From 1fd7a123f93a5f160fc64dd45d692748a4ad4f4e Mon Sep 17 00:00:00 2001 From: AnastasiaOnimovma Date: Sat, 21 Oct 2023 18:12:36 +0300 Subject: [PATCH] New fuctions --- parser/address.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/parser/address.py b/parser/address.py index 4c3bdca..73f53e4 100644 --- a/parser/address.py +++ b/parser/address.py @@ -130,6 +130,21 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"]) return token +def cut_address(ad: pd.Series, cl: str) -> pd.Series: + while ad["class"] and CLASSES.index(ad["class"][-1]) > CLASSES.index(cl[0]): + if ad["class"][-1] == "h": + ad["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", + ad["address"].lower()) + elif ad["class"][-1] == "b": + num = re.findall("к{0,1}\.? ?\d", ad["address"])[-1] + ad["address"] = re.sub(num, "", ad["address"]) + elif ad["class"][-1] == "l": + ad["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$", "", ad["address"]) + elif ad["class"][-1] == "r": + ad["address"] = re.sub(r"пом\.? ?\d+", "", ad["address"]) + ad["class"] = ad["class"][:-1] + return ad + # TODO: переработать систему из if в нормальный вид def split_address(address: str) -> List[str]: @@ -163,31 +178,27 @@ def split_address(address: str) -> List[str]: accumulator["class"] = cur_tk['class'] accumulator["address"] = cur_tk["obj"] continue + if CLASSES.index(accumulator["class"][-1]) < CLASSES.index(cur_tk["class"][0]) and accumulator["class"]!="w": accumulator["class"] += cur_tk['class'] accumulator["address"] += " " + cur_tk["obj"] else: ad_no_ranges = unfold_house_ranges(accumulator["address"]) accumulator["address"] = ad_no_ranges[-1] + res.extend(ad_no_ranges) - while accumulator["class"] and CLASSES.index(accumulator["class"][-1]) > CLASSES.index(cur_tk["class"][0]): - if accumulator["class"][-1] == "h": - accumulator["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", accumulator["address"].lower()) - elif accumulator["class"][-1] == "b": - num = re.findall("к{0,1}\.? ?\d", accumulator["address"])[-1] - accumulator["address"] = re.sub(num, "", accumulator["address"]) - elif accumulator["class"][-1] == "l": - accumulator ["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$","", accumulator["address"]) - elif accumulator["class"][-1] == "r": - accumulator["address"] = re.sub(r"пом\.? ?\d+","", accumulator["address"]) - accumulator["class"] = accumulator["class"][:-1] + + accumulator = cut_address(accumulator, cur_tk["class"]) + if not accumulator["class"] or CLASSES.index(cur_tk["class"][0]) <= CLASSES.index("s") or accumulator["class"]=="w": accumulator["class"] = cur_tk["class"] accumulator["address"] = cur_tk["obj"] + if cur_tk["class"][0] == "h": num = re.findall("\d{1,4} ?\/?\d* ?", cur_tk['obj'])[0] accumulator["address"] = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator["address"]) cur_tk["class"] =cur_tk["class"][1:] + if cur_tk["class"] and cur_tk["class"][0] == "b": num = re.findall("\d", cur_tk["obj"])[-1] if num and not "b" in accumulator["class"]: