New fuctions

This commit is contained in:
AnastasiaOnimovma 2023-10-21 18:12:36 +03:00
parent a71acc2ddf
commit 1fd7a123f9

View File

@ -130,6 +130,21 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"])
return token
def cut_address(ad: pd.Series, cl: str) -> pd.Series:
while ad["class"] and CLASSES.index(ad["class"][-1]) > CLASSES.index(cl[0]):
if ad["class"][-1] == "h":
ad["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "",
ad["address"].lower())
elif ad["class"][-1] == "b":
num = re.findall("к{0,1}\.? ?\d", ad["address"])[-1]
ad["address"] = re.sub(num, "", ad["address"])
elif ad["class"][-1] == "l":
ad["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$", "", ad["address"])
elif ad["class"][-1] == "r":
ad["address"] = re.sub(r"пом\.? ?\d+", "", ad["address"])
ad["class"] = ad["class"][:-1]
return ad
# TODO: переработать систему из if в нормальный вид
def split_address(address: str) -> List[str]:
@ -163,31 +178,27 @@ def split_address(address: str) -> List[str]:
accumulator["class"] = cur_tk['class']
accumulator["address"] = cur_tk["obj"]
continue
if CLASSES.index(accumulator["class"][-1]) < CLASSES.index(cur_tk["class"][0]) and accumulator["class"]!="w":
accumulator["class"] += cur_tk['class']
accumulator["address"] += " " + cur_tk["obj"]
else:
ad_no_ranges = unfold_house_ranges(accumulator["address"])
accumulator["address"] = ad_no_ranges[-1]
res.extend(ad_no_ranges)
while accumulator["class"] and CLASSES.index(accumulator["class"][-1]) > CLASSES.index(cur_tk["class"][0]):
if accumulator["class"][-1] == "h":
accumulator["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", accumulator["address"].lower())
elif accumulator["class"][-1] == "b":
num = re.findall("к{0,1}\.? ?\d", accumulator["address"])[-1]
accumulator["address"] = re.sub(num, "", accumulator["address"])
elif accumulator["class"][-1] == "l":
accumulator ["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$","", accumulator["address"])
elif accumulator["class"][-1] == "r":
accumulator["address"] = re.sub(r"пом\.? ?\d+","", accumulator["address"])
accumulator["class"] = accumulator["class"][:-1]
accumulator = cut_address(accumulator, cur_tk["class"])
if not accumulator["class"] or CLASSES.index(cur_tk["class"][0]) <= CLASSES.index("s") or accumulator["class"]=="w":
accumulator["class"] = cur_tk["class"]
accumulator["address"] = cur_tk["obj"]
if cur_tk["class"][0] == "h":
num = re.findall("\d{1,4} ?\/?\d* ?", cur_tk['obj'])[0]
accumulator["address"] = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator["address"])
cur_tk["class"] =cur_tk["class"][1:]
if cur_tk["class"] and cur_tk["class"][0] == "b":
num = re.findall("\d", cur_tk["obj"])[-1]
if num and not "b" in accumulator["class"]: