From cb422b9a2fc47f8413a3fbe6c4726a6b94ca0fee Mon Sep 17 00:00:00 2001 From: AnastasiaOnimovma Date: Mon, 16 Oct 2023 00:42:22 +0300 Subject: [PATCH] Classification(ver.1) --- parser/address.py | 144 +++++++++++++++++++++++++++++++++------------- 1 file changed, 103 insertions(+), 41 deletions(-) diff --git a/parser/address.py b/parser/address.py index 79eac1b..a7ca5dd 100644 --- a/parser/address.py +++ b/parser/address.py @@ -7,22 +7,29 @@ import pandas as pd T = TypeVar("T") -SETTLEMENTS_PREFIXES=("г","мо","р-н","п","д","гп","c","хутор","массив","тер","СНТ","СТ","ДСК","ДНП","ДПК","НП","садоводство") -STREET_PREFIXES = ("ул", "бул", "пр", "ш", "пер", "дор", "маг", "наб", "пл", "просп", "туп", "аллея", "мост", "парк", "кольцо","проезд") -HOUSES_PREFIXES = ("д.", "д","уч","участок") -BUILDING_PREFIXES=("к", "корп") -LETTER=("лит", "литера") +CLASSES = ("s", "h", "b", "l", "?") -def unfold_house_ranges(address:str, token: str) -> List[str]: - adresses=[] +SETTLEMENTS_PREFIXES = ( + "г", "мо", "р-н", "п", "д", "гп", "c", "хутор", "массив", "тер", "СНТ", "СТ", "ДСК", "ДНП", "ДПК", "НП", + "садоводство") +STREET_PREFIXES = ( + " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе","линия","аллея", "мост", "парк", "кольцо","проезд", + "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.") +HOUSES_PREFIXES = ("д.", "уч.", "участок") +BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение") +LETTER = ("лит.", "литера"," л.") + + +def unfold_house_ranges(address: str, token: str) -> List[str]: + adresses = [] pairs_strings = re.findall(r"([\d]+-[\d]+)", token) for pair_string in pairs_strings: a, b = pair_string.split("-") a, b = int(a), int(b) if b > a: - token = token.replace(pair_string,"") - adresses += [address + " "+ token + number for number in map(str, range(a, b + 1))] + token = token.replace(pair_string, "") + adresses += [address + " " + token + number for number in map(str, range(a, b + 1))] else: @@ -52,6 +59,46 @@ def flatten(arr: Iterable[List[T]]) -> List[T]: return sum(arr, []) +def find_litera(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(LETTER, token['obj']) \ + or re.search(r"\d{1,3}[А-Яа-я]( |$)", token['obj']): + return "l" + # не работает + if (re.search(r"\b[А-Яа-я]{1}\b", token['obj']) and "l" in pre_token['class']): + return "l" + return "" + + +def find_building(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(BUILDING_PREFIXES, token['obj']) \ + or (re.search(r"\d", token['obj']) and "b" in pre_token['class']) \ + or re.search(r"к\.*\d", token['obj']) \ + or re.search(r"\d", token['obj']) and "b" in pre_token['class']: + return "b" + return "" + + +def find_house(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(HOUSES_PREFIXES, token['obj']): + return "h" + if re.search(r"(д|д\.) ?\d{1,3} ?\/*\d* ?", token['obj']) and not ("-я" in token['obj']): + if "h" in pre_token['class'] \ + or "s" in pre_token['class'] \ + or "s" in token['class']: + return "h" + # не работает + if re.search(r"\d{1,3}", token['obj']) and ("s" in pre_token['class'] or "h" in pre_token['class']): + return "h" + return "" + + +def find_street(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(STREET_PREFIXES, token['obj']) \ + or (re.search(r"[А-Я]{1}[а-я]+", token['obj']) and "s" in pre_token['class']): + return "s" + return "" + + # TODO: переработать систему из if в нормальный вид и классификация чисел/букв def split_address(address: str) -> List[str]: if ";" in address: @@ -59,47 +106,62 @@ def split_address(address: str) -> List[str]: if "," in address: tokens = address.split(",") - tokens = list(map(str.strip, filter(lambda token: token != "", tokens))) + t = list(map(str.strip, filter(lambda token: token != "", tokens))) + # токены в датафрэйм + tokens = pd.DataFrame() + tokens['obj'] = t + tokens.insert(len(tokens.columns), "class", "") res = [] - accumulator = [] + accumulator = "" for i in range(len(tokens)): # TODO: напселённые пункты # if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower()) # accumulator += tokens[i] + cur_tk = tokens.iloc[i] - # улицы - if any_of_in(STREET_PREFIXES, tokens[i].lower()): - if accumulator and any_of_in(STREET_PREFIXES, "".join(accumulator).lower() ): - res.append( " ".join(accumulator)) - accumulator=[] - accumulator.append(tokens[i]) - - # дома - elif any_of_in(HOUSES_PREFIXES, tokens[i].lower()): - if accumulator and any_of_in(HOUSES_PREFIXES, accumulator[-1].lower()): - res.append(" ".join(accumulator)) - accumulator.pop() - res.append(unfold_house_ranges(" ".join(accumulator),tokens[i])) - accumulator=res[-1] - res.pop() - # корпус - elif any_of_in(BUILDING_PREFIXES, tokens[i].lower()): - if accumulator and any_of_in(BUILDING_PREFIXES, accumulator[-1].lower() ): - res.append( " ".join(accumulator)) - accumulator.pop() - accumulator.append(tokens[i]) - # литера - elif any_of_in(LETTER, tokens[i].lower()): - if accumulator and any_of_in(LETTER, accumulator[-1].lower() ): - res.append(" ".join(accumulator)) - accumulator.pop() - accumulator.append (tokens[i]) + if i == 0: + pre_token = pd.Series(data=["", ""], index=['obj', 'class']) else: - accumulator.append(tokens[i]) - - res.append(" ".join(accumulator)) + pre_token = tokens.iloc[i - 1] + obj_class = find_street(cur_tk, pre_token) + if obj_class: + cur_tk["class"] += obj_class + if "s" in tokens['class'].iloc[i - 1]: + res.append(accumulator) + accumulator = "" + accumulator += tokens["obj"].iloc[i] + obj_class = find_house(cur_tk, pre_token) + if obj_class: + cur_tk["class"] += obj_class + if "h" in tokens['class'].iloc[i - 1]: + res.append(accumulator) + num = re.findall("\d{,3}", tokens['obj'].iloc[i])[-1] + accumulator = re.sub(r"\d{,3} ?\/*\d* ?", num,accumulator) + else: + accumulator += tokens["obj"].iloc[i] + obj_class = find_building(cur_tk, pre_token) + if obj_class: + cur_tk["class"] += obj_class + if "b" in tokens['class'].iloc[i - 1]: + res.append(accumulator) + num = re.findall("\d", tokens['obj'].iloc[i])[-1] + accumulator = re.sub(r"\d$", num, accumulator) + else: + accumulator += tokens["obj"].iloc[i] + obj_class = find_litera(cur_tk, pre_token) + if obj_class: + cur_tk["class"] += obj_class + if "l" in tokens['class'].iloc[i - 1]: + res.append(accumulator) + num = re.findall("[А-яа-я]", tokens['obj'].iloc[i].strip())[-1] + accumulator = re.sub(r"[А-яа-я]$", num, accumulator) + else: + accumulator += tokens["obj"].iloc[i] + if cur_tk['class'] == "": + cur_tk['class'] = "w" + print(cur_tk) return res return [address]