import re import pandas as pd from .utils import any_of_in CLASSES = ("w", "d", "c", "t", "s", "h", "b", "e", "l", "r") DISTRICTS_PREFIXES = ("мо ", "р-н", "городское", "лесхоз") COUNTRYSIDE_PREFIXES = ( "г", "п", "д", "гп", "рп", "кп", "пгт", "c", "хутор", " урочище", ) TERRITORY_PREFIXES = ( "тер.", " тер", "снт ", "ст ", "дск ", "днп ", "дпк ", "нп ", "пдк ", "т/б ", "садоводство", "массив", "хозя", "сад-во", ) STREET_PREFIXES = ( " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе", "лини", "аллея", "мост", " парк", "кольцо", "проезд", "съезд", "переулок", "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.", ) HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп", "дом", "дома") BUILDING_PREFIXES = ("к.", "к ", "корп", "корпус") EDIFICE_PREFIXES = ("стр.", "строение") LETTER = ("лит.", "литера", " л.") PREFIXES = ( DISTRICTS_PREFIXES, COUNTRYSIDE_PREFIXES, TERRITORY_PREFIXES, STREET_PREFIXES, HOUSES_PREFIXES, BUILDING_PREFIXES, EDIFICE_PREFIXES, LETTER, ) def find_room(token: pd.Series, pre_token: pd.Series) -> str: if re.search(r"\bпом\.?", token["obj"]): return "r" return "" def find_litera(token: pd.Series, pre_token: pd.Series) -> str: if find_room(token, pre_token): return "" # fmt: off if ( any_of_in(LETTER, token["obj"].lower()) or re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"]) ): #fmt: on return "l" if ( ( re.search(r"\b([А-Я]|[а-я]){1}$", token["obj"]) and ("l" in pre_token["class"] or "h" in pre_token["class"]) ) and (" ш" not in token["obj"]) and not find_countryside(token, pre_token) ): return "l" return "" def find_edifice(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(EDIFICE_PREFIXES, token["obj"].lower()): return "e" return "" def find_building(token: pd.Series, pre_token: pd.Series) -> str: if re.search(r"\d", token["obj"]) and not find_room(token, pre_token): if ( any_of_in(BUILDING_PREFIXES, token["obj"].lower()) or "b" in pre_token["class"] and ("h" not in token["class"]) and not find_edifice(token, pre_token) or re.search(r"к\.* ?\d", token["obj"]) ): return "b" return "" def find_house(token: pd.Series, pre_token: pd.Series) -> str: if re.search(r"\d{1,4}", token["obj"]) and not find_room(token, pre_token): if any_of_in(HOUSES_PREFIXES, token["obj"].lower()): return "h" if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token["obj"]): return "h" if ( ( "s" in pre_token["class"] or "h" in pre_token["class"] or "s" in token["class"] ) and not any_of_in(("-я", "-й", "-Я"), token["obj"]) and not find_building(token, pre_token) and not find_edifice(token, pre_token) ): return "h" if ( find_building(token, pre_token) and not any_of_in(("-я", "-й", "-Я"), token["obj"]) and True ): if len(re.findall(r"\d{1,4}", token["obj"])) > 1: return "h" if int(re.search(r"\d{1,4}", token["obj"]).group()) // 10 > 0: return "h" return "" def find_street(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(STREET_PREFIXES, token["obj"].lower()): return "s" if ( re.search(r"\b[А-Яа-я]{4,}\b", token["obj"]) and not any( [el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el) > 2] ) and not ( "d" in token["class"] or "t" in token["class"] or "c" in token["class"] ) ): return "s" return "" def find_territory(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(TERRITORY_PREFIXES, token["obj"].lower()): return "t" return "" def find_countryside(token: pd.Series, pre_token: pd.Series) -> str: if ( any_of_in(COUNTRYSIDE_PREFIXES, token["obj"].lower()) and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token["obj"]) and not find_house(token, pre_token) and not any_of_in(STREET_PREFIXES, token["obj"].lower()) ): return "c" return "" def find_district(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(DISTRICTS_PREFIXES, token["obj"].lower()): return "d" return "" def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: brackets = re.search(r"\(.+\)", token["obj"]) if brackets: token["obj"] = re.sub(r"\(.+\)", "()", token["obj"]) token["class"] += find_district(token, pre_token) token["class"] += find_countryside(token, pre_token) token["class"] += find_territory(token, pre_token) token["class"] += find_street(token, pre_token) token["class"] += find_house(token, pre_token) token["class"] += find_building(token, pre_token) token["class"] += find_edifice(token, pre_token) token["class"] += find_litera(token, pre_token) token["class"] += find_room(token, pre_token) if token["class"] == "": token["class"] = "w" if brackets: token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"]) return token