lenengro_parser/parser/address/classifier.py

import re

import pandas as pd

from .utils import any_of_in

CLASSES = ("w", "d", "c", "t", "s", "h", "b", "e", "l", "r")
DISTRICTS_PREFIXES = ("мо ", "р-н", "городское", "лесхоз")
COUNTRYSIDE_PREFIXES = (
    "г",
    "п",
    "д",
    "гп",
    "рп",
    "кп",
    "пгт",
    "c",
    "хутор",
    " урочище",
)
TERRITORY_PREFIXES = (
    "тер.",
    " тер",
    "снт ",
    "ст ",
    "дск ",
    "днп ",
    "дпк ",
    "нп ",
    "пдк ",
    "т/б ",
    "садоводство",
    "массив",
    "хозя",
    "сад-во",
)
STREET_PREFIXES = (
    " ул",
    " бул",
    " пр",
    " ш",
    " пер",
    " дор",
    " маг",
    " наб",
    " пл",
    " просп",
    " туп",
    "шоссе",
    "лини",
    "аллея",
    "мост",
    " парк",
    "кольцо",
    "проезд",
    "съезд",
    "переулок",
    "ул.",
    "бул.",
    "пр.",
    "ш.",
    "пер.",
    "дор.",
    "маг.",
    "наб.",
    "пл.",
    "просп.",
    "туп.",
)
HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп", "дом", "дома")
BUILDING_PREFIXES = ("к.", "к ", "корп", "корпус")
EDIFICE_PREFIXES = ("стр.", "строение")
LETTER = ("лит.", "литера", " л.")
PREFIXES = (
    DISTRICTS_PREFIXES,
    COUNTRYSIDE_PREFIXES,
    TERRITORY_PREFIXES,
    STREET_PREFIXES,
    HOUSES_PREFIXES,
    BUILDING_PREFIXES,
    EDIFICE_PREFIXES,
    LETTER,
)


def find_room(token: pd.Series, pre_token: pd.Series) -> str:
    if re.search(r"\bпом\.?", token["obj"]):
        return "r"
    return ""


def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
    if find_room(token, pre_token):
        return ""
    # fmt: off
    if (
        any_of_in(LETTER, token["obj"].lower()) or
        re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"])
    ):
    #fmt: on
        return "l"
    if (
        (
            re.search(r"\b([А-Я]|[а-я]){1}$", token["obj"])
            and ("l" in pre_token["class"] or "h" in pre_token["class"])
        )
        and (" ш" not in token["obj"])
        and not find_countryside(token, pre_token)
    ):
        return "l"
    return ""


def find_edifice(token: pd.Series, pre_token: pd.Series) -> str:
    if any_of_in(EDIFICE_PREFIXES, token["obj"].lower()):
        return "e"
    return ""


def find_building(token: pd.Series, pre_token: pd.Series) -> str:
    if re.search(r"\d", token["obj"]) and not find_room(token, pre_token):
        if (
            any_of_in(BUILDING_PREFIXES, token["obj"].lower())
            or "b" in pre_token["class"]
            and ("h" not in token["class"])
            and not find_edifice(token, pre_token)
            or re.search(r"к\.* ?\d", token["obj"])
        ):
            return "b"
    return ""


def find_house(token: pd.Series, pre_token: pd.Series) -> str:
    if re.search(r"\d{1,4}", token["obj"]) and not find_room(token, pre_token):
        if any_of_in(HOUSES_PREFIXES, token["obj"].lower()):
            return "h"
        if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token["obj"]):
            return "h"
        if (
            (
                "s" in pre_token["class"]
                or "h" in pre_token["class"]
                or "s" in token["class"]
            )
            and not any_of_in(("-я", "-й", "-Я"), token["obj"])
            and not find_building(token, pre_token)
            and not find_edifice(token, pre_token)
        ):
            return "h"
        if (
            find_building(token, pre_token)
            and not any_of_in(("-я", "-й", "-Я"), token["obj"])
            and True
        ):
            if len(re.findall(r"\d{1,4}", token["obj"])) > 1:
                return "h"
            if int(re.search(r"\d{1,4}", token["obj"]).group()) // 10 > 0:
                return "h"
    return ""


def find_street(token: pd.Series, pre_token: pd.Series) -> str:
    if any_of_in(STREET_PREFIXES, token["obj"].lower()):
        return "s"
    if (
        re.search(r"\b[А-Яа-я]{4,}\b", token["obj"])
        and not any(
            [el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el) > 2]
        )
        and not (
            "d" in token["class"] or "t" in token["class"] or "c" in token["class"]
        )
    ):
        return "s"
    return ""


def find_territory(token: pd.Series, pre_token: pd.Series) -> str:
    if any_of_in(TERRITORY_PREFIXES, token["obj"].lower()):
        return "t"
    return ""


def find_countryside(token: pd.Series, pre_token: pd.Series) -> str:
    if (
        any_of_in(COUNTRYSIDE_PREFIXES, token["obj"].lower())
        and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token["obj"])
        and not find_house(token, pre_token)
        and not any_of_in(STREET_PREFIXES, token["obj"].lower())
    ):
        return "c"
    return ""


def find_district(token: pd.Series, pre_token: pd.Series) -> str:
    if any_of_in(DISTRICTS_PREFIXES, token["obj"].lower()):
        return "d"
    return ""


def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
    brackets = re.search(r"\(.+\)", token["obj"])
    if brackets:
        token["obj"] = re.sub(r"\(.+\)", "()", token["obj"])

    token["class"] += find_district(token, pre_token)
    token["class"] += find_countryside(token, pre_token)
    token["class"] += find_territory(token, pre_token)
    token["class"] += find_street(token, pre_token)
    token["class"] += find_house(token, pre_token)
    token["class"] += find_building(token, pre_token)
    token["class"] += find_edifice(token, pre_token)
    token["class"] += find_litera(token, pre_token)
    token["class"] += find_room(token, pre_token)

    if token["class"] == "":
        token["class"] = "w"

    if brackets:
        token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"])

    return token