2023-10-29 15:59:55 +03:00

223 lines
5.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import pandas as pd
from .utils import any_of_in
CLASSES = ("w", "d", "c", "t", "s", "h", "b", "e", "l", "r")
DISTRICTS_PREFIXES = ("мо ", "р", "городское", "лесхоз")
COUNTRYSIDE_PREFIXES = (
"г",
"п",
"д",
"гп",
"рп",
"кп",
"пгт",
"c",
"хутор",
" урочище",
)
TERRITORY_PREFIXES = (
"тер.",
" тер",
"снт ",
"ст ",
"дск ",
"днп ",
"дпк ",
"нп ",
"пдк ",
"т/б ",
"садоводство",
"массив",
"хозя",
"сад-во",
)
STREET_PREFIXES = (
" ул",
" бул",
" пр",
" ш",
" пер",
" дор",
" маг",
" наб",
" пл",
" просп",
" туп",
"шоссе",
"лини",
"аллея",
"мост",
" парк",
"кольцо",
"проезд",
"съезд",
"переулок",
"ул.",
"бул.",
"пр.",
"ш.",
"пер.",
"дор.",
"маг.",
"наб.",
"пл.",
"просп.",
"туп.",
)
HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп", "дом", "дома")
BUILDING_PREFIXES = ("к.", "к ", "корп", "корпус")
EDIFICE_PREFIXES = ("стр.", "строение")
LETTER = ("лит.", "литера", " л.")
PREFIXES = (
DISTRICTS_PREFIXES,
COUNTRYSIDE_PREFIXES,
TERRITORY_PREFIXES,
STREET_PREFIXES,
HOUSES_PREFIXES,
BUILDING_PREFIXES,
EDIFICE_PREFIXES,
LETTER,
)
def find_room(token: pd.Series, pre_token: pd.Series) -> str:
if re.search(r"\bпом\.?", token["obj"]):
return "r"
return ""
def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
if find_room(token, pre_token):
return ""
# fmt: off
if (
any_of_in(LETTER, token["obj"].lower()) or
re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"])
):
#fmt: on
return "l"
if (
(
re.search(r"\b([А-Я]|[а-я]){1}$", token["obj"])
and ("l" in pre_token["class"] or "h" in pre_token["class"])
)
and (" ш" not in token["obj"])
and not find_countryside(token, pre_token)
):
return "l"
return ""
def find_edifice(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(EDIFICE_PREFIXES, token["obj"].lower()):
return "e"
return ""
def find_building(token: pd.Series, pre_token: pd.Series) -> str:
if re.search(r"\d", token["obj"]) and not find_room(token, pre_token):
if (
any_of_in(BUILDING_PREFIXES, token["obj"].lower())
or "b" in pre_token["class"]
and ("h" not in token["class"])
and not find_edifice(token, pre_token)
or re.search(r"к\.* ?\d", token["obj"])
):
return "b"
return ""
def find_house(token: pd.Series, pre_token: pd.Series) -> str:
if re.search(r"\d{1,4}", token["obj"]) and not find_room(token, pre_token):
if any_of_in(HOUSES_PREFIXES, token["obj"].lower()):
return "h"
if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token["obj"]):
return "h"
if (
(
"s" in pre_token["class"]
or "h" in pre_token["class"]
or "s" in token["class"]
)
and not any_of_in(("", "", ""), token["obj"])
and not find_building(token, pre_token)
and not find_edifice(token, pre_token)
):
return "h"
if (
find_building(token, pre_token)
and not any_of_in(("", "", ""), token["obj"])
and True
):
if len(re.findall(r"\d{1,4}", token["obj"])) > 1:
return "h"
if int(re.search(r"\d{1,4}", token["obj"]).group()) // 10 > 0:
return "h"
return ""
def find_street(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(STREET_PREFIXES, token["obj"].lower()):
return "s"
if (
re.search(r"\b[А-Яа-я]{4,}\b", token["obj"])
and not any(
[el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el) > 2]
)
and not (
"d" in token["class"] or "t" in token["class"] or "c" in token["class"]
)
):
return "s"
return ""
def find_territory(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(TERRITORY_PREFIXES, token["obj"].lower()):
return "t"
return ""
def find_countryside(token: pd.Series, pre_token: pd.Series) -> str:
if (
any_of_in(COUNTRYSIDE_PREFIXES, token["obj"].lower())
and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token["obj"])
and not find_house(token, pre_token)
and not any_of_in(STREET_PREFIXES, token["obj"].lower())
):
return "c"
return ""
def find_district(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(DISTRICTS_PREFIXES, token["obj"].lower()):
return "d"
return ""
def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
brackets = re.search(r"\(.+\)", token["obj"])
if brackets:
token["obj"] = re.sub(r"\(.+\)", "()", token["obj"])
token["class"] += find_district(token, pre_token)
token["class"] += find_countryside(token, pre_token)
token["class"] += find_territory(token, pre_token)
token["class"] += find_street(token, pre_token)
token["class"] += find_house(token, pre_token)
token["class"] += find_building(token, pre_token)
token["class"] += find_edifice(token, pre_token)
token["class"] += find_litera(token, pre_token)
token["class"] += find_room(token, pre_token)
if token["class"] == "":
token["class"] = "w"
if brackets:
token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"])
return token