223 lines
5.9 KiB
Python
223 lines
5.9 KiB
Python
import re
|
||
|
||
import pandas as pd
|
||
|
||
from .utils import any_of_in
|
||
|
||
CLASSES = ("w", "d", "c", "t", "s", "h", "b", "e", "l", "r")
|
||
DISTRICTS_PREFIXES = ("мо ", "р-н", "городское", "лесхоз")
|
||
COUNTRYSIDE_PREFIXES = (
|
||
"г",
|
||
"п",
|
||
"д",
|
||
"гп",
|
||
"рп",
|
||
"кп",
|
||
"пгт",
|
||
"c",
|
||
"хутор",
|
||
" урочище",
|
||
)
|
||
TERRITORY_PREFIXES = (
|
||
"тер.",
|
||
" тер",
|
||
"снт ",
|
||
"ст ",
|
||
"дск ",
|
||
"днп ",
|
||
"дпк ",
|
||
"нп ",
|
||
"пдк ",
|
||
"т/б ",
|
||
"садоводство",
|
||
"массив",
|
||
"хозя",
|
||
"сад-во",
|
||
)
|
||
STREET_PREFIXES = (
|
||
" ул",
|
||
" бул",
|
||
" пр",
|
||
" ш",
|
||
" пер",
|
||
" дор",
|
||
" маг",
|
||
" наб",
|
||
" пл",
|
||
" просп",
|
||
" туп",
|
||
"шоссе",
|
||
"лини",
|
||
"аллея",
|
||
"мост",
|
||
" парк",
|
||
"кольцо",
|
||
"проезд",
|
||
"съезд",
|
||
"переулок",
|
||
"ул.",
|
||
"бул.",
|
||
"пр.",
|
||
"ш.",
|
||
"пер.",
|
||
"дор.",
|
||
"маг.",
|
||
"наб.",
|
||
"пл.",
|
||
"просп.",
|
||
"туп.",
|
||
)
|
||
HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп", "дом", "дома")
|
||
BUILDING_PREFIXES = ("к.", "к ", "корп", "корпус")
|
||
EDIFICE_PREFIXES = ("стр.", "строение")
|
||
LETTER = ("лит.", "литера", " л.")
|
||
PREFIXES = (
|
||
DISTRICTS_PREFIXES,
|
||
COUNTRYSIDE_PREFIXES,
|
||
TERRITORY_PREFIXES,
|
||
STREET_PREFIXES,
|
||
HOUSES_PREFIXES,
|
||
BUILDING_PREFIXES,
|
||
EDIFICE_PREFIXES,
|
||
LETTER,
|
||
)
|
||
|
||
|
||
def find_room(token: pd.Series, pre_token: pd.Series) -> str:
|
||
if re.search(r"\bпом\.?", token["obj"]):
|
||
return "r"
|
||
return ""
|
||
|
||
|
||
def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
|
||
if find_room(token, pre_token):
|
||
return ""
|
||
# fmt: off
|
||
if (
|
||
any_of_in(LETTER, token["obj"].lower()) or
|
||
re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"])
|
||
):
|
||
#fmt: on
|
||
return "l"
|
||
if (
|
||
(
|
||
re.search(r"\b([А-Я]|[а-я]){1}$", token["obj"])
|
||
and ("l" in pre_token["class"] or "h" in pre_token["class"])
|
||
)
|
||
and (" ш" not in token["obj"])
|
||
and not find_countryside(token, pre_token)
|
||
):
|
||
return "l"
|
||
return ""
|
||
|
||
|
||
def find_edifice(token: pd.Series, pre_token: pd.Series) -> str:
|
||
if any_of_in(EDIFICE_PREFIXES, token["obj"].lower()):
|
||
return "e"
|
||
return ""
|
||
|
||
|
||
def find_building(token: pd.Series, pre_token: pd.Series) -> str:
|
||
if re.search(r"\d", token["obj"]) and not find_room(token, pre_token):
|
||
if (
|
||
any_of_in(BUILDING_PREFIXES, token["obj"].lower())
|
||
or "b" in pre_token["class"]
|
||
and ("h" not in token["class"])
|
||
and not find_edifice(token, pre_token)
|
||
or re.search(r"к\.* ?\d", token["obj"])
|
||
):
|
||
return "b"
|
||
return ""
|
||
|
||
|
||
def find_house(token: pd.Series, pre_token: pd.Series) -> str:
|
||
if re.search(r"\d{1,4}", token["obj"]) and not find_room(token, pre_token):
|
||
if any_of_in(HOUSES_PREFIXES, token["obj"].lower()):
|
||
return "h"
|
||
if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token["obj"]):
|
||
return "h"
|
||
if (
|
||
(
|
||
"s" in pre_token["class"]
|
||
or "h" in pre_token["class"]
|
||
or "s" in token["class"]
|
||
)
|
||
and not any_of_in(("-я", "-й", "-Я"), token["obj"])
|
||
and not find_building(token, pre_token)
|
||
and not find_edifice(token, pre_token)
|
||
):
|
||
return "h"
|
||
if (
|
||
find_building(token, pre_token)
|
||
and not any_of_in(("-я", "-й", "-Я"), token["obj"])
|
||
and True
|
||
):
|
||
if len(re.findall(r"\d{1,4}", token["obj"])) > 1:
|
||
return "h"
|
||
if int(re.search(r"\d{1,4}", token["obj"]).group()) // 10 > 0:
|
||
return "h"
|
||
return ""
|
||
|
||
|
||
def find_street(token: pd.Series, pre_token: pd.Series) -> str:
|
||
if any_of_in(STREET_PREFIXES, token["obj"].lower()):
|
||
return "s"
|
||
if (
|
||
re.search(r"\b[А-Яа-я]{4,}\b", token["obj"])
|
||
and not any(
|
||
[el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el) > 2]
|
||
)
|
||
and not (
|
||
"d" in token["class"] or "t" in token["class"] or "c" in token["class"]
|
||
)
|
||
):
|
||
return "s"
|
||
return ""
|
||
|
||
|
||
def find_territory(token: pd.Series, pre_token: pd.Series) -> str:
|
||
if any_of_in(TERRITORY_PREFIXES, token["obj"].lower()):
|
||
return "t"
|
||
return ""
|
||
|
||
|
||
def find_countryside(token: pd.Series, pre_token: pd.Series) -> str:
|
||
if (
|
||
any_of_in(COUNTRYSIDE_PREFIXES, token["obj"].lower())
|
||
and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token["obj"])
|
||
and not find_house(token, pre_token)
|
||
and not any_of_in(STREET_PREFIXES, token["obj"].lower())
|
||
):
|
||
return "c"
|
||
return ""
|
||
|
||
|
||
def find_district(token: pd.Series, pre_token: pd.Series) -> str:
|
||
if any_of_in(DISTRICTS_PREFIXES, token["obj"].lower()):
|
||
return "d"
|
||
return ""
|
||
|
||
|
||
def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
|
||
brackets = re.search(r"\(.+\)", token["obj"])
|
||
if brackets:
|
||
token["obj"] = re.sub(r"\(.+\)", "()", token["obj"])
|
||
|
||
token["class"] += find_district(token, pre_token)
|
||
token["class"] += find_countryside(token, pre_token)
|
||
token["class"] += find_territory(token, pre_token)
|
||
token["class"] += find_street(token, pre_token)
|
||
token["class"] += find_house(token, pre_token)
|
||
token["class"] += find_building(token, pre_token)
|
||
token["class"] += find_edifice(token, pre_token)
|
||
token["class"] += find_litera(token, pre_token)
|
||
token["class"] += find_room(token, pre_token)
|
||
|
||
if token["class"] == "":
|
||
token["class"] = "w"
|
||
|
||
if brackets:
|
||
token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"])
|
||
|
||
return token
|