lenengro_parser/parser/address.py
2023-10-16 22:02:12 +03:00

233 lines
8.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import re
from typing import Iterable, List, TypeVar
import pandas as pd
T = TypeVar("T")
CLASSES = ("d", "c", "t", "s", "h", "b", "l", "r", "w")
DISTRICTS_PREFIXES = ("мо ", "р")
COUNTRYSIDE_PREFIXES = (
" г", " п", " д", " гп", " рп", " кп", " пгт", " c", "хутор", " урочище"
"г.", "п.", "д.", "гп.", "рп.", "кп.", "пгт.", "c.")
TERRITORY_PREFIXES =("тер.", " тер", "снт ", "ст ", "дск ", "днп ", "дпк ", "нп ", "пдк ", "т/б ", "садоводство", "массив", "хоз","сад-во","с-во")
STREET_PREFIXES = (
" ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе","линия","аллея", "мост", " парк", "кольцо","проезд", "съезд",
"ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.")
HOUSES_PREFIXES = ("д.", "уч.", "участок","мкд","тп")
BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение","корпус")
LETTER = ("лит.", "литера"," л.")
def unfold_house_ranges(address: str, token: str) -> List[str]:
adresses = []
pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
for pair_string in pairs_strings:
a, b = pair_string.split("-")
a, b = int(a), int(b)
if b > a:
token = token.replace(pair_string, "")
adresses += [address + " " + token + number for number in map(str, range(a, b + 1))]
else:
token = token.replace("-", "/")
adresses += address + " " + token
if not adresses:
adresses.append(address + " " + token)
return adresses
def unfold_houses_list(token: str) -> List[str]:
token = unfold_house_ranges(token)
reg = re.compile(r"(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )")
if len(re.findall(reg, token)) > 1:
tokens = token.split(",")
return [*[tokens[0] + " " + house_token for house_token in tokens[1:]]]
return [token]
def any_of_in(substrings: Iterable[str], string: str) -> bool:
return any(map(lambda substring: substring in string, substrings))
def flatten(arr: Iterable[List[T]]) -> List[T]:
return sum(arr, [])
def find_room(token: pd.Series, pre_token: pd.Series) -> str:
if re.search(r"пом\.?", token['obj']):
return "r"
return ""
def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(LETTER, token['obj'].lower()) \
or re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token['obj']):
return "l"
if (re.search(r"\b([А-Я]|[а-я]){1}$", token['obj']) \
and ("l" in pre_token['class'] or "h" in pre_token['class'])) \
and not (" ш" in token["obj"]) \
and not find_countryside(token,pre_token):
return "l"
return ""
def find_building(token: pd.Series, pre_token: pd.Series) -> str:
if re.search(r"\d", token['obj']):
if any_of_in(BUILDING_PREFIXES, token['obj'].lower()) \
or "b" in pre_token['class'] and not ("h" in token['class'])\
or re.search(r"к\.* ?\d", token['obj']):
return "b"
return ""
def find_house(token: pd.Series, pre_token: pd.Series) -> str:
if re.search(r"\d{1,4}", token['obj']):
if any_of_in(HOUSES_PREFIXES, token['obj'].lower()):
return "h"
if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token['obj']):
return "h"
if ("s" in pre_token['class'] or "h" in pre_token['class'] or "s" in token['class']) \
and not any_of_in(("", "", ""), token['obj'])\
and not find_building(token,pre_token):
return "h"
return ""
def find_street(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(STREET_PREFIXES, token['obj'].lower()) \
or re.search(r"[А-Я]{1}[а-я]+ая", token['obj']):
return "s"
return ""
def find_territory(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(TERRITORY_PREFIXES, token['obj'].lower()):
return "t"
return ""
def find_countryside(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(COUNTRYSIDE_PREFIXES, token['obj'].lower()) \
and not find_house(token,pre_token) \
and not find_street(token,pre_token):
return "c"
return ""
def find_district(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(DISTRICTS_PREFIXES, token['obj'].lower()):
return "d"
return ""
# TODO: переработать систему из if в нормальный вид и классификация чисел/букв
def split_address(address: str) -> List[str]:
if ";" in address:
address = address.replace(";", ",")
if "," in address:
tokens = address.split(",")
t = list(map(str.strip, filter(lambda token: token != "", tokens)))
# токены в датафрэйм
tokens = pd.DataFrame()
tokens['obj'] = t
tokens.insert(len(tokens.columns), "class", "")
res = []
accumulator = ""
for i in range(len(tokens)):
# TODO: напселённые пункты
# if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower())
# accumulator += tokens[i]
cur_tk = tokens.iloc[i]
if i == 0:
pre_token = pd.Series(data=["", ""], index=['obj', 'class'])
else:
pre_token = tokens.iloc[i - 1]
obj_class = find_district(cur_tk, pre_token)
if obj_class:
cur_tk["class"] += obj_class
if "d" in pre_token['class']:
res.append(accumulator)
accumulator = ""
accumulator += cur_tk["obj"]
obj_class = find_countryside(cur_tk, pre_token)
if obj_class:
cur_tk["class"] += obj_class
if "c" in pre_token['class']:
res.append(accumulator)
accumulator = ""
accumulator += cur_tk["obj"]
obj_class = find_territory(cur_tk, pre_token)
if obj_class:
cur_tk["class"] += obj_class
if "t" in pre_token['class']:
res.append(accumulator)
accumulator = ""
accumulator +=cur_tk["obj"]
obj_class = find_street(cur_tk, pre_token)
if obj_class:
cur_tk["class"] += obj_class
if "s" in pre_token['class']:
res.append(accumulator)
accumulator = ""
accumulator += cur_tk["obj"]
obj_class = find_house(cur_tk, pre_token)
if obj_class:
cur_tk["class"] += obj_class
if "h" in pre_token["class"]:
res.append(accumulator)
num = re.findall("\d{1,4}", cur_tk['obj'])[-1]
accumulator = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator)
else:
accumulator += cur_tk["obj"]
obj_class = find_building(cur_tk, pre_token)
if obj_class:
cur_tk["class"] += obj_class
if "b" in pre_token["class"]:
res.append(accumulator)
num = re.findall("\d", tokens['obj'].iloc[i])[-1]
accumulator = re.sub(r"\d$", num, accumulator)
else:
accumulator += pre_token["obj"]
obj_class = find_litera(cur_tk, pre_token)
if obj_class:
cur_tk["class"] += obj_class
if "l" in pre_token["class"]:
res.append(accumulator)
num = re.findall("[А-яа-я]", cur_tk["obj"].strip())[-1]
accumulator = re.sub(r"[А-яа-я]$", num, accumulator)
else:
accumulator += cur_tk["obj"]
if cur_tk['class'] == "":
cur_tk['class'] = "w"
tokens.iloc[i] = cur_tk
print(tokens.iloc[i])
# print(cur_tk)
return res
return [address]
def process_row(row: pd.Series[str]) -> pd.Series[str]:
row = row.copy()
if pd.isnull(row["Улица"]):
row["Улица"] = [None]
else:
addresses = split_address(row["Улица"])
row["Улица"] = addresses
return row
def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
merged_df = df.apply(process_row, axis=1).reset_index()
return merged_df.explode("Улица", ignore_index=True)