lenengro_parser/parser/address.py
2023-10-16 00:42:22 +03:00

186 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import re
from typing import Iterable, List, TypeVar
import pandas as pd
T = TypeVar("T")
CLASSES = ("s", "h", "b", "l", "?")
SETTLEMENTS_PREFIXES = (
"г", "мо", "р", "п", "д", "гп", "c", "хутор", "массив", "тер", "СНТ", "СТ", "ДСК", "ДНП", "ДПК", "НП",
"садоводство")
STREET_PREFIXES = (
" ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе","линия","аллея", "мост", "парк", "кольцо","проезд",
"ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.")
HOUSES_PREFIXES = ("д.", "уч.", "участок")
BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение")
LETTER = ("лит.", "литера"," л.")
def unfold_house_ranges(address: str, token: str) -> List[str]:
adresses = []
pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
for pair_string in pairs_strings:
a, b = pair_string.split("-")
a, b = int(a), int(b)
if b > a:
token = token.replace(pair_string, "")
adresses += [address + " " + token + number for number in map(str, range(a, b + 1))]
else:
token = token.replace("-", "/")
adresses += address + " " + token
if not adresses:
adresses.append(address + " " + token)
return adresses
def unfold_houses_list(token: str) -> List[str]:
token = unfold_house_ranges(token)
reg = re.compile(r"(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )")
if len(re.findall(reg, token)) > 1:
tokens = token.split(",")
return [*[tokens[0] + " " + house_token for house_token in tokens[1:]]]
return [token]
def any_of_in(substrings: Iterable[str], string: str) -> bool:
return any(map(lambda substring: substring in string, substrings))
def flatten(arr: Iterable[List[T]]) -> List[T]:
return sum(arr, [])
def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(LETTER, token['obj']) \
or re.search(r"\d{1,3}[А-Яа-я]( |$)", token['obj']):
return "l"
# не работает
if (re.search(r"\b[А-Яа-я]{1}\b", token['obj']) and "l" in pre_token['class']):
return "l"
return ""
def find_building(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(BUILDING_PREFIXES, token['obj']) \
or (re.search(r"\d", token['obj']) and "b" in pre_token['class']) \
or re.search(r"к\.*\d", token['obj']) \
or re.search(r"\d", token['obj']) and "b" in pre_token['class']:
return "b"
return ""
def find_house(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(HOUSES_PREFIXES, token['obj']):
return "h"
if re.search(r"(д|д\.) ?\d{1,3} ?\/*\d* ?", token['obj']) and not ("" in token['obj']):
if "h" in pre_token['class'] \
or "s" in pre_token['class'] \
or "s" in token['class']:
return "h"
# не работает
if re.search(r"\d{1,3}", token['obj']) and ("s" in pre_token['class'] or "h" in pre_token['class']):
return "h"
return ""
def find_street(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(STREET_PREFIXES, token['obj']) \
or (re.search(r"[А-Я]{1}[а-я]+", token['obj']) and "s" in pre_token['class']):
return "s"
return ""
# TODO: переработать систему из if в нормальный вид и классификация чисел/букв
def split_address(address: str) -> List[str]:
if ";" in address:
address = address.replace(";", ",")
if "," in address:
tokens = address.split(",")
t = list(map(str.strip, filter(lambda token: token != "", tokens)))
# токены в датафрэйм
tokens = pd.DataFrame()
tokens['obj'] = t
tokens.insert(len(tokens.columns), "class", "")
res = []
accumulator = ""
for i in range(len(tokens)):
# TODO: напселённые пункты
# if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower())
# accumulator += tokens[i]
cur_tk = tokens.iloc[i]
if i == 0:
pre_token = pd.Series(data=["", ""], index=['obj', 'class'])
else:
pre_token = tokens.iloc[i - 1]
obj_class = find_street(cur_tk, pre_token)
if obj_class:
cur_tk["class"] += obj_class
if "s" in tokens['class'].iloc[i - 1]:
res.append(accumulator)
accumulator = ""
accumulator += tokens["obj"].iloc[i]
obj_class = find_house(cur_tk, pre_token)
if obj_class:
cur_tk["class"] += obj_class
if "h" in tokens['class'].iloc[i - 1]:
res.append(accumulator)
num = re.findall("\d{,3}", tokens['obj'].iloc[i])[-1]
accumulator = re.sub(r"\d{,3} ?\/*\d* ?", num,accumulator)
else:
accumulator += tokens["obj"].iloc[i]
obj_class = find_building(cur_tk, pre_token)
if obj_class:
cur_tk["class"] += obj_class
if "b" in tokens['class'].iloc[i - 1]:
res.append(accumulator)
num = re.findall("\d", tokens['obj'].iloc[i])[-1]
accumulator = re.sub(r"\d$", num, accumulator)
else:
accumulator += tokens["obj"].iloc[i]
obj_class = find_litera(cur_tk, pre_token)
if obj_class:
cur_tk["class"] += obj_class
if "l" in tokens['class'].iloc[i - 1]:
res.append(accumulator)
num = re.findall("[А-яа-я]", tokens['obj'].iloc[i].strip())[-1]
accumulator = re.sub(r"[А-яа-я]$", num, accumulator)
else:
accumulator += tokens["obj"].iloc[i]
if cur_tk['class'] == "":
cur_tk['class'] = "w"
print(cur_tk)
return res
return [address]
def process_row(row: pd.Series[str]) -> pd.Series[str]:
row = row.copy()
if pd.isnull(row["Улица"]):
row["Улица"] = [None]
else:
addresses = split_address(row["Улица"])
row["Улица"] = addresses
return row
def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
merged_df = df.apply(process_row, axis=1).reset_index()
return merged_df.explode("Улица", ignore_index=True)