1
0
2023-10-13 10:00:04 +03:00

124 lines
4.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import re
from typing import Iterable, List, TypeVar
import pandas as pd
T = TypeVar("T")
SETTLEMENTS_PREFIXES=("г","мо","р","п","д","гп","c","хутор","массив","тер","СНТ","СТ","ДСК","ДНП","ДПК","НП","садоводство")
STREET_PREFIXES = ("ул", "бул", "пр", "ш", "пер", "дор", "маг", "наб", "пл", "просп", "туп", "аллея", "мост", "парк", "кольцо","проезд")
HOUSES_PREFIXES = ("д.", "д","уч","участок")
BUILDING_PREFIXES=("к", "корп")
LETTER=("лит", "литера")
def unfold_house_ranges(address:str, token: str) -> List[str]:
adresses=[]
pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
for pair_string in pairs_strings:
a, b = pair_string.split("-")
a, b = int(a), int(b)
if b > a:
token = token.replace(pair_string,"")
adresses += [address + " "+ token + number for number in map(str, range(a, b + 1))]
else:
token = token.replace("-", "/")
adresses += address + " " + token
if not adresses:
adresses.append(address + " " + token)
return adresses
def unfold_houses_list(token: str) -> List[str]:
token = unfold_house_ranges(token)
reg = re.compile(r"(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )")
if len(re.findall(reg, token)) > 1:
tokens = token.split(",")
return [*[tokens[0] + " " + house_token for house_token in tokens[1:]]]
return [token]
def any_of_in(substrings: Iterable[str], string: str) -> bool:
return any(map(lambda substring: substring in string, substrings))
def flatten(arr: Iterable[List[T]]) -> List[T]:
return sum(arr, [])
# TODO: переработать систему из if в нормальный вид и классификация чисел/букв
def split_address(address: str) -> List[str]:
if ";" in address:
address = address.replace(";", ",")
if "," in address:
tokens = address.split(",")
tokens = list(map(str.strip, filter(lambda token: token != "", tokens)))
res = []
accumulator = []
for i in range(len(tokens)):
# TODO: напселённые пункты
# if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower())
# accumulator += tokens[i]
# улицы
if any_of_in(STREET_PREFIXES, tokens[i].lower()):
if accumulator and any_of_in(STREET_PREFIXES, "".join(accumulator).lower() ):
res.append( " ".join(accumulator))
accumulator=[]
accumulator.append(tokens[i])
# дома
elif any_of_in(HOUSES_PREFIXES, tokens[i].lower()):
if accumulator and any_of_in(HOUSES_PREFIXES, accumulator[-1].lower()):
res.append(" ".join(accumulator))
accumulator.pop()
res.append(unfold_house_ranges(" ".join(accumulator),tokens[i]))
accumulator=res[-1]
res.pop()
# корпус
elif any_of_in(BUILDING_PREFIXES, tokens[i].lower()):
if accumulator and any_of_in(BUILDING_PREFIXES, accumulator[-1].lower() ):
res.append( " ".join(accumulator))
accumulator.pop()
accumulator.append(tokens[i])
# литера
elif any_of_in(LETTER, tokens[i].lower()):
if accumulator and any_of_in(LETTER, accumulator[-1].lower() ):
res.append(" ".join(accumulator))
accumulator.pop()
accumulator.append (tokens[i])
else:
accumulator.append(tokens[i])
res.append(" ".join(accumulator))
return res
return [address]
def process_row(row: pd.Series[str]) -> pd.Series[str]:
row = row.copy()
if pd.isnull(row["Улица"]):
row["Улица"] = [None]
else:
addresses = split_address(row["Улица"])
row["Улица"] = addresses
return row
def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
merged_df = df.apply(process_row, axis=1).reset_index()
return merged_df.explode("Улица", ignore_index=True)