from __future__ import annotations import re from typing import Iterable, List, TypeVar import pandas as pd T = TypeVar("T") SETTLEMENTS_PREFIXES=("г","мо","р-н","п","д","гп","c","хутор","массив","тер","СНТ","СТ","ДСК","ДНП","ДПК","НП","садоводство") STREET_PREFIXES = ("ул", "бул", "пр", "ш", "пер", "дор", "маг", "наб", "пл", "просп", "туп", "аллея", "мост", "парк", "кольцо","проезд") HOUSES_PREFIXES = ("д.", "д","уч","участок") BUILDING_PREFIXES=("к", "корп") LETTER=("лит", "литера") def unfold_house_ranges(address:str, token: str) -> List[str]: adresses=[] pairs_strings = re.findall(r"([\d]+-[\d]+)", token) for pair_string in pairs_strings: a, b = pair_string.split("-") a, b = int(a), int(b) if b > a: token = token.replace(pair_string,"") adresses += [address + " "+ token + number for number in map(str, range(a, b + 1))] else: token = token.replace("-", "/") adresses += address + " " + token if not adresses: adresses.append(address + " " + token) return adresses def unfold_houses_list(token: str) -> List[str]: token = unfold_house_ranges(token) reg = re.compile(r"(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )") if len(re.findall(reg, token)) > 1: tokens = token.split(",") return [*[tokens[0] + " " + house_token for house_token in tokens[1:]]] return [token] def any_of_in(substrings: Iterable[str], string: str) -> bool: return any(map(lambda substring: substring in string, substrings)) def flatten(arr: Iterable[List[T]]) -> List[T]: return sum(arr, []) # TODO: переработать систему из if в нормальный вид и классификация чисел/букв def split_address(address: str) -> List[str]: if ";" in address: address = address.replace(";", ",") if "," in address: tokens = address.split(",") tokens = list(map(str.strip, filter(lambda token: token != "", tokens))) res = [] accumulator = [] for i in range(len(tokens)): # TODO: напселённые пункты # if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower()) # accumulator += tokens[i] # улицы if any_of_in(STREET_PREFIXES, tokens[i].lower()): if accumulator and any_of_in(STREET_PREFIXES, "".join(accumulator).lower() ): res.append( " ".join(accumulator)) accumulator=[] accumulator.append(tokens[i]) # дома elif any_of_in(HOUSES_PREFIXES, tokens[i].lower()): if accumulator and any_of_in(HOUSES_PREFIXES, accumulator[-1].lower()): res.append(" ".join(accumulator)) accumulator.pop() res.append(unfold_house_ranges(" ".join(accumulator),tokens[i])) accumulator=res[-1] res.pop() # корпус elif any_of_in(BUILDING_PREFIXES, tokens[i].lower()): if accumulator and any_of_in(BUILDING_PREFIXES, accumulator[-1].lower() ): res.append( " ".join(accumulator)) accumulator.pop() accumulator.append(tokens[i]) # литера elif any_of_in(LETTER, tokens[i].lower()): if accumulator and any_of_in(LETTER, accumulator[-1].lower() ): res.append(" ".join(accumulator)) accumulator.pop() accumulator.append (tokens[i]) else: accumulator.append(tokens[i]) res.append(" ".join(accumulator)) return res return [address] def process_row(row: pd.Series[str]) -> pd.Series[str]: row = row.copy() if pd.isnull(row["Улица"]): row["Улица"] = [None] else: addresses = split_address(row["Улица"]) row["Улица"] = addresses return row def split_addresses(df: pd.DataFrame) -> pd.DataFrame: merged_df = df.apply(process_row, axis=1).reset_index() return merged_df.explode("Улица", ignore_index=True)