from __future__ import annotations import re from typing import Iterable, List, TypeVar import pandas as pd T = TypeVar("T") STREET_PREFIXES = ("ул.", "бул.", "пр.", "ул", "бул", "пр", "ш.", "ш", "пер.", "пер") HOUSES_PREFIXES = ("д.", "д") def unfold_house_ranges(token: str) -> str: pairs_strings = re.findall(r"([\d]+-[\d]+)", token) for pair_string in pairs_strings: a, b = pair_string.split("-") a, b = int(a), int(b) if b > a: token = token.replace(pair_string, ", ".join(map(str, range(a, b + 1)))) return token def unfold_houses_list(token: str) -> List[str]: token = unfold_house_ranges(token) reg = re.compile(r"(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )") if len(re.findall(reg, token)) > 1: tokens = token.split(",") return [*[tokens[0] + " " + house_token for house_token in tokens[1:]]] return [token] def any_of_in(substrings: Iterable[str], string: str) -> bool: return any(map(lambda substring: substring in string, substrings)) def flatten(arr: Iterable[List[T]]) -> List[T]: return sum(arr, []) def split_address(address: str) -> List[str]: if ";" in address: return flatten(map(unfold_houses_list, address.split(";"))) elif "," in address: tokens = re.split(r"(,)", address) tokens = list(map(str.strip, filter(lambda token: token != "", tokens))) res = [] accumulator = "" for i in range(len(tokens)): if any_of_in(STREET_PREFIXES, tokens[i].lower()) and any_of_in( STREET_PREFIXES, accumulator.lower() ): res += unfold_houses_list(accumulator) accumulator = "" accumulator += tokens[i] res += unfold_houses_list(accumulator) return res return [address] def process_row(row: pd.Series[str]) -> pd.Series[str]: row = row.copy() if pd.isnull(row["Улица"]): row["Улица"] = [None] else: addresses = split_address(row["Улица"]) row["Улица"] = addresses return row def split_addresses(df: pd.DataFrame) -> pd.DataFrame: merged_df = df.apply(process_row, axis=1).reset_index() return merged_df.explode("Улица", ignore_index=True)