from __future__ import annotations import re from typing import Iterable, List, TypeVar from collections.abc import Sequence import pandas as pd T = TypeVar("T") CLASSES = ("w", "d", "c", "t", "s", "h", "b", "l", "r") DISTRICTS_PREFIXES = ("мо ", "р-н","городское","лесхоз") COUNTRYSIDE_PREFIXES = ( "г", "п", "д", "гп", "рп", "кп", "пгт", "c", "хутор", " урочище") TERRITORY_PREFIXES = ( "тер.", " тер", "снт ", "ст ", "дск ", "днп ", "дпк ", "нп ", "пдк ", "т/б ", "садоводство", "массив", "хозя", "сад-во") STREET_PREFIXES = ( " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе", "лини", "аллея", "мост", " парк", "кольцо", "проезд", "съезд","переулок", "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.") HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп","дом") BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение", "корпус") LETTER = ("лит.", "литера", " л.") def unfold_house_ranges(token: str) -> List[str]: addresses = [] pairs_strings = re.findall(r"([\d]+-[\d]+)", token) for pair_string in pairs_strings: a, b = pair_string.split("-") a, b = int(a), int(b) if b > a: addresses += [re.sub(r"([\d]+-[\d]+)", number, token) for number in map(str, range(a, b + 1))] else: token = token.replace("-", "/") if not addresses: addresses.append(token) return addresses def any_of_in(substrings: Iterable[str], string: str) -> bool: return any(map(lambda substring: substring in string, substrings)) def flatten(arr: Iterable[List[T]]) -> List[T]: return sum(arr, []) def find_room(token: pd.Series, pre_token: pd.Series) -> str: if re.search(r"пом\.?", token['obj']): return "r" return "" def find_litera(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(LETTER, token['obj'].lower()) \ or re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token['obj']): return "l" if (re.search(r"\b([А-Я]|[а-я]){1}$", token['obj']) \ and ("l" in pre_token['class'] or "h" in pre_token['class'])) \ and not (" ш" in token["obj"]) \ and not find_countryside(token, pre_token): return "l" return "" def find_building(token: pd.Series, pre_token: pd.Series) -> str: if re.search(r"\d", token['obj']): if any_of_in(BUILDING_PREFIXES, token['obj'].lower()) \ or "b" in pre_token['class'] and not ("h" in token['class']) \ or re.search(r"к\.* ?\d", token['obj']): return "b" return "" def find_house(token: pd.Series, pre_token: pd.Series) -> str: if re.search(r"\d{1,4}", token['obj']): if any_of_in(HOUSES_PREFIXES, token['obj'].lower()): return "h" if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token['obj']): return "h" if ("s" in pre_token['class'] or "h" in pre_token['class'] or "s" in token['class']) \ and not any_of_in(("-я", "-й", "-Я"), token['obj']) \ and not find_building(token, pre_token): return "h" return "" def find_street(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(STREET_PREFIXES, token['obj'].lower()) \ or re.search(r"[а-я]+ая", token['obj']): return "s" return "" def find_territory(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(TERRITORY_PREFIXES, token['obj'].lower()): return "t" return "" def find_countryside(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(COUNTRYSIDE_PREFIXES, token['obj'].lower()) \ and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token['obj']) \ and not find_house(token, pre_token) \ and not find_street(token, pre_token): return "c" return "" def find_district(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(DISTRICTS_PREFIXES, token['obj'].lower()): return "d" return "" def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: brackets = re.search(r"\(.+\)", token["obj"]) if brackets: token["obj"] = re.sub(r"\(.+\)", "()", token["obj"]) token["class"] += find_district(token, pre_token) token["class"] += find_countryside(token, pre_token) token["class"] += find_territory(token, pre_token) token["class"] += find_street(token, pre_token) token["class"] += find_house(token, pre_token) token["class"] += find_building(token, pre_token) token["class"] += find_litera(token, pre_token) if token['class'] == "": token['class'] = "w" if brackets: token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"]) return token def cut_address(ad: pd.Series, cl: str) -> pd.Series: while ad["class"] and CLASSES.index(ad["class"][-1]) > CLASSES.index(cl[0]): if ad["class"][-1] == "h": ad["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", ad["address"].lower()) elif ad["class"][-1] == "b": num = re.findall("к{0,1}\.? ?\d", ad["address"])[-1] ad["address"] = re.sub(num, "", ad["address"]) elif ad["class"][-1] == "l": ad["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$", "", ad["address"]) elif ad["class"][-1] == "r": ad["address"] = re.sub(r"пом\.? ?\d+", "", ad["address"]) ad["class"] = ad["class"][:-1] return ad def is_nonempty_str(string: str) -> bool: return string != "" def create_token(obj: str = "", token_class: str = ""): return pd.Series( { "obj": obj, "class": token_class, } ) class AddressSplitter(Sequence): addresses: list[str] tokens: list[pd.Series] def __init__(self, address: str): self.input = address self.addresses = self.split() if len(self.addresses) == 0: self.addresses = [address] # Sequence abstract methods implementation def __getitem__(self, key: int): if key < len(self.addresses): return self.addresses[key] else: raise IndexError() def __len__(self): return len(self.addresses) # Address token class manipulations def next_class(self) -> str: return self.token["class"][0] def correct_order(self) -> bool: prev_class = self.accumulator["class"][-1] return ( CLASSES.index(prev_class) < CLASSES.index(self.next_class()) and self.accumulator["class"] != "w" ) def next_class_is(self, comparing_class: str) -> bool: return len(self.token["class"]) > 0 and self.next_class() == comparing_class[0] def pop_token_class(self): self.token["class"] = self.token["class"][1:] def has_no_class(self, comparing_class: str) -> bool: return comparing_class[0] not in self.accumulator["class"] def next_is_street_or_upper(self) -> bool: is_unknown_class = self.accumulator["class"] in ("", "w") return ( CLASSES.index(self.next_class()) <= CLASSES.index("s") or is_unknown_class ) # Accumulator manipulation def substitue_house(self) -> str: num = re.findall(r"\d{1,4} ?\/?\d* ?", self.token["obj"])[0] return re.sub(r"\d{1,4} ?\/*\d* ?", num, self.accumulator["address"]) def append_building(self, num: int) -> pd.Series: self.accumulator["class"] += "b" self.accumulator["address"] += "к." + num return self.accumulator def substitue_building(self, num: int) -> str: return re.sub(r"\d$", num, self.accumulator["address"]) def insert_building(self): number = re.findall(r"\d", self.token["obj"])[-1] if number and self.has_no_class("building"): self.accumulator = self.append_building(number) else: self.accumulator["address"] = self.substitue_building(number) def without_letter(self) -> str: return re.sub(r"[А-Яа-я]$", "", self.accumulator["address"].strip()) def substitue_letter(self, letter: str) -> str: address_without_letter = self.without_letter() return address_without_letter + letter def insert_letter(self): letter = re.findall(r"[А-Яа-я]", self.token["obj"].strip())[-1] self.accumulator["address"] = self.substitue_letter(letter) if letter and self.has_no_class("litera"): self.accumulator["class"] += "l" def has_letter_in(self) -> bool: return ( re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", self.accumulator["address"]) is not None ) # Data preprocessing def split_tokens(self) -> list[pd.Series]: address = self.input.replace(";", ",") parts = address.split(",") parts = map(str.strip, parts) parts = filter(is_nonempty_str, parts) tokens = map(lambda part: create_token(part, ""), parts) return list(tokens) def split(self): self.tokens = self.split_tokens() result = [] self.accumulator = pd.Series({"address": "", "class": ""}) prev_token = create_token() for cursor in self.tokens: self.token = address_classification(cursor, prev_token) prev_token = self.token.copy() if self.accumulator["class"] == "": self.accumulator = self.token.rename({"obj": "address"}) continue if self.correct_order(): self.accumulator["address"] += " " self.accumulator += self.token.rename({"obj": "address"}) else: unfolded_address = unfold_house_ranges(self.accumulator["address"]) self.accumulator["address"] = unfolded_address[-1] result.extend(unfolded_address) self.accumulator = cut_address(self.accumulator, self.token["class"]) if self.next_is_street_or_upper(): self.accumulator = self.token.rename({"obj": "address"}) if self.next_class_is("house"): self.accumulator["address"] = self.substitue_house() self.pop_token_class() if self.next_class_is("building"): self.insert_building() self.pop_token_class() if self.next_class_is("letter"): self.insert_letter() elif self.has_letter_in(): self.accumulator["address"] = self.without_letter() result.extend(unfold_house_ranges(self.accumulator["address"])) return result def split_pesoch_res(address: str) -> List[str]: t = re.sub(r",", " ", address) t = re.split(r"(Санкт-Петербург|Ленинградская обл|Л\.О)", t) t = list(map(str.strip, filter(lambda token: token != "", t))) tokens = [t[i] + " " + t[i+1] for i in range(0, len(t)-1, 2)] if tokens: return list(set(tokens)) return [address] def process_row(row: pd.Series[str]) -> pd.Series[str]: row = row.copy() if pd.isnull(row["Улица"]): row["Улица"] = [None] else: if row["РЭС"] == "Песочинский РЭС": addresses = split_pesoch_res(row["Улица"]) else: addresses = AddressSplitter(row["Улица"]) row["Улица"] = addresses return row def split_addresses(df: pd.DataFrame) -> pd.DataFrame: merged_df = df.apply(process_row, axis=1).reset_index() return merged_df.explode("Улица", ignore_index=True)