from __future__ import annotations import re from typing import Iterable, List, TypeVar from collections.abc import Sequence import pandas as pd T = TypeVar("T") CLASSES = ("w", "d", "c", "t", "s", "h", "b", "e", "l", "r") DISTRICTS_PREFIXES = ("мо ", "р-н","городское","лесхоз") COUNTRYSIDE_PREFIXES = ( "г", "п", "д", "гп", "рп", "кп", "пгт", "c", "хутор", " урочище") TERRITORY_PREFIXES = ( "тер.", " тер", "снт ", "ст ", "дск ", "днп ", "дпк ", "нп ", "пдк ", "т/б ", "садоводство", "массив", "хозя", "сад-во") STREET_PREFIXES = ( " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе", "лини", "аллея", "мост", " парк", "кольцо", "проезд", "съезд","переулок", "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.") HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп","дом","дома") BUILDING_PREFIXES = ("к.", "к ","корп", "корпус") EDIFICE_PREFIXES=("стр.", "строение") LETTER = ("лит.", "литера", " л.") PREFIXES = (DISTRICTS_PREFIXES, COUNTRYSIDE_PREFIXES, TERRITORY_PREFIXES, STREET_PREFIXES, HOUSES_PREFIXES, BUILDING_PREFIXES, EDIFICE_PREFIXES,LETTER) def unfold_house_ranges(token: str) -> List[str]: addresses = [] pairs_strings = re.findall(r"([\d]+-[\d]+)", token) for pair_string in pairs_strings: a, b = pair_string.split("-") a, b = int(a), int(b) if b > a: addresses += [re.sub(r"([\d]+-[\d]+)", number, token) for number in map(str, range(a, b + 1))] else: token = token.replace("-", "/") if not addresses: addresses.append(token) return addresses def any_of_in(substrings: Iterable[str], string: str) -> bool: return any(map(lambda substring: substring in string, substrings)) def flatten(arr: Iterable[List[T]]) -> List[T]: return sum(arr, []) def find_room(token: pd.Series, pre_token: pd.Series) -> str: if re.search(r"\bпом\.?", token['obj']): return "r" return "" def find_litera(token: pd.Series, pre_token: pd.Series) -> str: if find_room(token, pre_token): return "" if any_of_in(LETTER, token['obj'].lower()) \ or re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token['obj']): return "l" if (re.search(r"\b([А-Я]|[а-я]){1}$", token['obj']) \ and ("l" in pre_token['class'] or "h" in pre_token['class'])) \ and (" ш" not in token["obj"]) \ and not find_countryside(token, pre_token): return "l" return "" def find_edifice(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(EDIFICE_PREFIXES, token['obj'].lower()): return "e" return "" def find_building(token: pd.Series, pre_token: pd.Series) -> str: if re.search(r"\d", token['obj']) and not find_room(token,pre_token): if any_of_in(BUILDING_PREFIXES, token['obj'].lower()) \ or "b" in pre_token['class'] and ("h" not in token['class']) and not find_edifice(token,pre_token)\ or re.search(r"к\.* ?\d", token['obj']): return "b" return "" def find_house(token: pd.Series, pre_token: pd.Series) -> str: if re.search(r"\d{1,4}", token['obj']) and not find_room(token,pre_token): if any_of_in(HOUSES_PREFIXES, token['obj'].lower()): return "h" if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token['obj']): return "h" if ("s" in pre_token['class'] or "h" in pre_token['class'] or "s" in token['class']) \ and not any_of_in(("-я", "-й", "-Я"), token['obj']) \ and not find_building(token, pre_token)\ and not find_edifice(token,pre_token): return "h" if find_building(token, pre_token) \ and not any_of_in(("-я", "-й", "-Я"), token['obj']) \ and True: if len(re.findall(r"\d{1,4}", token['obj'])) > 1: return "h" if int(re.search(r"\d{1,4}", token['obj']).group()) // 10 >0: return "h" return "" def find_street(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(STREET_PREFIXES, token['obj'].lower()): return "s" if re.search(r"\b[А-Яа-я]{4,}\b", token['obj']) \ and not any([el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el)>2]) \ and not ("d" in token["class"] or "t" in token["class"] or "c" in token["class"]): return "s" return "" def find_territory(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(TERRITORY_PREFIXES, token['obj'].lower()): return "t" return "" def find_countryside(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(COUNTRYSIDE_PREFIXES, token['obj'].lower()) \ and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token['obj']) \ and not find_house(token, pre_token) \ and not any_of_in(STREET_PREFIXES, token['obj'].lower()): return "c" return "" def find_district(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(DISTRICTS_PREFIXES, token['obj'].lower()): return "d" return "" def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: brackets = re.search(r"\(.+\)", token["obj"]) if brackets: token["obj"] = re.sub(r"\(.+\)", "()", token["obj"]) token["class"] += find_district(token, pre_token) token["class"] += find_countryside(token, pre_token) token["class"] += find_territory(token, pre_token) token["class"] += find_street(token, pre_token) token["class"] += find_house(token, pre_token) token["class"] += find_building(token, pre_token) token["class"] += find_edifice(token, pre_token) token["class"] += find_litera(token, pre_token) token["class"] += find_room(token, pre_token) if token['class'] == "": token['class'] = "w" if brackets: token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"]) return token def cut_address(ad: pd.Series, cl: str) -> pd.Series: while ad["class"] and CLASSES.index(ad["class"][-1]) > CLASSES.index(cl[0]): if ad["class"][-1] == "h": ad["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", ad["address"].lower()) elif ad["class"][-1] == "b": num = re.findall(r"к{0,1}\.? ?\d", ad["address"])[-1] ad["address"] = re.sub(num, "", ad["address"]) elif ad["class"][-1] == "e": ad["address"] = re.sub(r"cтр\.? ?\d", "", ad["address"]) elif ad["class"][-1] == "l": ad["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$", "", ad["address"]) elif ad["class"][-1] == "r": ad["address"] = re.sub(r"пом\.? ?\d+", "", ad["address"]) ad["class"] = ad["class"][:-1] return ad def is_valid_token(string: str) -> bool: return string not in ("", "уг.", "д.") def create_token(obj: str = "", token_class: str = ""): return pd.Series( { "obj": obj, "class": token_class, } ) class AddressSplitter(Sequence): def __init__(self, address: str): self.input = address self.addresses = self.split() ## Sequence abstract methods implementation def __getitem__(self, key: int): if key < len(self.addresses): return self.addresses[key] else: raise IndexError() def __len__(self): return len(self.addresses) ## Address token class manipulations def next_class(self) -> str: return self.token["class"][0] def correct_order(self) -> bool: prev_class = self.accumulator["class"][-1] return ( CLASSES.index(prev_class) < CLASSES.index(self.next_class()) and self.accumulator["class"] != "w" ) def next_class_is(self, comparing_class: str) -> bool: return len(self.token["class"]) > 0 and self.next_class() == comparing_class[0] def has_no_class(self, comparing_class: str) -> bool: return comparing_class[0] not in self.accumulator["class"] def pop_token_class(self): self.token["class"] = self.token["class"][1:] ## Accumulator constrains def next_is_street_or_upper(self) -> bool: is_unknown_class = self.accumulator["class"] in ("", "w") return ( CLASSES.index(self.next_class()) <= CLASSES.index("s") or is_unknown_class ) def has_numbered_street(self) -> bool: return any_of_in(("-я", "-й", "-Я"), self.accumulator["address"]) ## Accumulator manipulation # House def substitue_house(self) -> str: house_regex = re.compile(r"\d{1,4} ?[\/\-]?\d* ?") number = house_regex.findall(self.token['obj'])[0] if self.has_numbered_street(): house_number_index = 1 else: house_number_index = 0 number_in_accumulator = house_regex.findall(self.accumulator["address"]) if number_in_accumulator: return re.sub(number_in_accumulator[house_number_index], number, self.accumulator["address"]) else: return self.accumulator["address"] # Building def append_building(self, number: int) -> pd.Series: self.accumulator["class"] += "b" self.accumulator["address"] += "к." + number return self.accumulator def substitue_building(self, number: int) -> str: return re.sub(r"\d$", number, self.accumulator["address"]) def insert_building(self): number = re.findall(r"\d", self.token["obj"])[-1] if number and self.has_no_class("building"): self.accumulator = self.append_building(number) else: self.accumulator["address"] = self.substitue_building(number) # Edifice def substitue_edifice(self, number: int) -> str: return re.sub(r"cтр\. ?\d", number, self.accumulator["address"].strip()) def insert_edifice(self): number = re.findall("стр\.? ?\d", self.token["obj"])[-1] self.accumulator["address"] = self.substitue_edifice(number) if number and self.has_no_class("edifice"): self.accumulator["class"] += "e" # Letter def without_letter(self) -> str: return re.sub(r"[А-Яа-я]$", "", self.accumulator["address"].strip()) def substitue_letter(self, letter: str) -> str: address_without_letter = self.without_letter() return address_without_letter + letter def insert_letter(self): letter = re.findall(r"[А-Яа-я]", self.token["obj"])[-1] self.accumulator["address"] = self.substitue_letter(letter) if letter and self.has_no_class("litera"): self.accumulator["class"] += "l" def has_letter_in(self) -> bool: return ( re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", self.accumulator["address"]) ) # Room def substitue_room(self, number: int) -> str: return re.sub(r"пом\. ?\d\-?\d*\w?", number, self.accumulator["address"].strip()) def insert_room(self): number = re.findall("пом\. ?\-?\d*\w?", self.token["obj"])[-1] self.accumulator["address"] = self.substitue_room(number) if number and self.has_no_class("room"): self.accumulator["class"] += "r" ## Data preprocessing def split_tokens(self) -> list[pd.Series]: address = self.input.replace(";", ",") parts = address.split(",") parts = map(str.strip, parts) parts = filter(is_valid_token, parts) tokens = map(lambda part: create_token(part, ""), parts) return list(tokens) def split(self): self.tokens = self.split_tokens() result = [] self.accumulator = pd.Series({"address": "", "class": ""}) prev_token = create_token() for cursor in self.tokens: self.token = address_classification(cursor, prev_token) prev_token = self.token.copy() if self.accumulator["class"] == "": self.accumulator = self.token.rename({"obj": "address"}) continue if self.correct_order(): self.accumulator["address"] += " " self.accumulator += self.token.rename({"obj": "address"}) else: unfolded_address = unfold_house_ranges(self.accumulator["address"]) self.accumulator["address"] = unfolded_address[-1] result.extend(unfolded_address) self.accumulator = cut_address(self.accumulator, self.token["class"]) if self.next_is_street_or_upper(): self.accumulator = self.token.rename({"obj": "address"}) if self.next_class_is("house"): self.accumulator["address"] = self.substitue_house() self.pop_token_class() if self.next_class_is("building"): self.insert_building() self.pop_token_class() if self.next_class_is("edifice"): self.insert_edifice() self.pop_token_class() if self.next_class_is("letter"): self.insert_letter() elif self.has_letter_in(): self.accumulator["address"] = self.without_letter() if self.next_class_is("room"): self.insert_room() self.pop_token_class() result.extend(unfold_house_ranges(self.accumulator["address"])) return result def split_pesoch_res(address: str) -> List[str]: t = re.sub(r",", " ", address) t = re.split(r"(Санкт-Петербург|Ленинградская обл|Л\.О)", t) t = list(map(str.strip, filter(lambda token: token != "", t))) tokens = [t[i] + " " + t[i+1] for i in range(0, len(t)-1, 2)] if tokens: return list(set(tokens)) return [address] def process_row(row: pd.Series[str]) -> pd.Series[str]: row = row.copy() if pd.isnull(row["Улица"]): row["Улица"] = [None] else: if row["РЭС"] == "Песочинский РЭС": addresses = split_pesoch_res(row["Улица"]) else: addresses = AddressSplitter(row["Улица"]) row["Улица"] = addresses return row def split_addresses(df: pd.DataFrame) -> pd.DataFrame: merged_df = df.apply(process_row, axis=1).reset_index() return merged_df.explode("Улица", ignore_index=True)