lenengro_parser/parser/address.py

88 lines
2.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import re
from typing import Iterable, List, TypeVar
import pandas as pd
T = TypeVar("T")
STREET_PREFIXES = ("ул.", "бул.", "пр.", "ул", "бул", "пр", "ш.", "ш", "пер.", "пер")
HOUSES_PREFIXES = ("д.", "д")
def unfold_house_ranges(token: str) -> str:
pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
for pair_string in pairs_strings:
a, b = pair_string.split("-")
a, b = int(a), int(b)
if b > a:
token = token.replace(pair_string, ", ".join(map(str, range(a, b + 1))))
return token
def unfold_houses_list(token: str) -> List[str]:
token = unfold_house_ranges(token)
reg = re.compile(r"(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )")
if len(re.findall(reg, token)) > 1:
tokens = token.split(",")
return [*[tokens[0] + " " + house_token for house_token in tokens[1:]]]
return [token]
def any_of_in(substrings: Iterable[str], string: str) -> bool:
return any(map(lambda substring: substring in string, substrings))
def flatten(arr: Iterable[List[T]]) -> List[T]:
return sum(arr, [])
def split_address(address: str) -> List[str]:
if ";" in address:
return flatten(map(unfold_houses_list, address.split(";")))
elif "," in address:
tokens = re.split(r"(,)", address)
tokens = list(map(str.strip, filter(lambda token: token != "", tokens)))
res = []
accumulator = ""
for i in range(len(tokens)):
if any_of_in(STREET_PREFIXES, tokens[i].lower()) and any_of_in(
STREET_PREFIXES, accumulator.lower()
):
res += unfold_houses_list(accumulator)
accumulator = ""
accumulator += tokens[i]
res += unfold_houses_list(accumulator)
return res
return [address]
def process_row(row: pd.Series[str]) -> pd.Series[str]:
row = row.copy()
if pd.isnull(row["Улица"]):
row["Улица"] = [None]
else:
addresses = split_address(row["Улица"])
row["Улица"] = addresses
return row
def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
merged_df = df.apply(process_row, axis=1).reset_index()
return merged_df.explode("Улица", ignore_index=True)