from __future__ import annotations from typing import List, Iterable, TypeVar, Any import pandas as pd import re T = TypeVar('T') STREET_PREFIXES = ('ул.', 'бул.', 'пр.', 'ул', 'бул', 'пр', 'ш.', 'ш', 'пер.', 'пер') HOUSES_PREFIXES = ('д.', 'д') def unfold_house_ranges(token: str) -> str: pairs_strings = re.findall(r'([\d]+-[\d]+)', token) for pair_string in pairs_strings: a, b = pair_string.split('-') a, b = int(a), int(b) if b > a: token = token.replace( pair_string, ', '.join(map(str, range(a, b+1)))) return token def unfold_houses_list(token: str) -> List[str]: token = unfold_house_ranges(token) reg = re.compile(r'(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )') if len(re.findall(reg, token)) > 1: tokens = token.split(',') return [*[tokens[0] + ' ' + house_token for house_token in tokens[1:]]] return [token] def any_of_in(substrings: Iterable[str], string: str) -> bool: return any(map(lambda substring: substring in string, substrings)) def flatten(arr: Iterable[List[T]]) -> List[T]: return sum(arr, []) def split_address(address: str) -> List[str]: if ';' in address: return flatten(map(unfold_houses_list, address.split(';'))) elif ',' in address: tokens = re.split(r'(,)', address) tokens = list(map(str.strip, filter( lambda token: token != '', tokens))) res = [] accumulator = '' for i in range(len(tokens)): if (any_of_in(STREET_PREFIXES, tokens[i].lower()) and any_of_in(STREET_PREFIXES, accumulator.lower())): res += unfold_houses_list(accumulator) accumulator = '' accumulator += tokens[i] res += unfold_houses_list(accumulator) return res return [address] def process_row(row: pd.Series[str]) -> pd.Series[str]: row = row.copy() if pd.isnull(row['Улица']): row['Улица'] = [None] else: addresses = split_address(row['Улица']) row['Улица'] = addresses return row def split_addresses(df: pd.DataFrame) -> pd.DataFrame: merged_df = df.apply(process_row, axis=1).reset_index() return merged_df.explode('Улица', ignore_index=True)