from __future__ import annotations from typing import List, Iterable, TypeVar, Any import pandas as pd import re T = TypeVar('T') street_prefixes = ('ул.', 'бул.', 'пр.', 'ул', 'бул', 'пр', 'ш.', 'ш', 'пер.', 'пер') houses_prefixes = ('д.', 'д') def unfold_house_ranges(token: str) -> str: pairs_strings = re.findall(r'([\d]+-[\d]+)', token) for pair_string in pairs_strings: a, b = pair_string.split('-') a, b = int(a), int(b) if b > a: token = token.replace( pair_string, ', '.join(map(str, range(a, b+1)))) return token def unfold_houses_list(token: str) -> List[str]: token = unfold_house_ranges(token) reg = re.compile(r'(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )') if len(re.findall(reg, token)) > 1: tokens = token.split(',') return [*[tokens[0] + ' ' + house_token for house_token in tokens[1:]]] return [token] def any_of_in(substrings: Iterable[str], string: str) -> bool: return any(map(lambda substring: substring in string, substrings)) def flatten(arr: Iterable[List[T]]) -> List[T]: return sum(arr, []) def split_address(address: str) -> List[str]: if ';' in address: return flatten(map(unfold_houses_list, address.split(';'))) elif ',' in address: tokens = re.split(r'(,)', address) tokens = list(map(str.strip, filter( lambda token: token != '', tokens))) res = [] accumulator = '' for i in range(len(tokens)): if (any_of_in(street_prefixes, tokens[i].lower()) and any_of_in(street_prefixes, accumulator.lower())): res += unfold_houses_list(accumulator) accumulator = '' accumulator += tokens[i] res += unfold_houses_list(accumulator) return res return [address] def process_row(row: pd.Series[str]) -> pd.Series[str]: if pd.isnull(row['Улица']): return row addresses = split_address(row['Улица']) row = row.copy() row['Улица'] = addresses return row def split_addresses(df: pd.DataFrame) -> pd.DataFrame: return df.apply(process_row, axis=1).explode('Улица', ignore_index=True)