dm1sh c40a1b4f92
FInished work
(Too lazy to split by commits)
2023-09-21 20:41:56 +03:00

89 lines
2.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
from typing import List, Iterable, TypeVar, Any
import pandas as pd
import re
T = TypeVar('T')
STREET_PREFIXES = ('ул.', 'бул.', 'пр.', 'ул', 'бул',
'пр', 'ш.', 'ш', 'пер.', 'пер')
HOUSES_PREFIXES = ('д.', 'д')
def unfold_house_ranges(token: str) -> str:
pairs_strings = re.findall(r'([\d]+-[\d]+)', token)
for pair_string in pairs_strings:
a, b = pair_string.split('-')
a, b = int(a), int(b)
if b > a:
token = token.replace(
pair_string, ', '.join(map(str, range(a, b+1))))
return token
def unfold_houses_list(token: str) -> List[str]:
token = unfold_house_ranges(token)
reg = re.compile(r'(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )')
if len(re.findall(reg, token)) > 1:
tokens = token.split(',')
return [*[tokens[0] + ' ' + house_token for house_token in tokens[1:]]]
return [token]
def any_of_in(substrings: Iterable[str], string: str) -> bool:
return any(map(lambda substring: substring in string, substrings))
def flatten(arr: Iterable[List[T]]) -> List[T]:
return sum(arr, [])
def split_address(address: str) -> List[str]:
if ';' in address:
return flatten(map(unfold_houses_list, address.split(';')))
elif ',' in address:
tokens = re.split(r'(,)', address)
tokens = list(map(str.strip, filter(
lambda token: token != '', tokens)))
res = []
accumulator = ''
for i in range(len(tokens)):
if (any_of_in(STREET_PREFIXES, tokens[i].lower()) and
any_of_in(STREET_PREFIXES, accumulator.lower())):
res += unfold_houses_list(accumulator)
accumulator = ''
accumulator += tokens[i]
res += unfold_houses_list(accumulator)
return res
return [address]
def process_row(row: pd.Series[str]) -> pd.Series[str]:
row = row.copy()
if pd.isnull(row['Улица']):
row['Улица'] = [None]
else:
addresses = split_address(row['Улица'])
row['Улица'] = addresses
return row
def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
merged_df = df.apply(process_row, axis=1).reset_index()
return merged_df.explode('Улица', ignore_index=True)