diff --git a/.gitignore b/.gitignore index 43024ce..71444f4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ .venv __pycache__ .env -data*.csv \ No newline at end of file +data*.csv +.idea/ \ No newline at end of file diff --git a/parser/__main__.py b/parser/__main__.py index e55c05a..de668c0 100644 --- a/parser/__main__.py +++ b/parser/__main__.py @@ -8,7 +8,7 @@ from . import pipeline def job(): parser = pipeline() - parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H:%M")}.csv') + parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H.%M")}.csv') if len(sys.argv) == 2: diff --git a/parser/address.py b/parser/address.py index 4b22430..79eac1b 100644 --- a/parser/address.py +++ b/parser/address.py @@ -7,20 +7,30 @@ import pandas as pd T = TypeVar("T") -STREET_PREFIXES = ("ул.", "бул.", "пр.", "ул", "бул", "пр", "ш.", "ш", "пер.", "пер") -HOUSES_PREFIXES = ("д.", "д") +SETTLEMENTS_PREFIXES=("г","мо","р-н","п","д","гп","c","хутор","массив","тер","СНТ","СТ","ДСК","ДНП","ДПК","НП","садоводство") +STREET_PREFIXES = ("ул", "бул", "пр", "ш", "пер", "дор", "маг", "наб", "пл", "просп", "туп", "аллея", "мост", "парк", "кольцо","проезд") +HOUSES_PREFIXES = ("д.", "д","уч","участок") +BUILDING_PREFIXES=("к", "корп") +LETTER=("лит", "литера") - -def unfold_house_ranges(token: str) -> str: +def unfold_house_ranges(address:str, token: str) -> List[str]: + adresses=[] pairs_strings = re.findall(r"([\d]+-[\d]+)", token) for pair_string in pairs_strings: a, b = pair_string.split("-") a, b = int(a), int(b) if b > a: - token = token.replace(pair_string, ", ".join(map(str, range(a, b + 1)))) + token = token.replace(pair_string,"") + adresses += [address + " "+ token + number for number in map(str, range(a, b + 1))] - return token + + else: + token = token.replace("-", "/") + adresses += address + " " + token + if not adresses: + adresses.append(address + " " + token) + return adresses def unfold_houses_list(token: str) -> List[str]: @@ -42,28 +52,54 @@ def flatten(arr: Iterable[List[T]]) -> List[T]: return sum(arr, []) +# TODO: переработать систему из if в нормальный вид и классификация чисел/букв def split_address(address: str) -> List[str]: if ";" in address: - return flatten(map(unfold_houses_list, address.split(";"))) - elif "," in address: - tokens = re.split(r"(,)", address) + address = address.replace(";", ",") + if "," in address: + tokens = address.split(",") tokens = list(map(str.strip, filter(lambda token: token != "", tokens))) - res = [] - accumulator = "" + accumulator = [] for i in range(len(tokens)): - if any_of_in(STREET_PREFIXES, tokens[i].lower()) and any_of_in( - STREET_PREFIXES, accumulator.lower() - ): - res += unfold_houses_list(accumulator) - accumulator = "" - accumulator += tokens[i] + # TODO: напселённые пункты + # if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower()) + # accumulator += tokens[i] - res += unfold_houses_list(accumulator) + # улицы + if any_of_in(STREET_PREFIXES, tokens[i].lower()): + if accumulator and any_of_in(STREET_PREFIXES, "".join(accumulator).lower() ): + res.append( " ".join(accumulator)) + accumulator=[] + accumulator.append(tokens[i]) + # дома + elif any_of_in(HOUSES_PREFIXES, tokens[i].lower()): + if accumulator and any_of_in(HOUSES_PREFIXES, accumulator[-1].lower()): + res.append(" ".join(accumulator)) + accumulator.pop() + res.append(unfold_house_ranges(" ".join(accumulator),tokens[i])) + accumulator=res[-1] + res.pop() + # корпус + elif any_of_in(BUILDING_PREFIXES, tokens[i].lower()): + if accumulator and any_of_in(BUILDING_PREFIXES, accumulator[-1].lower() ): + res.append( " ".join(accumulator)) + accumulator.pop() + accumulator.append(tokens[i]) + # литера + elif any_of_in(LETTER, tokens[i].lower()): + if accumulator and any_of_in(LETTER, accumulator[-1].lower() ): + res.append(" ".join(accumulator)) + accumulator.pop() + accumulator.append (tokens[i]) + else: + accumulator.append(tokens[i]) + + res.append(" ".join(accumulator)) return res return [address]