From a000bf5867299094bba68d3134a3f6e5c388f3c5 Mon Sep 17 00:00:00 2001 From: AnastasiaOnimovma Date: Fri, 13 Oct 2023 10:00:04 +0300 Subject: [PATCH 01/12] Change address recognizer (not final) --- .gitignore | 3 +- parser/__main__.py | 2 +- parser/address.py | 72 ++++++++++++++++++++++++++++++++++------------ 3 files changed, 57 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index 43024ce..71444f4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ .venv __pycache__ .env -data*.csv \ No newline at end of file +data*.csv +.idea/ \ No newline at end of file diff --git a/parser/__main__.py b/parser/__main__.py index e55c05a..de668c0 100644 --- a/parser/__main__.py +++ b/parser/__main__.py @@ -8,7 +8,7 @@ from . import pipeline def job(): parser = pipeline() - parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H:%M")}.csv') + parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H.%M")}.csv') if len(sys.argv) == 2: diff --git a/parser/address.py b/parser/address.py index 4b22430..79eac1b 100644 --- a/parser/address.py +++ b/parser/address.py @@ -7,20 +7,30 @@ import pandas as pd T = TypeVar("T") -STREET_PREFIXES = ("ул.", "бул.", "пр.", "ул", "бул", "пр", "ш.", "ш", "пер.", "пер") -HOUSES_PREFIXES = ("д.", "д") +SETTLEMENTS_PREFIXES=("г","мо","р-н","п","д","гп","c","хутор","массив","тер","СНТ","СТ","ДСК","ДНП","ДПК","НП","садоводство") +STREET_PREFIXES = ("ул", "бул", "пр", "ш", "пер", "дор", "маг", "наб", "пл", "просп", "туп", "аллея", "мост", "парк", "кольцо","проезд") +HOUSES_PREFIXES = ("д.", "д","уч","участок") +BUILDING_PREFIXES=("к", "корп") +LETTER=("лит", "литера") - -def unfold_house_ranges(token: str) -> str: +def unfold_house_ranges(address:str, token: str) -> List[str]: + adresses=[] pairs_strings = re.findall(r"([\d]+-[\d]+)", token) for pair_string in pairs_strings: a, b = pair_string.split("-") a, b = int(a), int(b) if b > a: - token = token.replace(pair_string, ", ".join(map(str, range(a, b + 1)))) + token = token.replace(pair_string,"") + adresses += [address + " "+ token + number for number in map(str, range(a, b + 1))] - return token + + else: + token = token.replace("-", "/") + adresses += address + " " + token + if not adresses: + adresses.append(address + " " + token) + return adresses def unfold_houses_list(token: str) -> List[str]: @@ -42,28 +52,54 @@ def flatten(arr: Iterable[List[T]]) -> List[T]: return sum(arr, []) +# TODO: переработать систему из if в нормальный вид и классификация чисел/букв def split_address(address: str) -> List[str]: if ";" in address: - return flatten(map(unfold_houses_list, address.split(";"))) - elif "," in address: - tokens = re.split(r"(,)", address) + address = address.replace(";", ",") + if "," in address: + tokens = address.split(",") tokens = list(map(str.strip, filter(lambda token: token != "", tokens))) - res = [] - accumulator = "" + accumulator = [] for i in range(len(tokens)): - if any_of_in(STREET_PREFIXES, tokens[i].lower()) and any_of_in( - STREET_PREFIXES, accumulator.lower() - ): - res += unfold_houses_list(accumulator) - accumulator = "" - accumulator += tokens[i] + # TODO: напселённые пункты + # if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower()) + # accumulator += tokens[i] - res += unfold_houses_list(accumulator) + # улицы + if any_of_in(STREET_PREFIXES, tokens[i].lower()): + if accumulator and any_of_in(STREET_PREFIXES, "".join(accumulator).lower() ): + res.append( " ".join(accumulator)) + accumulator=[] + accumulator.append(tokens[i]) + # дома + elif any_of_in(HOUSES_PREFIXES, tokens[i].lower()): + if accumulator and any_of_in(HOUSES_PREFIXES, accumulator[-1].lower()): + res.append(" ".join(accumulator)) + accumulator.pop() + res.append(unfold_house_ranges(" ".join(accumulator),tokens[i])) + accumulator=res[-1] + res.pop() + # корпус + elif any_of_in(BUILDING_PREFIXES, tokens[i].lower()): + if accumulator and any_of_in(BUILDING_PREFIXES, accumulator[-1].lower() ): + res.append( " ".join(accumulator)) + accumulator.pop() + accumulator.append(tokens[i]) + # литера + elif any_of_in(LETTER, tokens[i].lower()): + if accumulator and any_of_in(LETTER, accumulator[-1].lower() ): + res.append(" ".join(accumulator)) + accumulator.pop() + accumulator.append (tokens[i]) + else: + accumulator.append(tokens[i]) + + res.append(" ".join(accumulator)) return res return [address] From cb422b9a2fc47f8413a3fbe6c4726a6b94ca0fee Mon Sep 17 00:00:00 2001 From: AnastasiaOnimovma Date: Mon, 16 Oct 2023 00:42:22 +0300 Subject: [PATCH 02/12] Classification(ver.1) --- parser/address.py | 144 +++++++++++++++++++++++++++++++++------------- 1 file changed, 103 insertions(+), 41 deletions(-) diff --git a/parser/address.py b/parser/address.py index 79eac1b..a7ca5dd 100644 --- a/parser/address.py +++ b/parser/address.py @@ -7,22 +7,29 @@ import pandas as pd T = TypeVar("T") -SETTLEMENTS_PREFIXES=("г","мо","р-н","п","д","гп","c","хутор","массив","тер","СНТ","СТ","ДСК","ДНП","ДПК","НП","садоводство") -STREET_PREFIXES = ("ул", "бул", "пр", "ш", "пер", "дор", "маг", "наб", "пл", "просп", "туп", "аллея", "мост", "парк", "кольцо","проезд") -HOUSES_PREFIXES = ("д.", "д","уч","участок") -BUILDING_PREFIXES=("к", "корп") -LETTER=("лит", "литера") +CLASSES = ("s", "h", "b", "l", "?") -def unfold_house_ranges(address:str, token: str) -> List[str]: - adresses=[] +SETTLEMENTS_PREFIXES = ( + "г", "мо", "р-н", "п", "д", "гп", "c", "хутор", "массив", "тер", "СНТ", "СТ", "ДСК", "ДНП", "ДПК", "НП", + "садоводство") +STREET_PREFIXES = ( + " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе","линия","аллея", "мост", "парк", "кольцо","проезд", + "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.") +HOUSES_PREFIXES = ("д.", "уч.", "участок") +BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение") +LETTER = ("лит.", "литера"," л.") + + +def unfold_house_ranges(address: str, token: str) -> List[str]: + adresses = [] pairs_strings = re.findall(r"([\d]+-[\d]+)", token) for pair_string in pairs_strings: a, b = pair_string.split("-") a, b = int(a), int(b) if b > a: - token = token.replace(pair_string,"") - adresses += [address + " "+ token + number for number in map(str, range(a, b + 1))] + token = token.replace(pair_string, "") + adresses += [address + " " + token + number for number in map(str, range(a, b + 1))] else: @@ -52,6 +59,46 @@ def flatten(arr: Iterable[List[T]]) -> List[T]: return sum(arr, []) +def find_litera(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(LETTER, token['obj']) \ + or re.search(r"\d{1,3}[А-Яа-я]( |$)", token['obj']): + return "l" + # не работает + if (re.search(r"\b[А-Яа-я]{1}\b", token['obj']) and "l" in pre_token['class']): + return "l" + return "" + + +def find_building(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(BUILDING_PREFIXES, token['obj']) \ + or (re.search(r"\d", token['obj']) and "b" in pre_token['class']) \ + or re.search(r"к\.*\d", token['obj']) \ + or re.search(r"\d", token['obj']) and "b" in pre_token['class']: + return "b" + return "" + + +def find_house(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(HOUSES_PREFIXES, token['obj']): + return "h" + if re.search(r"(д|д\.) ?\d{1,3} ?\/*\d* ?", token['obj']) and not ("-я" in token['obj']): + if "h" in pre_token['class'] \ + or "s" in pre_token['class'] \ + or "s" in token['class']: + return "h" + # не работает + if re.search(r"\d{1,3}", token['obj']) and ("s" in pre_token['class'] or "h" in pre_token['class']): + return "h" + return "" + + +def find_street(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(STREET_PREFIXES, token['obj']) \ + or (re.search(r"[А-Я]{1}[а-я]+", token['obj']) and "s" in pre_token['class']): + return "s" + return "" + + # TODO: переработать систему из if в нормальный вид и классификация чисел/букв def split_address(address: str) -> List[str]: if ";" in address: @@ -59,47 +106,62 @@ def split_address(address: str) -> List[str]: if "," in address: tokens = address.split(",") - tokens = list(map(str.strip, filter(lambda token: token != "", tokens))) + t = list(map(str.strip, filter(lambda token: token != "", tokens))) + # токены в датафрэйм + tokens = pd.DataFrame() + tokens['obj'] = t + tokens.insert(len(tokens.columns), "class", "") res = [] - accumulator = [] + accumulator = "" for i in range(len(tokens)): # TODO: напселённые пункты # if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower()) # accumulator += tokens[i] + cur_tk = tokens.iloc[i] - # улицы - if any_of_in(STREET_PREFIXES, tokens[i].lower()): - if accumulator and any_of_in(STREET_PREFIXES, "".join(accumulator).lower() ): - res.append( " ".join(accumulator)) - accumulator=[] - accumulator.append(tokens[i]) - - # дома - elif any_of_in(HOUSES_PREFIXES, tokens[i].lower()): - if accumulator and any_of_in(HOUSES_PREFIXES, accumulator[-1].lower()): - res.append(" ".join(accumulator)) - accumulator.pop() - res.append(unfold_house_ranges(" ".join(accumulator),tokens[i])) - accumulator=res[-1] - res.pop() - # корпус - elif any_of_in(BUILDING_PREFIXES, tokens[i].lower()): - if accumulator and any_of_in(BUILDING_PREFIXES, accumulator[-1].lower() ): - res.append( " ".join(accumulator)) - accumulator.pop() - accumulator.append(tokens[i]) - # литера - elif any_of_in(LETTER, tokens[i].lower()): - if accumulator and any_of_in(LETTER, accumulator[-1].lower() ): - res.append(" ".join(accumulator)) - accumulator.pop() - accumulator.append (tokens[i]) + if i == 0: + pre_token = pd.Series(data=["", ""], index=['obj', 'class']) else: - accumulator.append(tokens[i]) - - res.append(" ".join(accumulator)) + pre_token = tokens.iloc[i - 1] + obj_class = find_street(cur_tk, pre_token) + if obj_class: + cur_tk["class"] += obj_class + if "s" in tokens['class'].iloc[i - 1]: + res.append(accumulator) + accumulator = "" + accumulator += tokens["obj"].iloc[i] + obj_class = find_house(cur_tk, pre_token) + if obj_class: + cur_tk["class"] += obj_class + if "h" in tokens['class'].iloc[i - 1]: + res.append(accumulator) + num = re.findall("\d{,3}", tokens['obj'].iloc[i])[-1] + accumulator = re.sub(r"\d{,3} ?\/*\d* ?", num,accumulator) + else: + accumulator += tokens["obj"].iloc[i] + obj_class = find_building(cur_tk, pre_token) + if obj_class: + cur_tk["class"] += obj_class + if "b" in tokens['class'].iloc[i - 1]: + res.append(accumulator) + num = re.findall("\d", tokens['obj'].iloc[i])[-1] + accumulator = re.sub(r"\d$", num, accumulator) + else: + accumulator += tokens["obj"].iloc[i] + obj_class = find_litera(cur_tk, pre_token) + if obj_class: + cur_tk["class"] += obj_class + if "l" in tokens['class'].iloc[i - 1]: + res.append(accumulator) + num = re.findall("[А-яа-я]", tokens['obj'].iloc[i].strip())[-1] + accumulator = re.sub(r"[А-яа-я]$", num, accumulator) + else: + accumulator += tokens["obj"].iloc[i] + if cur_tk['class'] == "": + cur_tk['class'] = "w" + print(cur_tk) return res return [address] From d822c5012b4e29da1478c57bc9edd46a43c45c41 Mon Sep 17 00:00:00 2001 From: AnastasiaOnimovma Date: Mon, 16 Oct 2023 22:02:12 +0300 Subject: [PATCH 03/12] classification(full address) --- parser/address.py | 127 +++++++++++++++++++++++++++++++--------------- parser/util.py | 6 ++- 2 files changed, 92 insertions(+), 41 deletions(-) diff --git a/parser/address.py b/parser/address.py index a7ca5dd..b9d18c9 100644 --- a/parser/address.py +++ b/parser/address.py @@ -7,16 +7,17 @@ import pandas as pd T = TypeVar("T") -CLASSES = ("s", "h", "b", "l", "?") - -SETTLEMENTS_PREFIXES = ( - "г", "мо", "р-н", "п", "д", "гп", "c", "хутор", "массив", "тер", "СНТ", "СТ", "ДСК", "ДНП", "ДПК", "НП", - "садоводство") +CLASSES = ("d", "c", "t", "s", "h", "b", "l", "r", "w") +DISTRICTS_PREFIXES = ("мо ", "р-н") +COUNTRYSIDE_PREFIXES = ( + " г", " п", " д", " гп", " рп", " кп", " пгт", " c", "хутор", " урочище" + "г.", "п.", "д.", "гп.", "рп.", "кп.", "пгт.", "c.") +TERRITORY_PREFIXES =("тер.", " тер", "снт ", "ст ", "дск ", "днп ", "дпк ", "нп ", "пдк ", "т/б ", "садоводство", "массив", "хоз","сад-во","с-во") STREET_PREFIXES = ( - " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе","линия","аллея", "мост", "парк", "кольцо","проезд", + " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе","линия","аллея", "мост", " парк", "кольцо","проезд", "съезд", "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.") -HOUSES_PREFIXES = ("д.", "уч.", "участок") -BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение") +HOUSES_PREFIXES = ("д.", "уч.", "участок","мкд","тп") +BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение","корпус") LETTER = ("лит.", "литера"," л.") @@ -58,46 +59,67 @@ def any_of_in(substrings: Iterable[str], string: str) -> bool: def flatten(arr: Iterable[List[T]]) -> List[T]: return sum(arr, []) +def find_room(token: pd.Series, pre_token: pd.Series) -> str: + if re.search(r"пом\.?", token['obj']): + return "r" + return "" def find_litera(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(LETTER, token['obj']) \ - or re.search(r"\d{1,3}[А-Яа-я]( |$)", token['obj']): + if any_of_in(LETTER, token['obj'].lower()) \ + or re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token['obj']): return "l" - # не работает - if (re.search(r"\b[А-Яа-я]{1}\b", token['obj']) and "l" in pre_token['class']): + if (re.search(r"\b([А-Я]|[а-я]){1}$", token['obj']) \ + and ("l" in pre_token['class'] or "h" in pre_token['class'])) \ + and not (" ш" in token["obj"]) \ + and not find_countryside(token,pre_token): return "l" return "" def find_building(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(BUILDING_PREFIXES, token['obj']) \ - or (re.search(r"\d", token['obj']) and "b" in pre_token['class']) \ - or re.search(r"к\.*\d", token['obj']) \ - or re.search(r"\d", token['obj']) and "b" in pre_token['class']: - return "b" + if re.search(r"\d", token['obj']): + if any_of_in(BUILDING_PREFIXES, token['obj'].lower()) \ + or "b" in pre_token['class'] and not ("h" in token['class'])\ + or re.search(r"к\.* ?\d", token['obj']): + return "b" return "" def find_house(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(HOUSES_PREFIXES, token['obj']): - return "h" - if re.search(r"(д|д\.) ?\d{1,3} ?\/*\d* ?", token['obj']) and not ("-я" in token['obj']): - if "h" in pre_token['class'] \ - or "s" in pre_token['class'] \ - or "s" in token['class']: + if re.search(r"\d{1,4}", token['obj']): + if any_of_in(HOUSES_PREFIXES, token['obj'].lower()): + return "h" + if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token['obj']): + return "h" + if ("s" in pre_token['class'] or "h" in pre_token['class'] or "s" in token['class']) \ + and not any_of_in(("-я", "-й", "-Я"), token['obj'])\ + and not find_building(token,pre_token): return "h" - # не работает - if re.search(r"\d{1,3}", token['obj']) and ("s" in pre_token['class'] or "h" in pre_token['class']): - return "h" return "" def find_street(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(STREET_PREFIXES, token['obj']) \ - or (re.search(r"[А-Я]{1}[а-я]+", token['obj']) and "s" in pre_token['class']): + if any_of_in(STREET_PREFIXES, token['obj'].lower()) \ + or re.search(r"[А-Я]{1}[а-я]+ая", token['obj']): return "s" return "" +def find_territory(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(TERRITORY_PREFIXES, token['obj'].lower()): + return "t" + return "" +def find_countryside(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(COUNTRYSIDE_PREFIXES, token['obj'].lower()) \ + and not find_house(token,pre_token) \ + and not find_street(token,pre_token): + return "c" + return "" + +def find_district(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(DISTRICTS_PREFIXES, token['obj'].lower()): + return "d" + return "" + # TODO: переработать систему из if в нормальный вид и классификация чисел/букв def split_address(address: str) -> List[str]: @@ -125,43 +147,68 @@ def split_address(address: str) -> List[str]: pre_token = pd.Series(data=["", ""], index=['obj', 'class']) else: pre_token = tokens.iloc[i - 1] + + obj_class = find_district(cur_tk, pre_token) + if obj_class: + cur_tk["class"] += obj_class + if "d" in pre_token['class']: + res.append(accumulator) + accumulator = "" + accumulator += cur_tk["obj"] + obj_class = find_countryside(cur_tk, pre_token) + if obj_class: + cur_tk["class"] += obj_class + if "c" in pre_token['class']: + res.append(accumulator) + accumulator = "" + accumulator += cur_tk["obj"] + obj_class = find_territory(cur_tk, pre_token) + if obj_class: + cur_tk["class"] += obj_class + if "t" in pre_token['class']: + res.append(accumulator) + accumulator = "" + accumulator +=cur_tk["obj"] obj_class = find_street(cur_tk, pre_token) if obj_class: cur_tk["class"] += obj_class - if "s" in tokens['class'].iloc[i - 1]: + if "s" in pre_token['class']: res.append(accumulator) accumulator = "" - accumulator += tokens["obj"].iloc[i] + accumulator += cur_tk["obj"] obj_class = find_house(cur_tk, pre_token) if obj_class: cur_tk["class"] += obj_class - if "h" in tokens['class'].iloc[i - 1]: + if "h" in pre_token["class"]: res.append(accumulator) - num = re.findall("\d{,3}", tokens['obj'].iloc[i])[-1] - accumulator = re.sub(r"\d{,3} ?\/*\d* ?", num,accumulator) + num = re.findall("\d{1,4}", cur_tk['obj'])[-1] + accumulator = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator) else: - accumulator += tokens["obj"].iloc[i] + accumulator += cur_tk["obj"] obj_class = find_building(cur_tk, pre_token) if obj_class: cur_tk["class"] += obj_class - if "b" in tokens['class'].iloc[i - 1]: + if "b" in pre_token["class"]: res.append(accumulator) num = re.findall("\d", tokens['obj'].iloc[i])[-1] accumulator = re.sub(r"\d$", num, accumulator) else: - accumulator += tokens["obj"].iloc[i] + accumulator += pre_token["obj"] obj_class = find_litera(cur_tk, pre_token) if obj_class: cur_tk["class"] += obj_class - if "l" in tokens['class'].iloc[i - 1]: + if "l" in pre_token["class"]: res.append(accumulator) - num = re.findall("[А-яа-я]", tokens['obj'].iloc[i].strip())[-1] + num = re.findall("[А-яа-я]", cur_tk["obj"].strip())[-1] accumulator = re.sub(r"[А-яа-я]$", num, accumulator) else: - accumulator += tokens["obj"].iloc[i] + accumulator += cur_tk["obj"] if cur_tk['class'] == "": cur_tk['class'] = "w" - print(cur_tk) + tokens.iloc[i] = cur_tk + print(tokens.iloc[i]) + + # print(cur_tk) return res return [address] diff --git a/parser/util.py b/parser/util.py index fe36f0e..6b54416 100644 --- a/parser/util.py +++ b/parser/util.py @@ -10,12 +10,16 @@ from . import ( def pipeline(parser: Optional[LenenergoParser] = None) -> LenenergoParser: if parser is None: - parser = LenenergoParser() + parser = LenenergoParser(ndays=15) print(parser) parser.df = split_addresses(parser.df) + for i in range(len(parser.df)): + print(parser.df['Улица'].iloc[i]) + + parser.df = concurrent_fetch_builing_ids(parser.df) parser.df = preprocess_df(parser.df) From a71acc2ddf325b6af5406a70312adfde247e6d0f Mon Sep 17 00:00:00 2001 From: AnastasiaOnimovma Date: Sat, 21 Oct 2023 17:32:37 +0300 Subject: [PATCH 04/12] Working version of classification (not final) --- .gitignore | 3 +- parser/address.py | 225 ++++++++++++++++++++++++---------------------- parser/util.py | 6 +- 3 files changed, 121 insertions(+), 113 deletions(-) diff --git a/.gitignore b/.gitignore index 71444f4..fe14a37 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ __pycache__ .env data*.csv -.idea/ \ No newline at end of file +.idea/ +.ipynb_checkpoints \ No newline at end of file diff --git a/parser/address.py b/parser/address.py index b9d18c9..4c3bdca 100644 --- a/parser/address.py +++ b/parser/address.py @@ -7,50 +7,36 @@ import pandas as pd T = TypeVar("T") -CLASSES = ("d", "c", "t", "s", "h", "b", "l", "r", "w") -DISTRICTS_PREFIXES = ("мо ", "р-н") +CLASSES = ("w", "d", "c", "t", "s", "h", "b", "l", "r") + +DISTRICTS_PREFIXES = ("мо ", "р-н","городское","лесхоз") COUNTRYSIDE_PREFIXES = ( - " г", " п", " д", " гп", " рп", " кп", " пгт", " c", "хутор", " урочище" - "г.", "п.", "д.", "гп.", "рп.", "кп.", "пгт.", "c.") -TERRITORY_PREFIXES =("тер.", " тер", "снт ", "ст ", "дск ", "днп ", "дпк ", "нп ", "пдк ", "т/б ", "садоводство", "массив", "хоз","сад-во","с-во") + "г", "п", "д", "гп", "рп", "кп", "пгт", "c", "хутор", " урочище") +TERRITORY_PREFIXES = ( +"тер.", " тер", "снт ", "ст ", "дск ", "днп ", "дпк ", "нп ", "пдк ", "т/б ", "садоводство", "массив", "хозя", "сад-во") STREET_PREFIXES = ( - " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе","линия","аллея", "мост", " парк", "кольцо","проезд", "съезд", + " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе", "лини", "аллея", + "мост", " парк", "кольцо", "проезд", "съезд","переулок", "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.") -HOUSES_PREFIXES = ("д.", "уч.", "участок","мкд","тп") -BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение","корпус") -LETTER = ("лит.", "литера"," л.") +HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп","дом") +BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение", "корпус") +LETTER = ("лит.", "литера", " л.") -def unfold_house_ranges(address: str, token: str) -> List[str]: - adresses = [] +def unfold_house_ranges(token: str) -> List[str]: + addresses = [] pairs_strings = re.findall(r"([\d]+-[\d]+)", token) for pair_string in pairs_strings: a, b = pair_string.split("-") a, b = int(a), int(b) if b > a: - token = token.replace(pair_string, "") - adresses += [address + " " + token + number for number in map(str, range(a, b + 1))] - - + addresses += [re.sub(r"([\d]+-[\d]+)", number, token) for number in map(str, range(a, b + 1))] else: token = token.replace("-", "/") - adresses += address + " " + token - if not adresses: - adresses.append(address + " " + token) - return adresses - - -def unfold_houses_list(token: str) -> List[str]: - token = unfold_house_ranges(token) - - reg = re.compile(r"(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )") - - if len(re.findall(reg, token)) > 1: - tokens = token.split(",") - return [*[tokens[0] + " " + house_token for house_token in tokens[1:]]] - return [token] - + if not addresses: + addresses.append(token) + return addresses def any_of_in(substrings: Iterable[str], string: str) -> bool: return any(map(lambda substring: substring in string, substrings)) @@ -59,19 +45,21 @@ def any_of_in(substrings: Iterable[str], string: str) -> bool: def flatten(arr: Iterable[List[T]]) -> List[T]: return sum(arr, []) + def find_room(token: pd.Series, pre_token: pd.Series) -> str: - if re.search(r"пом\.?", token['obj']): + if re.search(r"пом\.?", token['obj']): return "r" return "" + def find_litera(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(LETTER, token['obj'].lower()) \ or re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token['obj']): return "l" if (re.search(r"\b([А-Я]|[а-я]){1}$", token['obj']) \ - and ("l" in pre_token['class'] or "h" in pre_token['class'])) \ + and ("l" in pre_token['class'] or "h" in pre_token['class'])) \ and not (" ш" in token["obj"]) \ - and not find_countryside(token,pre_token): + and not find_countryside(token, pre_token): return "l" return "" @@ -79,7 +67,7 @@ def find_litera(token: pd.Series, pre_token: pd.Series) -> str: def find_building(token: pd.Series, pre_token: pd.Series) -> str: if re.search(r"\d", token['obj']): if any_of_in(BUILDING_PREFIXES, token['obj'].lower()) \ - or "b" in pre_token['class'] and not ("h" in token['class'])\ + or "b" in pre_token['class'] and not ("h" in token['class']) \ or re.search(r"к\.* ?\d", token['obj']): return "b" return "" @@ -92,36 +80,58 @@ def find_house(token: pd.Series, pre_token: pd.Series) -> str: if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token['obj']): return "h" if ("s" in pre_token['class'] or "h" in pre_token['class'] or "s" in token['class']) \ - and not any_of_in(("-я", "-й", "-Я"), token['obj'])\ - and not find_building(token,pre_token): + and not any_of_in(("-я", "-й", "-Я"), token['obj']) \ + and not find_building(token, pre_token): return "h" return "" def find_street(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(STREET_PREFIXES, token['obj'].lower()) \ - or re.search(r"[А-Я]{1}[а-я]+ая", token['obj']): + or re.search(r"[а-я]+ая", token['obj']): return "s" return "" + def find_territory(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(TERRITORY_PREFIXES, token['obj'].lower()): return "t" return "" + + def find_countryside(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(COUNTRYSIDE_PREFIXES, token['obj'].lower()) \ - and not find_house(token,pre_token) \ - and not find_street(token,pre_token): + and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token['obj']) \ + and not find_house(token, pre_token) \ + and not find_street(token, pre_token): return "c" return "" + def find_district(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(DISTRICTS_PREFIXES, token['obj'].lower()): return "d" return "" +def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: + brackets = re.search(r"\(.+\)", token["obj"]) + if brackets: + token["obj"] = re.sub(r"\(.+\)", "()", token["obj"]) + token["class"] += find_district(token, pre_token) + token["class"] += find_countryside(token, pre_token) + token["class"] += find_territory(token, pre_token) + token["class"] += find_street(token, pre_token) + token["class"] += find_house(token, pre_token) + token["class"] += find_building(token, pre_token) + token["class"] += find_litera(token, pre_token) + if token['class'] == "": + token['class'] = "w" + if brackets: + token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"]) + return token -# TODO: переработать систему из if в нормальный вид и классификация чисел/букв + +# TODO: переработать систему из if в нормальный вид def split_address(address: str) -> List[str]: if ";" in address: address = address.replace(";", ",") @@ -129,18 +139,15 @@ def split_address(address: str) -> List[str]: tokens = address.split(",") t = list(map(str.strip, filter(lambda token: token != "", tokens))) - # токены в датафрэйм + tokens = pd.DataFrame() tokens['obj'] = t + tokens = tokens[tokens["obj"] != ""] tokens.insert(len(tokens.columns), "class", "") res = [] - accumulator = "" + accumulator = pd.Series(data={"address": "", "class": ""}) for i in range(len(tokens)): - - # TODO: напселённые пункты - # if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower()) - # accumulator += tokens[i] cur_tk = tokens.iloc[i] if i == 0: @@ -148,71 +155,72 @@ def split_address(address: str) -> List[str]: else: pre_token = tokens.iloc[i - 1] - obj_class = find_district(cur_tk, pre_token) - if obj_class: - cur_tk["class"] += obj_class - if "d" in pre_token['class']: - res.append(accumulator) - accumulator = "" - accumulator += cur_tk["obj"] - obj_class = find_countryside(cur_tk, pre_token) - if obj_class: - cur_tk["class"] += obj_class - if "c" in pre_token['class']: - res.append(accumulator) - accumulator = "" - accumulator += cur_tk["obj"] - obj_class = find_territory(cur_tk, pre_token) - if obj_class: - cur_tk["class"] += obj_class - if "t" in pre_token['class']: - res.append(accumulator) - accumulator = "" - accumulator +=cur_tk["obj"] - obj_class = find_street(cur_tk, pre_token) - if obj_class: - cur_tk["class"] += obj_class - if "s" in pre_token['class']: - res.append(accumulator) - accumulator = "" - accumulator += cur_tk["obj"] - obj_class = find_house(cur_tk, pre_token) - if obj_class: - cur_tk["class"] += obj_class - if "h" in pre_token["class"]: - res.append(accumulator) - num = re.findall("\d{1,4}", cur_tk['obj'])[-1] - accumulator = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator) - else: - accumulator += cur_tk["obj"] - obj_class = find_building(cur_tk, pre_token) - if obj_class: - cur_tk["class"] += obj_class - if "b" in pre_token["class"]: - res.append(accumulator) - num = re.findall("\d", tokens['obj'].iloc[i])[-1] - accumulator = re.sub(r"\d$", num, accumulator) - else: - accumulator += pre_token["obj"] - obj_class = find_litera(cur_tk, pre_token) - if obj_class: - cur_tk["class"] += obj_class - if "l" in pre_token["class"]: - res.append(accumulator) - num = re.findall("[А-яа-я]", cur_tk["obj"].strip())[-1] - accumulator = re.sub(r"[А-яа-я]$", num, accumulator) - else: - accumulator += cur_tk["obj"] - if cur_tk['class'] == "": - cur_tk['class'] = "w" + cur_tk = address_classification(cur_tk, pre_token) tokens.iloc[i] = cur_tk print(tokens.iloc[i]) - # print(cur_tk) + if not accumulator["class"]: + accumulator["class"] = cur_tk['class'] + accumulator["address"] = cur_tk["obj"] + continue + if CLASSES.index(accumulator["class"][-1]) < CLASSES.index(cur_tk["class"][0]) and accumulator["class"]!="w": + accumulator["class"] += cur_tk['class'] + accumulator["address"] += " " + cur_tk["obj"] + else: + ad_no_ranges = unfold_house_ranges(accumulator["address"]) + accumulator["address"] = ad_no_ranges[-1] + res.extend(ad_no_ranges) + while accumulator["class"] and CLASSES.index(accumulator["class"][-1]) > CLASSES.index(cur_tk["class"][0]): + if accumulator["class"][-1] == "h": + accumulator["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", accumulator["address"].lower()) + elif accumulator["class"][-1] == "b": + num = re.findall("к{0,1}\.? ?\d", accumulator["address"])[-1] + accumulator["address"] = re.sub(num, "", accumulator["address"]) + elif accumulator["class"][-1] == "l": + accumulator ["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$","", accumulator["address"]) + elif accumulator["class"][-1] == "r": + accumulator["address"] = re.sub(r"пом\.? ?\d+","", accumulator["address"]) + accumulator["class"] = accumulator["class"][:-1] + if not accumulator["class"] or CLASSES.index(cur_tk["class"][0]) <= CLASSES.index("s") or accumulator["class"]=="w": + accumulator["class"] = cur_tk["class"] + accumulator["address"] = cur_tk["obj"] + if cur_tk["class"][0] == "h": + num = re.findall("\d{1,4} ?\/?\d* ?", cur_tk['obj'])[0] + accumulator["address"] = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator["address"]) + cur_tk["class"] =cur_tk["class"][1:] + if cur_tk["class"] and cur_tk["class"][0] == "b": + num = re.findall("\d", cur_tk["obj"])[-1] + if num and not "b" in accumulator["class"]: + accumulator["class"] += "b" + accumulator["address"] += "к." + num + else: + accumulator["address"] = re.sub(r"\d$", num, accumulator["address"]) + cur_tk["class"] = cur_tk["class"][1:] + + if cur_tk["class"] and cur_tk["class"][0] == "l": + num = re.findall("[А-Яа-я]", cur_tk["obj"].strip())[-1] + accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip()) + accumulator["address"] += num + if num and not "l" in accumulator["class"]: + accumulator["class"] += "l" + else: + if re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", accumulator["address"]): + accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip()) + res.extend(unfold_house_ranges(accumulator["address"])) + print(res) return res return [address] +def split_pesoch_res(address: str) -> List[str]: + t = re.sub(r",", " ", address) + t = re.split(r"(Санкт-Петербург|Ленинградская обл|Л\.О)", t) + t = list(map(str.strip, filter(lambda token: token != "", t))) + tokens = [t[i] + " " + t[i+1] for i in range(0, len(t)-1, 2)] + + if tokens: + return list(set(tokens)) + return [address] def process_row(row: pd.Series[str]) -> pd.Series[str]: row = row.copy() @@ -220,7 +228,10 @@ def process_row(row: pd.Series[str]) -> pd.Series[str]: if pd.isnull(row["Улица"]): row["Улица"] = [None] else: - addresses = split_address(row["Улица"]) + if row["РЭС"] == "Песочинский РЭС": + addresses = split_pesoch_res(row["Улица"]) + else: + addresses = split_address(row["Улица"]) row["Улица"] = addresses return row @@ -229,4 +240,4 @@ def process_row(row: pd.Series[str]) -> pd.Series[str]: def split_addresses(df: pd.DataFrame) -> pd.DataFrame: merged_df = df.apply(process_row, axis=1).reset_index() - return merged_df.explode("Улица", ignore_index=True) + return merged_df.explode("Улица", ignore_index=True) \ No newline at end of file diff --git a/parser/util.py b/parser/util.py index 6b54416..983c9b8 100644 --- a/parser/util.py +++ b/parser/util.py @@ -10,16 +10,12 @@ from . import ( def pipeline(parser: Optional[LenenergoParser] = None) -> LenenergoParser: if parser is None: - parser = LenenergoParser(ndays=15) + parser = LenenergoParser(file_path = r"C:\Users\Юля\PycharmProjects\machine_learning\lenengro_parser\data_Rosseti.csv") print(parser) parser.df = split_addresses(parser.df) - for i in range(len(parser.df)): - print(parser.df['Улица'].iloc[i]) - - parser.df = concurrent_fetch_builing_ids(parser.df) parser.df = preprocess_df(parser.df) From 1fd7a123f93a5f160fc64dd45d692748a4ad4f4e Mon Sep 17 00:00:00 2001 From: AnastasiaOnimovma Date: Sat, 21 Oct 2023 18:12:36 +0300 Subject: [PATCH 05/12] New fuctions --- parser/address.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/parser/address.py b/parser/address.py index 4c3bdca..73f53e4 100644 --- a/parser/address.py +++ b/parser/address.py @@ -130,6 +130,21 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"]) return token +def cut_address(ad: pd.Series, cl: str) -> pd.Series: + while ad["class"] and CLASSES.index(ad["class"][-1]) > CLASSES.index(cl[0]): + if ad["class"][-1] == "h": + ad["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", + ad["address"].lower()) + elif ad["class"][-1] == "b": + num = re.findall("к{0,1}\.? ?\d", ad["address"])[-1] + ad["address"] = re.sub(num, "", ad["address"]) + elif ad["class"][-1] == "l": + ad["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$", "", ad["address"]) + elif ad["class"][-1] == "r": + ad["address"] = re.sub(r"пом\.? ?\d+", "", ad["address"]) + ad["class"] = ad["class"][:-1] + return ad + # TODO: переработать систему из if в нормальный вид def split_address(address: str) -> List[str]: @@ -163,31 +178,27 @@ def split_address(address: str) -> List[str]: accumulator["class"] = cur_tk['class'] accumulator["address"] = cur_tk["obj"] continue + if CLASSES.index(accumulator["class"][-1]) < CLASSES.index(cur_tk["class"][0]) and accumulator["class"]!="w": accumulator["class"] += cur_tk['class'] accumulator["address"] += " " + cur_tk["obj"] else: ad_no_ranges = unfold_house_ranges(accumulator["address"]) accumulator["address"] = ad_no_ranges[-1] + res.extend(ad_no_ranges) - while accumulator["class"] and CLASSES.index(accumulator["class"][-1]) > CLASSES.index(cur_tk["class"][0]): - if accumulator["class"][-1] == "h": - accumulator["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", accumulator["address"].lower()) - elif accumulator["class"][-1] == "b": - num = re.findall("к{0,1}\.? ?\d", accumulator["address"])[-1] - accumulator["address"] = re.sub(num, "", accumulator["address"]) - elif accumulator["class"][-1] == "l": - accumulator ["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$","", accumulator["address"]) - elif accumulator["class"][-1] == "r": - accumulator["address"] = re.sub(r"пом\.? ?\d+","", accumulator["address"]) - accumulator["class"] = accumulator["class"][:-1] + + accumulator = cut_address(accumulator, cur_tk["class"]) + if not accumulator["class"] or CLASSES.index(cur_tk["class"][0]) <= CLASSES.index("s") or accumulator["class"]=="w": accumulator["class"] = cur_tk["class"] accumulator["address"] = cur_tk["obj"] + if cur_tk["class"][0] == "h": num = re.findall("\d{1,4} ?\/?\d* ?", cur_tk['obj'])[0] accumulator["address"] = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator["address"]) cur_tk["class"] =cur_tk["class"][1:] + if cur_tk["class"] and cur_tk["class"][0] == "b": num = re.findall("\d", cur_tk["obj"])[-1] if num and not "b" in accumulator["class"]: From 259c71b17bfee77ba4e71eed2a26423319af9b2e Mon Sep 17 00:00:00 2001 From: AnastasiaOnimovma Date: Mon, 23 Oct 2023 00:42:27 +0300 Subject: [PATCH 06/12] Street recognition --- parser/address.py | 73 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 57 insertions(+), 16 deletions(-) diff --git a/parser/address.py b/parser/address.py index 73f53e4..2d94cae 100644 --- a/parser/address.py +++ b/parser/address.py @@ -7,8 +7,7 @@ import pandas as pd T = TypeVar("T") -CLASSES = ("w", "d", "c", "t", "s", "h", "b", "l", "r") - +CLASSES = ("w", "d", "c", "t", "s", "h", "b","e", "l", "r") DISTRICTS_PREFIXES = ("мо ", "р-н","городское","лесхоз") COUNTRYSIDE_PREFIXES = ( "г", "п", "д", "гп", "рп", "кп", "пгт", "c", "хутор", " урочище") @@ -18,9 +17,11 @@ STREET_PREFIXES = ( " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе", "лини", "аллея", "мост", " парк", "кольцо", "проезд", "съезд","переулок", "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.") -HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп","дом") -BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение", "корпус") +HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп","дом","дома") +BUILDING_PREFIXES = ("к.", "к ","корп", "корпус") +EDIFICE_PREFIXES=("стр.", "строение") LETTER = ("лит.", "литера", " л.") +PREFIXES = (DISTRICTS_PREFIXES, COUNTRYSIDE_PREFIXES, TERRITORY_PREFIXES, STREET_PREFIXES, HOUSES_PREFIXES, BUILDING_PREFIXES, EDIFICE_PREFIXES,LETTER) def unfold_house_ranges(token: str) -> List[str]: @@ -47,12 +48,14 @@ def flatten(arr: Iterable[List[T]]) -> List[T]: def find_room(token: pd.Series, pre_token: pd.Series) -> str: - if re.search(r"пом\.?", token['obj']): + if re.search(r"\bпом\.?", token['obj']): return "r" return "" def find_litera(token: pd.Series, pre_token: pd.Series) -> str: + if find_room(token, pre_token): + return "" if any_of_in(LETTER, token['obj'].lower()) \ or re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token['obj']): return "l" @@ -62,33 +65,47 @@ def find_litera(token: pd.Series, pre_token: pd.Series) -> str: and not find_countryside(token, pre_token): return "l" return "" - +def find_edifice(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(EDIFICE_PREFIXES, token['obj'].lower()): + return "e" + return "" def find_building(token: pd.Series, pre_token: pd.Series) -> str: - if re.search(r"\d", token['obj']): + if re.search(r"\d", token['obj']) and not find_room(token,pre_token): if any_of_in(BUILDING_PREFIXES, token['obj'].lower()) \ - or "b" in pre_token['class'] and not ("h" in token['class']) \ + or "b" in pre_token['class'] and not ("h" in token['class']) and not find_edifice(token,pre_token)\ or re.search(r"к\.* ?\d", token['obj']): return "b" return "" def find_house(token: pd.Series, pre_token: pd.Series) -> str: - if re.search(r"\d{1,4}", token['obj']): + if re.search(r"\d{1,4}", token['obj']) and not find_room(token,pre_token): if any_of_in(HOUSES_PREFIXES, token['obj'].lower()): return "h" if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token['obj']): return "h" if ("s" in pre_token['class'] or "h" in pre_token['class'] or "s" in token['class']) \ and not any_of_in(("-я", "-й", "-Я"), token['obj']) \ - and not find_building(token, pre_token): + and not find_building(token, pre_token)\ + and not find_edifice(token,pre_token): return "h" + if find_building(token, pre_token) \ + and not any_of_in(("-я", "-й", "-Я"), token['obj']) \ + and True: + if len(re.findall(r"\d{1,4}", token['obj'])) > 1: + return "h" + if int(re.search(r"\d{1,4}", token['obj']).group()) // 10 >0: + return "h" return "" def find_street(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(STREET_PREFIXES, token['obj'].lower()) \ - or re.search(r"[а-я]+ая", token['obj']): + if any_of_in(STREET_PREFIXES, token['obj'].lower()): + return "s" + if re.search(r"\b[А-Яа-я]{4,}\b", token['obj']) \ + and not any([el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el)>2]) \ + and not ("d" in token["class"] or "t" in token["class"] or "c" in token["class"]): return "s" return "" @@ -103,7 +120,7 @@ def find_countryside(token: pd.Series, pre_token: pd.Series) -> str: if any_of_in(COUNTRYSIDE_PREFIXES, token['obj'].lower()) \ and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token['obj']) \ and not find_house(token, pre_token) \ - and not find_street(token, pre_token): + and not any_of_in(STREET_PREFIXES, token['obj'].lower()): return "c" return "" @@ -123,7 +140,9 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: token["class"] += find_street(token, pre_token) token["class"] += find_house(token, pre_token) token["class"] += find_building(token, pre_token) + token["class"] += find_edifice(token, pre_token) token["class"] += find_litera(token, pre_token) + token["class"] += find_room(token, pre_token) if token['class'] == "": token['class'] = "w" if brackets: @@ -138,6 +157,8 @@ def cut_address(ad: pd.Series, cl: str) -> pd.Series: elif ad["class"][-1] == "b": num = re.findall("к{0,1}\.? ?\d", ad["address"])[-1] ad["address"] = re.sub(num, "", ad["address"]) + elif ad["class"][-1] == "e": + ad["address"] = re.sub(r"cтр\.? ?\d", "", ad["address"]) elif ad["class"][-1] == "l": ad["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$", "", ad["address"]) elif ad["class"][-1] == "r": @@ -157,7 +178,8 @@ def split_address(address: str) -> List[str]: tokens = pd.DataFrame() tokens['obj'] = t - tokens = tokens[tokens["obj"] != ""] + for el in ("", "уг.", "д."): + tokens = tokens[tokens["obj"] != el] tokens.insert(len(tokens.columns), "class", "") res = [] accumulator = pd.Series(data={"address": "", "class": ""}) @@ -195,8 +217,14 @@ def split_address(address: str) -> List[str]: accumulator["address"] = cur_tk["obj"] if cur_tk["class"][0] == "h": - num = re.findall("\d{1,4} ?\/?\d* ?", cur_tk['obj'])[0] - accumulator["address"] = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator["address"]) + num = re.findall("\d{1,4} ?[\/\-]?\d* ?", cur_tk['obj'])[0] + if any_of_in(("-я", "-й", "-Я"), accumulator["address"]): + idx = 1 + else: + idx = 0 + num_ac = re.findall("\d{1,4} ?[\/\-]?\d* ?", accumulator["address"]) + if num_ac: + accumulator["address"] = re.sub(num_ac[idx], num, accumulator["address"]) cur_tk["class"] =cur_tk["class"][1:] if cur_tk["class"] and cur_tk["class"][0] == "b": @@ -208,6 +236,13 @@ def split_address(address: str) -> List[str]: accumulator["address"] = re.sub(r"\d$", num, accumulator["address"]) cur_tk["class"] = cur_tk["class"][1:] + if cur_tk["class"] and cur_tk["class"][0] == "e": + num = re.findall("стр\.? ?\d", cur_tk["obj"].strip())[-1] + accumulator["address"] = re.sub(r"cтр\. ?\d", num, accumulator["address"].strip()) + if num and not "e" in accumulator["class"]: + accumulator["class"] += "e" + cur_tk["class"] = cur_tk["class"][1:] + if cur_tk["class"] and cur_tk["class"][0] == "l": num = re.findall("[А-Яа-я]", cur_tk["obj"].strip())[-1] accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip()) @@ -217,6 +252,12 @@ def split_address(address: str) -> List[str]: else: if re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", accumulator["address"]): accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip()) + if cur_tk["class"] and cur_tk["class"][0] == "r": + num = re.findall("пом\. ?\-?\d*\w?", cur_tk["obj"].strip())[-1] + accumulator["address"] = re.sub(r"пом\. ?\d\-?\d*\w?", num, accumulator["address"].strip()) + if num and not "r" in accumulator["class"]: + accumulator["class"] += "r" + cur_tk["class"] = cur_tk["class"][1:] res.extend(unfold_house_ranges(accumulator["address"])) print(res) return res From 5722fc86fbb8964e62a704f332f61008264e0b82 Mon Sep 17 00:00:00 2001 From: dm1sh Date: Sun, 29 Oct 2023 10:44:08 +0300 Subject: [PATCH 07/12] Rewrote split_address as a class AddressSplitter --- parser/__main__.py | 4 +- parser/address.py | 218 ++++++++++++++++++++++++++++++++------------- parser/util.py | 2 +- 3 files changed, 160 insertions(+), 64 deletions(-) diff --git a/parser/__main__.py b/parser/__main__.py index de668c0..5616d83 100644 --- a/parser/__main__.py +++ b/parser/__main__.py @@ -3,11 +3,11 @@ import time import schedule -from . import pipeline +from . import pipeline, LenenergoParser def job(): - parser = pipeline() + parser = pipeline(LenenergoParser(file_path="./data.csv")) parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H.%M")}.csv') diff --git a/parser/address.py b/parser/address.py index 73f53e4..4239bf7 100644 --- a/parser/address.py +++ b/parser/address.py @@ -2,6 +2,7 @@ from __future__ import annotations import re from typing import Iterable, List, TypeVar +from collections.abc import Sequence import pandas as pd @@ -146,82 +147,177 @@ def cut_address(ad: pd.Series, cl: str) -> pd.Series: return ad -# TODO: переработать систему из if в нормальный вид -def split_address(address: str) -> List[str]: - if ";" in address: - address = address.replace(";", ",") - if "," in address: - tokens = address.split(",") +def is_nonempty_str(string: str) -> bool: + return string != "" - t = list(map(str.strip, filter(lambda token: token != "", tokens))) - tokens = pd.DataFrame() - tokens['obj'] = t - tokens = tokens[tokens["obj"] != ""] - tokens.insert(len(tokens.columns), "class", "") - res = [] - accumulator = pd.Series(data={"address": "", "class": ""}) +def create_token(obj: str = "", token_class: str = ""): + return pd.Series( + { + "obj": obj, + "class": token_class, + } + ) - for i in range(len(tokens)): - cur_tk = tokens.iloc[i] - if i == 0: - pre_token = pd.Series(data=["", ""], index=['obj', 'class']) - else: - pre_token = tokens.iloc[i - 1] +class AddressSplitter(Sequence): + addresses: list[str] + tokens: list[pd.Series] - cur_tk = address_classification(cur_tk, pre_token) - tokens.iloc[i] = cur_tk - print(tokens.iloc[i]) + def __init__(self, address: str): + self.input = address - if not accumulator["class"]: - accumulator["class"] = cur_tk['class'] - accumulator["address"] = cur_tk["obj"] + self.addresses = self.split() + + if len(self.addresses) == 0: + self.addresses = [address] + + # Sequence abstract methods implementation + + def __getitem__(self, key: int): + if key < len(self.addresses): + return self.addresses[key] + else: + raise IndexError() + + def __len__(self): + return len(self.addresses) + + # Address token class manipulations + + def next_class(self) -> str: + return self.token["class"][0] + + def correct_order(self) -> bool: + prev_class = self.accumulator["class"][-1] + + return ( + CLASSES.index(prev_class) < CLASSES.index(self.next_class()) + and self.accumulator["class"] != "w" + ) + + def next_class_is(self, comparing_class: str) -> bool: + return len(self.token["class"]) > 0 and self.next_class() == comparing_class[0] + + def pop_token_class(self): + self.token["class"] = self.token["class"][1:] + + def has_no_class(self, comparing_class: str) -> bool: + return comparing_class[0] not in self.accumulator["class"] + + def next_is_street_or_upper(self) -> bool: + is_unknown_class = self.accumulator["class"] in ("", "w") + + return ( + CLASSES.index(self.next_class()) <= CLASSES.index("s") or is_unknown_class + ) + + # Accumulator manipulation + + def substitue_house(self) -> str: + num = re.findall(r"\d{1,4} ?\/?\d* ?", self.token["obj"])[0] + + return re.sub(r"\d{1,4} ?\/*\d* ?", num, self.accumulator["address"]) + + def append_building(self, num: int) -> pd.Series: + self.accumulator["class"] += "b" + self.accumulator["address"] += "к." + num + + return self.accumulator + + def substitue_building(self, num: int) -> str: + return re.sub(r"\d$", num, self.accumulator["address"]) + + def insert_building(self): + number = re.findall(r"\d", self.token["obj"])[-1] + + if number and self.has_no_class("building"): + self.accumulator = self.append_building(number) + else: + self.accumulator["address"] = self.substitue_building(number) + + def without_letter(self) -> str: + return re.sub(r"[А-Яа-я]$", "", self.accumulator["address"].strip()) + + def substitue_letter(self, letter: str) -> str: + address_without_letter = self.without_letter() + + return address_without_letter + letter + + def insert_letter(self): + letter = re.findall(r"[А-Яа-я]", self.token["obj"].strip())[-1] + self.accumulator["address"] = self.substitue_letter(letter) + + if letter and self.has_no_class("litera"): + self.accumulator["class"] += "l" + + def has_letter_in(self) -> bool: + return ( + re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", self.accumulator["address"]) + is not None + ) + + # Data preprocessing + + def split_tokens(self) -> list[pd.Series]: + address = self.input.replace(";", ",") + + parts = address.split(",") + parts = map(str.strip, parts) + parts = filter(is_nonempty_str, parts) + + tokens = map(lambda part: create_token(part, ""), parts) + + return list(tokens) + + def split(self): + self.tokens = self.split_tokens() + + result = [] + + self.accumulator = pd.Series({"address": "", "class": ""}) + + prev_token = create_token() + + for cursor in self.tokens: + self.token = address_classification(cursor, prev_token) + prev_token = self.token.copy() + + if self.accumulator["class"] == "": + self.accumulator = self.token.rename({"obj": "address"}) continue - if CLASSES.index(accumulator["class"][-1]) < CLASSES.index(cur_tk["class"][0]) and accumulator["class"]!="w": - accumulator["class"] += cur_tk['class'] - accumulator["address"] += " " + cur_tk["obj"] + if self.correct_order(): + self.accumulator["address"] += " " + self.accumulator += self.token.rename({"obj": "address"}) else: - ad_no_ranges = unfold_house_ranges(accumulator["address"]) - accumulator["address"] = ad_no_ranges[-1] + unfolded_address = unfold_house_ranges(self.accumulator["address"]) + self.accumulator["address"] = unfolded_address[-1] - res.extend(ad_no_ranges) + result.extend(unfolded_address) - accumulator = cut_address(accumulator, cur_tk["class"]) + self.accumulator = cut_address(self.accumulator, self.token["class"]) - if not accumulator["class"] or CLASSES.index(cur_tk["class"][0]) <= CLASSES.index("s") or accumulator["class"]=="w": - accumulator["class"] = cur_tk["class"] - accumulator["address"] = cur_tk["obj"] + if self.next_is_street_or_upper(): + self.accumulator = self.token.rename({"obj": "address"}) - if cur_tk["class"][0] == "h": - num = re.findall("\d{1,4} ?\/?\d* ?", cur_tk['obj'])[0] - accumulator["address"] = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator["address"]) - cur_tk["class"] =cur_tk["class"][1:] + if self.next_class_is("house"): + self.accumulator["address"] = self.substitue_house() + self.pop_token_class() - if cur_tk["class"] and cur_tk["class"][0] == "b": - num = re.findall("\d", cur_tk["obj"])[-1] - if num and not "b" in accumulator["class"]: - accumulator["class"] += "b" - accumulator["address"] += "к." + num - else: - accumulator["address"] = re.sub(r"\d$", num, accumulator["address"]) - cur_tk["class"] = cur_tk["class"][1:] + if self.next_class_is("building"): + self.insert_building() + self.pop_token_class() - if cur_tk["class"] and cur_tk["class"][0] == "l": - num = re.findall("[А-Яа-я]", cur_tk["obj"].strip())[-1] - accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip()) - accumulator["address"] += num - if num and not "l" in accumulator["class"]: - accumulator["class"] += "l" - else: - if re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", accumulator["address"]): - accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip()) - res.extend(unfold_house_ranges(accumulator["address"])) - print(res) - return res + if self.next_class_is("letter"): + self.insert_letter() + elif self.has_letter_in(): + self.accumulator["address"] = self.without_letter() + + result.extend(unfold_house_ranges(self.accumulator["address"])) + + return result - return [address] def split_pesoch_res(address: str) -> List[str]: t = re.sub(r",", " ", address) @@ -242,7 +338,7 @@ def process_row(row: pd.Series[str]) -> pd.Series[str]: if row["РЭС"] == "Песочинский РЭС": addresses = split_pesoch_res(row["Улица"]) else: - addresses = split_address(row["Улица"]) + addresses = AddressSplitter(row["Улица"]) row["Улица"] = addresses return row diff --git a/parser/util.py b/parser/util.py index 983c9b8..9fad768 100644 --- a/parser/util.py +++ b/parser/util.py @@ -10,7 +10,7 @@ from . import ( def pipeline(parser: Optional[LenenergoParser] = None) -> LenenergoParser: if parser is None: - parser = LenenergoParser(file_path = r"C:\Users\Юля\PycharmProjects\machine_learning\lenengro_parser\data_Rosseti.csv") + parser = LenenergoParser(parser) print(parser) From e6af86703e8a84953d75b02530dee5f735e783c1 Mon Sep 17 00:00:00 2001 From: dm1sh Date: Sun, 29 Oct 2023 12:24:49 +0300 Subject: [PATCH 08/12] Applied formatter --- parser/__main__.py | 2 +- parser/address.py | 220 +++++++++++++++++++++++++++++++++------------ 2 files changed, 162 insertions(+), 60 deletions(-) diff --git a/parser/__main__.py b/parser/__main__.py index 5616d83..b9de621 100644 --- a/parser/__main__.py +++ b/parser/__main__.py @@ -3,7 +3,7 @@ import time import schedule -from . import pipeline, LenenergoParser +from . import LenenergoParser, pipeline def job(): diff --git a/parser/address.py b/parser/address.py index 28769f7..d9111dd 100644 --- a/parser/address.py +++ b/parser/address.py @@ -1,28 +1,90 @@ from __future__ import annotations import re -from typing import Iterable, List, TypeVar from collections.abc import Sequence +from typing import Iterable, List, TypeVar import pandas as pd T = TypeVar("T") CLASSES = ("w", "d", "c", "t", "s", "h", "b", "e", "l", "r") -DISTRICTS_PREFIXES = ("мо ", "р-н","городское","лесхоз") +DISTRICTS_PREFIXES = ("мо ", "р-н", "городское", "лесхоз") COUNTRYSIDE_PREFIXES = ( - "г", "п", "д", "гп", "рп", "кп", "пгт", "c", "хутор", " урочище") + "г", + "п", + "д", + "гп", + "рп", + "кп", + "пгт", + "c", + "хутор", + " урочище", +) TERRITORY_PREFIXES = ( -"тер.", " тер", "снт ", "ст ", "дск ", "днп ", "дпк ", "нп ", "пдк ", "т/б ", "садоводство", "массив", "хозя", "сад-во") + "тер.", + " тер", + "снт ", + "ст ", + "дск ", + "днп ", + "дпк ", + "нп ", + "пдк ", + "т/б ", + "садоводство", + "массив", + "хозя", + "сад-во", +) STREET_PREFIXES = ( - " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе", "лини", "аллея", - "мост", " парк", "кольцо", "проезд", "съезд","переулок", - "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.") -HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп","дом","дома") -BUILDING_PREFIXES = ("к.", "к ","корп", "корпус") -EDIFICE_PREFIXES=("стр.", "строение") + " ул", + " бул", + " пр", + " ш", + " пер", + " дор", + " маг", + " наб", + " пл", + " просп", + " туп", + "шоссе", + "лини", + "аллея", + "мост", + " парк", + "кольцо", + "проезд", + "съезд", + "переулок", + "ул.", + "бул.", + "пр.", + "ш.", + "пер.", + "дор.", + "маг.", + "наб.", + "пл.", + "просп.", + "туп.", +) +HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп", "дом", "дома") +BUILDING_PREFIXES = ("к.", "к ", "корп", "корпус") +EDIFICE_PREFIXES = ("стр.", "строение") LETTER = ("лит.", "литера", " л.") -PREFIXES = (DISTRICTS_PREFIXES, COUNTRYSIDE_PREFIXES, TERRITORY_PREFIXES, STREET_PREFIXES, HOUSES_PREFIXES, BUILDING_PREFIXES, EDIFICE_PREFIXES,LETTER) +PREFIXES = ( + DISTRICTS_PREFIXES, + COUNTRYSIDE_PREFIXES, + TERRITORY_PREFIXES, + STREET_PREFIXES, + HOUSES_PREFIXES, + BUILDING_PREFIXES, + EDIFICE_PREFIXES, + LETTER, +) def unfold_house_ranges(token: str) -> List[str]: @@ -33,13 +95,17 @@ def unfold_house_ranges(token: str) -> List[str]: a, b = int(a), int(b) if b > a: - addresses += [re.sub(r"([\d]+-[\d]+)", number, token) for number in map(str, range(a, b + 1))] + addresses += [ + re.sub(r"([\d]+-[\d]+)", number, token) + for number in map(str, range(a, b + 1)) + ] else: token = token.replace("-", "/") if not addresses: addresses.append(token) return addresses + def any_of_in(substrings: Iterable[str], string: str) -> bool: return any(map(lambda substring: substring in string, substrings)) @@ -49,7 +115,7 @@ def flatten(arr: Iterable[List[T]]) -> List[T]: def find_room(token: pd.Series, pre_token: pd.Series) -> str: - if re.search(r"\bпом\.?", token['obj']): + if re.search(r"\bпом\.?", token["obj"]): return "r" return "" @@ -57,80 +123,109 @@ def find_room(token: pd.Series, pre_token: pd.Series) -> str: def find_litera(token: pd.Series, pre_token: pd.Series) -> str: if find_room(token, pre_token): return "" - if any_of_in(LETTER, token['obj'].lower()) \ - or re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token['obj']): + if any_of_in(LETTER, token["obj"].lower()) or re.search( + r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"] + ): return "l" - if (re.search(r"\b([А-Я]|[а-я]){1}$", token['obj']) \ - and ("l" in pre_token['class'] or "h" in pre_token['class'])) \ - and (" ш" not in token["obj"]) \ - and not find_countryside(token, pre_token): + if ( + ( + re.search(r"\b([А-Я]|[а-я]){1}$", token["obj"]) + and ("l" in pre_token["class"] or "h" in pre_token["class"]) + ) + and (" ш" not in token["obj"]) + and not find_countryside(token, pre_token) + ): return "l" return "" + + def find_edifice(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(EDIFICE_PREFIXES, token['obj'].lower()): + if any_of_in(EDIFICE_PREFIXES, token["obj"].lower()): return "e" return "" + def find_building(token: pd.Series, pre_token: pd.Series) -> str: - if re.search(r"\d", token['obj']) and not find_room(token,pre_token): - if any_of_in(BUILDING_PREFIXES, token['obj'].lower()) \ - or "b" in pre_token['class'] and ("h" not in token['class']) and not find_edifice(token,pre_token)\ - or re.search(r"к\.* ?\d", token['obj']): + if re.search(r"\d", token["obj"]) and not find_room(token, pre_token): + if ( + any_of_in(BUILDING_PREFIXES, token["obj"].lower()) + or "b" in pre_token["class"] + and ("h" not in token["class"]) + and not find_edifice(token, pre_token) + or re.search(r"к\.* ?\d", token["obj"]) + ): return "b" return "" def find_house(token: pd.Series, pre_token: pd.Series) -> str: - if re.search(r"\d{1,4}", token['obj']) and not find_room(token,pre_token): - if any_of_in(HOUSES_PREFIXES, token['obj'].lower()): + if re.search(r"\d{1,4}", token["obj"]) and not find_room(token, pre_token): + if any_of_in(HOUSES_PREFIXES, token["obj"].lower()): return "h" - if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token['obj']): + if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token["obj"]): return "h" - if ("s" in pre_token['class'] or "h" in pre_token['class'] or "s" in token['class']) \ - and not any_of_in(("-я", "-й", "-Я"), token['obj']) \ - and not find_building(token, pre_token)\ - and not find_edifice(token,pre_token): + if ( + ( + "s" in pre_token["class"] + or "h" in pre_token["class"] + or "s" in token["class"] + ) + and not any_of_in(("-я", "-й", "-Я"), token["obj"]) + and not find_building(token, pre_token) + and not find_edifice(token, pre_token) + ): return "h" - if find_building(token, pre_token) \ - and not any_of_in(("-я", "-й", "-Я"), token['obj']) \ - and True: - if len(re.findall(r"\d{1,4}", token['obj'])) > 1: + if ( + find_building(token, pre_token) + and not any_of_in(("-я", "-й", "-Я"), token["obj"]) + and True + ): + if len(re.findall(r"\d{1,4}", token["obj"])) > 1: return "h" - if int(re.search(r"\d{1,4}", token['obj']).group()) // 10 >0: + if int(re.search(r"\d{1,4}", token["obj"]).group()) // 10 > 0: return "h" return "" def find_street(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(STREET_PREFIXES, token['obj'].lower()): + if any_of_in(STREET_PREFIXES, token["obj"].lower()): return "s" - if re.search(r"\b[А-Яа-я]{4,}\b", token['obj']) \ - and not any([el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el)>2]) \ - and not ("d" in token["class"] or "t" in token["class"] or "c" in token["class"]): + if ( + re.search(r"\b[А-Яа-я]{4,}\b", token["obj"]) + and not any( + [el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el) > 2] + ) + and not ( + "d" in token["class"] or "t" in token["class"] or "c" in token["class"] + ) + ): return "s" return "" def find_territory(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(TERRITORY_PREFIXES, token['obj'].lower()): + if any_of_in(TERRITORY_PREFIXES, token["obj"].lower()): return "t" return "" def find_countryside(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(COUNTRYSIDE_PREFIXES, token['obj'].lower()) \ - and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token['obj']) \ - and not find_house(token, pre_token) \ - and not any_of_in(STREET_PREFIXES, token['obj'].lower()): + if ( + any_of_in(COUNTRYSIDE_PREFIXES, token["obj"].lower()) + and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token["obj"]) + and not find_house(token, pre_token) + and not any_of_in(STREET_PREFIXES, token["obj"].lower()) + ): return "c" return "" def find_district(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(DISTRICTS_PREFIXES, token['obj'].lower()): + if any_of_in(DISTRICTS_PREFIXES, token["obj"].lower()): return "d" return "" + def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: brackets = re.search(r"\(.+\)", token["obj"]) if brackets: @@ -144,17 +239,19 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: token["class"] += find_edifice(token, pre_token) token["class"] += find_litera(token, pre_token) token["class"] += find_room(token, pre_token) - if token['class'] == "": - token['class'] = "w" + if token["class"] == "": + token["class"] = "w" if brackets: token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"]) return token + def cut_address(ad: pd.Series, cl: str) -> pd.Series: while ad["class"] and CLASSES.index(ad["class"][-1]) > CLASSES.index(cl[0]): if ad["class"][-1] == "h": - ad["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", - ad["address"].lower()) + ad["address"] = re.sub( + r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", ad["address"].lower() + ) elif ad["class"][-1] == "b": num = re.findall(r"к{0,1}\.? ?\d", ad["address"])[-1] ad["address"] = re.sub(num, "", ad["address"]) @@ -239,7 +336,7 @@ class AddressSplitter(Sequence): def substitue_house(self) -> str: house_regex = re.compile(r"\d{1,4} ?[\/\-]?\d* ?") - number = house_regex.findall(self.token['obj'])[0] + number = house_regex.findall(self.token["obj"])[0] if self.has_numbered_street(): house_number_index = 1 @@ -249,7 +346,11 @@ class AddressSplitter(Sequence): number_in_accumulator = house_regex.findall(self.accumulator["address"]) if number_in_accumulator: - return re.sub(number_in_accumulator[house_number_index], number, self.accumulator["address"]) + return re.sub( + number_in_accumulator[house_number_index], + number, + self.accumulator["address"], + ) else: return self.accumulator["address"] @@ -303,19 +404,19 @@ class AddressSplitter(Sequence): self.accumulator["class"] += "l" def has_letter_in(self) -> bool: - return ( - re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", self.accumulator["address"]) - ) + return re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", self.accumulator["address"]) # Room def substitue_room(self, number: int) -> str: - return re.sub(r"пом\. ?\d\-?\d*\w?", number, self.accumulator["address"].strip()) + return re.sub( + r"пом\. ?\d\-?\d*\w?", number, self.accumulator["address"].strip() + ) def insert_room(self): number = re.findall("пом\. ?\-?\d*\w?", self.token["obj"])[-1] self.accumulator["address"] = self.substitue_room(number) - + if number and self.has_no_class("room"): self.accumulator["class"] += "r" @@ -393,12 +494,13 @@ def split_pesoch_res(address: str) -> List[str]: t = re.sub(r",", " ", address) t = re.split(r"(Санкт-Петербург|Ленинградская обл|Л\.О)", t) t = list(map(str.strip, filter(lambda token: token != "", t))) - tokens = [t[i] + " " + t[i+1] for i in range(0, len(t)-1, 2)] + tokens = [t[i] + " " + t[i + 1] for i in range(0, len(t) - 1, 2)] if tokens: return list(set(tokens)) return [address] + def process_row(row: pd.Series[str]) -> pd.Series[str]: row = row.copy() @@ -417,4 +519,4 @@ def process_row(row: pd.Series[str]) -> pd.Series[str]: def split_addresses(df: pd.DataFrame) -> pd.DataFrame: merged_df = df.apply(process_row, axis=1).reset_index() - return merged_df.explode("Улица", ignore_index=True) \ No newline at end of file + return merged_df.explode("Улица", ignore_index=True) From 06f08d493315e1e103a6420cfb4ab1bf7bd0aedb Mon Sep 17 00:00:00 2001 From: dm1sh Date: Sun, 29 Oct 2023 12:27:14 +0300 Subject: [PATCH 09/12] Renamed pipeline file --- parser/__init__.py | 2 +- parser/{util.py => pipeline.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename parser/{util.py => pipeline.py} (100%) diff --git a/parser/__init__.py b/parser/__init__.py index cd4b8b3..b8ce701 100644 --- a/parser/__init__.py +++ b/parser/__init__.py @@ -7,6 +7,7 @@ from .building_id import ( get_building_id, ) from .lenenergo import LenenergoParser +from .pipeline import pipeline from .preprocess import ( COL_NS, ICOL_NS, @@ -14,7 +15,6 @@ from .preprocess import ( preprocess_df, preprocess_read_df, ) -from .util import pipeline __all__ = ( "async_fetch_building_id", diff --git a/parser/util.py b/parser/pipeline.py similarity index 100% rename from parser/util.py rename to parser/pipeline.py From cc2029802b07d5689028db185a319733d1d6fdd8 Mon Sep 17 00:00:00 2001 From: dm1sh Date: Sun, 29 Oct 2023 14:24:39 +0300 Subject: [PATCH 10/12] Split address.py to module --- parser/address.py | 522 ----------------------------------- parser/address/__init__.py | 12 + parser/address/classifier.py | 215 +++++++++++++++ parser/address/splitter.py | 292 ++++++++++++++++++++ parser/address/utils.py | 45 +++ 5 files changed, 564 insertions(+), 522 deletions(-) delete mode 100644 parser/address.py create mode 100644 parser/address/__init__.py create mode 100644 parser/address/classifier.py create mode 100644 parser/address/splitter.py create mode 100644 parser/address/utils.py diff --git a/parser/address.py b/parser/address.py deleted file mode 100644 index d9111dd..0000000 --- a/parser/address.py +++ /dev/null @@ -1,522 +0,0 @@ -from __future__ import annotations - -import re -from collections.abc import Sequence -from typing import Iterable, List, TypeVar - -import pandas as pd - -T = TypeVar("T") - -CLASSES = ("w", "d", "c", "t", "s", "h", "b", "e", "l", "r") -DISTRICTS_PREFIXES = ("мо ", "р-н", "городское", "лесхоз") -COUNTRYSIDE_PREFIXES = ( - "г", - "п", - "д", - "гп", - "рп", - "кп", - "пгт", - "c", - "хутор", - " урочище", -) -TERRITORY_PREFIXES = ( - "тер.", - " тер", - "снт ", - "ст ", - "дск ", - "днп ", - "дпк ", - "нп ", - "пдк ", - "т/б ", - "садоводство", - "массив", - "хозя", - "сад-во", -) -STREET_PREFIXES = ( - " ул", - " бул", - " пр", - " ш", - " пер", - " дор", - " маг", - " наб", - " пл", - " просп", - " туп", - "шоссе", - "лини", - "аллея", - "мост", - " парк", - "кольцо", - "проезд", - "съезд", - "переулок", - "ул.", - "бул.", - "пр.", - "ш.", - "пер.", - "дор.", - "маг.", - "наб.", - "пл.", - "просп.", - "туп.", -) -HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп", "дом", "дома") -BUILDING_PREFIXES = ("к.", "к ", "корп", "корпус") -EDIFICE_PREFIXES = ("стр.", "строение") -LETTER = ("лит.", "литера", " л.") -PREFIXES = ( - DISTRICTS_PREFIXES, - COUNTRYSIDE_PREFIXES, - TERRITORY_PREFIXES, - STREET_PREFIXES, - HOUSES_PREFIXES, - BUILDING_PREFIXES, - EDIFICE_PREFIXES, - LETTER, -) - - -def unfold_house_ranges(token: str) -> List[str]: - addresses = [] - pairs_strings = re.findall(r"([\d]+-[\d]+)", token) - for pair_string in pairs_strings: - a, b = pair_string.split("-") - a, b = int(a), int(b) - - if b > a: - addresses += [ - re.sub(r"([\d]+-[\d]+)", number, token) - for number in map(str, range(a, b + 1)) - ] - else: - token = token.replace("-", "/") - if not addresses: - addresses.append(token) - return addresses - - -def any_of_in(substrings: Iterable[str], string: str) -> bool: - return any(map(lambda substring: substring in string, substrings)) - - -def flatten(arr: Iterable[List[T]]) -> List[T]: - return sum(arr, []) - - -def find_room(token: pd.Series, pre_token: pd.Series) -> str: - if re.search(r"\bпом\.?", token["obj"]): - return "r" - return "" - - -def find_litera(token: pd.Series, pre_token: pd.Series) -> str: - if find_room(token, pre_token): - return "" - if any_of_in(LETTER, token["obj"].lower()) or re.search( - r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"] - ): - return "l" - if ( - ( - re.search(r"\b([А-Я]|[а-я]){1}$", token["obj"]) - and ("l" in pre_token["class"] or "h" in pre_token["class"]) - ) - and (" ш" not in token["obj"]) - and not find_countryside(token, pre_token) - ): - return "l" - return "" - - -def find_edifice(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(EDIFICE_PREFIXES, token["obj"].lower()): - return "e" - return "" - - -def find_building(token: pd.Series, pre_token: pd.Series) -> str: - if re.search(r"\d", token["obj"]) and not find_room(token, pre_token): - if ( - any_of_in(BUILDING_PREFIXES, token["obj"].lower()) - or "b" in pre_token["class"] - and ("h" not in token["class"]) - and not find_edifice(token, pre_token) - or re.search(r"к\.* ?\d", token["obj"]) - ): - return "b" - return "" - - -def find_house(token: pd.Series, pre_token: pd.Series) -> str: - if re.search(r"\d{1,4}", token["obj"]) and not find_room(token, pre_token): - if any_of_in(HOUSES_PREFIXES, token["obj"].lower()): - return "h" - if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token["obj"]): - return "h" - if ( - ( - "s" in pre_token["class"] - or "h" in pre_token["class"] - or "s" in token["class"] - ) - and not any_of_in(("-я", "-й", "-Я"), token["obj"]) - and not find_building(token, pre_token) - and not find_edifice(token, pre_token) - ): - return "h" - if ( - find_building(token, pre_token) - and not any_of_in(("-я", "-й", "-Я"), token["obj"]) - and True - ): - if len(re.findall(r"\d{1,4}", token["obj"])) > 1: - return "h" - if int(re.search(r"\d{1,4}", token["obj"]).group()) // 10 > 0: - return "h" - return "" - - -def find_street(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(STREET_PREFIXES, token["obj"].lower()): - return "s" - if ( - re.search(r"\b[А-Яа-я]{4,}\b", token["obj"]) - and not any( - [el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el) > 2] - ) - and not ( - "d" in token["class"] or "t" in token["class"] or "c" in token["class"] - ) - ): - return "s" - return "" - - -def find_territory(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(TERRITORY_PREFIXES, token["obj"].lower()): - return "t" - return "" - - -def find_countryside(token: pd.Series, pre_token: pd.Series) -> str: - if ( - any_of_in(COUNTRYSIDE_PREFIXES, token["obj"].lower()) - and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token["obj"]) - and not find_house(token, pre_token) - and not any_of_in(STREET_PREFIXES, token["obj"].lower()) - ): - return "c" - return "" - - -def find_district(token: pd.Series, pre_token: pd.Series) -> str: - if any_of_in(DISTRICTS_PREFIXES, token["obj"].lower()): - return "d" - return "" - - -def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: - brackets = re.search(r"\(.+\)", token["obj"]) - if brackets: - token["obj"] = re.sub(r"\(.+\)", "()", token["obj"]) - token["class"] += find_district(token, pre_token) - token["class"] += find_countryside(token, pre_token) - token["class"] += find_territory(token, pre_token) - token["class"] += find_street(token, pre_token) - token["class"] += find_house(token, pre_token) - token["class"] += find_building(token, pre_token) - token["class"] += find_edifice(token, pre_token) - token["class"] += find_litera(token, pre_token) - token["class"] += find_room(token, pre_token) - if token["class"] == "": - token["class"] = "w" - if brackets: - token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"]) - return token - - -def cut_address(ad: pd.Series, cl: str) -> pd.Series: - while ad["class"] and CLASSES.index(ad["class"][-1]) > CLASSES.index(cl[0]): - if ad["class"][-1] == "h": - ad["address"] = re.sub( - r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", ad["address"].lower() - ) - elif ad["class"][-1] == "b": - num = re.findall(r"к{0,1}\.? ?\d", ad["address"])[-1] - ad["address"] = re.sub(num, "", ad["address"]) - elif ad["class"][-1] == "e": - ad["address"] = re.sub(r"cтр\.? ?\d", "", ad["address"]) - elif ad["class"][-1] == "l": - ad["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$", "", ad["address"]) - elif ad["class"][-1] == "r": - ad["address"] = re.sub(r"пом\.? ?\d+", "", ad["address"]) - ad["class"] = ad["class"][:-1] - return ad - - -def is_valid_token(string: str) -> bool: - return string not in ("", "уг.", "д.") - - -def create_token(obj: str = "", token_class: str = ""): - return pd.Series( - { - "obj": obj, - "class": token_class, - } - ) - - -class AddressSplitter(Sequence): - def __init__(self, address: str): - self.input = address - - self.addresses = self.split() - - ## Sequence abstract methods implementation - - def __getitem__(self, key: int): - if key < len(self.addresses): - return self.addresses[key] - else: - raise IndexError() - - def __len__(self): - return len(self.addresses) - - ## Address token class manipulations - - def next_class(self) -> str: - return self.token["class"][0] - - def correct_order(self) -> bool: - prev_class = self.accumulator["class"][-1] - - return ( - CLASSES.index(prev_class) < CLASSES.index(self.next_class()) - and self.accumulator["class"] != "w" - ) - - def next_class_is(self, comparing_class: str) -> bool: - return len(self.token["class"]) > 0 and self.next_class() == comparing_class[0] - - def has_no_class(self, comparing_class: str) -> bool: - return comparing_class[0] not in self.accumulator["class"] - - def pop_token_class(self): - self.token["class"] = self.token["class"][1:] - - ## Accumulator constrains - - def next_is_street_or_upper(self) -> bool: - is_unknown_class = self.accumulator["class"] in ("", "w") - - return ( - CLASSES.index(self.next_class()) <= CLASSES.index("s") or is_unknown_class - ) - - def has_numbered_street(self) -> bool: - return any_of_in(("-я", "-й", "-Я"), self.accumulator["address"]) - - ## Accumulator manipulation - - # House - - def substitue_house(self) -> str: - house_regex = re.compile(r"\d{1,4} ?[\/\-]?\d* ?") - - number = house_regex.findall(self.token["obj"])[0] - - if self.has_numbered_street(): - house_number_index = 1 - else: - house_number_index = 0 - - number_in_accumulator = house_regex.findall(self.accumulator["address"]) - - if number_in_accumulator: - return re.sub( - number_in_accumulator[house_number_index], - number, - self.accumulator["address"], - ) - else: - return self.accumulator["address"] - - # Building - - def append_building(self, number: int) -> pd.Series: - self.accumulator["class"] += "b" - self.accumulator["address"] += "к." + number - - return self.accumulator - - def substitue_building(self, number: int) -> str: - return re.sub(r"\d$", number, self.accumulator["address"]) - - def insert_building(self): - number = re.findall(r"\d", self.token["obj"])[-1] - - if number and self.has_no_class("building"): - self.accumulator = self.append_building(number) - else: - self.accumulator["address"] = self.substitue_building(number) - - # Edifice - - def substitue_edifice(self, number: int) -> str: - return re.sub(r"cтр\. ?\d", number, self.accumulator["address"].strip()) - - def insert_edifice(self): - number = re.findall("стр\.? ?\d", self.token["obj"])[-1] - - self.accumulator["address"] = self.substitue_edifice(number) - - if number and self.has_no_class("edifice"): - self.accumulator["class"] += "e" - - # Letter - - def without_letter(self) -> str: - return re.sub(r"[А-Яа-я]$", "", self.accumulator["address"].strip()) - - def substitue_letter(self, letter: str) -> str: - address_without_letter = self.without_letter() - - return address_without_letter + letter - - def insert_letter(self): - letter = re.findall(r"[А-Яа-я]", self.token["obj"])[-1] - self.accumulator["address"] = self.substitue_letter(letter) - - if letter and self.has_no_class("litera"): - self.accumulator["class"] += "l" - - def has_letter_in(self) -> bool: - return re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", self.accumulator["address"]) - - # Room - - def substitue_room(self, number: int) -> str: - return re.sub( - r"пом\. ?\d\-?\d*\w?", number, self.accumulator["address"].strip() - ) - - def insert_room(self): - number = re.findall("пом\. ?\-?\d*\w?", self.token["obj"])[-1] - self.accumulator["address"] = self.substitue_room(number) - - if number and self.has_no_class("room"): - self.accumulator["class"] += "r" - - ## Data preprocessing - - def split_tokens(self) -> list[pd.Series]: - address = self.input.replace(";", ",") - - parts = address.split(",") - parts = map(str.strip, parts) - parts = filter(is_valid_token, parts) - - tokens = map(lambda part: create_token(part, ""), parts) - - return list(tokens) - - def split(self): - self.tokens = self.split_tokens() - - result = [] - - self.accumulator = pd.Series({"address": "", "class": ""}) - - prev_token = create_token() - - for cursor in self.tokens: - self.token = address_classification(cursor, prev_token) - prev_token = self.token.copy() - - if self.accumulator["class"] == "": - self.accumulator = self.token.rename({"obj": "address"}) - continue - - if self.correct_order(): - self.accumulator["address"] += " " - self.accumulator += self.token.rename({"obj": "address"}) - else: - unfolded_address = unfold_house_ranges(self.accumulator["address"]) - self.accumulator["address"] = unfolded_address[-1] - - result.extend(unfolded_address) - - self.accumulator = cut_address(self.accumulator, self.token["class"]) - - if self.next_is_street_or_upper(): - self.accumulator = self.token.rename({"obj": "address"}) - - if self.next_class_is("house"): - self.accumulator["address"] = self.substitue_house() - self.pop_token_class() - - if self.next_class_is("building"): - self.insert_building() - self.pop_token_class() - - if self.next_class_is("edifice"): - self.insert_edifice() - self.pop_token_class() - - if self.next_class_is("letter"): - self.insert_letter() - elif self.has_letter_in(): - self.accumulator["address"] = self.without_letter() - - if self.next_class_is("room"): - self.insert_room() - self.pop_token_class() - - result.extend(unfold_house_ranges(self.accumulator["address"])) - - return result - - -def split_pesoch_res(address: str) -> List[str]: - t = re.sub(r",", " ", address) - t = re.split(r"(Санкт-Петербург|Ленинградская обл|Л\.О)", t) - t = list(map(str.strip, filter(lambda token: token != "", t))) - tokens = [t[i] + " " + t[i + 1] for i in range(0, len(t) - 1, 2)] - - if tokens: - return list(set(tokens)) - return [address] - - -def process_row(row: pd.Series[str]) -> pd.Series[str]: - row = row.copy() - - if pd.isnull(row["Улица"]): - row["Улица"] = [None] - else: - if row["РЭС"] == "Песочинский РЭС": - addresses = split_pesoch_res(row["Улица"]) - else: - addresses = AddressSplitter(row["Улица"]) - row["Улица"] = addresses - - return row - - -def split_addresses(df: pd.DataFrame) -> pd.DataFrame: - merged_df = df.apply(process_row, axis=1).reset_index() - - return merged_df.explode("Улица", ignore_index=True) diff --git a/parser/address/__init__.py b/parser/address/__init__.py new file mode 100644 index 0000000..370717b --- /dev/null +++ b/parser/address/__init__.py @@ -0,0 +1,12 @@ +from .classifier import CLASSES, address_classification +from .splitter import AddressSplitter, split_addresses, split_pesoch_res +from .utils import create_token + +__all__ = ( + "address_classification", + "AddressSplitter", + "CLASSES", + "create_token", + "split_addresses", + "split_pesoch_res", +) diff --git a/parser/address/classifier.py b/parser/address/classifier.py new file mode 100644 index 0000000..a333ace --- /dev/null +++ b/parser/address/classifier.py @@ -0,0 +1,215 @@ +import re + +import pandas as pd + +from .utils import any_of_in + +CLASSES = ("w", "d", "c", "t", "s", "h", "b", "e", "l", "r") +DISTRICTS_PREFIXES = ("мо ", "р-н", "городское", "лесхоз") +COUNTRYSIDE_PREFIXES = ( + "г", + "п", + "д", + "гп", + "рп", + "кп", + "пгт", + "c", + "хутор", + " урочище", +) +TERRITORY_PREFIXES = ( + "тер.", + " тер", + "снт ", + "ст ", + "дск ", + "днп ", + "дпк ", + "нп ", + "пдк ", + "т/б ", + "садоводство", + "массив", + "хозя", + "сад-во", +) +STREET_PREFIXES = ( + " ул", + " бул", + " пр", + " ш", + " пер", + " дор", + " маг", + " наб", + " пл", + " просп", + " туп", + "шоссе", + "лини", + "аллея", + "мост", + " парк", + "кольцо", + "проезд", + "съезд", + "переулок", + "ул.", + "бул.", + "пр.", + "ш.", + "пер.", + "дор.", + "маг.", + "наб.", + "пл.", + "просп.", + "туп.", +) +HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп", "дом", "дома") +BUILDING_PREFIXES = ("к.", "к ", "корп", "корпус") +EDIFICE_PREFIXES = ("стр.", "строение") +LETTER = ("лит.", "литера", " л.") +PREFIXES = ( + DISTRICTS_PREFIXES, + COUNTRYSIDE_PREFIXES, + TERRITORY_PREFIXES, + STREET_PREFIXES, + HOUSES_PREFIXES, + BUILDING_PREFIXES, + EDIFICE_PREFIXES, + LETTER, +) + + +def find_room(token: pd.Series, pre_token: pd.Series) -> str: + if re.search(r"\bпом\.?", token["obj"]): + return "r" + return "" + + +def find_litera(token: pd.Series, pre_token: pd.Series) -> str: + if find_room(token, pre_token): + return "" + if any_of_in(LETTER, token["obj"].lower()) or re.search( + r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"] + ): + return "l" + if ( + ( + re.search(r"\b([А-Я]|[а-я]){1}$", token["obj"]) + and ("l" in pre_token["class"] or "h" in pre_token["class"]) + ) + and (" ш" not in token["obj"]) + and not find_countryside(token, pre_token) + ): + return "l" + return "" + + +def find_edifice(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(EDIFICE_PREFIXES, token["obj"].lower()): + return "e" + return "" + + +def find_building(token: pd.Series, pre_token: pd.Series) -> str: + if re.search(r"\d", token["obj"]) and not find_room(token, pre_token): + if ( + any_of_in(BUILDING_PREFIXES, token["obj"].lower()) + or "b" in pre_token["class"] + and ("h" not in token["class"]) + and not find_edifice(token, pre_token) + or re.search(r"к\.* ?\d", token["obj"]) + ): + return "b" + return "" + + +def find_house(token: pd.Series, pre_token: pd.Series) -> str: + if re.search(r"\d{1,4}", token["obj"]) and not find_room(token, pre_token): + if any_of_in(HOUSES_PREFIXES, token["obj"].lower()): + return "h" + if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token["obj"]): + return "h" + if ( + ( + "s" in pre_token["class"] + or "h" in pre_token["class"] + or "s" in token["class"] + ) + and not any_of_in(("-я", "-й", "-Я"), token["obj"]) + and not find_building(token, pre_token) + and not find_edifice(token, pre_token) + ): + return "h" + if ( + find_building(token, pre_token) + and not any_of_in(("-я", "-й", "-Я"), token["obj"]) + and True + ): + if len(re.findall(r"\d{1,4}", token["obj"])) > 1: + return "h" + if int(re.search(r"\d{1,4}", token["obj"]).group()) // 10 > 0: + return "h" + return "" + + +def find_street(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(STREET_PREFIXES, token["obj"].lower()): + return "s" + if ( + re.search(r"\b[А-Яа-я]{4,}\b", token["obj"]) + and not any( + [el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el) > 2] + ) + and not ( + "d" in token["class"] or "t" in token["class"] or "c" in token["class"] + ) + ): + return "s" + return "" + + +def find_territory(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(TERRITORY_PREFIXES, token["obj"].lower()): + return "t" + return "" + + +def find_countryside(token: pd.Series, pre_token: pd.Series) -> str: + if ( + any_of_in(COUNTRYSIDE_PREFIXES, token["obj"].lower()) + and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token["obj"]) + and not find_house(token, pre_token) + and not any_of_in(STREET_PREFIXES, token["obj"].lower()) + ): + return "c" + return "" + + +def find_district(token: pd.Series, pre_token: pd.Series) -> str: + if any_of_in(DISTRICTS_PREFIXES, token["obj"].lower()): + return "d" + return "" + + +def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: + brackets = re.search(r"\(.+\)", token["obj"]) + if brackets: + token["obj"] = re.sub(r"\(.+\)", "()", token["obj"]) + token["class"] += find_district(token, pre_token) + token["class"] += find_countryside(token, pre_token) + token["class"] += find_territory(token, pre_token) + token["class"] += find_street(token, pre_token) + token["class"] += find_house(token, pre_token) + token["class"] += find_building(token, pre_token) + token["class"] += find_edifice(token, pre_token) + token["class"] += find_litera(token, pre_token) + token["class"] += find_room(token, pre_token) + if token["class"] == "": + token["class"] = "w" + if brackets: + token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"]) + return token diff --git a/parser/address/splitter.py b/parser/address/splitter.py new file mode 100644 index 0000000..8cc4ffc --- /dev/null +++ b/parser/address/splitter.py @@ -0,0 +1,292 @@ +from __future__ import annotations + +import re +from collections.abc import Sequence + +import pandas as pd + +from .classifier import CLASSES, address_classification +from .utils import any_of_in, create_token, is_valid_token, unfold_house_ranges + + +class AddressSplitter(Sequence): + def __init__(self, address: str): + self.input = address + + self.addresses = self.split() + + # Sequence abstract methods implementation + + def __getitem__(self, key: int): + if key < len(self.addresses): + return self.addresses[key] + else: + raise IndexError() + + def __len__(self): + return len(self.addresses) + + # Address token class manipulations + + def next_class(self) -> str: + return self.token["class"][0] + + def prev_class(self) -> str: + return self.accumulator["class"][-1] + + def correct_order(self) -> bool: + return ( + len(self.accumulator["class"]) > 0 + and CLASSES.index(self.prev_class()) < CLASSES.index(self.next_class()) + and self.accumulator["class"] != "w" + ) + + def next_class_is(self, comparing_class: str) -> bool: + return len(self.token["class"]) > 0 and self.next_class() == comparing_class[0] + + def has_no_class(self, comparing_class: str) -> bool: + return comparing_class[0] not in self.accumulator["class"] + + def pop_token_class(self): + self.token["class"] = self.token["class"][1:] + + # Accumulator constrains + + def next_is_street_or_upper(self) -> bool: + is_unknown_class = self.accumulator["class"] in ("", "w") + + return ( + CLASSES.index(self.next_class()) <= CLASSES.index("s") or is_unknown_class + ) + + def has_numbered_street(self) -> bool: + return any_of_in(("-я", "-й", "-Я"), self.accumulator["address"]) + + # Accumulator manipulation + + ## House + + def substitue_house(self) -> str: + house_regex = re.compile(r"\d{1,4} ?[\/\-]?\d* ?") + + number = house_regex.findall(self.token["obj"])[0] + + if self.has_numbered_street(): + house_number_index = 1 + else: + house_number_index = 0 + + number_in_accumulator = house_regex.findall(self.accumulator["address"]) + + if number_in_accumulator: + return re.sub( + number_in_accumulator[house_number_index], + number, + self.accumulator["address"], + ) + else: + return self.accumulator["address"] + + ## Building + + def append_building(self, number: int) -> pd.Series: + self.accumulator["class"] += "b" + self.accumulator["address"] += "к." + number + + return self.accumulator + + def substitue_building(self, number: int) -> str: + return re.sub(r"\d$", number, self.accumulator["address"]) + + def insert_building(self): + number = re.findall(r"\d", self.token["obj"])[-1] + + if number and self.has_no_class("building"): + self.accumulator = self.append_building(number) + else: + self.accumulator["address"] = self.substitue_building(number) + + ## Edifice + + def substitue_edifice(self, number: int) -> str: + return re.sub(r"cтр\. ?\d", number, self.accumulator["address"].strip()) + + def insert_edifice(self): + number = re.findall("стр\.? ?\d", self.token["obj"])[-1] + + self.accumulator["address"] = self.substitue_edifice(number) + + if number and self.has_no_class("edifice"): + self.accumulator["class"] += "e" + + ## Letter + + def without_letter(self) -> str: + return re.sub(r"[А-Яа-я]$", "", self.accumulator["address"].strip()) + + def substitue_letter(self, letter: str) -> str: + address_without_letter = self.without_letter() + + return address_without_letter + letter + + def insert_letter(self): + letter = re.findall(r"[А-Яа-я]", self.token["obj"])[-1] + self.accumulator["address"] = self.substitue_letter(letter) + + if letter and self.has_no_class("litera"): + self.accumulator["class"] += "l" + + def has_letter_in(self) -> bool: + return re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", self.accumulator["address"]) + + ## Room + + def substitue_room(self, number: int) -> str: + return re.sub( + r"пом\. ?\d\-?\d*\w?", number, self.accumulator["address"].strip() + ) + + def insert_room(self): + number = re.findall("пом\. ?\-?\d*\w?", self.token["obj"])[-1] + self.accumulator["address"] = self.substitue_room(number) + + if number and self.has_no_class("room"): + self.accumulator["class"] += "r" + + # Data preprocessing + + def split_tokens(self) -> list[pd.Series]: + address = self.input.replace(";", ",") + + parts = address.split(",") + parts = map(str.strip, parts) + parts = filter(is_valid_token, parts) + + tokens = map(lambda part: create_token(part, ""), parts) + + return list(tokens) + + def cut_address(self) -> pd.Series: + while len(self.accumulator["class"]) > 0 and CLASSES.index( + self.prev_class() + ) > CLASSES.index(self.next_class()): + match self.accumulator["class"][-1]: + case "h": + self.accumulator["addresses"] = re.sub( + r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", + "", + self.accumulator["address"].lower(), + ) + case "b": + number = re.findall(r"к{0,1}\.? ?\d", self.accumulator["address"])[ + -1 + ] + self.accumulator["address"] = re.sub( + number, "", self.accumulator["address"] + ) + case "e": + self.accumulator["address"] = re.sub( + r"cтр\.? ?\d", "", self.accumulator["address"] + ) + case "l": + self.accumulator["address"] = re.sub( + r"[литера]*\.? ?[А-Яа-я]{1}$", "", self.accumulator["address"] + ) + case "r": + self.accumulator["address"] = re.sub( + r"пом\.? ?\d+", "", self.accumulator["address"] + ) + + self.accumulator["class"] = self.accumulator["class"][:-1] + + return self.accumulator + + # Splitting + + def split(self): + self.tokens = self.split_tokens() + + result = [] + + self.accumulator = pd.Series({"address": "", "class": ""}) + + prev_token = create_token() + + for cursor in self.tokens: + self.token = address_classification(cursor, prev_token) + prev_token = self.token.copy() + + if self.accumulator["class"] == "": + self.accumulator = self.token.rename({"obj": "address"}) + continue + + if self.correct_order(): + self.accumulator["address"] += " " + self.accumulator += self.token.rename({"obj": "address"}) + else: + unfolded_address = unfold_house_ranges(self.accumulator["address"]) + self.accumulator["address"] = unfolded_address[-1] + + result.extend(unfolded_address) + + self.accumulator = self.cut_address() + + if self.next_is_street_or_upper(): + self.accumulator = self.token.rename({"obj": "address"}) + + if self.next_class_is("house"): + self.accumulator["address"] = self.substitue_house() + self.pop_token_class() + + if self.next_class_is("building"): + self.insert_building() + self.pop_token_class() + + if self.next_class_is("edifice"): + self.insert_edifice() + self.pop_token_class() + + if self.next_class_is("letter"): + self.insert_letter() + elif self.has_letter_in(): + self.accumulator["address"] = self.without_letter() + + if self.next_class_is("room"): + self.insert_room() + self.pop_token_class() + + result.extend(unfold_house_ranges(self.accumulator["address"])) + + return result + + +def split_pesoch_res(address: str) -> list[str]: + t = re.sub(r",", " ", address) + t = re.split(r"(Санкт-Петербург|Ленинградская обл|Л\.О)", t) + t = list(map(str.strip, filter(lambda token: token != "", t))) + tokens = [t[i] + " " + t[i + 1] for i in range(0, len(t) - 1, 2)] + + if tokens: + return list(set(tokens)) + return [address] + + +def process_row(row: pd.Series[str]) -> pd.Series[str]: + row = row.copy() + + if pd.isnull(row["Улица"]): + row["Улица"] = [None] + else: + if row["РЭС"] == "Песочинский РЭС": + addresses = split_pesoch_res(row["Улица"]) + else: + addresses = AddressSplitter(row["Улица"]) + row["Улица"] = addresses + + return row + + +def split_addresses(df: pd.DataFrame) -> pd.DataFrame: + merged_df = df.apply(process_row, axis=1).reset_index() + + return merged_df.explode("Улица", ignore_index=True) diff --git a/parser/address/utils.py b/parser/address/utils.py new file mode 100644 index 0000000..6bfe1f9 --- /dev/null +++ b/parser/address/utils.py @@ -0,0 +1,45 @@ +import re +from collections.abc import Iterable +from typing import TypeVar + +import pandas as pd + +T = TypeVar("T") + +def any_of_in(substrings: Iterable[str], string: str) -> bool: + return any(map(lambda substring: substring in string, substrings)) + + +def flatten(arr: Iterable[list[T]]) -> list[T]: + return sum(arr, []) + +def unfold_house_ranges(token: str) -> list[str]: + addresses = [] + pairs_strings = re.findall(r"([\d]+-[\d]+)", token) + for pair_string in pairs_strings: + a, b = pair_string.split("-") + a, b = int(a), int(b) + + if b > a: + addresses += [ + re.sub(r"([\d]+-[\d]+)", number, token) + for number in map(str, range(a, b + 1)) + ] + else: + token = token.replace("-", "/") + if not addresses: + addresses.append(token) + return addresses + + +def is_valid_token(string: str) -> bool: + return string not in ("", "уг.", "д.") + + +def create_token(obj: str = "", token_class: str = ""): + return pd.Series( + { + "obj": obj, + "class": token_class, + } + ) \ No newline at end of file From 931ff1270b0e865b40ddfb6f7ce8239d247bbd71 Mon Sep 17 00:00:00 2001 From: dm1sh Date: Sun, 29 Oct 2023 14:25:00 +0300 Subject: [PATCH 11/12] Fixed import errors in parser --- parser/pipeline.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/parser/pipeline.py b/parser/pipeline.py index 9fad768..07f25ca 100644 --- a/parser/pipeline.py +++ b/parser/pipeline.py @@ -1,16 +1,14 @@ from typing import Optional -from . import ( - LenenergoParser, - concurrent_fetch_builing_ids, - preprocess_df, - split_addresses, -) +from .lenenergo import LenenergoParser +from .building_id import concurrent_fetch_builing_ids +from .preprocess import preprocess_df +from .address import split_addresses def pipeline(parser: Optional[LenenergoParser] = None) -> LenenergoParser: if parser is None: - parser = LenenergoParser(parser) + parser = LenenergoParser() print(parser) From 3bd1deb8db40ada9f8cf789d77e1970d8fc2d3da Mon Sep 17 00:00:00 2001 From: dm1sh Date: Sun, 29 Oct 2023 15:59:55 +0300 Subject: [PATCH 12/12] Code formatting --- parser/address/classifier.py | 11 +++++++++-- parser/address/splitter.py | 9 ++++++--- parser/address/utils.py | 4 +++- parser/pipeline.py | 6 +++--- runner/database.py | 2 +- 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/parser/address/classifier.py b/parser/address/classifier.py index a333ace..2ce1488 100644 --- a/parser/address/classifier.py +++ b/parser/address/classifier.py @@ -92,9 +92,12 @@ def find_room(token: pd.Series, pre_token: pd.Series) -> str: def find_litera(token: pd.Series, pre_token: pd.Series) -> str: if find_room(token, pre_token): return "" - if any_of_in(LETTER, token["obj"].lower()) or re.search( - r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"] + # fmt: off + if ( + any_of_in(LETTER, token["obj"].lower()) or + re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"]) ): + #fmt: on return "l" if ( ( @@ -199,6 +202,7 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: brackets = re.search(r"\(.+\)", token["obj"]) if brackets: token["obj"] = re.sub(r"\(.+\)", "()", token["obj"]) + token["class"] += find_district(token, pre_token) token["class"] += find_countryside(token, pre_token) token["class"] += find_territory(token, pre_token) @@ -208,8 +212,11 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: token["class"] += find_edifice(token, pre_token) token["class"] += find_litera(token, pre_token) token["class"] += find_room(token, pre_token) + if token["class"] == "": token["class"] = "w" + if brackets: token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"]) + return token diff --git a/parser/address/splitter.py b/parser/address/splitter.py index 8cc4ffc..e698d28 100644 --- a/parser/address/splitter.py +++ b/parser/address/splitter.py @@ -167,9 +167,12 @@ class AddressSplitter(Sequence): return list(tokens) def cut_address(self) -> pd.Series: - while len(self.accumulator["class"]) > 0 and CLASSES.index( - self.prev_class() - ) > CLASSES.index(self.next_class()): + # fmt: off + while ( + len(self.accumulator["class"]) > 0 + and CLASSES.index(self.prev_class()) > CLASSES.index(self.next_class()) + ): + # fmt: on match self.accumulator["class"][-1]: case "h": self.accumulator["addresses"] = re.sub( diff --git a/parser/address/utils.py b/parser/address/utils.py index 6bfe1f9..0935245 100644 --- a/parser/address/utils.py +++ b/parser/address/utils.py @@ -6,6 +6,7 @@ import pandas as pd T = TypeVar("T") + def any_of_in(substrings: Iterable[str], string: str) -> bool: return any(map(lambda substring: substring in string, substrings)) @@ -13,6 +14,7 @@ def any_of_in(substrings: Iterable[str], string: str) -> bool: def flatten(arr: Iterable[list[T]]) -> list[T]: return sum(arr, []) + def unfold_house_ranges(token: str) -> list[str]: addresses = [] pairs_strings = re.findall(r"([\d]+-[\d]+)", token) @@ -42,4 +44,4 @@ def create_token(obj: str = "", token_class: str = ""): "obj": obj, "class": token_class, } - ) \ No newline at end of file + ) diff --git a/parser/pipeline.py b/parser/pipeline.py index 07f25ca..920c23e 100644 --- a/parser/pipeline.py +++ b/parser/pipeline.py @@ -1,9 +1,9 @@ from typing import Optional -from .lenenergo import LenenergoParser -from .building_id import concurrent_fetch_builing_ids -from .preprocess import preprocess_df from .address import split_addresses +from .building_id import concurrent_fetch_builing_ids +from .lenenergo import LenenergoParser +from .preprocess import preprocess_df def pipeline(parser: Optional[LenenergoParser] = None) -> LenenergoParser: diff --git a/runner/database.py b/runner/database.py index 0da4e76..d70743b 100644 --- a/runner/database.py +++ b/runner/database.py @@ -1,10 +1,10 @@ from .config import ( + DB_URL, POSTGRES_DB, POSTGRES_HOST, POSTGRES_PASSWORD, POSTGRES_PORT, POSTGRES_USER, - DB_URL, ) db_credentials = {"conninfo": DB_URL}