lenengro_parser/parser/address.py
AnastasiaOnimovma 1fd7a123f9 New fuctions
2023-10-21 18:12:36 +03:00

254 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import re
from typing import Iterable, List, TypeVar
import pandas as pd
T = TypeVar("T")
CLASSES = ("w", "d", "c", "t", "s", "h", "b", "l", "r")
DISTRICTS_PREFIXES = ("мо ", "р","городское","лесхоз")
COUNTRYSIDE_PREFIXES = (
"г", "п", "д", "гп", "рп", "кп", "пгт", "c", "хутор", " урочище")
TERRITORY_PREFIXES = (
"тер.", " тер", "снт ", "ст ", "дск ", "днп ", "дпк ", "нп ", "пдк ", "т/б ", "садоводство", "массив", "хозя", "сад-во")
STREET_PREFIXES = (
" ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе", "лини", "аллея",
"мост", " парк", "кольцо", "проезд", "съезд","переулок",
"ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.")
HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп","дом")
BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение", "корпус")
LETTER = ("лит.", "литера", " л.")
def unfold_house_ranges(token: str) -> List[str]:
addresses = []
pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
for pair_string in pairs_strings:
a, b = pair_string.split("-")
a, b = int(a), int(b)
if b > a:
addresses += [re.sub(r"([\d]+-[\d]+)", number, token) for number in map(str, range(a, b + 1))]
else:
token = token.replace("-", "/")
if not addresses:
addresses.append(token)
return addresses
def any_of_in(substrings: Iterable[str], string: str) -> bool:
return any(map(lambda substring: substring in string, substrings))
def flatten(arr: Iterable[List[T]]) -> List[T]:
return sum(arr, [])
def find_room(token: pd.Series, pre_token: pd.Series) -> str:
if re.search(r"пом\.?", token['obj']):
return "r"
return ""
def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(LETTER, token['obj'].lower()) \
or re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token['obj']):
return "l"
if (re.search(r"\b([А-Я]|[а-я]){1}$", token['obj']) \
and ("l" in pre_token['class'] or "h" in pre_token['class'])) \
and not (" ш" in token["obj"]) \
and not find_countryside(token, pre_token):
return "l"
return ""
def find_building(token: pd.Series, pre_token: pd.Series) -> str:
if re.search(r"\d", token['obj']):
if any_of_in(BUILDING_PREFIXES, token['obj'].lower()) \
or "b" in pre_token['class'] and not ("h" in token['class']) \
or re.search(r"к\.* ?\d", token['obj']):
return "b"
return ""
def find_house(token: pd.Series, pre_token: pd.Series) -> str:
if re.search(r"\d{1,4}", token['obj']):
if any_of_in(HOUSES_PREFIXES, token['obj'].lower()):
return "h"
if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token['obj']):
return "h"
if ("s" in pre_token['class'] or "h" in pre_token['class'] or "s" in token['class']) \
and not any_of_in(("", "", ""), token['obj']) \
and not find_building(token, pre_token):
return "h"
return ""
def find_street(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(STREET_PREFIXES, token['obj'].lower()) \
or re.search(r"[а-я]+ая", token['obj']):
return "s"
return ""
def find_territory(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(TERRITORY_PREFIXES, token['obj'].lower()):
return "t"
return ""
def find_countryside(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(COUNTRYSIDE_PREFIXES, token['obj'].lower()) \
and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token['obj']) \
and not find_house(token, pre_token) \
and not find_street(token, pre_token):
return "c"
return ""
def find_district(token: pd.Series, pre_token: pd.Series) -> str:
if any_of_in(DISTRICTS_PREFIXES, token['obj'].lower()):
return "d"
return ""
def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
brackets = re.search(r"\(.+\)", token["obj"])
if brackets:
token["obj"] = re.sub(r"\(.+\)", "()", token["obj"])
token["class"] += find_district(token, pre_token)
token["class"] += find_countryside(token, pre_token)
token["class"] += find_territory(token, pre_token)
token["class"] += find_street(token, pre_token)
token["class"] += find_house(token, pre_token)
token["class"] += find_building(token, pre_token)
token["class"] += find_litera(token, pre_token)
if token['class'] == "":
token['class'] = "w"
if brackets:
token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"])
return token
def cut_address(ad: pd.Series, cl: str) -> pd.Series:
while ad["class"] and CLASSES.index(ad["class"][-1]) > CLASSES.index(cl[0]):
if ad["class"][-1] == "h":
ad["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "",
ad["address"].lower())
elif ad["class"][-1] == "b":
num = re.findall("к{0,1}\.? ?\d", ad["address"])[-1]
ad["address"] = re.sub(num, "", ad["address"])
elif ad["class"][-1] == "l":
ad["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$", "", ad["address"])
elif ad["class"][-1] == "r":
ad["address"] = re.sub(r"пом\.? ?\d+", "", ad["address"])
ad["class"] = ad["class"][:-1]
return ad
# TODO: переработать систему из if в нормальный вид
def split_address(address: str) -> List[str]:
if ";" in address:
address = address.replace(";", ",")
if "," in address:
tokens = address.split(",")
t = list(map(str.strip, filter(lambda token: token != "", tokens)))
tokens = pd.DataFrame()
tokens['obj'] = t
tokens = tokens[tokens["obj"] != ""]
tokens.insert(len(tokens.columns), "class", "")
res = []
accumulator = pd.Series(data={"address": "", "class": ""})
for i in range(len(tokens)):
cur_tk = tokens.iloc[i]
if i == 0:
pre_token = pd.Series(data=["", ""], index=['obj', 'class'])
else:
pre_token = tokens.iloc[i - 1]
cur_tk = address_classification(cur_tk, pre_token)
tokens.iloc[i] = cur_tk
print(tokens.iloc[i])
if not accumulator["class"]:
accumulator["class"] = cur_tk['class']
accumulator["address"] = cur_tk["obj"]
continue
if CLASSES.index(accumulator["class"][-1]) < CLASSES.index(cur_tk["class"][0]) and accumulator["class"]!="w":
accumulator["class"] += cur_tk['class']
accumulator["address"] += " " + cur_tk["obj"]
else:
ad_no_ranges = unfold_house_ranges(accumulator["address"])
accumulator["address"] = ad_no_ranges[-1]
res.extend(ad_no_ranges)
accumulator = cut_address(accumulator, cur_tk["class"])
if not accumulator["class"] or CLASSES.index(cur_tk["class"][0]) <= CLASSES.index("s") or accumulator["class"]=="w":
accumulator["class"] = cur_tk["class"]
accumulator["address"] = cur_tk["obj"]
if cur_tk["class"][0] == "h":
num = re.findall("\d{1,4} ?\/?\d* ?", cur_tk['obj'])[0]
accumulator["address"] = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator["address"])
cur_tk["class"] =cur_tk["class"][1:]
if cur_tk["class"] and cur_tk["class"][0] == "b":
num = re.findall("\d", cur_tk["obj"])[-1]
if num and not "b" in accumulator["class"]:
accumulator["class"] += "b"
accumulator["address"] += "к." + num
else:
accumulator["address"] = re.sub(r"\d$", num, accumulator["address"])
cur_tk["class"] = cur_tk["class"][1:]
if cur_tk["class"] and cur_tk["class"][0] == "l":
num = re.findall("[А-Яа-я]", cur_tk["obj"].strip())[-1]
accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip())
accumulator["address"] += num
if num and not "l" in accumulator["class"]:
accumulator["class"] += "l"
else:
if re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", accumulator["address"]):
accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip())
res.extend(unfold_house_ranges(accumulator["address"]))
print(res)
return res
return [address]
def split_pesoch_res(address: str) -> List[str]:
t = re.sub(r",", " ", address)
t = re.split(r"(Санкт-Петербург|Ленинградская обл|Л\.О)", t)
t = list(map(str.strip, filter(lambda token: token != "", t)))
tokens = [t[i] + " " + t[i+1] for i in range(0, len(t)-1, 2)]
if tokens:
return list(set(tokens))
return [address]
def process_row(row: pd.Series[str]) -> pd.Series[str]:
row = row.copy()
if pd.isnull(row["Улица"]):
row["Улица"] = [None]
else:
if row["РЭС"] == "Песочинский РЭС":
addresses = split_pesoch_res(row["Улица"])
else:
addresses = split_address(row["Улица"])
row["Улица"] = addresses
return row
def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
merged_df = df.apply(process_row, axis=1).reset_index()
return merged_df.explode("Улица", ignore_index=True)