Change address recognizer (not final)

This commit is contained in:
AnastasiaOnimovma 2023-10-13 10:00:04 +03:00
parent 5fddedb709
commit a000bf5867
3 changed files with 57 additions and 20 deletions

3
.gitignore vendored
View File

@ -2,4 +2,5 @@
.venv
__pycache__
.env
data*.csv
data*.csv
.idea/

View File

@ -8,7 +8,7 @@ from . import pipeline
def job():
parser = pipeline()
parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H:%M")}.csv')
parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H.%M")}.csv')
if len(sys.argv) == 2:

View File

@ -7,20 +7,30 @@ import pandas as pd
T = TypeVar("T")
STREET_PREFIXES = ("ул.", "бул.", "пр.", "ул", "бул", "пр", "ш.", "ш", "пер.", "пер")
HOUSES_PREFIXES = ("д.", "д")
SETTLEMENTS_PREFIXES=("г","мо","р","п","д","гп","c","хутор","массив","тер","СНТ","СТ","ДСК","ДНП","ДПК","НП","садоводство")
STREET_PREFIXES = ("ул", "бул", "пр", "ш", "пер", "дор", "маг", "наб", "пл", "просп", "туп", "аллея", "мост", "парк", "кольцо","проезд")
HOUSES_PREFIXES = ("д.", "д","уч","участок")
BUILDING_PREFIXES=("к", "корп")
LETTER=("лит", "литера")
def unfold_house_ranges(token: str) -> str:
def unfold_house_ranges(address:str, token: str) -> List[str]:
adresses=[]
pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
for pair_string in pairs_strings:
a, b = pair_string.split("-")
a, b = int(a), int(b)
if b > a:
token = token.replace(pair_string, ", ".join(map(str, range(a, b + 1))))
token = token.replace(pair_string,"")
adresses += [address + " "+ token + number for number in map(str, range(a, b + 1))]
return token
else:
token = token.replace("-", "/")
adresses += address + " " + token
if not adresses:
adresses.append(address + " " + token)
return adresses
def unfold_houses_list(token: str) -> List[str]:
@ -42,28 +52,54 @@ def flatten(arr: Iterable[List[T]]) -> List[T]:
return sum(arr, [])
# TODO: переработать систему из if в нормальный вид и классификация чисел/букв
def split_address(address: str) -> List[str]:
if ";" in address:
return flatten(map(unfold_houses_list, address.split(";")))
elif "," in address:
tokens = re.split(r"(,)", address)
address = address.replace(";", ",")
if "," in address:
tokens = address.split(",")
tokens = list(map(str.strip, filter(lambda token: token != "", tokens)))
res = []
accumulator = ""
accumulator = []
for i in range(len(tokens)):
if any_of_in(STREET_PREFIXES, tokens[i].lower()) and any_of_in(
STREET_PREFIXES, accumulator.lower()
):
res += unfold_houses_list(accumulator)
accumulator = ""
accumulator += tokens[i]
# TODO: напселённые пункты
# if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower())
# accumulator += tokens[i]
res += unfold_houses_list(accumulator)
# улицы
if any_of_in(STREET_PREFIXES, tokens[i].lower()):
if accumulator and any_of_in(STREET_PREFIXES, "".join(accumulator).lower() ):
res.append( " ".join(accumulator))
accumulator=[]
accumulator.append(tokens[i])
# дома
elif any_of_in(HOUSES_PREFIXES, tokens[i].lower()):
if accumulator and any_of_in(HOUSES_PREFIXES, accumulator[-1].lower()):
res.append(" ".join(accumulator))
accumulator.pop()
res.append(unfold_house_ranges(" ".join(accumulator),tokens[i]))
accumulator=res[-1]
res.pop()
# корпус
elif any_of_in(BUILDING_PREFIXES, tokens[i].lower()):
if accumulator and any_of_in(BUILDING_PREFIXES, accumulator[-1].lower() ):
res.append( " ".join(accumulator))
accumulator.pop()
accumulator.append(tokens[i])
# литера
elif any_of_in(LETTER, tokens[i].lower()):
if accumulator and any_of_in(LETTER, accumulator[-1].lower() ):
res.append(" ".join(accumulator))
accumulator.pop()
accumulator.append (tokens[i])
else:
accumulator.append(tokens[i])
res.append(" ".join(accumulator))
return res
return [address]