Change address recognizer (not final)
This commit is contained in:
parent
5fddedb709
commit
a000bf5867
3
.gitignore
vendored
3
.gitignore
vendored
@ -2,4 +2,5 @@
|
||||
.venv
|
||||
__pycache__
|
||||
.env
|
||||
data*.csv
|
||||
data*.csv
|
||||
.idea/
|
@ -8,7 +8,7 @@ from . import pipeline
|
||||
|
||||
def job():
|
||||
parser = pipeline()
|
||||
parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H:%M")}.csv')
|
||||
parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H.%M")}.csv')
|
||||
|
||||
|
||||
if len(sys.argv) == 2:
|
||||
|
@ -7,20 +7,30 @@ import pandas as pd
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
STREET_PREFIXES = ("ул.", "бул.", "пр.", "ул", "бул", "пр", "ш.", "ш", "пер.", "пер")
|
||||
HOUSES_PREFIXES = ("д.", "д")
|
||||
SETTLEMENTS_PREFIXES=("г","мо","р-н","п","д","гп","c","хутор","массив","тер","СНТ","СТ","ДСК","ДНП","ДПК","НП","садоводство")
|
||||
STREET_PREFIXES = ("ул", "бул", "пр", "ш", "пер", "дор", "маг", "наб", "пл", "просп", "туп", "аллея", "мост", "парк", "кольцо","проезд")
|
||||
HOUSES_PREFIXES = ("д.", "д","уч","участок")
|
||||
BUILDING_PREFIXES=("к", "корп")
|
||||
LETTER=("лит", "литера")
|
||||
|
||||
|
||||
def unfold_house_ranges(token: str) -> str:
|
||||
def unfold_house_ranges(address:str, token: str) -> List[str]:
|
||||
adresses=[]
|
||||
pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
|
||||
for pair_string in pairs_strings:
|
||||
a, b = pair_string.split("-")
|
||||
a, b = int(a), int(b)
|
||||
|
||||
if b > a:
|
||||
token = token.replace(pair_string, ", ".join(map(str, range(a, b + 1))))
|
||||
token = token.replace(pair_string,"")
|
||||
adresses += [address + " "+ token + number for number in map(str, range(a, b + 1))]
|
||||
|
||||
return token
|
||||
|
||||
else:
|
||||
token = token.replace("-", "/")
|
||||
adresses += address + " " + token
|
||||
if not adresses:
|
||||
adresses.append(address + " " + token)
|
||||
return adresses
|
||||
|
||||
|
||||
def unfold_houses_list(token: str) -> List[str]:
|
||||
@ -42,28 +52,54 @@ def flatten(arr: Iterable[List[T]]) -> List[T]:
|
||||
return sum(arr, [])
|
||||
|
||||
|
||||
# TODO: переработать систему из if в нормальный вид и классификация чисел/букв
|
||||
def split_address(address: str) -> List[str]:
|
||||
if ";" in address:
|
||||
return flatten(map(unfold_houses_list, address.split(";")))
|
||||
elif "," in address:
|
||||
tokens = re.split(r"(,)", address)
|
||||
address = address.replace(";", ",")
|
||||
if "," in address:
|
||||
tokens = address.split(",")
|
||||
|
||||
tokens = list(map(str.strip, filter(lambda token: token != "", tokens)))
|
||||
|
||||
res = []
|
||||
accumulator = ""
|
||||
accumulator = []
|
||||
|
||||
for i in range(len(tokens)):
|
||||
if any_of_in(STREET_PREFIXES, tokens[i].lower()) and any_of_in(
|
||||
STREET_PREFIXES, accumulator.lower()
|
||||
):
|
||||
res += unfold_houses_list(accumulator)
|
||||
accumulator = ""
|
||||
|
||||
accumulator += tokens[i]
|
||||
# TODO: напселённые пункты
|
||||
# if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower())
|
||||
# accumulator += tokens[i]
|
||||
|
||||
res += unfold_houses_list(accumulator)
|
||||
# улицы
|
||||
if any_of_in(STREET_PREFIXES, tokens[i].lower()):
|
||||
if accumulator and any_of_in(STREET_PREFIXES, "".join(accumulator).lower() ):
|
||||
res.append( " ".join(accumulator))
|
||||
accumulator=[]
|
||||
accumulator.append(tokens[i])
|
||||
|
||||
# дома
|
||||
elif any_of_in(HOUSES_PREFIXES, tokens[i].lower()):
|
||||
if accumulator and any_of_in(HOUSES_PREFIXES, accumulator[-1].lower()):
|
||||
res.append(" ".join(accumulator))
|
||||
accumulator.pop()
|
||||
res.append(unfold_house_ranges(" ".join(accumulator),tokens[i]))
|
||||
accumulator=res[-1]
|
||||
res.pop()
|
||||
# корпус
|
||||
elif any_of_in(BUILDING_PREFIXES, tokens[i].lower()):
|
||||
if accumulator and any_of_in(BUILDING_PREFIXES, accumulator[-1].lower() ):
|
||||
res.append( " ".join(accumulator))
|
||||
accumulator.pop()
|
||||
accumulator.append(tokens[i])
|
||||
# литера
|
||||
elif any_of_in(LETTER, tokens[i].lower()):
|
||||
if accumulator and any_of_in(LETTER, accumulator[-1].lower() ):
|
||||
res.append(" ".join(accumulator))
|
||||
accumulator.pop()
|
||||
accumulator.append (tokens[i])
|
||||
else:
|
||||
accumulator.append(tokens[i])
|
||||
|
||||
res.append(" ".join(accumulator))
|
||||
return res
|
||||
|
||||
return [address]
|
||||
|
Loading…
x
Reference in New Issue
Block a user