Change address recognizer (not final)
This commit is contained in:
parent
5fddedb709
commit
a000bf5867
3
.gitignore
vendored
3
.gitignore
vendored
@ -2,4 +2,5 @@
|
|||||||
.venv
|
.venv
|
||||||
__pycache__
|
__pycache__
|
||||||
.env
|
.env
|
||||||
data*.csv
|
data*.csv
|
||||||
|
.idea/
|
@ -8,7 +8,7 @@ from . import pipeline
|
|||||||
|
|
||||||
def job():
|
def job():
|
||||||
parser = pipeline()
|
parser = pipeline()
|
||||||
parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H:%M")}.csv')
|
parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H.%M")}.csv')
|
||||||
|
|
||||||
|
|
||||||
if len(sys.argv) == 2:
|
if len(sys.argv) == 2:
|
||||||
|
@ -7,20 +7,30 @@ import pandas as pd
|
|||||||
|
|
||||||
T = TypeVar("T")
|
T = TypeVar("T")
|
||||||
|
|
||||||
STREET_PREFIXES = ("ул.", "бул.", "пр.", "ул", "бул", "пр", "ш.", "ш", "пер.", "пер")
|
SETTLEMENTS_PREFIXES=("г","мо","р-н","п","д","гп","c","хутор","массив","тер","СНТ","СТ","ДСК","ДНП","ДПК","НП","садоводство")
|
||||||
HOUSES_PREFIXES = ("д.", "д")
|
STREET_PREFIXES = ("ул", "бул", "пр", "ш", "пер", "дор", "маг", "наб", "пл", "просп", "туп", "аллея", "мост", "парк", "кольцо","проезд")
|
||||||
|
HOUSES_PREFIXES = ("д.", "д","уч","участок")
|
||||||
|
BUILDING_PREFIXES=("к", "корп")
|
||||||
|
LETTER=("лит", "литера")
|
||||||
|
|
||||||
|
def unfold_house_ranges(address:str, token: str) -> List[str]:
|
||||||
def unfold_house_ranges(token: str) -> str:
|
adresses=[]
|
||||||
pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
|
pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
|
||||||
for pair_string in pairs_strings:
|
for pair_string in pairs_strings:
|
||||||
a, b = pair_string.split("-")
|
a, b = pair_string.split("-")
|
||||||
a, b = int(a), int(b)
|
a, b = int(a), int(b)
|
||||||
|
|
||||||
if b > a:
|
if b > a:
|
||||||
token = token.replace(pair_string, ", ".join(map(str, range(a, b + 1))))
|
token = token.replace(pair_string,"")
|
||||||
|
adresses += [address + " "+ token + number for number in map(str, range(a, b + 1))]
|
||||||
|
|
||||||
return token
|
|
||||||
|
else:
|
||||||
|
token = token.replace("-", "/")
|
||||||
|
adresses += address + " " + token
|
||||||
|
if not adresses:
|
||||||
|
adresses.append(address + " " + token)
|
||||||
|
return adresses
|
||||||
|
|
||||||
|
|
||||||
def unfold_houses_list(token: str) -> List[str]:
|
def unfold_houses_list(token: str) -> List[str]:
|
||||||
@ -42,28 +52,54 @@ def flatten(arr: Iterable[List[T]]) -> List[T]:
|
|||||||
return sum(arr, [])
|
return sum(arr, [])
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: переработать систему из if в нормальный вид и классификация чисел/букв
|
||||||
def split_address(address: str) -> List[str]:
|
def split_address(address: str) -> List[str]:
|
||||||
if ";" in address:
|
if ";" in address:
|
||||||
return flatten(map(unfold_houses_list, address.split(";")))
|
address = address.replace(";", ",")
|
||||||
elif "," in address:
|
if "," in address:
|
||||||
tokens = re.split(r"(,)", address)
|
tokens = address.split(",")
|
||||||
|
|
||||||
tokens = list(map(str.strip, filter(lambda token: token != "", tokens)))
|
tokens = list(map(str.strip, filter(lambda token: token != "", tokens)))
|
||||||
|
|
||||||
res = []
|
res = []
|
||||||
accumulator = ""
|
accumulator = []
|
||||||
|
|
||||||
for i in range(len(tokens)):
|
for i in range(len(tokens)):
|
||||||
if any_of_in(STREET_PREFIXES, tokens[i].lower()) and any_of_in(
|
|
||||||
STREET_PREFIXES, accumulator.lower()
|
|
||||||
):
|
|
||||||
res += unfold_houses_list(accumulator)
|
|
||||||
accumulator = ""
|
|
||||||
|
|
||||||
accumulator += tokens[i]
|
# TODO: напселённые пункты
|
||||||
|
# if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower())
|
||||||
|
# accumulator += tokens[i]
|
||||||
|
|
||||||
res += unfold_houses_list(accumulator)
|
# улицы
|
||||||
|
if any_of_in(STREET_PREFIXES, tokens[i].lower()):
|
||||||
|
if accumulator and any_of_in(STREET_PREFIXES, "".join(accumulator).lower() ):
|
||||||
|
res.append( " ".join(accumulator))
|
||||||
|
accumulator=[]
|
||||||
|
accumulator.append(tokens[i])
|
||||||
|
|
||||||
|
# дома
|
||||||
|
elif any_of_in(HOUSES_PREFIXES, tokens[i].lower()):
|
||||||
|
if accumulator and any_of_in(HOUSES_PREFIXES, accumulator[-1].lower()):
|
||||||
|
res.append(" ".join(accumulator))
|
||||||
|
accumulator.pop()
|
||||||
|
res.append(unfold_house_ranges(" ".join(accumulator),tokens[i]))
|
||||||
|
accumulator=res[-1]
|
||||||
|
res.pop()
|
||||||
|
# корпус
|
||||||
|
elif any_of_in(BUILDING_PREFIXES, tokens[i].lower()):
|
||||||
|
if accumulator and any_of_in(BUILDING_PREFIXES, accumulator[-1].lower() ):
|
||||||
|
res.append( " ".join(accumulator))
|
||||||
|
accumulator.pop()
|
||||||
|
accumulator.append(tokens[i])
|
||||||
|
# литера
|
||||||
|
elif any_of_in(LETTER, tokens[i].lower()):
|
||||||
|
if accumulator and any_of_in(LETTER, accumulator[-1].lower() ):
|
||||||
|
res.append(" ".join(accumulator))
|
||||||
|
accumulator.pop()
|
||||||
|
accumulator.append (tokens[i])
|
||||||
|
else:
|
||||||
|
accumulator.append(tokens[i])
|
||||||
|
|
||||||
|
res.append(" ".join(accumulator))
|
||||||
return res
|
return res
|
||||||
|
|
||||||
return [address]
|
return [address]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user