Code formatting
This commit is contained in:
parent
931ff1270b
commit
3bd1deb8db
@ -92,9 +92,12 @@ def find_room(token: pd.Series, pre_token: pd.Series) -> str:
|
||||
def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
|
||||
if find_room(token, pre_token):
|
||||
return ""
|
||||
if any_of_in(LETTER, token["obj"].lower()) or re.search(
|
||||
r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"]
|
||||
# fmt: off
|
||||
if (
|
||||
any_of_in(LETTER, token["obj"].lower()) or
|
||||
re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"])
|
||||
):
|
||||
#fmt: on
|
||||
return "l"
|
||||
if (
|
||||
(
|
||||
@ -199,6 +202,7 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
|
||||
brackets = re.search(r"\(.+\)", token["obj"])
|
||||
if brackets:
|
||||
token["obj"] = re.sub(r"\(.+\)", "()", token["obj"])
|
||||
|
||||
token["class"] += find_district(token, pre_token)
|
||||
token["class"] += find_countryside(token, pre_token)
|
||||
token["class"] += find_territory(token, pre_token)
|
||||
@ -208,8 +212,11 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
|
||||
token["class"] += find_edifice(token, pre_token)
|
||||
token["class"] += find_litera(token, pre_token)
|
||||
token["class"] += find_room(token, pre_token)
|
||||
|
||||
if token["class"] == "":
|
||||
token["class"] = "w"
|
||||
|
||||
if brackets:
|
||||
token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"])
|
||||
|
||||
return token
|
||||
|
@ -167,9 +167,12 @@ class AddressSplitter(Sequence):
|
||||
return list(tokens)
|
||||
|
||||
def cut_address(self) -> pd.Series:
|
||||
while len(self.accumulator["class"]) > 0 and CLASSES.index(
|
||||
self.prev_class()
|
||||
) > CLASSES.index(self.next_class()):
|
||||
# fmt: off
|
||||
while (
|
||||
len(self.accumulator["class"]) > 0
|
||||
and CLASSES.index(self.prev_class()) > CLASSES.index(self.next_class())
|
||||
):
|
||||
# fmt: on
|
||||
match self.accumulator["class"][-1]:
|
||||
case "h":
|
||||
self.accumulator["addresses"] = re.sub(
|
||||
|
@ -6,6 +6,7 @@ import pandas as pd
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def any_of_in(substrings: Iterable[str], string: str) -> bool:
|
||||
return any(map(lambda substring: substring in string, substrings))
|
||||
|
||||
@ -13,6 +14,7 @@ def any_of_in(substrings: Iterable[str], string: str) -> bool:
|
||||
def flatten(arr: Iterable[list[T]]) -> list[T]:
|
||||
return sum(arr, [])
|
||||
|
||||
|
||||
def unfold_house_ranges(token: str) -> list[str]:
|
||||
addresses = []
|
||||
pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
|
||||
@ -42,4 +44,4 @@ def create_token(obj: str = "", token_class: str = ""):
|
||||
"obj": obj,
|
||||
"class": token_class,
|
||||
}
|
||||
)
|
||||
)
|
||||
|
@ -1,9 +1,9 @@
|
||||
from typing import Optional
|
||||
|
||||
from .lenenergo import LenenergoParser
|
||||
from .building_id import concurrent_fetch_builing_ids
|
||||
from .preprocess import preprocess_df
|
||||
from .address import split_addresses
|
||||
from .building_id import concurrent_fetch_builing_ids
|
||||
from .lenenergo import LenenergoParser
|
||||
from .preprocess import preprocess_df
|
||||
|
||||
|
||||
def pipeline(parser: Optional[LenenergoParser] = None) -> LenenergoParser:
|
||||
|
@ -1,10 +1,10 @@
|
||||
from .config import (
|
||||
DB_URL,
|
||||
POSTGRES_DB,
|
||||
POSTGRES_HOST,
|
||||
POSTGRES_PASSWORD,
|
||||
POSTGRES_PORT,
|
||||
POSTGRES_USER,
|
||||
DB_URL,
|
||||
)
|
||||
|
||||
db_credentials = {"conninfo": DB_URL}
|
||||
|
Loading…
x
Reference in New Issue
Block a user