From 3bd1deb8db40ada9f8cf789d77e1970d8fc2d3da Mon Sep 17 00:00:00 2001 From: dm1sh Date: Sun, 29 Oct 2023 15:59:55 +0300 Subject: [PATCH] Code formatting --- parser/address/classifier.py | 11 +++++++++-- parser/address/splitter.py | 9 ++++++--- parser/address/utils.py | 4 +++- parser/pipeline.py | 6 +++--- runner/database.py | 2 +- 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/parser/address/classifier.py b/parser/address/classifier.py index a333ace..2ce1488 100644 --- a/parser/address/classifier.py +++ b/parser/address/classifier.py @@ -92,9 +92,12 @@ def find_room(token: pd.Series, pre_token: pd.Series) -> str: def find_litera(token: pd.Series, pre_token: pd.Series) -> str: if find_room(token, pre_token): return "" - if any_of_in(LETTER, token["obj"].lower()) or re.search( - r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"] + # fmt: off + if ( + any_of_in(LETTER, token["obj"].lower()) or + re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"]) ): + #fmt: on return "l" if ( ( @@ -199,6 +202,7 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: brackets = re.search(r"\(.+\)", token["obj"]) if brackets: token["obj"] = re.sub(r"\(.+\)", "()", token["obj"]) + token["class"] += find_district(token, pre_token) token["class"] += find_countryside(token, pre_token) token["class"] += find_territory(token, pre_token) @@ -208,8 +212,11 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series: token["class"] += find_edifice(token, pre_token) token["class"] += find_litera(token, pre_token) token["class"] += find_room(token, pre_token) + if token["class"] == "": token["class"] = "w" + if brackets: token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"]) + return token diff --git a/parser/address/splitter.py b/parser/address/splitter.py index 8cc4ffc..e698d28 100644 --- a/parser/address/splitter.py +++ b/parser/address/splitter.py @@ -167,9 +167,12 @@ class AddressSplitter(Sequence): return list(tokens) def cut_address(self) -> pd.Series: - while len(self.accumulator["class"]) > 0 and CLASSES.index( - self.prev_class() - ) > CLASSES.index(self.next_class()): + # fmt: off + while ( + len(self.accumulator["class"]) > 0 + and CLASSES.index(self.prev_class()) > CLASSES.index(self.next_class()) + ): + # fmt: on match self.accumulator["class"][-1]: case "h": self.accumulator["addresses"] = re.sub( diff --git a/parser/address/utils.py b/parser/address/utils.py index 6bfe1f9..0935245 100644 --- a/parser/address/utils.py +++ b/parser/address/utils.py @@ -6,6 +6,7 @@ import pandas as pd T = TypeVar("T") + def any_of_in(substrings: Iterable[str], string: str) -> bool: return any(map(lambda substring: substring in string, substrings)) @@ -13,6 +14,7 @@ def any_of_in(substrings: Iterable[str], string: str) -> bool: def flatten(arr: Iterable[list[T]]) -> list[T]: return sum(arr, []) + def unfold_house_ranges(token: str) -> list[str]: addresses = [] pairs_strings = re.findall(r"([\d]+-[\d]+)", token) @@ -42,4 +44,4 @@ def create_token(obj: str = "", token_class: str = ""): "obj": obj, "class": token_class, } - ) \ No newline at end of file + ) diff --git a/parser/pipeline.py b/parser/pipeline.py index 07f25ca..920c23e 100644 --- a/parser/pipeline.py +++ b/parser/pipeline.py @@ -1,9 +1,9 @@ from typing import Optional -from .lenenergo import LenenergoParser -from .building_id import concurrent_fetch_builing_ids -from .preprocess import preprocess_df from .address import split_addresses +from .building_id import concurrent_fetch_builing_ids +from .lenenergo import LenenergoParser +from .preprocess import preprocess_df def pipeline(parser: Optional[LenenergoParser] = None) -> LenenergoParser: diff --git a/runner/database.py b/runner/database.py index 0da4e76..d70743b 100644 --- a/runner/database.py +++ b/runner/database.py @@ -1,10 +1,10 @@ from .config import ( + DB_URL, POSTGRES_DB, POSTGRES_HOST, POSTGRES_PASSWORD, POSTGRES_PORT, POSTGRES_USER, - DB_URL, ) db_credentials = {"conninfo": DB_URL}