From a000bf5867299094bba68d3134a3f6e5c388f3c5 Mon Sep 17 00:00:00 2001
From: AnastasiaOnimovma <kud0304@mail.ru>
Date: Fri, 13 Oct 2023 10:00:04 +0300
Subject: [PATCH 01/12] Change address recognizer (not final)

---
 .gitignore         |  3 +-
 parser/__main__.py |  2 +-
 parser/address.py  | 72 ++++++++++++++++++++++++++++++++++------------
 3 files changed, 57 insertions(+), 20 deletions(-)

diff --git a/.gitignore b/.gitignore
index 43024ce..71444f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,5 @@
 .venv
 __pycache__
 .env
-data*.csv
\ No newline at end of file
+data*.csv
+.idea/
\ No newline at end of file
diff --git a/parser/__main__.py b/parser/__main__.py
index e55c05a..de668c0 100644
--- a/parser/__main__.py
+++ b/parser/__main__.py
@@ -8,7 +8,7 @@ from . import pipeline
 
 def job():
     parser = pipeline()
-    parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H:%M")}.csv')
+    parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H.%M")}.csv')
 
 
 if len(sys.argv) == 2:
diff --git a/parser/address.py b/parser/address.py
index 4b22430..79eac1b 100644
--- a/parser/address.py
+++ b/parser/address.py
@@ -7,20 +7,30 @@ import pandas as pd
 
 T = TypeVar("T")
 
-STREET_PREFIXES = ("ул.", "бул.", "пр.", "ул", "бул", "пр", "ш.", "ш", "пер.", "пер")
-HOUSES_PREFIXES = ("д.", "д")
+SETTLEMENTS_PREFIXES=("г","мо","р-н","п","д","гп","c","хутор","массив","тер","СНТ","СТ","ДСК","ДНП","ДПК","НП","садоводство")
+STREET_PREFIXES = ("ул", "бул", "пр", "ш", "пер", "дор", "маг", "наб", "пл", "просп", "туп", "аллея", "мост", "парк", "кольцо","проезд")
+HOUSES_PREFIXES = ("д.", "д","уч","участок")
+BUILDING_PREFIXES=("к", "корп")
+LETTER=("лит", "литера")
 
-
-def unfold_house_ranges(token: str) -> str:
+def unfold_house_ranges(address:str, token: str) -> List[str]:
+    adresses=[]
     pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
     for pair_string in pairs_strings:
         a, b = pair_string.split("-")
         a, b = int(a), int(b)
 
         if b > a:
-            token = token.replace(pair_string, ", ".join(map(str, range(a, b + 1))))
+            token = token.replace(pair_string,"")
+            adresses += [address + " "+ token + number for number in map(str, range(a, b + 1))]
 
-    return token
+
+        else:
+            token = token.replace("-", "/")
+            adresses += address + " " + token
+    if not adresses:
+        adresses.append(address + " " + token)
+    return adresses
 
 
 def unfold_houses_list(token: str) -> List[str]:
@@ -42,28 +52,54 @@ def flatten(arr: Iterable[List[T]]) -> List[T]:
     return sum(arr, [])
 
 
+# TODO: переработать систему из if в нормальный вид и классификация чисел/букв
 def split_address(address: str) -> List[str]:
     if ";" in address:
-        return flatten(map(unfold_houses_list, address.split(";")))
-    elif "," in address:
-        tokens = re.split(r"(,)", address)
+        address = address.replace(";", ",")
+    if "," in address:
+        tokens = address.split(",")
 
         tokens = list(map(str.strip, filter(lambda token: token != "", tokens)))
-
         res = []
-        accumulator = ""
+        accumulator = []
 
         for i in range(len(tokens)):
-            if any_of_in(STREET_PREFIXES, tokens[i].lower()) and any_of_in(
-                STREET_PREFIXES, accumulator.lower()
-            ):
-                res += unfold_houses_list(accumulator)
-                accumulator = ""
 
-            accumulator += tokens[i]
+            # TODO: напселённые пункты
+            # if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower())
+            #     accumulator += tokens[i]
 
-        res += unfold_houses_list(accumulator)
+            # улицы
+            if any_of_in(STREET_PREFIXES, tokens[i].lower()):
+                if accumulator and any_of_in(STREET_PREFIXES, "".join(accumulator).lower() ):
+                    res.append( " ".join(accumulator))
+                    accumulator=[]
+                accumulator.append(tokens[i])
 
+            # дома
+            elif any_of_in(HOUSES_PREFIXES, tokens[i].lower()):
+                if accumulator and any_of_in(HOUSES_PREFIXES, accumulator[-1].lower()):
+                    res.append(" ".join(accumulator))
+                    accumulator.pop()
+                res.append(unfold_house_ranges(" ".join(accumulator),tokens[i]))
+                accumulator=res[-1]
+                res.pop()
+            # корпус
+            elif any_of_in(BUILDING_PREFIXES, tokens[i].lower()):
+                if accumulator and any_of_in(BUILDING_PREFIXES, accumulator[-1].lower() ):
+                    res.append( " ".join(accumulator))
+                    accumulator.pop()
+                accumulator.append(tokens[i])
+            # литера
+            elif any_of_in(LETTER, tokens[i].lower()):
+                if accumulator and any_of_in(LETTER, accumulator[-1].lower() ):
+                    res.append(" ".join(accumulator))
+                    accumulator.pop()
+                accumulator.append (tokens[i])
+            else:
+                accumulator.append(tokens[i])
+
+        res.append(" ".join(accumulator))
         return res
 
     return [address]

From cb422b9a2fc47f8413a3fbe6c4726a6b94ca0fee Mon Sep 17 00:00:00 2001
From: AnastasiaOnimovma <kud0304@mail.ru>
Date: Mon, 16 Oct 2023 00:42:22 +0300
Subject: [PATCH 02/12] Classification(ver.1)

---
 parser/address.py | 144 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 103 insertions(+), 41 deletions(-)

diff --git a/parser/address.py b/parser/address.py
index 79eac1b..a7ca5dd 100644
--- a/parser/address.py
+++ b/parser/address.py
@@ -7,22 +7,29 @@ import pandas as pd
 
 T = TypeVar("T")
 
-SETTLEMENTS_PREFIXES=("г","мо","р-н","п","д","гп","c","хутор","массив","тер","СНТ","СТ","ДСК","ДНП","ДПК","НП","садоводство")
-STREET_PREFIXES = ("ул", "бул", "пр", "ш", "пер", "дор", "маг", "наб", "пл", "просп", "туп", "аллея", "мост", "парк", "кольцо","проезд")
-HOUSES_PREFIXES = ("д.", "д","уч","участок")
-BUILDING_PREFIXES=("к", "корп")
-LETTER=("лит", "литера")
+CLASSES = ("s", "h", "b", "l", "?")
 
-def unfold_house_ranges(address:str, token: str) -> List[str]:
-    adresses=[]
+SETTLEMENTS_PREFIXES = (
+    "г", "мо", "р-н", "п", "д", "гп", "c", "хутор", "массив", "тер", "СНТ", "СТ", "ДСК", "ДНП", "ДПК", "НП",
+    "садоводство")
+STREET_PREFIXES = (
+    " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе","линия","аллея", "мост", "парк", "кольцо","проезд",
+    "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.")
+HOUSES_PREFIXES = ("д.", "уч.", "участок")
+BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение")
+LETTER = ("лит.", "литера"," л.")
+
+
+def unfold_house_ranges(address: str, token: str) -> List[str]:
+    adresses = []
     pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
     for pair_string in pairs_strings:
         a, b = pair_string.split("-")
         a, b = int(a), int(b)
 
         if b > a:
-            token = token.replace(pair_string,"")
-            adresses += [address + " "+ token + number for number in map(str, range(a, b + 1))]
+            token = token.replace(pair_string, "")
+            adresses += [address + " " + token + number for number in map(str, range(a, b + 1))]
 
 
         else:
@@ -52,6 +59,46 @@ def flatten(arr: Iterable[List[T]]) -> List[T]:
     return sum(arr, [])
 
 
+def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
+    if any_of_in(LETTER, token['obj']) \
+            or re.search(r"\d{1,3}[А-Яа-я]( |$)", token['obj']):
+        return "l"
+    # не работает
+    if (re.search(r"\b[А-Яа-я]{1}\b", token['obj']) and "l" in pre_token['class']):
+        return "l"
+    return ""
+
+
+def find_building(token: pd.Series, pre_token: pd.Series) -> str:
+    if any_of_in(BUILDING_PREFIXES, token['obj']) \
+            or (re.search(r"\d", token['obj']) and "b" in pre_token['class']) \
+            or re.search(r"к\.*\d", token['obj']) \
+            or re.search(r"\d", token['obj']) and "b" in pre_token['class']:
+        return "b"
+    return ""
+
+
+def find_house(token: pd.Series, pre_token: pd.Series) -> str:
+    if any_of_in(HOUSES_PREFIXES, token['obj']):
+        return "h"
+    if re.search(r"(д|д\.) ?\d{1,3} ?\/*\d* ?", token['obj']) and not ("-я" in token['obj']):
+        if "h" in pre_token['class'] \
+                or "s" in pre_token['class'] \
+                or "s" in token['class']:
+            return "h"
+    # не работает
+    if re.search(r"\d{1,3}", token['obj']) and ("s" in pre_token['class'] or "h" in pre_token['class']):
+        return "h"
+    return ""
+
+
+def find_street(token: pd.Series, pre_token: pd.Series) -> str:
+    if any_of_in(STREET_PREFIXES, token['obj']) \
+            or (re.search(r"[А-Я]{1}[а-я]+", token['obj']) and "s" in pre_token['class']):
+        return "s"
+    return ""
+
+
 # TODO: переработать систему из if в нормальный вид и классификация чисел/букв
 def split_address(address: str) -> List[str]:
     if ";" in address:
@@ -59,47 +106,62 @@ def split_address(address: str) -> List[str]:
     if "," in address:
         tokens = address.split(",")
 
-        tokens = list(map(str.strip, filter(lambda token: token != "", tokens)))
+        t = list(map(str.strip, filter(lambda token: token != "", tokens)))
+        # токены в датафрэйм
+        tokens = pd.DataFrame()
+        tokens['obj'] = t
+        tokens.insert(len(tokens.columns), "class", "")
         res = []
-        accumulator = []
+        accumulator = ""
 
         for i in range(len(tokens)):
 
             # TODO: напселённые пункты
             # if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower())
             #     accumulator += tokens[i]
+            cur_tk = tokens.iloc[i]
 
-            # улицы
-            if any_of_in(STREET_PREFIXES, tokens[i].lower()):
-                if accumulator and any_of_in(STREET_PREFIXES, "".join(accumulator).lower() ):
-                    res.append( " ".join(accumulator))
-                    accumulator=[]
-                accumulator.append(tokens[i])
-
-            # дома
-            elif any_of_in(HOUSES_PREFIXES, tokens[i].lower()):
-                if accumulator and any_of_in(HOUSES_PREFIXES, accumulator[-1].lower()):
-                    res.append(" ".join(accumulator))
-                    accumulator.pop()
-                res.append(unfold_house_ranges(" ".join(accumulator),tokens[i]))
-                accumulator=res[-1]
-                res.pop()
-            # корпус
-            elif any_of_in(BUILDING_PREFIXES, tokens[i].lower()):
-                if accumulator and any_of_in(BUILDING_PREFIXES, accumulator[-1].lower() ):
-                    res.append( " ".join(accumulator))
-                    accumulator.pop()
-                accumulator.append(tokens[i])
-            # литера
-            elif any_of_in(LETTER, tokens[i].lower()):
-                if accumulator and any_of_in(LETTER, accumulator[-1].lower() ):
-                    res.append(" ".join(accumulator))
-                    accumulator.pop()
-                accumulator.append (tokens[i])
+            if i == 0:
+                pre_token = pd.Series(data=["", ""], index=['obj', 'class'])
             else:
-                accumulator.append(tokens[i])
-
-        res.append(" ".join(accumulator))
+                pre_token = tokens.iloc[i - 1]
+            obj_class = find_street(cur_tk, pre_token)
+            if obj_class:
+                cur_tk["class"] += obj_class
+                if "s" in tokens['class'].iloc[i - 1]:
+                    res.append(accumulator)
+                    accumulator = ""
+                accumulator += tokens["obj"].iloc[i]
+            obj_class = find_house(cur_tk, pre_token)
+            if obj_class:
+                cur_tk["class"] += obj_class
+                if "h" in tokens['class'].iloc[i - 1]:
+                    res.append(accumulator)
+                    num = re.findall("\d{,3}", tokens['obj'].iloc[i])[-1]
+                    accumulator = re.sub(r"\d{,3} ?\/*\d* ?", num,accumulator)
+                else:
+                    accumulator += tokens["obj"].iloc[i]
+            obj_class = find_building(cur_tk, pre_token)
+            if obj_class:
+                cur_tk["class"] += obj_class
+                if "b" in tokens['class'].iloc[i - 1]:
+                    res.append(accumulator)
+                    num = re.findall("\d", tokens['obj'].iloc[i])[-1]
+                    accumulator = re.sub(r"\d$", num, accumulator)
+                else:
+                    accumulator += tokens["obj"].iloc[i]
+            obj_class = find_litera(cur_tk, pre_token)
+            if obj_class:
+                cur_tk["class"] += obj_class
+                if "l" in tokens['class'].iloc[i - 1]:
+                    res.append(accumulator)
+                    num = re.findall("[А-яа-я]", tokens['obj'].iloc[i].strip())[-1]
+                    accumulator = re.sub(r"[А-яа-я]$", num, accumulator)
+                else:
+                    accumulator += tokens["obj"].iloc[i]
+            if cur_tk['class'] == "":
+                cur_tk['class'] = "w"
+        print(cur_tk)
         return res
 
     return [address]

From d822c5012b4e29da1478c57bc9edd46a43c45c41 Mon Sep 17 00:00:00 2001
From: AnastasiaOnimovma <kud0304@mail.ru>
Date: Mon, 16 Oct 2023 22:02:12 +0300
Subject: [PATCH 03/12] classification(full address)

---
 parser/address.py | 127 +++++++++++++++++++++++++++++++---------------
 parser/util.py    |   6 ++-
 2 files changed, 92 insertions(+), 41 deletions(-)

diff --git a/parser/address.py b/parser/address.py
index a7ca5dd..b9d18c9 100644
--- a/parser/address.py
+++ b/parser/address.py
@@ -7,16 +7,17 @@ import pandas as pd
 
 T = TypeVar("T")
 
-CLASSES = ("s", "h", "b", "l", "?")
-
-SETTLEMENTS_PREFIXES = (
-    "г", "мо", "р-н", "п", "д", "гп", "c", "хутор", "массив", "тер", "СНТ", "СТ", "ДСК", "ДНП", "ДПК", "НП",
-    "садоводство")
+CLASSES = ("d", "c", "t", "s", "h", "b", "l", "r", "w")
+DISTRICTS_PREFIXES = ("мо ", "р-н")
+COUNTRYSIDE_PREFIXES = (
+    " г", " п", " д", " гп", " рп", " кп", " пгт", " c", "хутор", " урочище"
+    "г.", "п.", "д.", "гп.", "рп.", "кп.", "пгт.", "c.")
+TERRITORY_PREFIXES =("тер.", " тер", "снт ", "ст ", "дск ", "днп ", "дпк ", "нп ", "пдк ", "т/б ", "садоводство", "массив", "хоз","сад-во","с-во")
 STREET_PREFIXES = (
-    " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе","линия","аллея", "мост", "парк", "кольцо","проезд",
+    " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе","линия","аллея", "мост", " парк", "кольцо","проезд", "съезд",
     "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.")
-HOUSES_PREFIXES = ("д.", "уч.", "участок")
-BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение")
+HOUSES_PREFIXES = ("д.", "уч.", "участок","мкд","тп")
+BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение","корпус")
 LETTER = ("лит.", "литера"," л.")
 
 
@@ -58,46 +59,67 @@ def any_of_in(substrings: Iterable[str], string: str) -> bool:
 def flatten(arr: Iterable[List[T]]) -> List[T]:
     return sum(arr, [])
 
+def find_room(token: pd.Series, pre_token: pd.Series) -> str:
+    if  re.search(r"пом\.?", token['obj']):
+        return "r"
+    return ""
 
 def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(LETTER, token['obj']) \
-            or re.search(r"\d{1,3}[А-Яа-я]( |$)", token['obj']):
+    if any_of_in(LETTER, token['obj'].lower()) \
+            or re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token['obj']):
         return "l"
-    # не работает
-    if (re.search(r"\b[А-Яа-я]{1}\b", token['obj']) and "l" in pre_token['class']):
+    if (re.search(r"\b([А-Я]|[а-я]){1}$", token['obj']) \
+            and ("l" in pre_token['class'] or "h" in pre_token['class'])) \
+            and not (" ш" in token["obj"]) \
+            and not find_countryside(token,pre_token):
         return "l"
     return ""
 
 
 def find_building(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(BUILDING_PREFIXES, token['obj']) \
-            or (re.search(r"\d", token['obj']) and "b" in pre_token['class']) \
-            or re.search(r"к\.*\d", token['obj']) \
-            or re.search(r"\d", token['obj']) and "b" in pre_token['class']:
-        return "b"
+    if re.search(r"\d", token['obj']):
+        if any_of_in(BUILDING_PREFIXES, token['obj'].lower()) \
+                or "b" in pre_token['class'] and not ("h" in token['class'])\
+                or re.search(r"к\.* ?\d", token['obj']):
+            return "b"
     return ""
 
 
 def find_house(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(HOUSES_PREFIXES, token['obj']):
-        return "h"
-    if re.search(r"(д|д\.) ?\d{1,3} ?\/*\d* ?", token['obj']) and not ("-я" in token['obj']):
-        if "h" in pre_token['class'] \
-                or "s" in pre_token['class'] \
-                or "s" in token['class']:
+    if re.search(r"\d{1,4}", token['obj']):
+        if any_of_in(HOUSES_PREFIXES, token['obj'].lower()):
+            return "h"
+        if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token['obj']):
+            return "h"
+        if ("s" in pre_token['class'] or "h" in pre_token['class'] or "s" in token['class']) \
+                and not any_of_in(("-я", "-й", "-Я"), token['obj'])\
+                and not find_building(token,pre_token):
             return "h"
-    # не работает
-    if re.search(r"\d{1,3}", token['obj']) and ("s" in pre_token['class'] or "h" in pre_token['class']):
-        return "h"
     return ""
 
 
 def find_street(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(STREET_PREFIXES, token['obj']) \
-            or (re.search(r"[А-Я]{1}[а-я]+", token['obj']) and "s" in pre_token['class']):
+    if any_of_in(STREET_PREFIXES, token['obj'].lower()) \
+            or re.search(r"[А-Я]{1}[а-я]+ая", token['obj']):
         return "s"
     return ""
 
+def find_territory(token: pd.Series, pre_token: pd.Series) -> str:
+    if any_of_in(TERRITORY_PREFIXES, token['obj'].lower()):
+        return "t"
+    return ""
+def find_countryside(token: pd.Series, pre_token: pd.Series) -> str:
+    if any_of_in(COUNTRYSIDE_PREFIXES, token['obj'].lower()) \
+            and not find_house(token,pre_token) \
+            and not find_street(token,pre_token):
+        return "c"
+    return ""
+
+def find_district(token: pd.Series, pre_token: pd.Series) -> str:
+    if any_of_in(DISTRICTS_PREFIXES, token['obj'].lower()):
+        return "d"
+    return ""
+
 
 # TODO: переработать систему из if в нормальный вид и классификация чисел/букв
 def split_address(address: str) -> List[str]:
@@ -125,43 +147,68 @@ def split_address(address: str) -> List[str]:
                 pre_token = pd.Series(data=["", ""], index=['obj', 'class'])
             else:
                 pre_token = tokens.iloc[i - 1]
+
+            obj_class = find_district(cur_tk, pre_token)
+            if obj_class:
+                cur_tk["class"] += obj_class
+                if "d" in pre_token['class']:
+                    res.append(accumulator)
+                    accumulator = ""
+                accumulator += cur_tk["obj"]
+            obj_class = find_countryside(cur_tk, pre_token)
+            if obj_class:
+                cur_tk["class"] += obj_class
+                if "c" in pre_token['class']:
+                    res.append(accumulator)
+                    accumulator = ""
+                accumulator += cur_tk["obj"]
+            obj_class = find_territory(cur_tk, pre_token)
+            if obj_class:
+                cur_tk["class"] += obj_class
+                if "t" in pre_token['class']:
+                    res.append(accumulator)
+                    accumulator = ""
+                accumulator +=cur_tk["obj"]
             obj_class = find_street(cur_tk, pre_token)
             if obj_class:
                 cur_tk["class"] += obj_class
-                if "s" in tokens['class'].iloc[i - 1]:
+                if "s" in pre_token['class']:
                     res.append(accumulator)
                     accumulator = ""
-                accumulator += tokens["obj"].iloc[i]
+                accumulator += cur_tk["obj"]
             obj_class = find_house(cur_tk, pre_token)
             if obj_class:
                 cur_tk["class"] += obj_class
-                if "h" in tokens['class'].iloc[i - 1]:
+                if "h" in pre_token["class"]:
                     res.append(accumulator)
-                    num = re.findall("\d{,3}", tokens['obj'].iloc[i])[-1]
-                    accumulator = re.sub(r"\d{,3} ?\/*\d* ?", num,accumulator)
+                    num = re.findall("\d{1,4}", cur_tk['obj'])[-1]
+                    accumulator = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator)
                 else:
-                    accumulator += tokens["obj"].iloc[i]
+                    accumulator += cur_tk["obj"]
             obj_class = find_building(cur_tk, pre_token)
             if obj_class:
                 cur_tk["class"] += obj_class
-                if "b" in tokens['class'].iloc[i - 1]:
+                if "b" in pre_token["class"]:
                     res.append(accumulator)
                     num = re.findall("\d", tokens['obj'].iloc[i])[-1]
                     accumulator = re.sub(r"\d$", num, accumulator)
                 else:
-                    accumulator += tokens["obj"].iloc[i]
+                    accumulator += pre_token["obj"]
             obj_class = find_litera(cur_tk, pre_token)
             if obj_class:
                 cur_tk["class"] += obj_class
-                if "l" in tokens['class'].iloc[i - 1]:
+                if "l" in pre_token["class"]:
                     res.append(accumulator)
-                    num = re.findall("[А-яа-я]", tokens['obj'].iloc[i].strip())[-1]
+                    num = re.findall("[А-яа-я]", cur_tk["obj"].strip())[-1]
                     accumulator = re.sub(r"[А-яа-я]$", num, accumulator)
                 else:
-                    accumulator += tokens["obj"].iloc[i]
+                    accumulator +=  cur_tk["obj"]
             if cur_tk['class'] == "":
                 cur_tk['class'] = "w"
-        print(cur_tk)
+            tokens.iloc[i] = cur_tk
+            print(tokens.iloc[i])
+
+        # print(cur_tk)
         return res
 
     return [address]
diff --git a/parser/util.py b/parser/util.py
index fe36f0e..6b54416 100644
--- a/parser/util.py
+++ b/parser/util.py
@@ -10,12 +10,16 @@ from . import (
 
 def pipeline(parser: Optional[LenenergoParser] = None) -> LenenergoParser:
     if parser is None:
-        parser = LenenergoParser()
+        parser = LenenergoParser(ndays=15)
 
     print(parser)
 
     parser.df = split_addresses(parser.df)
 
+    for i in range(len(parser.df)):
+        print(parser.df['Улица'].iloc[i])
+
+
     parser.df = concurrent_fetch_builing_ids(parser.df)
 
     parser.df = preprocess_df(parser.df)

From a71acc2ddf325b6af5406a70312adfde247e6d0f Mon Sep 17 00:00:00 2001
From: AnastasiaOnimovma <kud0304@mail.ru>
Date: Sat, 21 Oct 2023 17:32:37 +0300
Subject: [PATCH 04/12] Working version of classification (not final)

---
 .gitignore        |   3 +-
 parser/address.py | 225 ++++++++++++++++++++++++----------------------
 parser/util.py    |   6 +-
 3 files changed, 121 insertions(+), 113 deletions(-)

diff --git a/.gitignore b/.gitignore
index 71444f4..fe14a37 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@
 __pycache__
 .env
 data*.csv
-.idea/
\ No newline at end of file
+.idea/
+.ipynb_checkpoints
\ No newline at end of file
diff --git a/parser/address.py b/parser/address.py
index b9d18c9..4c3bdca 100644
--- a/parser/address.py
+++ b/parser/address.py
@@ -7,50 +7,36 @@ import pandas as pd
 
 T = TypeVar("T")
 
-CLASSES = ("d", "c", "t", "s", "h", "b", "l", "r", "w")
-DISTRICTS_PREFIXES = ("мо ", "р-н")
+CLASSES = ("w", "d", "c", "t", "s", "h", "b", "l", "r")
+
+DISTRICTS_PREFIXES = ("мо ", "р-н","городское","лесхоз")
 COUNTRYSIDE_PREFIXES = (
-    " г", " п", " д", " гп", " рп", " кп", " пгт", " c", "хутор", " урочище"
-    "г.", "п.", "д.", "гп.", "рп.", "кп.", "пгт.", "c.")
-TERRITORY_PREFIXES =("тер.", " тер", "снт ", "ст ", "дск ", "днп ", "дпк ", "нп ", "пдк ", "т/б ", "садоводство", "массив", "хоз","сад-во","с-во")
+    "г", "п", "д", "гп", "рп", "кп", "пгт", "c", "хутор", " урочище")
+TERRITORY_PREFIXES = (
+"тер.", " тер", "снт ", "ст ", "дск ", "днп ", "дпк ", "нп ", "пдк ", "т/б ", "садоводство", "массив", "хозя", "сад-во")
 STREET_PREFIXES = (
-    " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе","линия","аллея", "мост", " парк", "кольцо","проезд", "съезд",
+    " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе", "лини", "аллея",
+    "мост", " парк", "кольцо", "проезд", "съезд","переулок",
     "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.")
-HOUSES_PREFIXES = ("д.", "уч.", "участок","мкд","тп")
-BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение","корпус")
-LETTER = ("лит.", "литера"," л.")
+HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп","дом")
+BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение", "корпус")
+LETTER = ("лит.", "литера", " л.")
 
 
-def unfold_house_ranges(address: str, token: str) -> List[str]:
-    adresses = []
+def unfold_house_ranges(token: str) -> List[str]:
+    addresses = []
     pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
     for pair_string in pairs_strings:
         a, b = pair_string.split("-")
         a, b = int(a), int(b)
 
         if b > a:
-            token = token.replace(pair_string, "")
-            adresses += [address + " " + token + number for number in map(str, range(a, b + 1))]
-
-
+            addresses += [re.sub(r"([\d]+-[\d]+)", number, token) for number in map(str, range(a, b + 1))]
         else:
             token = token.replace("-", "/")
-            adresses += address + " " + token
-    if not adresses:
-        adresses.append(address + " " + token)
-    return adresses
-
-
-def unfold_houses_list(token: str) -> List[str]:
-    token = unfold_house_ranges(token)
-
-    reg = re.compile(r"(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )")
-
-    if len(re.findall(reg, token)) > 1:
-        tokens = token.split(",")
-        return [*[tokens[0] + " " + house_token for house_token in tokens[1:]]]
-    return [token]
-
+    if not addresses:
+        addresses.append(token)
+    return addresses
 
 def any_of_in(substrings: Iterable[str], string: str) -> bool:
     return any(map(lambda substring: substring in string, substrings))
@@ -59,19 +45,21 @@ def any_of_in(substrings: Iterable[str], string: str) -> bool:
 def flatten(arr: Iterable[List[T]]) -> List[T]:
     return sum(arr, [])
 
+
 def find_room(token: pd.Series, pre_token: pd.Series) -> str:
-    if  re.search(r"пом\.?", token['obj']):
+    if re.search(r"пом\.?", token['obj']):
         return "r"
     return ""
 
+
 def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
     if any_of_in(LETTER, token['obj'].lower()) \
             or re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token['obj']):
         return "l"
     if (re.search(r"\b([А-Я]|[а-я]){1}$", token['obj']) \
-            and ("l" in pre_token['class'] or "h" in pre_token['class'])) \
+        and ("l" in pre_token['class'] or "h" in pre_token['class'])) \
             and not (" ш" in token["obj"]) \
-            and not find_countryside(token,pre_token):
+            and not find_countryside(token, pre_token):
         return "l"
     return ""
 
@@ -79,7 +67,7 @@ def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
 def find_building(token: pd.Series, pre_token: pd.Series) -> str:
     if re.search(r"\d", token['obj']):
         if any_of_in(BUILDING_PREFIXES, token['obj'].lower()) \
-                or "b" in pre_token['class'] and not ("h" in token['class'])\
+                or "b" in pre_token['class'] and not ("h" in token['class']) \
                 or re.search(r"к\.* ?\d", token['obj']):
             return "b"
     return ""
@@ -92,36 +80,58 @@ def find_house(token: pd.Series, pre_token: pd.Series) -> str:
         if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token['obj']):
             return "h"
         if ("s" in pre_token['class'] or "h" in pre_token['class'] or "s" in token['class']) \
-                and not any_of_in(("-я", "-й", "-Я"), token['obj'])\
-                and not find_building(token,pre_token):
+                and not any_of_in(("-я", "-й", "-Я"), token['obj']) \
+                and not find_building(token, pre_token):
             return "h"
     return ""
 
 
 def find_street(token: pd.Series, pre_token: pd.Series) -> str:
     if any_of_in(STREET_PREFIXES, token['obj'].lower()) \
-            or re.search(r"[А-Я]{1}[а-я]+ая", token['obj']):
+            or re.search(r"[а-я]+ая", token['obj']):
         return "s"
     return ""
 
+
 def find_territory(token: pd.Series, pre_token: pd.Series) -> str:
     if any_of_in(TERRITORY_PREFIXES, token['obj'].lower()):
         return "t"
     return ""
+
+
 def find_countryside(token: pd.Series, pre_token: pd.Series) -> str:
     if any_of_in(COUNTRYSIDE_PREFIXES, token['obj'].lower()) \
-            and not find_house(token,pre_token) \
-            and not find_street(token,pre_token):
+            and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token['obj']) \
+            and not find_house(token, pre_token) \
+            and not find_street(token, pre_token):
         return "c"
     return ""
 
+
 def find_district(token: pd.Series, pre_token: pd.Series) -> str:
     if any_of_in(DISTRICTS_PREFIXES, token['obj'].lower()):
         return "d"
     return ""
 
+def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
+    brackets = re.search(r"\(.+\)", token["obj"])
+    if brackets:
+        token["obj"] = re.sub(r"\(.+\)", "()", token["obj"])
+    token["class"] += find_district(token, pre_token)
+    token["class"] += find_countryside(token, pre_token)
+    token["class"] += find_territory(token, pre_token)
+    token["class"] += find_street(token, pre_token)
+    token["class"] += find_house(token, pre_token)
+    token["class"] += find_building(token, pre_token)
+    token["class"] += find_litera(token, pre_token)
+    if token['class'] == "":
+        token['class'] = "w"
+    if brackets:
+        token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"])
+    return token
 
-# TODO: переработать систему из if в нормальный вид и классификация чисел/букв
+
+# TODO: переработать систему из if в нормальный вид
 def split_address(address: str) -> List[str]:
     if ";" in address:
         address = address.replace(";", ",")
@@ -129,18 +139,15 @@ def split_address(address: str) -> List[str]:
         tokens = address.split(",")
 
         t = list(map(str.strip, filter(lambda token: token != "", tokens)))
-        # токены в датафрэйм
+
         tokens = pd.DataFrame()
         tokens['obj'] = t
+        tokens = tokens[tokens["obj"] != ""]
         tokens.insert(len(tokens.columns), "class", "")
         res = []
-        accumulator = ""
+        accumulator = pd.Series(data={"address": "", "class": ""})
 
         for i in range(len(tokens)):
-
-            # TODO: напселённые пункты
-            # if any_of_in(SETTLEMENTS_PREFIXES, tokens[i].lower())
-            #     accumulator += tokens[i]
             cur_tk = tokens.iloc[i]
 
             if i == 0:
@@ -148,71 +155,72 @@ def split_address(address: str) -> List[str]:
             else:
                 pre_token = tokens.iloc[i - 1]
 
-            obj_class = find_district(cur_tk, pre_token)
-            if obj_class:
-                cur_tk["class"] += obj_class
-                if "d" in pre_token['class']:
-                    res.append(accumulator)
-                    accumulator = ""
-                accumulator += cur_tk["obj"]
-            obj_class = find_countryside(cur_tk, pre_token)
-            if obj_class:
-                cur_tk["class"] += obj_class
-                if "c" in pre_token['class']:
-                    res.append(accumulator)
-                    accumulator = ""
-                accumulator += cur_tk["obj"]
-            obj_class = find_territory(cur_tk, pre_token)
-            if obj_class:
-                cur_tk["class"] += obj_class
-                if "t" in pre_token['class']:
-                    res.append(accumulator)
-                    accumulator = ""
-                accumulator +=cur_tk["obj"]
-            obj_class = find_street(cur_tk, pre_token)
-            if obj_class:
-                cur_tk["class"] += obj_class
-                if "s" in pre_token['class']:
-                    res.append(accumulator)
-                    accumulator = ""
-                accumulator += cur_tk["obj"]
-            obj_class = find_house(cur_tk, pre_token)
-            if obj_class:
-                cur_tk["class"] += obj_class
-                if "h" in pre_token["class"]:
-                    res.append(accumulator)
-                    num = re.findall("\d{1,4}", cur_tk['obj'])[-1]
-                    accumulator = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator)
-                else:
-                    accumulator += cur_tk["obj"]
-            obj_class = find_building(cur_tk, pre_token)
-            if obj_class:
-                cur_tk["class"] += obj_class
-                if "b" in pre_token["class"]:
-                    res.append(accumulator)
-                    num = re.findall("\d", tokens['obj'].iloc[i])[-1]
-                    accumulator = re.sub(r"\d$", num, accumulator)
-                else:
-                    accumulator += pre_token["obj"]
-            obj_class = find_litera(cur_tk, pre_token)
-            if obj_class:
-                cur_tk["class"] += obj_class
-                if "l" in pre_token["class"]:
-                    res.append(accumulator)
-                    num = re.findall("[А-яа-я]", cur_tk["obj"].strip())[-1]
-                    accumulator = re.sub(r"[А-яа-я]$", num, accumulator)
-                else:
-                    accumulator +=  cur_tk["obj"]
-            if cur_tk['class'] == "":
-                cur_tk['class'] = "w"
+            cur_tk = address_classification(cur_tk, pre_token)
             tokens.iloc[i] = cur_tk
             print(tokens.iloc[i])
 
-        # print(cur_tk)
+            if not accumulator["class"]:
+                accumulator["class"] = cur_tk['class']
+                accumulator["address"] = cur_tk["obj"]
+                continue
+            if CLASSES.index(accumulator["class"][-1]) < CLASSES.index(cur_tk["class"][0]) and accumulator["class"]!="w":
+                accumulator["class"] += cur_tk['class']
+                accumulator["address"] += " " + cur_tk["obj"]
+            else:
+                ad_no_ranges = unfold_house_ranges(accumulator["address"])
+                accumulator["address"] = ad_no_ranges[-1]
+                res.extend(ad_no_ranges)
+                while accumulator["class"] and CLASSES.index(accumulator["class"][-1]) > CLASSES.index(cur_tk["class"][0]):
+                    if accumulator["class"][-1] == "h":
+                        accumulator["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", accumulator["address"].lower())
+                    elif accumulator["class"][-1] == "b":
+                        num = re.findall("к{0,1}\.? ?\d", accumulator["address"])[-1]
+                        accumulator["address"] = re.sub(num, "", accumulator["address"])
+                    elif accumulator["class"][-1] == "l":
+                        accumulator ["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$","", accumulator["address"])
+                    elif accumulator["class"][-1] == "r":
+                        accumulator["address"] = re.sub(r"пом\.? ?\d+","", accumulator["address"])
+                    accumulator["class"] = accumulator["class"][:-1]
+                if not accumulator["class"] or CLASSES.index(cur_tk["class"][0]) <= CLASSES.index("s") or accumulator["class"]=="w":
+                    accumulator["class"] = cur_tk["class"]
+                    accumulator["address"] = cur_tk["obj"]
+                if cur_tk["class"][0] == "h":
+                    num = re.findall("\d{1,4} ?\/?\d* ?", cur_tk['obj'])[0]
+                    accumulator["address"] = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator["address"])
+                    cur_tk["class"] =cur_tk["class"][1:]
+                if cur_tk["class"] and cur_tk["class"][0] == "b":
+                    num = re.findall("\d", cur_tk["obj"])[-1]
+                    if num and not "b" in accumulator["class"]:
+                        accumulator["class"] += "b"
+                        accumulator["address"] += "к." + num
+                    else:
+                        accumulator["address"] = re.sub(r"\d$", num, accumulator["address"])
+                    cur_tk["class"] = cur_tk["class"][1:]
+
+                if cur_tk["class"] and cur_tk["class"][0] == "l":
+                    num = re.findall("[А-Яа-я]", cur_tk["obj"].strip())[-1]
+                    accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip())
+                    accumulator["address"] += num
+                    if num and not "l" in accumulator["class"]:
+                        accumulator["class"] += "l"
+                else:
+                    if re.search(r"\d{1,3}([А-Я]|[а-я])( |$)",  accumulator["address"]):
+                        accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip())
+        res.extend(unfold_house_ranges(accumulator["address"]))
+        print(res)
         return res
 
     return [address]
 
+def split_pesoch_res(address: str) -> List[str]:
+    t = re.sub(r",", " ", address)
+    t = re.split(r"(Санкт-Петербург|Ленинградская обл|Л\.О)", t)
+    t = list(map(str.strip, filter(lambda token: token != "", t)))
+    tokens = [t[i] + " " + t[i+1] for i in range(0, len(t)-1, 2)]
+
+    if tokens:
+        return list(set(tokens))
+    return [address]
 
 def process_row(row: pd.Series[str]) -> pd.Series[str]:
     row = row.copy()
@@ -220,7 +228,10 @@ def process_row(row: pd.Series[str]) -> pd.Series[str]:
     if pd.isnull(row["Улица"]):
         row["Улица"] = [None]
     else:
-        addresses = split_address(row["Улица"])
+        if row["РЭС"] == "Песочинский РЭС":
+            addresses = split_pesoch_res(row["Улица"])
+        else:
+            addresses = split_address(row["Улица"])
         row["Улица"] = addresses
 
     return row
@@ -229,4 +240,4 @@ def process_row(row: pd.Series[str]) -> pd.Series[str]:
 def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
     merged_df = df.apply(process_row, axis=1).reset_index()
 
-    return merged_df.explode("Улица", ignore_index=True)
+    return merged_df.explode("Улица", ignore_index=True)
\ No newline at end of file
diff --git a/parser/util.py b/parser/util.py
index 6b54416..983c9b8 100644
--- a/parser/util.py
+++ b/parser/util.py
@@ -10,16 +10,12 @@ from . import (
 
 def pipeline(parser: Optional[LenenergoParser] = None) -> LenenergoParser:
     if parser is None:
-        parser = LenenergoParser(ndays=15)
+        parser = LenenergoParser(file_path = r"C:\Users\Юля\PycharmProjects\machine_learning\lenengro_parser\data_Rosseti.csv")
 
     print(parser)
 
     parser.df = split_addresses(parser.df)
 
-    for i in range(len(parser.df)):
-        print(parser.df['Улица'].iloc[i])
-
-
     parser.df = concurrent_fetch_builing_ids(parser.df)
 
     parser.df = preprocess_df(parser.df)

From 1fd7a123f93a5f160fc64dd45d692748a4ad4f4e Mon Sep 17 00:00:00 2001
From: AnastasiaOnimovma <kud0304@mail.ru>
Date: Sat, 21 Oct 2023 18:12:36 +0300
Subject: [PATCH 05/12] New fuctions

---
 parser/address.py | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/parser/address.py b/parser/address.py
index 4c3bdca..73f53e4 100644
--- a/parser/address.py
+++ b/parser/address.py
@@ -130,6 +130,21 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
         token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"])
     return token
 
+def cut_address(ad: pd.Series, cl: str) -> pd.Series:
+    while ad["class"] and CLASSES.index(ad["class"][-1]) > CLASSES.index(cl[0]):
+        if ad["class"][-1] == "h":
+            ad["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "",
+                                            ad["address"].lower())
+        elif ad["class"][-1] == "b":
+            num = re.findall("к{0,1}\.? ?\d", ad["address"])[-1]
+            ad["address"] = re.sub(num, "", ad["address"])
+        elif ad["class"][-1] == "l":
+            ad["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$", "", ad["address"])
+        elif ad["class"][-1] == "r":
+            ad["address"] = re.sub(r"пом\.? ?\d+", "", ad["address"])
+        ad["class"] = ad["class"][:-1]
+    return ad
+
 
 # TODO: переработать систему из if в нормальный вид
 def split_address(address: str) -> List[str]:
@@ -163,31 +178,27 @@ def split_address(address: str) -> List[str]:
                 accumulator["class"] = cur_tk['class']
                 accumulator["address"] = cur_tk["obj"]
                 continue
+
             if CLASSES.index(accumulator["class"][-1]) < CLASSES.index(cur_tk["class"][0]) and accumulator["class"]!="w":
                 accumulator["class"] += cur_tk['class']
                 accumulator["address"] += " " + cur_tk["obj"]
             else:
                 ad_no_ranges = unfold_house_ranges(accumulator["address"])
                 accumulator["address"] = ad_no_ranges[-1]
+
                 res.extend(ad_no_ranges)
-                while accumulator["class"] and CLASSES.index(accumulator["class"][-1]) > CLASSES.index(cur_tk["class"][0]):
-                    if accumulator["class"][-1] == "h":
-                        accumulator["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", accumulator["address"].lower())
-                    elif accumulator["class"][-1] == "b":
-                        num = re.findall("к{0,1}\.? ?\d", accumulator["address"])[-1]
-                        accumulator["address"] = re.sub(num, "", accumulator["address"])
-                    elif accumulator["class"][-1] == "l":
-                        accumulator ["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$","", accumulator["address"])
-                    elif accumulator["class"][-1] == "r":
-                        accumulator["address"] = re.sub(r"пом\.? ?\d+","", accumulator["address"])
-                    accumulator["class"] = accumulator["class"][:-1]
+
+                accumulator = cut_address(accumulator, cur_tk["class"])
+
                 if not accumulator["class"] or CLASSES.index(cur_tk["class"][0]) <= CLASSES.index("s") or accumulator["class"]=="w":
                     accumulator["class"] = cur_tk["class"]
                     accumulator["address"] = cur_tk["obj"]
+
                 if cur_tk["class"][0] == "h":
                     num = re.findall("\d{1,4} ?\/?\d* ?", cur_tk['obj'])[0]
                     accumulator["address"] = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator["address"])
                     cur_tk["class"] =cur_tk["class"][1:]
+
                 if cur_tk["class"] and cur_tk["class"][0] == "b":
                     num = re.findall("\d", cur_tk["obj"])[-1]
                     if num and not "b" in accumulator["class"]:

From 259c71b17bfee77ba4e71eed2a26423319af9b2e Mon Sep 17 00:00:00 2001
From: AnastasiaOnimovma <kud0304@mail.ru>
Date: Mon, 23 Oct 2023 00:42:27 +0300
Subject: [PATCH 06/12] Street recognition

---
 parser/address.py | 73 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 16 deletions(-)

diff --git a/parser/address.py b/parser/address.py
index 73f53e4..2d94cae 100644
--- a/parser/address.py
+++ b/parser/address.py
@@ -7,8 +7,7 @@ import pandas as pd
 
 T = TypeVar("T")
 
-CLASSES = ("w", "d", "c", "t", "s", "h", "b", "l", "r")
-
+CLASSES = ("w", "d", "c", "t", "s", "h", "b","e", "l", "r")
 DISTRICTS_PREFIXES = ("мо ", "р-н","городское","лесхоз")
 COUNTRYSIDE_PREFIXES = (
     "г", "п", "д", "гп", "рп", "кп", "пгт", "c", "хутор", " урочище")
@@ -18,9 +17,11 @@ STREET_PREFIXES = (
     " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе", "лини", "аллея",
     "мост", " парк", "кольцо", "проезд", "съезд","переулок",
     "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.")
-HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп","дом")
-BUILDING_PREFIXES = ("к.", "корп", 'стр.', "строение", "корпус")
+HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп","дом","дома")
+BUILDING_PREFIXES = ("к.", "к ","корп", "корпус")
+EDIFICE_PREFIXES=("стр.", "строение")
 LETTER = ("лит.", "литера", " л.")
+PREFIXES = (DISTRICTS_PREFIXES, COUNTRYSIDE_PREFIXES, TERRITORY_PREFIXES, STREET_PREFIXES, HOUSES_PREFIXES, BUILDING_PREFIXES, EDIFICE_PREFIXES,LETTER)
 
 
 def unfold_house_ranges(token: str) -> List[str]:
@@ -47,12 +48,14 @@ def flatten(arr: Iterable[List[T]]) -> List[T]:
 
 
 def find_room(token: pd.Series, pre_token: pd.Series) -> str:
-    if re.search(r"пом\.?", token['obj']):
+    if re.search(r"\bпом\.?", token['obj']):
         return "r"
     return ""
 
 
 def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
+    if find_room(token, pre_token):
+        return ""
     if any_of_in(LETTER, token['obj'].lower()) \
             or re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token['obj']):
         return "l"
@@ -62,33 +65,47 @@ def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
             and not find_countryside(token, pre_token):
         return "l"
     return ""
-
+def find_edifice(token: pd.Series, pre_token: pd.Series) -> str:
+    if any_of_in(EDIFICE_PREFIXES, token['obj'].lower()):
+        return "e"
+    return ""
 
 def find_building(token: pd.Series, pre_token: pd.Series) -> str:
-    if re.search(r"\d", token['obj']):
+    if re.search(r"\d", token['obj']) and not find_room(token,pre_token):
         if any_of_in(BUILDING_PREFIXES, token['obj'].lower()) \
-                or "b" in pre_token['class'] and not ("h" in token['class']) \
+                or "b" in pre_token['class'] and not ("h" in token['class']) and not find_edifice(token,pre_token)\
                 or re.search(r"к\.* ?\d", token['obj']):
             return "b"
     return ""
 
 
 def find_house(token: pd.Series, pre_token: pd.Series) -> str:
-    if re.search(r"\d{1,4}", token['obj']):
+    if re.search(r"\d{1,4}", token['obj']) and not find_room(token,pre_token):
         if any_of_in(HOUSES_PREFIXES, token['obj'].lower()):
             return "h"
         if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token['obj']):
             return "h"
         if ("s" in pre_token['class'] or "h" in pre_token['class'] or "s" in token['class']) \
                 and not any_of_in(("-я", "-й", "-Я"), token['obj']) \
-                and not find_building(token, pre_token):
+                and not find_building(token, pre_token)\
+                and not find_edifice(token,pre_token):
             return "h"
+        if find_building(token, pre_token) \
+                and not any_of_in(("-я", "-й", "-Я"), token['obj']) \
+                and True:
+            if len(re.findall(r"\d{1,4}", token['obj'])) > 1:
+                return "h"
+            if int(re.search(r"\d{1,4}", token['obj']).group()) // 10 >0:
+                return "h"
     return ""
 
 
 def find_street(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(STREET_PREFIXES, token['obj'].lower()) \
-            or re.search(r"[а-я]+ая", token['obj']):
+    if any_of_in(STREET_PREFIXES, token['obj'].lower()):
+        return "s"
+    if re.search(r"\b[А-Яа-я]{4,}\b", token['obj']) \
+            and not any([el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el)>2]) \
+            and not ("d" in token["class"] or "t" in token["class"] or "c" in token["class"]):
         return "s"
     return ""
 
@@ -103,7 +120,7 @@ def find_countryside(token: pd.Series, pre_token: pd.Series) -> str:
     if any_of_in(COUNTRYSIDE_PREFIXES, token['obj'].lower()) \
             and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token['obj']) \
             and not find_house(token, pre_token) \
-            and not find_street(token, pre_token):
+            and not any_of_in(STREET_PREFIXES, token['obj'].lower()):
         return "c"
     return ""
 
@@ -123,7 +140,9 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
     token["class"] += find_street(token, pre_token)
     token["class"] += find_house(token, pre_token)
     token["class"] += find_building(token, pre_token)
+    token["class"] += find_edifice(token, pre_token)
     token["class"] += find_litera(token, pre_token)
+    token["class"] += find_room(token, pre_token)
     if token['class'] == "":
         token['class'] = "w"
     if brackets:
@@ -138,6 +157,8 @@ def cut_address(ad: pd.Series, cl: str) -> pd.Series:
         elif ad["class"][-1] == "b":
             num = re.findall("к{0,1}\.? ?\d", ad["address"])[-1]
             ad["address"] = re.sub(num, "", ad["address"])
+        elif ad["class"][-1] == "e":
+            ad["address"] = re.sub(r"cтр\.? ?\d", "", ad["address"])
         elif ad["class"][-1] == "l":
             ad["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$", "", ad["address"])
         elif ad["class"][-1] == "r":
@@ -157,7 +178,8 @@ def split_address(address: str) -> List[str]:
 
         tokens = pd.DataFrame()
         tokens['obj'] = t
-        tokens = tokens[tokens["obj"] != ""]
+        for el in ("", "уг.", "д."):
+            tokens = tokens[tokens["obj"] != el]
         tokens.insert(len(tokens.columns), "class", "")
         res = []
         accumulator = pd.Series(data={"address": "", "class": ""})
@@ -195,8 +217,14 @@ def split_address(address: str) -> List[str]:
                     accumulator["address"] = cur_tk["obj"]
 
                 if cur_tk["class"][0] == "h":
-                    num = re.findall("\d{1,4} ?\/?\d* ?", cur_tk['obj'])[0]
-                    accumulator["address"] = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator["address"])
+                    num = re.findall("\d{1,4} ?[\/\-]?\d* ?", cur_tk['obj'])[0]
+                    if any_of_in(("-я", "-й", "-Я"), accumulator["address"]):
+                        idx = 1
+                    else:
+                        idx = 0
+                    num_ac = re.findall("\d{1,4} ?[\/\-]?\d* ?", accumulator["address"])
+                    if num_ac:
+                        accumulator["address"] = re.sub(num_ac[idx], num, accumulator["address"])
                     cur_tk["class"] =cur_tk["class"][1:]
 
                 if cur_tk["class"] and cur_tk["class"][0] == "b":
@@ -208,6 +236,13 @@ def split_address(address: str) -> List[str]:
                         accumulator["address"] = re.sub(r"\d$", num, accumulator["address"])
                     cur_tk["class"] = cur_tk["class"][1:]
 
+                if cur_tk["class"] and cur_tk["class"][0] == "e":
+                    num = re.findall("стр\.? ?\d", cur_tk["obj"].strip())[-1]
+                    accumulator["address"] = re.sub(r"cтр\. ?\d", num, accumulator["address"].strip())
+                    if num and not "e" in accumulator["class"]:
+                        accumulator["class"] += "e"
+                    cur_tk["class"] = cur_tk["class"][1:]
+
                 if cur_tk["class"] and cur_tk["class"][0] == "l":
                     num = re.findall("[А-Яа-я]", cur_tk["obj"].strip())[-1]
                     accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip())
@@ -217,6 +252,12 @@ def split_address(address: str) -> List[str]:
                 else:
                     if re.search(r"\d{1,3}([А-Я]|[а-я])( |$)",  accumulator["address"]):
                         accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip())
+                if cur_tk["class"] and cur_tk["class"][0] == "r":
+                    num = re.findall("пом\. ?\-?\d*\w?", cur_tk["obj"].strip())[-1]
+                    accumulator["address"] = re.sub(r"пом\. ?\d\-?\d*\w?", num, accumulator["address"].strip())
+                    if num and not "r" in accumulator["class"]:
+                        accumulator["class"] += "r"
+                    cur_tk["class"] = cur_tk["class"][1:]
         res.extend(unfold_house_ranges(accumulator["address"]))
         print(res)
         return res

From 5722fc86fbb8964e62a704f332f61008264e0b82 Mon Sep 17 00:00:00 2001
From: dm1sh <me@dmitriy.icu>
Date: Sun, 29 Oct 2023 10:44:08 +0300
Subject: [PATCH 07/12] Rewrote split_address as a class AddressSplitter

---
 parser/__main__.py |   4 +-
 parser/address.py  | 218 ++++++++++++++++++++++++++++++++-------------
 parser/util.py     |   2 +-
 3 files changed, 160 insertions(+), 64 deletions(-)

diff --git a/parser/__main__.py b/parser/__main__.py
index de668c0..5616d83 100644
--- a/parser/__main__.py
+++ b/parser/__main__.py
@@ -3,11 +3,11 @@ import time
 
 import schedule
 
-from . import pipeline
+from . import pipeline, LenenergoParser
 
 
 def job():
-    parser = pipeline()
+    parser = pipeline(LenenergoParser(file_path="./data.csv"))
     parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H.%M")}.csv')
 
 
diff --git a/parser/address.py b/parser/address.py
index 73f53e4..4239bf7 100644
--- a/parser/address.py
+++ b/parser/address.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import re
 from typing import Iterable, List, TypeVar
+from collections.abc import Sequence
 
 import pandas as pd
 
@@ -146,82 +147,177 @@ def cut_address(ad: pd.Series, cl: str) -> pd.Series:
     return ad
 
 
-# TODO: переработать систему из if в нормальный вид
-def split_address(address: str) -> List[str]:
-    if ";" in address:
-        address = address.replace(";", ",")
-    if "," in address:
-        tokens = address.split(",")
+def is_nonempty_str(string: str) -> bool:
+    return string != ""
 
-        t = list(map(str.strip, filter(lambda token: token != "", tokens)))
 
-        tokens = pd.DataFrame()
-        tokens['obj'] = t
-        tokens = tokens[tokens["obj"] != ""]
-        tokens.insert(len(tokens.columns), "class", "")
-        res = []
-        accumulator = pd.Series(data={"address": "", "class": ""})
+def create_token(obj: str = "", token_class: str = ""):
+    return pd.Series(
+        {
+            "obj": obj,
+            "class": token_class,
+        }
+    )
 
-        for i in range(len(tokens)):
-            cur_tk = tokens.iloc[i]
 
-            if i == 0:
-                pre_token = pd.Series(data=["", ""], index=['obj', 'class'])
-            else:
-                pre_token = tokens.iloc[i - 1]
+class AddressSplitter(Sequence):
+    addresses: list[str]
+    tokens: list[pd.Series]
 
-            cur_tk = address_classification(cur_tk, pre_token)
-            tokens.iloc[i] = cur_tk
-            print(tokens.iloc[i])
+    def __init__(self, address: str):
+        self.input = address
 
-            if not accumulator["class"]:
-                accumulator["class"] = cur_tk['class']
-                accumulator["address"] = cur_tk["obj"]
+        self.addresses = self.split()
+
+        if len(self.addresses) == 0:
+            self.addresses = [address]
+
+    # Sequence abstract methods implementation
+
+    def __getitem__(self, key: int):
+        if key < len(self.addresses):
+            return self.addresses[key]
+        else:
+            raise IndexError()
+
+    def __len__(self):
+        return len(self.addresses)
+
+    # Address token class manipulations
+
+    def next_class(self) -> str:
+        return self.token["class"][0]
+
+    def correct_order(self) -> bool:
+        prev_class = self.accumulator["class"][-1]
+
+        return (
+            CLASSES.index(prev_class) < CLASSES.index(self.next_class())
+            and self.accumulator["class"] != "w"
+        )
+
+    def next_class_is(self, comparing_class: str) -> bool:
+        return len(self.token["class"]) > 0 and self.next_class() == comparing_class[0]
+
+    def pop_token_class(self):
+        self.token["class"] = self.token["class"][1:]
+
+    def has_no_class(self, comparing_class: str) -> bool:
+        return comparing_class[0] not in self.accumulator["class"]
+
+    def next_is_street_or_upper(self) -> bool:
+        is_unknown_class = self.accumulator["class"] in ("", "w")
+
+        return (
+            CLASSES.index(self.next_class()) <= CLASSES.index("s") or is_unknown_class
+        )
+
+    # Accumulator manipulation
+
+    def substitue_house(self) -> str:
+        num = re.findall(r"\d{1,4} ?\/?\d* ?", self.token["obj"])[0]
+
+        return re.sub(r"\d{1,4} ?\/*\d* ?", num, self.accumulator["address"])
+
+    def append_building(self, num: int) -> pd.Series:
+        self.accumulator["class"] += "b"
+        self.accumulator["address"] += "к." + num
+
+        return self.accumulator
+
+    def substitue_building(self, num: int) -> str:
+        return re.sub(r"\d$", num, self.accumulator["address"])
+
+    def insert_building(self):
+        number = re.findall(r"\d", self.token["obj"])[-1]
+
+        if number and self.has_no_class("building"):
+            self.accumulator = self.append_building(number)
+        else:
+            self.accumulator["address"] = self.substitue_building(number)
+
+    def without_letter(self) -> str:
+        return re.sub(r"[А-Яа-я]$", "", self.accumulator["address"].strip())
+
+    def substitue_letter(self, letter: str) -> str:
+        address_without_letter = self.without_letter()
+
+        return address_without_letter + letter
+
+    def insert_letter(self):
+        letter = re.findall(r"[А-Яа-я]", self.token["obj"].strip())[-1]
+        self.accumulator["address"] = self.substitue_letter(letter)
+
+        if letter and self.has_no_class("litera"):
+            self.accumulator["class"] += "l"
+
+    def has_letter_in(self) -> bool:
+        return (
+            re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", self.accumulator["address"])
+            is not None
+        )
+
+    # Data preprocessing
+
+    def split_tokens(self) -> list[pd.Series]:
+        address = self.input.replace(";", ",")
+
+        parts = address.split(",")
+        parts = map(str.strip, parts)
+        parts = filter(is_nonempty_str, parts)
+
+        tokens = map(lambda part: create_token(part, ""), parts)
+
+        return list(tokens)
+
+    def split(self):
+        self.tokens = self.split_tokens()
+
+        result = []
+
+        self.accumulator = pd.Series({"address": "", "class": ""})
+
+        prev_token = create_token()
+
+        for cursor in self.tokens:
+            self.token = address_classification(cursor, prev_token)
+            prev_token = self.token.copy()
+
+            if self.accumulator["class"] == "":
+                self.accumulator = self.token.rename({"obj": "address"})
                 continue
 
-            if CLASSES.index(accumulator["class"][-1]) < CLASSES.index(cur_tk["class"][0]) and accumulator["class"]!="w":
-                accumulator["class"] += cur_tk['class']
-                accumulator["address"] += " " + cur_tk["obj"]
+            if self.correct_order():
+                self.accumulator["address"] += " "
+                self.accumulator += self.token.rename({"obj": "address"})
             else:
-                ad_no_ranges = unfold_house_ranges(accumulator["address"])
-                accumulator["address"] = ad_no_ranges[-1]
+                unfolded_address = unfold_house_ranges(self.accumulator["address"])
+                self.accumulator["address"] = unfolded_address[-1]
 
-                res.extend(ad_no_ranges)
+                result.extend(unfolded_address)
 
-                accumulator = cut_address(accumulator, cur_tk["class"])
+                self.accumulator = cut_address(self.accumulator, self.token["class"])
 
-                if not accumulator["class"] or CLASSES.index(cur_tk["class"][0]) <= CLASSES.index("s") or accumulator["class"]=="w":
-                    accumulator["class"] = cur_tk["class"]
-                    accumulator["address"] = cur_tk["obj"]
+                if self.next_is_street_or_upper():
+                    self.accumulator = self.token.rename({"obj": "address"})
 
-                if cur_tk["class"][0] == "h":
-                    num = re.findall("\d{1,4} ?\/?\d* ?", cur_tk['obj'])[0]
-                    accumulator["address"] = re.sub(r"\d{1,4} ?\/*\d* ?", num, accumulator["address"])
-                    cur_tk["class"] =cur_tk["class"][1:]
+                if self.next_class_is("house"):
+                    self.accumulator["address"] = self.substitue_house()
+                    self.pop_token_class()
 
-                if cur_tk["class"] and cur_tk["class"][0] == "b":
-                    num = re.findall("\d", cur_tk["obj"])[-1]
-                    if num and not "b" in accumulator["class"]:
-                        accumulator["class"] += "b"
-                        accumulator["address"] += "к." + num
-                    else:
-                        accumulator["address"] = re.sub(r"\d$", num, accumulator["address"])
-                    cur_tk["class"] = cur_tk["class"][1:]
+                if self.next_class_is("building"):
+                    self.insert_building()
+                    self.pop_token_class()
 
-                if cur_tk["class"] and cur_tk["class"][0] == "l":
-                    num = re.findall("[А-Яа-я]", cur_tk["obj"].strip())[-1]
-                    accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip())
-                    accumulator["address"] += num
-                    if num and not "l" in accumulator["class"]:
-                        accumulator["class"] += "l"
-                else:
-                    if re.search(r"\d{1,3}([А-Я]|[а-я])( |$)",  accumulator["address"]):
-                        accumulator["address"] = re.sub(r"[А-Яа-я]$", "", accumulator["address"].strip())
-        res.extend(unfold_house_ranges(accumulator["address"]))
-        print(res)
-        return res
+                if self.next_class_is("letter"):
+                    self.insert_letter()
+                elif self.has_letter_in():
+                    self.accumulator["address"] = self.without_letter()
+
+        result.extend(unfold_house_ranges(self.accumulator["address"]))
+
+        return result
 
-    return [address]
 
 def split_pesoch_res(address: str) -> List[str]:
     t = re.sub(r",", " ", address)
@@ -242,7 +338,7 @@ def process_row(row: pd.Series[str]) -> pd.Series[str]:
         if row["РЭС"] == "Песочинский РЭС":
             addresses = split_pesoch_res(row["Улица"])
         else:
-            addresses = split_address(row["Улица"])
+            addresses = AddressSplitter(row["Улица"])
         row["Улица"] = addresses
 
     return row
diff --git a/parser/util.py b/parser/util.py
index 983c9b8..9fad768 100644
--- a/parser/util.py
+++ b/parser/util.py
@@ -10,7 +10,7 @@ from . import (
 
 def pipeline(parser: Optional[LenenergoParser] = None) -> LenenergoParser:
     if parser is None:
-        parser = LenenergoParser(file_path = r"C:\Users\Юля\PycharmProjects\machine_learning\lenengro_parser\data_Rosseti.csv")
+        parser = LenenergoParser(parser)
 
     print(parser)
 

From e6af86703e8a84953d75b02530dee5f735e783c1 Mon Sep 17 00:00:00 2001
From: dm1sh <me@dmitriy.icu>
Date: Sun, 29 Oct 2023 12:24:49 +0300
Subject: [PATCH 08/12] Applied formatter

---
 parser/__main__.py |   2 +-
 parser/address.py  | 220 +++++++++++++++++++++++++++++++++------------
 2 files changed, 162 insertions(+), 60 deletions(-)

diff --git a/parser/__main__.py b/parser/__main__.py
index 5616d83..b9de621 100644
--- a/parser/__main__.py
+++ b/parser/__main__.py
@@ -3,7 +3,7 @@ import time
 
 import schedule
 
-from . import pipeline, LenenergoParser
+from . import LenenergoParser, pipeline
 
 
 def job():
diff --git a/parser/address.py b/parser/address.py
index 28769f7..d9111dd 100644
--- a/parser/address.py
+++ b/parser/address.py
@@ -1,28 +1,90 @@
 from __future__ import annotations
 
 import re
-from typing import Iterable, List, TypeVar
 from collections.abc import Sequence
+from typing import Iterable, List, TypeVar
 
 import pandas as pd
 
 T = TypeVar("T")
 
 CLASSES = ("w", "d", "c", "t", "s", "h", "b", "e", "l", "r")
-DISTRICTS_PREFIXES = ("мо ", "р-н","городское","лесхоз")
+DISTRICTS_PREFIXES = ("мо ", "р-н", "городское", "лесхоз")
 COUNTRYSIDE_PREFIXES = (
-    "г", "п", "д", "гп", "рп", "кп", "пгт", "c", "хутор", " урочище")
+    "г",
+    "п",
+    "д",
+    "гп",
+    "рп",
+    "кп",
+    "пгт",
+    "c",
+    "хутор",
+    " урочище",
+)
 TERRITORY_PREFIXES = (
-"тер.", " тер", "снт ", "ст ", "дск ", "днп ", "дпк ", "нп ", "пдк ", "т/б ", "садоводство", "массив", "хозя", "сад-во")
+    "тер.",
+    " тер",
+    "снт ",
+    "ст ",
+    "дск ",
+    "днп ",
+    "дпк ",
+    "нп ",
+    "пдк ",
+    "т/б ",
+    "садоводство",
+    "массив",
+    "хозя",
+    "сад-во",
+)
 STREET_PREFIXES = (
-    " ул", " бул", " пр", " ш", " пер", " дор", " маг", " наб", " пл", " просп", " туп", "шоссе", "лини", "аллея",
-    "мост", " парк", "кольцо", "проезд", "съезд","переулок",
-    "ул.", "бул.", "пр.", "ш.", "пер.", "дор.", "маг.", "наб.", "пл.", "просп.", "туп.")
-HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп","дом","дома")
-BUILDING_PREFIXES = ("к.", "к ","корп", "корпус")
-EDIFICE_PREFIXES=("стр.", "строение")
+    " ул",
+    " бул",
+    " пр",
+    " ш",
+    " пер",
+    " дор",
+    " маг",
+    " наб",
+    " пл",
+    " просп",
+    " туп",
+    "шоссе",
+    "лини",
+    "аллея",
+    "мост",
+    " парк",
+    "кольцо",
+    "проезд",
+    "съезд",
+    "переулок",
+    "ул.",
+    "бул.",
+    "пр.",
+    "ш.",
+    "пер.",
+    "дор.",
+    "маг.",
+    "наб.",
+    "пл.",
+    "просп.",
+    "туп.",
+)
+HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп", "дом", "дома")
+BUILDING_PREFIXES = ("к.", "к ", "корп", "корпус")
+EDIFICE_PREFIXES = ("стр.", "строение")
 LETTER = ("лит.", "литера", " л.")
-PREFIXES = (DISTRICTS_PREFIXES, COUNTRYSIDE_PREFIXES, TERRITORY_PREFIXES, STREET_PREFIXES, HOUSES_PREFIXES, BUILDING_PREFIXES, EDIFICE_PREFIXES,LETTER)
+PREFIXES = (
+    DISTRICTS_PREFIXES,
+    COUNTRYSIDE_PREFIXES,
+    TERRITORY_PREFIXES,
+    STREET_PREFIXES,
+    HOUSES_PREFIXES,
+    BUILDING_PREFIXES,
+    EDIFICE_PREFIXES,
+    LETTER,
+)
 
 
 def unfold_house_ranges(token: str) -> List[str]:
@@ -33,13 +95,17 @@ def unfold_house_ranges(token: str) -> List[str]:
         a, b = int(a), int(b)
 
         if b > a:
-            addresses += [re.sub(r"([\d]+-[\d]+)", number, token) for number in map(str, range(a, b + 1))]
+            addresses += [
+                re.sub(r"([\d]+-[\d]+)", number, token)
+                for number in map(str, range(a, b + 1))
+            ]
         else:
             token = token.replace("-", "/")
     if not addresses:
         addresses.append(token)
     return addresses
 
+
 def any_of_in(substrings: Iterable[str], string: str) -> bool:
     return any(map(lambda substring: substring in string, substrings))
 
@@ -49,7 +115,7 @@ def flatten(arr: Iterable[List[T]]) -> List[T]:
 
 
 def find_room(token: pd.Series, pre_token: pd.Series) -> str:
-    if re.search(r"\bпом\.?", token['obj']):
+    if re.search(r"\bпом\.?", token["obj"]):
         return "r"
     return ""
 
@@ -57,80 +123,109 @@ def find_room(token: pd.Series, pre_token: pd.Series) -> str:
 def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
     if find_room(token, pre_token):
         return ""
-    if any_of_in(LETTER, token['obj'].lower()) \
-            or re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token['obj']):
+    if any_of_in(LETTER, token["obj"].lower()) or re.search(
+        r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"]
+    ):
         return "l"
-    if (re.search(r"\b([А-Я]|[а-я]){1}$", token['obj']) \
-        and ("l" in pre_token['class'] or "h" in pre_token['class'])) \
-            and (" ш" not in token["obj"]) \
-            and not find_countryside(token, pre_token):
+    if (
+        (
+            re.search(r"\b([А-Я]|[а-я]){1}$", token["obj"])
+            and ("l" in pre_token["class"] or "h" in pre_token["class"])
+        )
+        and (" ш" not in token["obj"])
+        and not find_countryside(token, pre_token)
+    ):
         return "l"
     return ""
+
+
 def find_edifice(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(EDIFICE_PREFIXES, token['obj'].lower()):
+    if any_of_in(EDIFICE_PREFIXES, token["obj"].lower()):
         return "e"
     return ""
 
+
 def find_building(token: pd.Series, pre_token: pd.Series) -> str:
-    if re.search(r"\d", token['obj']) and not find_room(token,pre_token):
-        if any_of_in(BUILDING_PREFIXES, token['obj'].lower()) \
-                or "b" in pre_token['class'] and ("h" not in token['class']) and not find_edifice(token,pre_token)\
-                or re.search(r"к\.* ?\d", token['obj']):
+    if re.search(r"\d", token["obj"]) and not find_room(token, pre_token):
+        if (
+            any_of_in(BUILDING_PREFIXES, token["obj"].lower())
+            or "b" in pre_token["class"]
+            and ("h" not in token["class"])
+            and not find_edifice(token, pre_token)
+            or re.search(r"к\.* ?\d", token["obj"])
+        ):
             return "b"
     return ""
 
 
 def find_house(token: pd.Series, pre_token: pd.Series) -> str:
-    if re.search(r"\d{1,4}", token['obj']) and not find_room(token,pre_token):
-        if any_of_in(HOUSES_PREFIXES, token['obj'].lower()):
+    if re.search(r"\d{1,4}", token["obj"]) and not find_room(token, pre_token):
+        if any_of_in(HOUSES_PREFIXES, token["obj"].lower()):
             return "h"
-        if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token['obj']):
+        if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token["obj"]):
             return "h"
-        if ("s" in pre_token['class'] or "h" in pre_token['class'] or "s" in token['class']) \
-                and not any_of_in(("-я", "-й", "-Я"), token['obj']) \
-                and not find_building(token, pre_token)\
-                and not find_edifice(token,pre_token):
+        if (
+            (
+                "s" in pre_token["class"]
+                or "h" in pre_token["class"]
+                or "s" in token["class"]
+            )
+            and not any_of_in(("-я", "-й", "-Я"), token["obj"])
+            and not find_building(token, pre_token)
+            and not find_edifice(token, pre_token)
+        ):
             return "h"
-        if find_building(token, pre_token) \
-                and not any_of_in(("-я", "-й", "-Я"), token['obj']) \
-                and True:
-            if len(re.findall(r"\d{1,4}", token['obj'])) > 1:
+        if (
+            find_building(token, pre_token)
+            and not any_of_in(("-я", "-й", "-Я"), token["obj"])
+            and True
+        ):
+            if len(re.findall(r"\d{1,4}", token["obj"])) > 1:
                 return "h"
-            if int(re.search(r"\d{1,4}", token['obj']).group()) // 10 >0:
+            if int(re.search(r"\d{1,4}", token["obj"]).group()) // 10 > 0:
                 return "h"
     return ""
 
 
 def find_street(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(STREET_PREFIXES, token['obj'].lower()):
+    if any_of_in(STREET_PREFIXES, token["obj"].lower()):
         return "s"
-    if re.search(r"\b[А-Яа-я]{4,}\b", token['obj']) \
-            and not any([el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el)>2]) \
-            and not ("d" in token["class"] or "t" in token["class"] or "c" in token["class"]):
+    if (
+        re.search(r"\b[А-Яа-я]{4,}\b", token["obj"])
+        and not any(
+            [el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el) > 2]
+        )
+        and not (
+            "d" in token["class"] or "t" in token["class"] or "c" in token["class"]
+        )
+    ):
         return "s"
     return ""
 
 
 def find_territory(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(TERRITORY_PREFIXES, token['obj'].lower()):
+    if any_of_in(TERRITORY_PREFIXES, token["obj"].lower()):
         return "t"
     return ""
 
 
 def find_countryside(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(COUNTRYSIDE_PREFIXES, token['obj'].lower()) \
-            and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token['obj']) \
-            and not find_house(token, pre_token) \
-            and not any_of_in(STREET_PREFIXES, token['obj'].lower()):
+    if (
+        any_of_in(COUNTRYSIDE_PREFIXES, token["obj"].lower())
+        and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token["obj"])
+        and not find_house(token, pre_token)
+        and not any_of_in(STREET_PREFIXES, token["obj"].lower())
+    ):
         return "c"
     return ""
 
 
 def find_district(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(DISTRICTS_PREFIXES, token['obj'].lower()):
+    if any_of_in(DISTRICTS_PREFIXES, token["obj"].lower()):
         return "d"
     return ""
 
+
 def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
     brackets = re.search(r"\(.+\)", token["obj"])
     if brackets:
@@ -144,17 +239,19 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
     token["class"] += find_edifice(token, pre_token)
     token["class"] += find_litera(token, pre_token)
     token["class"] += find_room(token, pre_token)
-    if token['class'] == "":
-        token['class'] = "w"
+    if token["class"] == "":
+        token["class"] = "w"
     if brackets:
         token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"])
     return token
 
+
 def cut_address(ad: pd.Series, cl: str) -> pd.Series:
     while ad["class"] and CLASSES.index(ad["class"][-1]) > CLASSES.index(cl[0]):
         if ad["class"][-1] == "h":
-            ad["address"] = re.sub(r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "",
-                                            ad["address"].lower())
+            ad["address"] = re.sub(
+                r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", ad["address"].lower()
+            )
         elif ad["class"][-1] == "b":
             num = re.findall(r"к{0,1}\.? ?\d", ad["address"])[-1]
             ad["address"] = re.sub(num, "", ad["address"])
@@ -239,7 +336,7 @@ class AddressSplitter(Sequence):
     def substitue_house(self) -> str:
         house_regex = re.compile(r"\d{1,4} ?[\/\-]?\d* ?")
 
-        number = house_regex.findall(self.token['obj'])[0]
+        number = house_regex.findall(self.token["obj"])[0]
 
         if self.has_numbered_street():
             house_number_index = 1
@@ -249,7 +346,11 @@ class AddressSplitter(Sequence):
         number_in_accumulator = house_regex.findall(self.accumulator["address"])
 
         if number_in_accumulator:
-            return re.sub(number_in_accumulator[house_number_index], number, self.accumulator["address"])
+            return re.sub(
+                number_in_accumulator[house_number_index],
+                number,
+                self.accumulator["address"],
+            )
         else:
             return self.accumulator["address"]
 
@@ -303,19 +404,19 @@ class AddressSplitter(Sequence):
             self.accumulator["class"] += "l"
 
     def has_letter_in(self) -> bool:
-        return (
-            re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", self.accumulator["address"])
-        )
+        return re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", self.accumulator["address"])
 
     # Room
 
     def substitue_room(self, number: int) -> str:
-        return re.sub(r"пом\. ?\d\-?\d*\w?", number, self.accumulator["address"].strip())
+        return re.sub(
+            r"пом\. ?\d\-?\d*\w?", number, self.accumulator["address"].strip()
+        )
 
     def insert_room(self):
         number = re.findall("пом\. ?\-?\d*\w?", self.token["obj"])[-1]
         self.accumulator["address"] = self.substitue_room(number)
-        
+
         if number and self.has_no_class("room"):
             self.accumulator["class"] += "r"
 
@@ -393,12 +494,13 @@ def split_pesoch_res(address: str) -> List[str]:
     t = re.sub(r",", " ", address)
     t = re.split(r"(Санкт-Петербург|Ленинградская обл|Л\.О)", t)
     t = list(map(str.strip, filter(lambda token: token != "", t)))
-    tokens = [t[i] + " " + t[i+1] for i in range(0, len(t)-1, 2)]
+    tokens = [t[i] + " " + t[i + 1] for i in range(0, len(t) - 1, 2)]
 
     if tokens:
         return list(set(tokens))
     return [address]
 
+
 def process_row(row: pd.Series[str]) -> pd.Series[str]:
     row = row.copy()
 
@@ -417,4 +519,4 @@ def process_row(row: pd.Series[str]) -> pd.Series[str]:
 def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
     merged_df = df.apply(process_row, axis=1).reset_index()
 
-    return merged_df.explode("Улица", ignore_index=True)
\ No newline at end of file
+    return merged_df.explode("Улица", ignore_index=True)

From 06f08d493315e1e103a6420cfb4ab1bf7bd0aedb Mon Sep 17 00:00:00 2001
From: dm1sh <me@dmitriy.icu>
Date: Sun, 29 Oct 2023 12:27:14 +0300
Subject: [PATCH 09/12] Renamed pipeline file

---
 parser/__init__.py              | 2 +-
 parser/{util.py => pipeline.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename parser/{util.py => pipeline.py} (100%)

diff --git a/parser/__init__.py b/parser/__init__.py
index cd4b8b3..b8ce701 100644
--- a/parser/__init__.py
+++ b/parser/__init__.py
@@ -7,6 +7,7 @@ from .building_id import (
     get_building_id,
 )
 from .lenenergo import LenenergoParser
+from .pipeline import pipeline
 from .preprocess import (
     COL_NS,
     ICOL_NS,
@@ -14,7 +15,6 @@ from .preprocess import (
     preprocess_df,
     preprocess_read_df,
 )
-from .util import pipeline
 
 __all__ = (
     "async_fetch_building_id",
diff --git a/parser/util.py b/parser/pipeline.py
similarity index 100%
rename from parser/util.py
rename to parser/pipeline.py

From cc2029802b07d5689028db185a319733d1d6fdd8 Mon Sep 17 00:00:00 2001
From: dm1sh <me@dmitriy.icu>
Date: Sun, 29 Oct 2023 14:24:39 +0300
Subject: [PATCH 10/12] Split address.py to module

---
 parser/address.py            | 522 -----------------------------------
 parser/address/__init__.py   |  12 +
 parser/address/classifier.py | 215 +++++++++++++++
 parser/address/splitter.py   | 292 ++++++++++++++++++++
 parser/address/utils.py      |  45 +++
 5 files changed, 564 insertions(+), 522 deletions(-)
 delete mode 100644 parser/address.py
 create mode 100644 parser/address/__init__.py
 create mode 100644 parser/address/classifier.py
 create mode 100644 parser/address/splitter.py
 create mode 100644 parser/address/utils.py

diff --git a/parser/address.py b/parser/address.py
deleted file mode 100644
index d9111dd..0000000
--- a/parser/address.py
+++ /dev/null
@@ -1,522 +0,0 @@
-from __future__ import annotations
-
-import re
-from collections.abc import Sequence
-from typing import Iterable, List, TypeVar
-
-import pandas as pd
-
-T = TypeVar("T")
-
-CLASSES = ("w", "d", "c", "t", "s", "h", "b", "e", "l", "r")
-DISTRICTS_PREFIXES = ("мо ", "р-н", "городское", "лесхоз")
-COUNTRYSIDE_PREFIXES = (
-    "г",
-    "п",
-    "д",
-    "гп",
-    "рп",
-    "кп",
-    "пгт",
-    "c",
-    "хутор",
-    " урочище",
-)
-TERRITORY_PREFIXES = (
-    "тер.",
-    " тер",
-    "снт ",
-    "ст ",
-    "дск ",
-    "днп ",
-    "дпк ",
-    "нп ",
-    "пдк ",
-    "т/б ",
-    "садоводство",
-    "массив",
-    "хозя",
-    "сад-во",
-)
-STREET_PREFIXES = (
-    " ул",
-    " бул",
-    " пр",
-    " ш",
-    " пер",
-    " дор",
-    " маг",
-    " наб",
-    " пл",
-    " просп",
-    " туп",
-    "шоссе",
-    "лини",
-    "аллея",
-    "мост",
-    " парк",
-    "кольцо",
-    "проезд",
-    "съезд",
-    "переулок",
-    "ул.",
-    "бул.",
-    "пр.",
-    "ш.",
-    "пер.",
-    "дор.",
-    "маг.",
-    "наб.",
-    "пл.",
-    "просп.",
-    "туп.",
-)
-HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп", "дом", "дома")
-BUILDING_PREFIXES = ("к.", "к ", "корп", "корпус")
-EDIFICE_PREFIXES = ("стр.", "строение")
-LETTER = ("лит.", "литера", " л.")
-PREFIXES = (
-    DISTRICTS_PREFIXES,
-    COUNTRYSIDE_PREFIXES,
-    TERRITORY_PREFIXES,
-    STREET_PREFIXES,
-    HOUSES_PREFIXES,
-    BUILDING_PREFIXES,
-    EDIFICE_PREFIXES,
-    LETTER,
-)
-
-
-def unfold_house_ranges(token: str) -> List[str]:
-    addresses = []
-    pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
-    for pair_string in pairs_strings:
-        a, b = pair_string.split("-")
-        a, b = int(a), int(b)
-
-        if b > a:
-            addresses += [
-                re.sub(r"([\d]+-[\d]+)", number, token)
-                for number in map(str, range(a, b + 1))
-            ]
-        else:
-            token = token.replace("-", "/")
-    if not addresses:
-        addresses.append(token)
-    return addresses
-
-
-def any_of_in(substrings: Iterable[str], string: str) -> bool:
-    return any(map(lambda substring: substring in string, substrings))
-
-
-def flatten(arr: Iterable[List[T]]) -> List[T]:
-    return sum(arr, [])
-
-
-def find_room(token: pd.Series, pre_token: pd.Series) -> str:
-    if re.search(r"\bпом\.?", token["obj"]):
-        return "r"
-    return ""
-
-
-def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
-    if find_room(token, pre_token):
-        return ""
-    if any_of_in(LETTER, token["obj"].lower()) or re.search(
-        r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"]
-    ):
-        return "l"
-    if (
-        (
-            re.search(r"\b([А-Я]|[а-я]){1}$", token["obj"])
-            and ("l" in pre_token["class"] or "h" in pre_token["class"])
-        )
-        and (" ш" not in token["obj"])
-        and not find_countryside(token, pre_token)
-    ):
-        return "l"
-    return ""
-
-
-def find_edifice(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(EDIFICE_PREFIXES, token["obj"].lower()):
-        return "e"
-    return ""
-
-
-def find_building(token: pd.Series, pre_token: pd.Series) -> str:
-    if re.search(r"\d", token["obj"]) and not find_room(token, pre_token):
-        if (
-            any_of_in(BUILDING_PREFIXES, token["obj"].lower())
-            or "b" in pre_token["class"]
-            and ("h" not in token["class"])
-            and not find_edifice(token, pre_token)
-            or re.search(r"к\.* ?\d", token["obj"])
-        ):
-            return "b"
-    return ""
-
-
-def find_house(token: pd.Series, pre_token: pd.Series) -> str:
-    if re.search(r"\d{1,4}", token["obj"]) and not find_room(token, pre_token):
-        if any_of_in(HOUSES_PREFIXES, token["obj"].lower()):
-            return "h"
-        if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token["obj"]):
-            return "h"
-        if (
-            (
-                "s" in pre_token["class"]
-                or "h" in pre_token["class"]
-                or "s" in token["class"]
-            )
-            and not any_of_in(("-я", "-й", "-Я"), token["obj"])
-            and not find_building(token, pre_token)
-            and not find_edifice(token, pre_token)
-        ):
-            return "h"
-        if (
-            find_building(token, pre_token)
-            and not any_of_in(("-я", "-й", "-Я"), token["obj"])
-            and True
-        ):
-            if len(re.findall(r"\d{1,4}", token["obj"])) > 1:
-                return "h"
-            if int(re.search(r"\d{1,4}", token["obj"]).group()) // 10 > 0:
-                return "h"
-    return ""
-
-
-def find_street(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(STREET_PREFIXES, token["obj"].lower()):
-        return "s"
-    if (
-        re.search(r"\b[А-Яа-я]{4,}\b", token["obj"])
-        and not any(
-            [el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el) > 2]
-        )
-        and not (
-            "d" in token["class"] or "t" in token["class"] or "c" in token["class"]
-        )
-    ):
-        return "s"
-    return ""
-
-
-def find_territory(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(TERRITORY_PREFIXES, token["obj"].lower()):
-        return "t"
-    return ""
-
-
-def find_countryside(token: pd.Series, pre_token: pd.Series) -> str:
-    if (
-        any_of_in(COUNTRYSIDE_PREFIXES, token["obj"].lower())
-        and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token["obj"])
-        and not find_house(token, pre_token)
-        and not any_of_in(STREET_PREFIXES, token["obj"].lower())
-    ):
-        return "c"
-    return ""
-
-
-def find_district(token: pd.Series, pre_token: pd.Series) -> str:
-    if any_of_in(DISTRICTS_PREFIXES, token["obj"].lower()):
-        return "d"
-    return ""
-
-
-def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
-    brackets = re.search(r"\(.+\)", token["obj"])
-    if brackets:
-        token["obj"] = re.sub(r"\(.+\)", "()", token["obj"])
-    token["class"] += find_district(token, pre_token)
-    token["class"] += find_countryside(token, pre_token)
-    token["class"] += find_territory(token, pre_token)
-    token["class"] += find_street(token, pre_token)
-    token["class"] += find_house(token, pre_token)
-    token["class"] += find_building(token, pre_token)
-    token["class"] += find_edifice(token, pre_token)
-    token["class"] += find_litera(token, pre_token)
-    token["class"] += find_room(token, pre_token)
-    if token["class"] == "":
-        token["class"] = "w"
-    if brackets:
-        token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"])
-    return token
-
-
-def cut_address(ad: pd.Series, cl: str) -> pd.Series:
-    while ad["class"] and CLASSES.index(ad["class"][-1]) > CLASSES.index(cl[0]):
-        if ad["class"][-1] == "h":
-            ad["address"] = re.sub(
-                r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?", "", ad["address"].lower()
-            )
-        elif ad["class"][-1] == "b":
-            num = re.findall(r"к{0,1}\.? ?\d", ad["address"])[-1]
-            ad["address"] = re.sub(num, "", ad["address"])
-        elif ad["class"][-1] == "e":
-            ad["address"] = re.sub(r"cтр\.? ?\d", "", ad["address"])
-        elif ad["class"][-1] == "l":
-            ad["address"] = re.sub(r"[литера]*\.? ?[А-Яа-я]{1}$", "", ad["address"])
-        elif ad["class"][-1] == "r":
-            ad["address"] = re.sub(r"пом\.? ?\d+", "", ad["address"])
-        ad["class"] = ad["class"][:-1]
-    return ad
-
-
-def is_valid_token(string: str) -> bool:
-    return string not in ("", "уг.", "д.")
-
-
-def create_token(obj: str = "", token_class: str = ""):
-    return pd.Series(
-        {
-            "obj": obj,
-            "class": token_class,
-        }
-    )
-
-
-class AddressSplitter(Sequence):
-    def __init__(self, address: str):
-        self.input = address
-
-        self.addresses = self.split()
-
-    ## Sequence abstract methods implementation
-
-    def __getitem__(self, key: int):
-        if key < len(self.addresses):
-            return self.addresses[key]
-        else:
-            raise IndexError()
-
-    def __len__(self):
-        return len(self.addresses)
-
-    ## Address token class manipulations
-
-    def next_class(self) -> str:
-        return self.token["class"][0]
-
-    def correct_order(self) -> bool:
-        prev_class = self.accumulator["class"][-1]
-
-        return (
-            CLASSES.index(prev_class) < CLASSES.index(self.next_class())
-            and self.accumulator["class"] != "w"
-        )
-
-    def next_class_is(self, comparing_class: str) -> bool:
-        return len(self.token["class"]) > 0 and self.next_class() == comparing_class[0]
-
-    def has_no_class(self, comparing_class: str) -> bool:
-        return comparing_class[0] not in self.accumulator["class"]
-
-    def pop_token_class(self):
-        self.token["class"] = self.token["class"][1:]
-
-    ## Accumulator constrains
-
-    def next_is_street_or_upper(self) -> bool:
-        is_unknown_class = self.accumulator["class"] in ("", "w")
-
-        return (
-            CLASSES.index(self.next_class()) <= CLASSES.index("s") or is_unknown_class
-        )
-
-    def has_numbered_street(self) -> bool:
-        return any_of_in(("-я", "-й", "-Я"), self.accumulator["address"])
-
-    ## Accumulator manipulation
-
-    # House
-
-    def substitue_house(self) -> str:
-        house_regex = re.compile(r"\d{1,4} ?[\/\-]?\d* ?")
-
-        number = house_regex.findall(self.token["obj"])[0]
-
-        if self.has_numbered_street():
-            house_number_index = 1
-        else:
-            house_number_index = 0
-
-        number_in_accumulator = house_regex.findall(self.accumulator["address"])
-
-        if number_in_accumulator:
-            return re.sub(
-                number_in_accumulator[house_number_index],
-                number,
-                self.accumulator["address"],
-            )
-        else:
-            return self.accumulator["address"]
-
-    # Building
-
-    def append_building(self, number: int) -> pd.Series:
-        self.accumulator["class"] += "b"
-        self.accumulator["address"] += "к." + number
-
-        return self.accumulator
-
-    def substitue_building(self, number: int) -> str:
-        return re.sub(r"\d$", number, self.accumulator["address"])
-
-    def insert_building(self):
-        number = re.findall(r"\d", self.token["obj"])[-1]
-
-        if number and self.has_no_class("building"):
-            self.accumulator = self.append_building(number)
-        else:
-            self.accumulator["address"] = self.substitue_building(number)
-
-    # Edifice
-
-    def substitue_edifice(self, number: int) -> str:
-        return re.sub(r"cтр\. ?\d", number, self.accumulator["address"].strip())
-
-    def insert_edifice(self):
-        number = re.findall("стр\.? ?\d", self.token["obj"])[-1]
-
-        self.accumulator["address"] = self.substitue_edifice(number)
-
-        if number and self.has_no_class("edifice"):
-            self.accumulator["class"] += "e"
-
-    # Letter
-
-    def without_letter(self) -> str:
-        return re.sub(r"[А-Яа-я]$", "", self.accumulator["address"].strip())
-
-    def substitue_letter(self, letter: str) -> str:
-        address_without_letter = self.without_letter()
-
-        return address_without_letter + letter
-
-    def insert_letter(self):
-        letter = re.findall(r"[А-Яа-я]", self.token["obj"])[-1]
-        self.accumulator["address"] = self.substitue_letter(letter)
-
-        if letter and self.has_no_class("litera"):
-            self.accumulator["class"] += "l"
-
-    def has_letter_in(self) -> bool:
-        return re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", self.accumulator["address"])
-
-    # Room
-
-    def substitue_room(self, number: int) -> str:
-        return re.sub(
-            r"пом\. ?\d\-?\d*\w?", number, self.accumulator["address"].strip()
-        )
-
-    def insert_room(self):
-        number = re.findall("пом\. ?\-?\d*\w?", self.token["obj"])[-1]
-        self.accumulator["address"] = self.substitue_room(number)
-
-        if number and self.has_no_class("room"):
-            self.accumulator["class"] += "r"
-
-    ## Data preprocessing
-
-    def split_tokens(self) -> list[pd.Series]:
-        address = self.input.replace(";", ",")
-
-        parts = address.split(",")
-        parts = map(str.strip, parts)
-        parts = filter(is_valid_token, parts)
-
-        tokens = map(lambda part: create_token(part, ""), parts)
-
-        return list(tokens)
-
-    def split(self):
-        self.tokens = self.split_tokens()
-
-        result = []
-
-        self.accumulator = pd.Series({"address": "", "class": ""})
-
-        prev_token = create_token()
-
-        for cursor in self.tokens:
-            self.token = address_classification(cursor, prev_token)
-            prev_token = self.token.copy()
-
-            if self.accumulator["class"] == "":
-                self.accumulator = self.token.rename({"obj": "address"})
-                continue
-
-            if self.correct_order():
-                self.accumulator["address"] += " "
-                self.accumulator += self.token.rename({"obj": "address"})
-            else:
-                unfolded_address = unfold_house_ranges(self.accumulator["address"])
-                self.accumulator["address"] = unfolded_address[-1]
-
-                result.extend(unfolded_address)
-
-                self.accumulator = cut_address(self.accumulator, self.token["class"])
-
-                if self.next_is_street_or_upper():
-                    self.accumulator = self.token.rename({"obj": "address"})
-
-                if self.next_class_is("house"):
-                    self.accumulator["address"] = self.substitue_house()
-                    self.pop_token_class()
-
-                if self.next_class_is("building"):
-                    self.insert_building()
-                    self.pop_token_class()
-
-                if self.next_class_is("edifice"):
-                    self.insert_edifice()
-                    self.pop_token_class()
-
-                if self.next_class_is("letter"):
-                    self.insert_letter()
-                elif self.has_letter_in():
-                    self.accumulator["address"] = self.without_letter()
-
-                if self.next_class_is("room"):
-                    self.insert_room()
-                    self.pop_token_class()
-
-        result.extend(unfold_house_ranges(self.accumulator["address"]))
-
-        return result
-
-
-def split_pesoch_res(address: str) -> List[str]:
-    t = re.sub(r",", " ", address)
-    t = re.split(r"(Санкт-Петербург|Ленинградская обл|Л\.О)", t)
-    t = list(map(str.strip, filter(lambda token: token != "", t)))
-    tokens = [t[i] + " " + t[i + 1] for i in range(0, len(t) - 1, 2)]
-
-    if tokens:
-        return list(set(tokens))
-    return [address]
-
-
-def process_row(row: pd.Series[str]) -> pd.Series[str]:
-    row = row.copy()
-
-    if pd.isnull(row["Улица"]):
-        row["Улица"] = [None]
-    else:
-        if row["РЭС"] == "Песочинский РЭС":
-            addresses = split_pesoch_res(row["Улица"])
-        else:
-            addresses = AddressSplitter(row["Улица"])
-        row["Улица"] = addresses
-
-    return row
-
-
-def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
-    merged_df = df.apply(process_row, axis=1).reset_index()
-
-    return merged_df.explode("Улица", ignore_index=True)
diff --git a/parser/address/__init__.py b/parser/address/__init__.py
new file mode 100644
index 0000000..370717b
--- /dev/null
+++ b/parser/address/__init__.py
@@ -0,0 +1,12 @@
+from .classifier import CLASSES, address_classification
+from .splitter import AddressSplitter, split_addresses, split_pesoch_res
+from .utils import create_token
+
+__all__ = (
+    "address_classification",
+    "AddressSplitter",
+    "CLASSES",
+    "create_token",
+    "split_addresses",
+    "split_pesoch_res",
+)
diff --git a/parser/address/classifier.py b/parser/address/classifier.py
new file mode 100644
index 0000000..a333ace
--- /dev/null
+++ b/parser/address/classifier.py
@@ -0,0 +1,215 @@
+import re
+
+import pandas as pd
+
+from .utils import any_of_in
+
+CLASSES = ("w", "d", "c", "t", "s", "h", "b", "e", "l", "r")
+DISTRICTS_PREFIXES = ("мо ", "р-н", "городское", "лесхоз")
+COUNTRYSIDE_PREFIXES = (
+    "г",
+    "п",
+    "д",
+    "гп",
+    "рп",
+    "кп",
+    "пгт",
+    "c",
+    "хутор",
+    " урочище",
+)
+TERRITORY_PREFIXES = (
+    "тер.",
+    " тер",
+    "снт ",
+    "ст ",
+    "дск ",
+    "днп ",
+    "дпк ",
+    "нп ",
+    "пдк ",
+    "т/б ",
+    "садоводство",
+    "массив",
+    "хозя",
+    "сад-во",
+)
+STREET_PREFIXES = (
+    " ул",
+    " бул",
+    " пр",
+    " ш",
+    " пер",
+    " дор",
+    " маг",
+    " наб",
+    " пл",
+    " просп",
+    " туп",
+    "шоссе",
+    "лини",
+    "аллея",
+    "мост",
+    " парк",
+    "кольцо",
+    "проезд",
+    "съезд",
+    "переулок",
+    "ул.",
+    "бул.",
+    "пр.",
+    "ш.",
+    "пер.",
+    "дор.",
+    "маг.",
+    "наб.",
+    "пл.",
+    "просп.",
+    "туп.",
+)
+HOUSES_PREFIXES = ("д.", "уч.", "участок", "мкд", "тп", "дом", "дома")
+BUILDING_PREFIXES = ("к.", "к ", "корп", "корпус")
+EDIFICE_PREFIXES = ("стр.", "строение")
+LETTER = ("лит.", "литера", " л.")
+PREFIXES = (
+    DISTRICTS_PREFIXES,
+    COUNTRYSIDE_PREFIXES,
+    TERRITORY_PREFIXES,
+    STREET_PREFIXES,
+    HOUSES_PREFIXES,
+    BUILDING_PREFIXES,
+    EDIFICE_PREFIXES,
+    LETTER,
+)
+
+
+def find_room(token: pd.Series, pre_token: pd.Series) -> str:
+    if re.search(r"\bпом\.?", token["obj"]):
+        return "r"
+    return ""
+
+
+def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
+    if find_room(token, pre_token):
+        return ""
+    if any_of_in(LETTER, token["obj"].lower()) or re.search(
+        r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"]
+    ):
+        return "l"
+    if (
+        (
+            re.search(r"\b([А-Я]|[а-я]){1}$", token["obj"])
+            and ("l" in pre_token["class"] or "h" in pre_token["class"])
+        )
+        and (" ш" not in token["obj"])
+        and not find_countryside(token, pre_token)
+    ):
+        return "l"
+    return ""
+
+
+def find_edifice(token: pd.Series, pre_token: pd.Series) -> str:
+    if any_of_in(EDIFICE_PREFIXES, token["obj"].lower()):
+        return "e"
+    return ""
+
+
+def find_building(token: pd.Series, pre_token: pd.Series) -> str:
+    if re.search(r"\d", token["obj"]) and not find_room(token, pre_token):
+        if (
+            any_of_in(BUILDING_PREFIXES, token["obj"].lower())
+            or "b" in pre_token["class"]
+            and ("h" not in token["class"])
+            and not find_edifice(token, pre_token)
+            or re.search(r"к\.* ?\d", token["obj"])
+        ):
+            return "b"
+    return ""
+
+
+def find_house(token: pd.Series, pre_token: pd.Series) -> str:
+    if re.search(r"\d{1,4}", token["obj"]) and not find_room(token, pre_token):
+        if any_of_in(HOUSES_PREFIXES, token["obj"].lower()):
+            return "h"
+        if re.search(r"(д|д\.) ?\d{1,4} ?\/*\d* ?", token["obj"]):
+            return "h"
+        if (
+            (
+                "s" in pre_token["class"]
+                or "h" in pre_token["class"]
+                or "s" in token["class"]
+            )
+            and not any_of_in(("-я", "-й", "-Я"), token["obj"])
+            and not find_building(token, pre_token)
+            and not find_edifice(token, pre_token)
+        ):
+            return "h"
+        if (
+            find_building(token, pre_token)
+            and not any_of_in(("-я", "-й", "-Я"), token["obj"])
+            and True
+        ):
+            if len(re.findall(r"\d{1,4}", token["obj"])) > 1:
+                return "h"
+            if int(re.search(r"\d{1,4}", token["obj"]).group()) // 10 > 0:
+                return "h"
+    return ""
+
+
+def find_street(token: pd.Series, pre_token: pd.Series) -> str:
+    if any_of_in(STREET_PREFIXES, token["obj"].lower()):
+        return "s"
+    if (
+        re.search(r"\b[А-Яа-я]{4,}\b", token["obj"])
+        and not any(
+            [el in token["obj"].lower() for pr in PREFIXES for el in pr if len(el) > 2]
+        )
+        and not (
+            "d" in token["class"] or "t" in token["class"] or "c" in token["class"]
+        )
+    ):
+        return "s"
+    return ""
+
+
+def find_territory(token: pd.Series, pre_token: pd.Series) -> str:
+    if any_of_in(TERRITORY_PREFIXES, token["obj"].lower()):
+        return "t"
+    return ""
+
+
+def find_countryside(token: pd.Series, pre_token: pd.Series) -> str:
+    if (
+        any_of_in(COUNTRYSIDE_PREFIXES, token["obj"].lower())
+        and re.search(r"\b[гпдрпктc]{1,3}(\b|\. )", token["obj"])
+        and not find_house(token, pre_token)
+        and not any_of_in(STREET_PREFIXES, token["obj"].lower())
+    ):
+        return "c"
+    return ""
+
+
+def find_district(token: pd.Series, pre_token: pd.Series) -> str:
+    if any_of_in(DISTRICTS_PREFIXES, token["obj"].lower()):
+        return "d"
+    return ""
+
+
+def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
+    brackets = re.search(r"\(.+\)", token["obj"])
+    if brackets:
+        token["obj"] = re.sub(r"\(.+\)", "()", token["obj"])
+    token["class"] += find_district(token, pre_token)
+    token["class"] += find_countryside(token, pre_token)
+    token["class"] += find_territory(token, pre_token)
+    token["class"] += find_street(token, pre_token)
+    token["class"] += find_house(token, pre_token)
+    token["class"] += find_building(token, pre_token)
+    token["class"] += find_edifice(token, pre_token)
+    token["class"] += find_litera(token, pre_token)
+    token["class"] += find_room(token, pre_token)
+    if token["class"] == "":
+        token["class"] = "w"
+    if brackets:
+        token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"])
+    return token
diff --git a/parser/address/splitter.py b/parser/address/splitter.py
new file mode 100644
index 0000000..8cc4ffc
--- /dev/null
+++ b/parser/address/splitter.py
@@ -0,0 +1,292 @@
+from __future__ import annotations
+
+import re
+from collections.abc import Sequence
+
+import pandas as pd
+
+from .classifier import CLASSES, address_classification
+from .utils import any_of_in, create_token, is_valid_token, unfold_house_ranges
+
+
+class AddressSplitter(Sequence):
+    def __init__(self, address: str):
+        self.input = address
+
+        self.addresses = self.split()
+
+    # Sequence abstract methods implementation
+
+    def __getitem__(self, key: int):
+        if key < len(self.addresses):
+            return self.addresses[key]
+        else:
+            raise IndexError()
+
+    def __len__(self):
+        return len(self.addresses)
+
+    # Address token class manipulations
+
+    def next_class(self) -> str:
+        return self.token["class"][0]
+
+    def prev_class(self) -> str:
+        return self.accumulator["class"][-1]
+
+    def correct_order(self) -> bool:
+        return (
+            len(self.accumulator["class"]) > 0
+            and CLASSES.index(self.prev_class()) < CLASSES.index(self.next_class())
+            and self.accumulator["class"] != "w"
+        )
+
+    def next_class_is(self, comparing_class: str) -> bool:
+        return len(self.token["class"]) > 0 and self.next_class() == comparing_class[0]
+
+    def has_no_class(self, comparing_class: str) -> bool:
+        return comparing_class[0] not in self.accumulator["class"]
+
+    def pop_token_class(self):
+        self.token["class"] = self.token["class"][1:]
+
+    # Accumulator constrains
+
+    def next_is_street_or_upper(self) -> bool:
+        is_unknown_class = self.accumulator["class"] in ("", "w")
+
+        return (
+            CLASSES.index(self.next_class()) <= CLASSES.index("s") or is_unknown_class
+        )
+
+    def has_numbered_street(self) -> bool:
+        return any_of_in(("-я", "-й", "-Я"), self.accumulator["address"])
+
+    # Accumulator manipulation
+
+    ## House
+
+    def substitue_house(self) -> str:
+        house_regex = re.compile(r"\d{1,4} ?[\/\-]?\d* ?")
+
+        number = house_regex.findall(self.token["obj"])[0]
+
+        if self.has_numbered_street():
+            house_number_index = 1
+        else:
+            house_number_index = 0
+
+        number_in_accumulator = house_regex.findall(self.accumulator["address"])
+
+        if number_in_accumulator:
+            return re.sub(
+                number_in_accumulator[house_number_index],
+                number,
+                self.accumulator["address"],
+            )
+        else:
+            return self.accumulator["address"]
+
+    ## Building
+
+    def append_building(self, number: int) -> pd.Series:
+        self.accumulator["class"] += "b"
+        self.accumulator["address"] += "к." + number
+
+        return self.accumulator
+
+    def substitue_building(self, number: int) -> str:
+        return re.sub(r"\d$", number, self.accumulator["address"])
+
+    def insert_building(self):
+        number = re.findall(r"\d", self.token["obj"])[-1]
+
+        if number and self.has_no_class("building"):
+            self.accumulator = self.append_building(number)
+        else:
+            self.accumulator["address"] = self.substitue_building(number)
+
+    ## Edifice
+
+    def substitue_edifice(self, number: int) -> str:
+        return re.sub(r"cтр\. ?\d", number, self.accumulator["address"].strip())
+
+    def insert_edifice(self):
+        number = re.findall("стр\.? ?\d", self.token["obj"])[-1]
+
+        self.accumulator["address"] = self.substitue_edifice(number)
+
+        if number and self.has_no_class("edifice"):
+            self.accumulator["class"] += "e"
+
+    ## Letter
+
+    def without_letter(self) -> str:
+        return re.sub(r"[А-Яа-я]$", "", self.accumulator["address"].strip())
+
+    def substitue_letter(self, letter: str) -> str:
+        address_without_letter = self.without_letter()
+
+        return address_without_letter + letter
+
+    def insert_letter(self):
+        letter = re.findall(r"[А-Яа-я]", self.token["obj"])[-1]
+        self.accumulator["address"] = self.substitue_letter(letter)
+
+        if letter and self.has_no_class("litera"):
+            self.accumulator["class"] += "l"
+
+    def has_letter_in(self) -> bool:
+        return re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", self.accumulator["address"])
+
+    ## Room
+
+    def substitue_room(self, number: int) -> str:
+        return re.sub(
+            r"пом\. ?\d\-?\d*\w?", number, self.accumulator["address"].strip()
+        )
+
+    def insert_room(self):
+        number = re.findall("пом\. ?\-?\d*\w?", self.token["obj"])[-1]
+        self.accumulator["address"] = self.substitue_room(number)
+
+        if number and self.has_no_class("room"):
+            self.accumulator["class"] += "r"
+
+    # Data preprocessing
+
+    def split_tokens(self) -> list[pd.Series]:
+        address = self.input.replace(";", ",")
+
+        parts = address.split(",")
+        parts = map(str.strip, parts)
+        parts = filter(is_valid_token, parts)
+
+        tokens = map(lambda part: create_token(part, ""), parts)
+
+        return list(tokens)
+
+    def cut_address(self) -> pd.Series:
+        while len(self.accumulator["class"]) > 0 and CLASSES.index(
+            self.prev_class()
+        ) > CLASSES.index(self.next_class()):
+            match self.accumulator["class"][-1]:
+                case "h":
+                    self.accumulator["addresses"] = re.sub(
+                        r"[мкдтпучасток]*\.? ?\d{1,4} ?\/*\d* ?",
+                        "",
+                        self.accumulator["address"].lower(),
+                    )
+                case "b":
+                    number = re.findall(r"к{0,1}\.? ?\d", self.accumulator["address"])[
+                        -1
+                    ]
+                    self.accumulator["address"] = re.sub(
+                        number, "", self.accumulator["address"]
+                    )
+                case "e":
+                    self.accumulator["address"] = re.sub(
+                        r"cтр\.? ?\d", "", self.accumulator["address"]
+                    )
+                case "l":
+                    self.accumulator["address"] = re.sub(
+                        r"[литера]*\.? ?[А-Яа-я]{1}$", "", self.accumulator["address"]
+                    )
+                case "r":
+                    self.accumulator["address"] = re.sub(
+                        r"пом\.? ?\d+", "", self.accumulator["address"]
+                    )
+
+            self.accumulator["class"] = self.accumulator["class"][:-1]
+
+        return self.accumulator
+
+    # Splitting
+
+    def split(self):
+        self.tokens = self.split_tokens()
+
+        result = []
+
+        self.accumulator = pd.Series({"address": "", "class": ""})
+
+        prev_token = create_token()
+
+        for cursor in self.tokens:
+            self.token = address_classification(cursor, prev_token)
+            prev_token = self.token.copy()
+
+            if self.accumulator["class"] == "":
+                self.accumulator = self.token.rename({"obj": "address"})
+                continue
+
+            if self.correct_order():
+                self.accumulator["address"] += " "
+                self.accumulator += self.token.rename({"obj": "address"})
+            else:
+                unfolded_address = unfold_house_ranges(self.accumulator["address"])
+                self.accumulator["address"] = unfolded_address[-1]
+
+                result.extend(unfolded_address)
+
+                self.accumulator = self.cut_address()
+
+                if self.next_is_street_or_upper():
+                    self.accumulator = self.token.rename({"obj": "address"})
+
+                if self.next_class_is("house"):
+                    self.accumulator["address"] = self.substitue_house()
+                    self.pop_token_class()
+
+                if self.next_class_is("building"):
+                    self.insert_building()
+                    self.pop_token_class()
+
+                if self.next_class_is("edifice"):
+                    self.insert_edifice()
+                    self.pop_token_class()
+
+                if self.next_class_is("letter"):
+                    self.insert_letter()
+                elif self.has_letter_in():
+                    self.accumulator["address"] = self.without_letter()
+
+                if self.next_class_is("room"):
+                    self.insert_room()
+                    self.pop_token_class()
+
+        result.extend(unfold_house_ranges(self.accumulator["address"]))
+
+        return result
+
+
+def split_pesoch_res(address: str) -> list[str]:
+    t = re.sub(r",", " ", address)
+    t = re.split(r"(Санкт-Петербург|Ленинградская обл|Л\.О)", t)
+    t = list(map(str.strip, filter(lambda token: token != "", t)))
+    tokens = [t[i] + " " + t[i + 1] for i in range(0, len(t) - 1, 2)]
+
+    if tokens:
+        return list(set(tokens))
+    return [address]
+
+
+def process_row(row: pd.Series[str]) -> pd.Series[str]:
+    row = row.copy()
+
+    if pd.isnull(row["Улица"]):
+        row["Улица"] = [None]
+    else:
+        if row["РЭС"] == "Песочинский РЭС":
+            addresses = split_pesoch_res(row["Улица"])
+        else:
+            addresses = AddressSplitter(row["Улица"])
+        row["Улица"] = addresses
+
+    return row
+
+
+def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
+    merged_df = df.apply(process_row, axis=1).reset_index()
+
+    return merged_df.explode("Улица", ignore_index=True)
diff --git a/parser/address/utils.py b/parser/address/utils.py
new file mode 100644
index 0000000..6bfe1f9
--- /dev/null
+++ b/parser/address/utils.py
@@ -0,0 +1,45 @@
+import re
+from collections.abc import Iterable
+from typing import TypeVar
+
+import pandas as pd
+
+T = TypeVar("T")
+
+def any_of_in(substrings: Iterable[str], string: str) -> bool:
+    return any(map(lambda substring: substring in string, substrings))
+
+
+def flatten(arr: Iterable[list[T]]) -> list[T]:
+    return sum(arr, [])
+
+def unfold_house_ranges(token: str) -> list[str]:
+    addresses = []
+    pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
+    for pair_string in pairs_strings:
+        a, b = pair_string.split("-")
+        a, b = int(a), int(b)
+
+        if b > a:
+            addresses += [
+                re.sub(r"([\d]+-[\d]+)", number, token)
+                for number in map(str, range(a, b + 1))
+            ]
+        else:
+            token = token.replace("-", "/")
+    if not addresses:
+        addresses.append(token)
+    return addresses
+
+
+def is_valid_token(string: str) -> bool:
+    return string not in ("", "уг.", "д.")
+
+
+def create_token(obj: str = "", token_class: str = ""):
+    return pd.Series(
+        {
+            "obj": obj,
+            "class": token_class,
+        }
+    )
\ No newline at end of file

From 931ff1270b0e865b40ddfb6f7ce8239d247bbd71 Mon Sep 17 00:00:00 2001
From: dm1sh <me@dmitriy.icu>
Date: Sun, 29 Oct 2023 14:25:00 +0300
Subject: [PATCH 11/12] Fixed import errors in parser

---
 parser/pipeline.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/parser/pipeline.py b/parser/pipeline.py
index 9fad768..07f25ca 100644
--- a/parser/pipeline.py
+++ b/parser/pipeline.py
@@ -1,16 +1,14 @@
 from typing import Optional
 
-from . import (
-    LenenergoParser,
-    concurrent_fetch_builing_ids,
-    preprocess_df,
-    split_addresses,
-)
+from .lenenergo import LenenergoParser
+from .building_id import concurrent_fetch_builing_ids
+from .preprocess import preprocess_df
+from .address import split_addresses
 
 
 def pipeline(parser: Optional[LenenergoParser] = None) -> LenenergoParser:
     if parser is None:
-        parser = LenenergoParser(parser)
+        parser = LenenergoParser()
 
     print(parser)
 

From 3bd1deb8db40ada9f8cf789d77e1970d8fc2d3da Mon Sep 17 00:00:00 2001
From: dm1sh <me@dmitriy.icu>
Date: Sun, 29 Oct 2023 15:59:55 +0300
Subject: [PATCH 12/12] Code formatting

---
 parser/address/classifier.py | 11 +++++++++--
 parser/address/splitter.py   |  9 ++++++---
 parser/address/utils.py      |  4 +++-
 parser/pipeline.py           |  6 +++---
 runner/database.py           |  2 +-
 5 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/parser/address/classifier.py b/parser/address/classifier.py
index a333ace..2ce1488 100644
--- a/parser/address/classifier.py
+++ b/parser/address/classifier.py
@@ -92,9 +92,12 @@ def find_room(token: pd.Series, pre_token: pd.Series) -> str:
 def find_litera(token: pd.Series, pre_token: pd.Series) -> str:
     if find_room(token, pre_token):
         return ""
-    if any_of_in(LETTER, token["obj"].lower()) or re.search(
-        r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"]
+    # fmt: off
+    if (
+        any_of_in(LETTER, token["obj"].lower()) or
+        re.search(r"\d{1,3}([А-Я]|[а-я])( |$)", token["obj"])
     ):
+    #fmt: on
         return "l"
     if (
         (
@@ -199,6 +202,7 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
     brackets = re.search(r"\(.+\)", token["obj"])
     if brackets:
         token["obj"] = re.sub(r"\(.+\)", "()", token["obj"])
+
     token["class"] += find_district(token, pre_token)
     token["class"] += find_countryside(token, pre_token)
     token["class"] += find_territory(token, pre_token)
@@ -208,8 +212,11 @@ def address_classification(token: pd.Series, pre_token: pd.Series) -> pd.Series:
     token["class"] += find_edifice(token, pre_token)
     token["class"] += find_litera(token, pre_token)
     token["class"] += find_room(token, pre_token)
+
     if token["class"] == "":
         token["class"] = "w"
+
     if brackets:
         token["obj"] = re.sub(r"\(\)", brackets.group(), token["obj"])
+
     return token
diff --git a/parser/address/splitter.py b/parser/address/splitter.py
index 8cc4ffc..e698d28 100644
--- a/parser/address/splitter.py
+++ b/parser/address/splitter.py
@@ -167,9 +167,12 @@ class AddressSplitter(Sequence):
         return list(tokens)
 
     def cut_address(self) -> pd.Series:
-        while len(self.accumulator["class"]) > 0 and CLASSES.index(
-            self.prev_class()
-        ) > CLASSES.index(self.next_class()):
+        # fmt: off
+        while (
+            len(self.accumulator["class"]) > 0
+            and CLASSES.index(self.prev_class()) > CLASSES.index(self.next_class())
+        ):
+        # fmt: on
             match self.accumulator["class"][-1]:
                 case "h":
                     self.accumulator["addresses"] = re.sub(
diff --git a/parser/address/utils.py b/parser/address/utils.py
index 6bfe1f9..0935245 100644
--- a/parser/address/utils.py
+++ b/parser/address/utils.py
@@ -6,6 +6,7 @@ import pandas as pd
 
 T = TypeVar("T")
 
+
 def any_of_in(substrings: Iterable[str], string: str) -> bool:
     return any(map(lambda substring: substring in string, substrings))
 
@@ -13,6 +14,7 @@ def any_of_in(substrings: Iterable[str], string: str) -> bool:
 def flatten(arr: Iterable[list[T]]) -> list[T]:
     return sum(arr, [])
 
+
 def unfold_house_ranges(token: str) -> list[str]:
     addresses = []
     pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
@@ -42,4 +44,4 @@ def create_token(obj: str = "", token_class: str = ""):
             "obj": obj,
             "class": token_class,
         }
-    )
\ No newline at end of file
+    )
diff --git a/parser/pipeline.py b/parser/pipeline.py
index 07f25ca..920c23e 100644
--- a/parser/pipeline.py
+++ b/parser/pipeline.py
@@ -1,9 +1,9 @@
 from typing import Optional
 
-from .lenenergo import LenenergoParser
-from .building_id import concurrent_fetch_builing_ids
-from .preprocess import preprocess_df
 from .address import split_addresses
+from .building_id import concurrent_fetch_builing_ids
+from .lenenergo import LenenergoParser
+from .preprocess import preprocess_df
 
 
 def pipeline(parser: Optional[LenenergoParser] = None) -> LenenergoParser:
diff --git a/runner/database.py b/runner/database.py
index 0da4e76..d70743b 100644
--- a/runner/database.py
+++ b/runner/database.py
@@ -1,10 +1,10 @@
 from .config import (
+    DB_URL,
     POSTGRES_DB,
     POSTGRES_HOST,
     POSTGRES_PASSWORD,
     POSTGRES_PORT,
     POSTGRES_USER,
-    DB_URL,
 )
 
 db_credentials = {"conninfo": DB_URL}