Update README.md

Added readme
Merge branch 'dev'
2023-10-30 10:04:09 +03:00 · 2023-10-29 17:09:54 +03:00 · 2023-10-29 16:27:41 +03:00 · 2023-10-29 15:36:18 +03:00 · 2023-10-29 15:34:42 +03:00 · 2023-10-29 14:36:45 +03:00
12 changed files with 109 additions and 42 deletions
--- a/.env.example
+++ b/.env.example
@ -1,5 +1,9 @@
 REFETCH_PERIOD_H=6
 STORE_NULL_BID=True
 POSTGRES_USER=lenenergo
 POSTGRES_PASSWORD=lenenergo
 POSTGRES_DB=lenenergo
 POSTGRES_HOST=db
 # or
 DB_URL=postgresql://lenenergo:lenenergo@localhost:5432
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,5 @@ __pycache__
 data*.csv
 .idea/
 .ipynb_checkpoints
 .vscode/
 *.odb
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -1,15 +0,0 @@
 {
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Python: Module",
            "type": "python",
            "request": "launch",
            "module": "parser",
            "justMyCode": true,
        }
    ]
 }
--- a/README.md
+++ b/README.md
@ -0,0 +1,37 @@
 # Lenenergo Parser
 ## DB columns settings
 Append to `runner/config.py`
 ```python
 DB_COLUMNS_MAP["<COL_NS key>"] = "<corresponding db column name>"
 ```
 ## Running instructions
 ```bash
 docker build . -it lenenergo_parser
 docker run -d \
  [-e REFETCH_PERIOD_H=4] \ # Refetch period
  [-e STORE_NULL_BID=False] \ # Store rows with null building_id
  # DB auth variants
  [-e POSTGRES_USER=lenenergo] \
  [-e POSTGRES_PASSWORD=lenenergo] \
  [-e POSTGRES_DB=lenenergo] \
  [-e POSTGRES_HOST=localhost] \
  [-e POSTGRES_PORT=5432] \
  # or
  [DB_URL=postgresql://lenenergo:lenenergo@localhost:5432/lenenergo] \
  lenenergo_parser
 ```
 ## Dev instructions
 ```bash
 python -m venv .venv
 pip install -r requirements.txt
 python -m runner
 ```
--- a/parser/README.md
+++ b/parser/README.md
@ -31,12 +31,18 @@ def split_addresses(df: pd.DataFrame) -> pd.DataFrame
 ```
 - `get_building_id`:
 ```python
-def get_building_id(street: str) -> Tuple[Optional[int], Optional[float], Optional[float]]
+def get_building_id(street: str) -> GeoTupleType
 ```
 - `fetch_builing_ids`:
 ```python
 def fetch_builing_ids(df: pd.DataFrame) -> pd.DataFrame
 ```
 - `async_fetch_building_id`:
 ```python
 async def async_fetch_building_id(
    session: aiohttp.ClientSession, street: str
 ) -> GeoTupleType
 ```
 - `async_fetch_building_ids`:
 ```python
 async def async_fetch_building_ids(df: pd.DataFrame) -> pd.DataFrame
@ -51,11 +57,15 @@ def preprocess_df(df: pd.DataFrame) -> pd.DataFrame
 ```
 - `COL_NS`:
 ```python
-COL_NS: Dict[str, str]
+COL_NS: dict[str, str]
 ```
 - `ICOL_NS`:
 ```python
-ICOL_NS: Dict[str, str]
+ICOL_NS: dict[str, str]
 ```
 - `PR_COL_NS`:
 ```python
 PR_COL_NS: tuple[str]
 ```
 - `preprocess_read_df`:
 ```python
@ -82,4 +92,4 @@ pip install -r requirements.txt
 python -m parser [<Период в часах>]
 ```
-Формат сохраняемых файлов: `data_%d-%m-%y_%H:%M.csv`
+Формат сохраняемых файлов: `data_%d-%m-%y_%H.%M.csv`
--- a/parser/init.py
+++ b/parser/init.py
@ -1,5 +1,6 @@
 from .address import split_addresses
 from .building_id import (
    GeoTupleType,
    async_fetch_building_id,
    async_fetch_building_ids,
    concurrent_fetch_builing_ids,
@ -11,20 +12,23 @@ from .pipeline import pipeline
 from .preprocess import (
    COL_NS,
    ICOL_NS,
    PR_COL_NS,
    group_by_index,
    preprocess_df,
    preprocess_read_df,
 )
 __all__ = (
    "COL_NS",
    "GeoTupleType",
    "ICOL_NS",
    "PR_COL_NS",
    "async_fetch_building_id",
    "async_fetch_building_ids",
    "COL_NS",
    "concurrent_fetch_builing_ids",
    "fetch_builing_ids",
    "get_building_id",
    "group_by_index",
    "ICOL_NS",
    "LenenergoParser",
    "pipeline",
    "preprocess_df",
--- a/parser/building_id.py
+++ b/parser/building_id.py
@ -45,7 +45,10 @@ async def async_fetch_building_id(
    async with session.get(
        "https://geocode.gate.petersburg.ru/parse/eas", params={"street": street}
    ) as r:
        try:
            res = await r.json()
        except aiohttp.client_exceptions.ContentTypeError:
            res = "error"
        if "error" in res:
            return None, None, None
--- a/parser/preprocess.py
+++ b/parser/preprocess.py
@ -21,6 +21,23 @@ COL_NS = {
    "lng": "Долгота",
 }
 PR_COL_NS = (
    "index",
    "region",
    "area",
    "town",
    "street",
    "branch",
    "res",
    "comment",
    "building_id",
    "lat",
    "lng",
    "start",
    "finish",
 )
 ICOL_NS = dict(map(reversed, COL_NS.items()))
--- a/runner/main.py
+++ b/runner/main.py
@ -1,11 +1,17 @@
 import schedule
-from .config import REFETCH_PERIOD_H
+from .config import REFETCH_PERIOD_H, STORE_NULL_BID
 from .job import job
 from .scheduler import run_continuously
 schedule.every(REFETCH_PERIOD_H).hours.do(job)
 stop_run_continuously = run_continuously()
 print(
    f"Scheduled to run every {REFETCH_PERIOD_H} hour and "
    + ("" if STORE_NULL_BID else "not ")
    + "to store NULL building_id"
 )
 # First run
 job()
--- a/runner/config.py
+++ b/runner/config.py
@ -1,6 +1,8 @@
 import os
 from parser import PR_COL_NS
 REFETCH_PERIOD_H = int(os.environ.get("REFETCH_PERIOD_H", "4"))
 STORE_NULL_BID = os.environ.get("STORE_NULL_BID", "False") == "True"
 POSTGRES_USER = os.environ.get("POSTGRES_USER", "lenenergo")
 POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD", "lenenergo")
@ -9,3 +11,9 @@ POSTGRES_HOST = os.environ.get("POSTGRES_HOST", "localhost")
 POSTGRES_PORT = int(os.environ.get("POSTGRES_PORT", "5432"))
 DB_URL = os.environ.get("DB_URL", None)
 DB_COLUMNS_MAP = dict(zip(PR_COL_NS, PR_COL_NS))
 """
 Feel free to rewrite mapping like 
 DB_COLUMNS_MAP["<COL_NS key>"] = "<corresponding db column name>"
 """
--- a/runner/job.py
+++ b/runner/job.py
@ -4,23 +4,12 @@ from parser import pipeline
 import pandas as pd
 import psycopg
 from .config import DB_COLUMNS_MAP, STORE_NULL_BID
 from .database import db_credentials
-sql_statement = """COPY records (
+sql_statement = "".join(
-    index,
+    ("COPY records (", ", ".join(DB_COLUMNS_MAP.values()), ") FROM STDIN")
-    region,
+)
    area,
    town,
    street,
    branch,
    res,
    comment,
    building_id,
    lat,
    lng,
    start,
    finish
 ) FROM STDIN"""
 def job():
@ -36,6 +25,8 @@ def job():
            with cursor.copy(sql_statement) as copy:
                for _, row in parser.df.iterrows():
                    row = row.where((pd.notnull(row)), None)
-                    copy.write_row(row.to_list())
+                    if row["building_id"] is not None or STORE_NULL_BID:
                        db_row = row.rename(DB_COLUMNS_MAP)
                        copy.write_row(db_row.to_list())
    print(f"Fetched in {datetime.now() - fetch_start}\n{parser}")
Author	SHA1	Message	Date
Dmitriy Shishkov	d653810dcb	Update README.md	2023-10-30 10:04:09 +03:00
dm1sh	c735ca2ee5	Added readme	2023-10-29 17:09:54 +03:00
dm1sh	2efb4d4846	Merge branch 'dev'	2023-10-29 16:27:41 +03:00
dm1sh	430f36619a	Added saving null responses setting and column name config	2023-10-29 15:36:18 +03:00
dm1sh	367abfc325	Added error handling for geocoder HTTP error	2023-10-29 15:34:42 +03:00
dm1sh	68b92b8bd2	Removed .vscode	2023-10-29 14:36:45 +03:00