Format code, renamed modules

This commit is contained in:
2023-10-11 20:06:50 +03:00
parent 4e619ae414
commit d53ffda0f2
24 changed files with 365 additions and 339 deletions

85
parser/README.md Normal file
View File

@@ -0,0 +1,85 @@
# Rosseti parser
## Описание
Библиотека, куда собран код, необходимый для получения, обработки и сохранения данных с сайта [Россети Ленэнерго](https://rosseti-lenenergo.ru/planned_work/)
## Интерфейс
- `RossetiParser`:
```python
class RossetiParser:
def __init__(self, ndays=7, today: Optional[datetime] = None, file_path: Optional[str] = None) -> None
self.base_url: str
self.ndays: int
self.today: datetime
self.df: pd.DataFrame
def __str__(self) -> str
def fetch(self, ndays: Optional[int] = None, today: Optional[datetime] = None) -> None
def save_df(self, file_path: str) -> None
def load_df(self, file_path: str) -> None
```
- `split_addresses`:
```python
def split_addresses(df: pd.DataFrame) -> pd.DataFrame
```
- `get_building_id`:
```python
def get_building_id(street: str) -> Tuple[Optional[int], Optional[float], Optional[float]]
```
- `fetch_builing_ids`:
```python
def fetch_builing_ids(df: pd.DataFrame) -> pd.DataFrame
```
- `async_fetch_building_ids`:
```python
async def async_fetch_building_ids(df: pd.DataFrame) -> pd.DataFrame
```
- `concurrent_fetch_builing_ids`:
```python
def concurrent_fetch_builing_ids(df: pd.Dataframe) -> pd.DataFrame
```
- `preprocess_df`:
```python
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame
```
- `COL_NS`:
```python
COL_NS: Dict[str, str]
```
- `ICOL_NS`:
```python
ICOL_NS: Dict[str, str]
```
- `preprocess_read_df`:
```python
def preprocess_read_df(df: pd.DataFrame) -> pd.DataFrame
```
- `group_by_index`:
```python
def group_by_index(df: pd.DataFrame) -> pd.DataFrame
```
- `pipeline`:
```python
def pipeline(parser: Optional[RossetiParser] = None) -> RossetiParser
```
## Инструкция по запуску
В корневой папке проекта:
```bash
python -m venv .venv
pip install -r requirements.txt
python -m rosseti_parser [<Период в часах>]
```
Формат сохраняемых файлов: `data_%d-%m-%y_%H:%M.csv`

31
parser/__init__.py Normal file
View File

@@ -0,0 +1,31 @@
from .address import split_addresses
from .building_id import (
async_fetch_building_ids,
concurrent_fetch_builing_ids,
fetch_builing_ids,
get_building_id,
)
from .preprocess import (
COL_NS,
ICOL_NS,
group_by_index,
preprocess_df,
preprocess_read_df,
)
from .rosseti import RossetiParser
from .util import pipeline
__all__ = (
"RossetiParser",
"split_addresses",
"get_building_id",
"fetch_builing_ids",
"async_fetch_building_ids",
"concurrent_fetch_builing_ids",
"preprocess_df",
"COL_NS",
"ICOL_NS",
"preprocess_read_df",
"group_by_index",
"pipeline",
)

29
parser/__main__.py Normal file
View File

@@ -0,0 +1,29 @@
import sys
import time
import schedule
from . import pipeline
def job():
parser = pipeline()
parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H:%M")}.csv')
if len(sys.argv) == 2:
if sys.argv[1] == "-h" or sys.argv[1] == "--help":
print("python -m parser [<running period in hours>]")
exit(0)
interval = int(sys.argv[1])
if interval > 0:
schedule.every(interval).hours.do(job)
job()
while True:
schedule.run_pending()
time.sleep(schedule.idle_seconds())
else:
job()

87
parser/address.py Normal file
View File

@@ -0,0 +1,87 @@
from __future__ import annotations
import re
from typing import Iterable, List, TypeVar
import pandas as pd
T = TypeVar("T")
STREET_PREFIXES = ("ул.", "бул.", "пр.", "ул", "бул", "пр", "ш.", "ш", "пер.", "пер")
HOUSES_PREFIXES = ("д.", "д")
def unfold_house_ranges(token: str) -> str:
pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
for pair_string in pairs_strings:
a, b = pair_string.split("-")
a, b = int(a), int(b)
if b > a:
token = token.replace(pair_string, ", ".join(map(str, range(a, b + 1))))
return token
def unfold_houses_list(token: str) -> List[str]:
token = unfold_house_ranges(token)
reg = re.compile(r"(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )")
if len(re.findall(reg, token)) > 1:
tokens = token.split(",")
return [*[tokens[0] + " " + house_token for house_token in tokens[1:]]]
return [token]
def any_of_in(substrings: Iterable[str], string: str) -> bool:
return any(map(lambda substring: substring in string, substrings))
def flatten(arr: Iterable[List[T]]) -> List[T]:
return sum(arr, [])
def split_address(address: str) -> List[str]:
if ";" in address:
return flatten(map(unfold_houses_list, address.split(";")))
elif "," in address:
tokens = re.split(r"(,)", address)
tokens = list(map(str.strip, filter(lambda token: token != "", tokens)))
res = []
accumulator = ""
for i in range(len(tokens)):
if any_of_in(STREET_PREFIXES, tokens[i].lower()) and any_of_in(
STREET_PREFIXES, accumulator.lower()
):
res += unfold_houses_list(accumulator)
accumulator = ""
accumulator += tokens[i]
res += unfold_houses_list(accumulator)
return res
return [address]
def process_row(row: pd.Series[str]) -> pd.Series[str]:
row = row.copy()
if pd.isnull(row["Улица"]):
row["Улица"] = [None]
else:
addresses = split_address(row["Улица"])
row["Улица"] = addresses
return row
def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
merged_df = df.apply(process_row, axis=1).reset_index()
return merged_df.explode("Улица", ignore_index=True)

73
parser/building_id.py Normal file
View File

@@ -0,0 +1,73 @@
from __future__ import annotations
import asyncio
from typing import Optional, Tuple
import aiohttp
import pandas as pd
import requests
GeoTupleType = Tuple[Optional[int], Optional[float], Optional[float]]
def get_building_id(street: str) -> GeoTupleType:
if pd.isnull(street):
return None, None, None
r = requests.get(
"https://geocode.gate.petersburg.ru/parse/eas",
params={"street": street},
timeout=10,
)
res = r.json()
if "error" in res:
return None, None, None
return res["Building_ID"], res["Latitude"], res["Longitude"]
def fetch_builing_ids(df: pd.DataFrame) -> pd.DataFrame:
df[["ID здания", "Широта", "Долгота"]] = df.apply(
lambda row: get_building_id(row["Улица"]), axis=1, result_type="expand"
)
return df
async def async_fetch_building_id(
session: aiohttp.ClientSession, street: str
) -> GeoTupleType:
if pd.isnull(street):
return None, None, None
async with session.get(
"https://geocode.gate.petersburg.ru/parse/eas", params={"street": street}
) as r:
res = await r.json()
if "error" in res:
return None, None, None
return res["Building_ID"], res["Latitude"], res["Longitude"]
async def async_fetch_building_ids(df: pd.DataFrame) -> pd.DataFrame:
async with aiohttp.ClientSession() as session:
tasks = []
for _, row in df.iterrows():
tasks.append(
asyncio.ensure_future(async_fetch_building_id(session, row["Улица"]))
)
res = await asyncio.gather(*tasks)
df[["ID здания", "Широта", "Долгота"]] = res
return df
def concurrent_fetch_builing_ids(df: pd.Dataframe) -> pd.DataFrame:
return asyncio.run(async_fetch_building_ids(df))

63
parser/preprocess.py Normal file
View File

@@ -0,0 +1,63 @@
from __future__ import annotations
from typing import Any, List
import pandas as pd
COL_NS = {
"region": "Регион РФ (область, край, город фед. значения, округ)",
"area": "Административный район",
"town": "Населённый пункт",
"street": "Улица",
"start_date": "Плановая дата начала отключения электроснабжения",
"start_time": "Плановое время начала отключения электроснабжения",
"finish_date": "Плановая дата восстановления отключения электроснабжения",
"finish_time": "Плановое время восстановления отключения электроснабжения",
"branch": "Филиал",
"res": "РЭС",
"comment": "Комментарий",
"building_id": "ID здания",
"lat": "Широта",
"lng": "Долгота",
}
ICOL_NS = dict(map(reversed, COL_NS.items()))
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
df.rename(columns=ICOL_NS, inplace=True)
for a in ("start", "finish"):
df[f"{a}"] = pd.to_datetime(
df[f"{a}_date"].astype(str) + " " + df[f"{a}_time"].astype(str),
dayfirst=True,
)
df.drop(columns=[f"{a}_date", f"{a}_time"], inplace=True)
return df
def preprocess_read_df(df: pd.DataFrame) -> pd.DataFrame:
for name in ("start", "finish"):
df[name] = pd.to_datetime(df[name])
return df
def join_columns(col: pd.Series[Any]) -> List[Any] | Any:
first = col.iloc[0]
if col.name in ("street", "building_id", "lat", "lng") and pd.notnull(first):
return list(col)
return first
def group_by_index(df: pd.DataFrame) -> pd.DataFrame:
groupped = df.groupby("index")
res_df = groupped.apply(lambda index_df: index_df.apply(join_columns)).drop(
columns="index"
)
return res_df

110
parser/rosseti.py Normal file
View File

@@ -0,0 +1,110 @@
import io
from datetime import datetime, timedelta
from typing import Mapping, Optional, Tuple
import pandas as pd
import requests
from bs4 import BeautifulSoup
class RossetiParser:
def __init__(
self, ndays=7, today: Optional[datetime] = None, file_path: Optional[str] = None
) -> None:
self.base_url = "https://rosseti-lenenergo.ru/planned_work"
if today is None:
today = datetime.today()
self.ndays = ndays
self.today = today
self.df = pd.DataFrame()
if file_path is None:
self.fetch(ndays)
else:
self.load_df(file_path)
def __str__(self) -> str:
return f"From {self.today.date()} for {self.ndays} days with {len(self.df)} records"
@staticmethod
def __format_date(date: datetime) -> str:
return date.strftime("%d.%m.%y")
def __compose_date_params(self, ndays: int, today: datetime) -> Mapping[str, str]:
date_start = self.__format_date(today)
date_finish = self.__format_date(today + timedelta(days=ndays))
return {"date_start": date_start, "date_finish": date_finish}
def __get_page(self, url: str, params: Mapping[str, str]) -> None:
r = requests.get(url, params)
self.soup = BeautifulSoup(r.text, features="html.parser")
def __parse_nav(self) -> Tuple[str, str]:
navigator = self.soup.find("span", attrs={"class": "page-nav-i"})
next_uri = navigator.find("a", attrs={"class": "next"})["href"]
last_uri = navigator.find_all("a")[-1]["href"]
return next_uri, last_uri
def __parse_table(self) -> pd.DataFrame:
table = self.soup.find("table", attrs={"class": "tableous_facts funds"})
return pd.read_html(io.StringIO(str(table)))[0]
def __save_page(self, uri: str) -> None:
print(f'Processing page "{uri}"')
self.__get_page(self.base_url + uri, self.__params)
self.df = pd.concat((self.df, self.__parse_table()), ignore_index=True)
def __set_columns(self) -> None:
self.df.columns = pd.Index(
(
"Регион РФ (область, край, город фед. значения, округ)",
"Административный район",
"Населённый пункт",
"Улица",
"Плановая дата начала отключения электроснабжения",
"Плановое время начала отключения электроснабжения",
"Плановая дата восстановления отключения электроснабжения",
"Плановое время восстановления отключения электроснабжения",
"Филиал",
"РЭС",
"Комментарий",
)
)
def fetch(
self, ndays: Optional[int] = None, today: Optional[datetime] = None
) -> None:
if ndays is None:
ndays = self.ndays
if today is None:
today = self.today
self.__params = self.__compose_date_params(ndays, today)
self.__save_page("")
next_uri, last_uri = self.__parse_nav()
while next_uri != last_uri:
self.__save_page(next_uri)
next_uri, _ = self.__parse_nav()
self.__save_page(next_uri)
self.__set_columns()
def save_df(self, file_path: str) -> None:
print(f'Saved as "{file_path}"')
self.df.to_csv(file_path, index=False)
def load_df(self, file_path: str) -> None:
print(f'Read from "{file_path}"')
self.df = pd.read_csv(file_path)

23
parser/util.py Normal file
View File

@@ -0,0 +1,23 @@
from typing import Optional
from . import (
RossetiParser,
concurrent_fetch_builing_ids,
preprocess_df,
split_addresses,
)
def pipeline(parser: Optional[RossetiParser] = None) -> RossetiParser:
if parser is None:
parser = RossetiParser()
print(parser)
parser.df = split_addresses(parser.df)
parser.df = concurrent_fetch_builing_ids(parser.df)
parser.df = preprocess_df(parser.df)
return parser