Format code, renamed modules
This commit is contained in:
85
parser/README.md
Normal file
85
parser/README.md
Normal file
@@ -0,0 +1,85 @@
|
||||
# Rosseti parser
|
||||
|
||||
## Описание
|
||||
|
||||
Библиотека, куда собран код, необходимый для получения, обработки и сохранения данных с сайта [Россети Ленэнерго](https://rosseti-lenenergo.ru/planned_work/)
|
||||
|
||||
## Интерфейс
|
||||
|
||||
- `RossetiParser`:
|
||||
```python
|
||||
class RossetiParser:
|
||||
def __init__(self, ndays=7, today: Optional[datetime] = None, file_path: Optional[str] = None) -> None
|
||||
|
||||
self.base_url: str
|
||||
self.ndays: int
|
||||
self.today: datetime
|
||||
self.df: pd.DataFrame
|
||||
|
||||
def __str__(self) -> str
|
||||
|
||||
def fetch(self, ndays: Optional[int] = None, today: Optional[datetime] = None) -> None
|
||||
|
||||
def save_df(self, file_path: str) -> None
|
||||
|
||||
def load_df(self, file_path: str) -> None
|
||||
|
||||
```
|
||||
- `split_addresses`:
|
||||
```python
|
||||
def split_addresses(df: pd.DataFrame) -> pd.DataFrame
|
||||
```
|
||||
- `get_building_id`:
|
||||
```python
|
||||
def get_building_id(street: str) -> Tuple[Optional[int], Optional[float], Optional[float]]
|
||||
```
|
||||
- `fetch_builing_ids`:
|
||||
```python
|
||||
def fetch_builing_ids(df: pd.DataFrame) -> pd.DataFrame
|
||||
```
|
||||
- `async_fetch_building_ids`:
|
||||
```python
|
||||
async def async_fetch_building_ids(df: pd.DataFrame) -> pd.DataFrame
|
||||
```
|
||||
- `concurrent_fetch_builing_ids`:
|
||||
```python
|
||||
def concurrent_fetch_builing_ids(df: pd.Dataframe) -> pd.DataFrame
|
||||
```
|
||||
- `preprocess_df`:
|
||||
```python
|
||||
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame
|
||||
```
|
||||
- `COL_NS`:
|
||||
```python
|
||||
COL_NS: Dict[str, str]
|
||||
```
|
||||
- `ICOL_NS`:
|
||||
```python
|
||||
ICOL_NS: Dict[str, str]
|
||||
```
|
||||
- `preprocess_read_df`:
|
||||
```python
|
||||
def preprocess_read_df(df: pd.DataFrame) -> pd.DataFrame
|
||||
```
|
||||
- `group_by_index`:
|
||||
```python
|
||||
def group_by_index(df: pd.DataFrame) -> pd.DataFrame
|
||||
```
|
||||
- `pipeline`:
|
||||
```python
|
||||
def pipeline(parser: Optional[RossetiParser] = None) -> RossetiParser
|
||||
```
|
||||
|
||||
## Инструкция по запуску
|
||||
|
||||
В корневой папке проекта:
|
||||
|
||||
```bash
|
||||
python -m venv .venv
|
||||
|
||||
pip install -r requirements.txt
|
||||
|
||||
python -m rosseti_parser [<Период в часах>]
|
||||
```
|
||||
|
||||
Формат сохраняемых файлов: `data_%d-%m-%y_%H:%M.csv`
|
31
parser/__init__.py
Normal file
31
parser/__init__.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from .address import split_addresses
|
||||
from .building_id import (
|
||||
async_fetch_building_ids,
|
||||
concurrent_fetch_builing_ids,
|
||||
fetch_builing_ids,
|
||||
get_building_id,
|
||||
)
|
||||
from .preprocess import (
|
||||
COL_NS,
|
||||
ICOL_NS,
|
||||
group_by_index,
|
||||
preprocess_df,
|
||||
preprocess_read_df,
|
||||
)
|
||||
from .rosseti import RossetiParser
|
||||
from .util import pipeline
|
||||
|
||||
__all__ = (
|
||||
"RossetiParser",
|
||||
"split_addresses",
|
||||
"get_building_id",
|
||||
"fetch_builing_ids",
|
||||
"async_fetch_building_ids",
|
||||
"concurrent_fetch_builing_ids",
|
||||
"preprocess_df",
|
||||
"COL_NS",
|
||||
"ICOL_NS",
|
||||
"preprocess_read_df",
|
||||
"group_by_index",
|
||||
"pipeline",
|
||||
)
|
29
parser/__main__.py
Normal file
29
parser/__main__.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import sys
|
||||
import time
|
||||
|
||||
import schedule
|
||||
|
||||
from . import pipeline
|
||||
|
||||
|
||||
def job():
|
||||
parser = pipeline()
|
||||
parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H:%M")}.csv')
|
||||
|
||||
|
||||
if len(sys.argv) == 2:
|
||||
if sys.argv[1] == "-h" or sys.argv[1] == "--help":
|
||||
print("python -m parser [<running period in hours>]")
|
||||
exit(0)
|
||||
|
||||
interval = int(sys.argv[1])
|
||||
if interval > 0:
|
||||
schedule.every(interval).hours.do(job)
|
||||
|
||||
job()
|
||||
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(schedule.idle_seconds())
|
||||
else:
|
||||
job()
|
87
parser/address.py
Normal file
87
parser/address.py
Normal file
@@ -0,0 +1,87 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Iterable, List, TypeVar
|
||||
|
||||
import pandas as pd
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
STREET_PREFIXES = ("ул.", "бул.", "пр.", "ул", "бул", "пр", "ш.", "ш", "пер.", "пер")
|
||||
HOUSES_PREFIXES = ("д.", "д")
|
||||
|
||||
|
||||
def unfold_house_ranges(token: str) -> str:
|
||||
pairs_strings = re.findall(r"([\d]+-[\d]+)", token)
|
||||
for pair_string in pairs_strings:
|
||||
a, b = pair_string.split("-")
|
||||
a, b = int(a), int(b)
|
||||
|
||||
if b > a:
|
||||
token = token.replace(pair_string, ", ".join(map(str, range(a, b + 1))))
|
||||
|
||||
return token
|
||||
|
||||
|
||||
def unfold_houses_list(token: str) -> List[str]:
|
||||
token = unfold_house_ranges(token)
|
||||
|
||||
reg = re.compile(r"(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )")
|
||||
|
||||
if len(re.findall(reg, token)) > 1:
|
||||
tokens = token.split(",")
|
||||
return [*[tokens[0] + " " + house_token for house_token in tokens[1:]]]
|
||||
return [token]
|
||||
|
||||
|
||||
def any_of_in(substrings: Iterable[str], string: str) -> bool:
|
||||
return any(map(lambda substring: substring in string, substrings))
|
||||
|
||||
|
||||
def flatten(arr: Iterable[List[T]]) -> List[T]:
|
||||
return sum(arr, [])
|
||||
|
||||
|
||||
def split_address(address: str) -> List[str]:
|
||||
if ";" in address:
|
||||
return flatten(map(unfold_houses_list, address.split(";")))
|
||||
elif "," in address:
|
||||
tokens = re.split(r"(,)", address)
|
||||
|
||||
tokens = list(map(str.strip, filter(lambda token: token != "", tokens)))
|
||||
|
||||
res = []
|
||||
accumulator = ""
|
||||
|
||||
for i in range(len(tokens)):
|
||||
if any_of_in(STREET_PREFIXES, tokens[i].lower()) and any_of_in(
|
||||
STREET_PREFIXES, accumulator.lower()
|
||||
):
|
||||
res += unfold_houses_list(accumulator)
|
||||
accumulator = ""
|
||||
|
||||
accumulator += tokens[i]
|
||||
|
||||
res += unfold_houses_list(accumulator)
|
||||
|
||||
return res
|
||||
|
||||
return [address]
|
||||
|
||||
|
||||
def process_row(row: pd.Series[str]) -> pd.Series[str]:
|
||||
row = row.copy()
|
||||
|
||||
if pd.isnull(row["Улица"]):
|
||||
row["Улица"] = [None]
|
||||
else:
|
||||
addresses = split_address(row["Улица"])
|
||||
row["Улица"] = addresses
|
||||
|
||||
return row
|
||||
|
||||
|
||||
def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
|
||||
merged_df = df.apply(process_row, axis=1).reset_index()
|
||||
|
||||
return merged_df.explode("Улица", ignore_index=True)
|
73
parser/building_id.py
Normal file
73
parser/building_id.py
Normal file
@@ -0,0 +1,73 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import aiohttp
|
||||
import pandas as pd
|
||||
import requests
|
||||
|
||||
GeoTupleType = Tuple[Optional[int], Optional[float], Optional[float]]
|
||||
|
||||
|
||||
def get_building_id(street: str) -> GeoTupleType:
|
||||
if pd.isnull(street):
|
||||
return None, None, None
|
||||
|
||||
r = requests.get(
|
||||
"https://geocode.gate.petersburg.ru/parse/eas",
|
||||
params={"street": street},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
res = r.json()
|
||||
|
||||
if "error" in res:
|
||||
return None, None, None
|
||||
|
||||
return res["Building_ID"], res["Latitude"], res["Longitude"]
|
||||
|
||||
|
||||
def fetch_builing_ids(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[["ID здания", "Широта", "Долгота"]] = df.apply(
|
||||
lambda row: get_building_id(row["Улица"]), axis=1, result_type="expand"
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
async def async_fetch_building_id(
|
||||
session: aiohttp.ClientSession, street: str
|
||||
) -> GeoTupleType:
|
||||
if pd.isnull(street):
|
||||
return None, None, None
|
||||
|
||||
async with session.get(
|
||||
"https://geocode.gate.petersburg.ru/parse/eas", params={"street": street}
|
||||
) as r:
|
||||
res = await r.json()
|
||||
|
||||
if "error" in res:
|
||||
return None, None, None
|
||||
|
||||
return res["Building_ID"], res["Latitude"], res["Longitude"]
|
||||
|
||||
|
||||
async def async_fetch_building_ids(df: pd.DataFrame) -> pd.DataFrame:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tasks = []
|
||||
|
||||
for _, row in df.iterrows():
|
||||
tasks.append(
|
||||
asyncio.ensure_future(async_fetch_building_id(session, row["Улица"]))
|
||||
)
|
||||
|
||||
res = await asyncio.gather(*tasks)
|
||||
|
||||
df[["ID здания", "Широта", "Долгота"]] = res
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def concurrent_fetch_builing_ids(df: pd.Dataframe) -> pd.DataFrame:
|
||||
return asyncio.run(async_fetch_building_ids(df))
|
63
parser/preprocess.py
Normal file
63
parser/preprocess.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
COL_NS = {
|
||||
"region": "Регион РФ (область, край, город фед. значения, округ)",
|
||||
"area": "Административный район",
|
||||
"town": "Населённый пункт",
|
||||
"street": "Улица",
|
||||
"start_date": "Плановая дата начала отключения электроснабжения",
|
||||
"start_time": "Плановое время начала отключения электроснабжения",
|
||||
"finish_date": "Плановая дата восстановления отключения электроснабжения",
|
||||
"finish_time": "Плановое время восстановления отключения электроснабжения",
|
||||
"branch": "Филиал",
|
||||
"res": "РЭС",
|
||||
"comment": "Комментарий",
|
||||
"building_id": "ID здания",
|
||||
"lat": "Широта",
|
||||
"lng": "Долгота",
|
||||
}
|
||||
|
||||
ICOL_NS = dict(map(reversed, COL_NS.items()))
|
||||
|
||||
|
||||
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df.rename(columns=ICOL_NS, inplace=True)
|
||||
|
||||
for a in ("start", "finish"):
|
||||
df[f"{a}"] = pd.to_datetime(
|
||||
df[f"{a}_date"].astype(str) + " " + df[f"{a}_time"].astype(str),
|
||||
dayfirst=True,
|
||||
)
|
||||
df.drop(columns=[f"{a}_date", f"{a}_time"], inplace=True)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def preprocess_read_df(df: pd.DataFrame) -> pd.DataFrame:
|
||||
for name in ("start", "finish"):
|
||||
df[name] = pd.to_datetime(df[name])
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def join_columns(col: pd.Series[Any]) -> List[Any] | Any:
|
||||
first = col.iloc[0]
|
||||
|
||||
if col.name in ("street", "building_id", "lat", "lng") and pd.notnull(first):
|
||||
return list(col)
|
||||
|
||||
return first
|
||||
|
||||
|
||||
def group_by_index(df: pd.DataFrame) -> pd.DataFrame:
|
||||
groupped = df.groupby("index")
|
||||
|
||||
res_df = groupped.apply(lambda index_df: index_df.apply(join_columns)).drop(
|
||||
columns="index"
|
||||
)
|
||||
|
||||
return res_df
|
110
parser/rosseti.py
Normal file
110
parser/rosseti.py
Normal file
@@ -0,0 +1,110 @@
|
||||
import io
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Mapping, Optional, Tuple
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
class RossetiParser:
|
||||
def __init__(
|
||||
self, ndays=7, today: Optional[datetime] = None, file_path: Optional[str] = None
|
||||
) -> None:
|
||||
self.base_url = "https://rosseti-lenenergo.ru/planned_work"
|
||||
|
||||
if today is None:
|
||||
today = datetime.today()
|
||||
|
||||
self.ndays = ndays
|
||||
self.today = today
|
||||
self.df = pd.DataFrame()
|
||||
|
||||
if file_path is None:
|
||||
self.fetch(ndays)
|
||||
else:
|
||||
self.load_df(file_path)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"From {self.today.date()} for {self.ndays} days with {len(self.df)} records"
|
||||
|
||||
@staticmethod
|
||||
def __format_date(date: datetime) -> str:
|
||||
return date.strftime("%d.%m.%y")
|
||||
|
||||
def __compose_date_params(self, ndays: int, today: datetime) -> Mapping[str, str]:
|
||||
date_start = self.__format_date(today)
|
||||
date_finish = self.__format_date(today + timedelta(days=ndays))
|
||||
|
||||
return {"date_start": date_start, "date_finish": date_finish}
|
||||
|
||||
def __get_page(self, url: str, params: Mapping[str, str]) -> None:
|
||||
r = requests.get(url, params)
|
||||
|
||||
self.soup = BeautifulSoup(r.text, features="html.parser")
|
||||
|
||||
def __parse_nav(self) -> Tuple[str, str]:
|
||||
navigator = self.soup.find("span", attrs={"class": "page-nav-i"})
|
||||
|
||||
next_uri = navigator.find("a", attrs={"class": "next"})["href"]
|
||||
last_uri = navigator.find_all("a")[-1]["href"]
|
||||
|
||||
return next_uri, last_uri
|
||||
|
||||
def __parse_table(self) -> pd.DataFrame:
|
||||
table = self.soup.find("table", attrs={"class": "tableous_facts funds"})
|
||||
|
||||
return pd.read_html(io.StringIO(str(table)))[0]
|
||||
|
||||
def __save_page(self, uri: str) -> None:
|
||||
print(f'Processing page "{uri}"')
|
||||
self.__get_page(self.base_url + uri, self.__params)
|
||||
self.df = pd.concat((self.df, self.__parse_table()), ignore_index=True)
|
||||
|
||||
def __set_columns(self) -> None:
|
||||
self.df.columns = pd.Index(
|
||||
(
|
||||
"Регион РФ (область, край, город фед. значения, округ)",
|
||||
"Административный район",
|
||||
"Населённый пункт",
|
||||
"Улица",
|
||||
"Плановая дата начала отключения электроснабжения",
|
||||
"Плановое время начала отключения электроснабжения",
|
||||
"Плановая дата восстановления отключения электроснабжения",
|
||||
"Плановое время восстановления отключения электроснабжения",
|
||||
"Филиал",
|
||||
"РЭС",
|
||||
"Комментарий",
|
||||
)
|
||||
)
|
||||
|
||||
def fetch(
|
||||
self, ndays: Optional[int] = None, today: Optional[datetime] = None
|
||||
) -> None:
|
||||
if ndays is None:
|
||||
ndays = self.ndays
|
||||
if today is None:
|
||||
today = self.today
|
||||
|
||||
self.__params = self.__compose_date_params(ndays, today)
|
||||
|
||||
self.__save_page("")
|
||||
|
||||
next_uri, last_uri = self.__parse_nav()
|
||||
|
||||
while next_uri != last_uri:
|
||||
self.__save_page(next_uri)
|
||||
|
||||
next_uri, _ = self.__parse_nav()
|
||||
|
||||
self.__save_page(next_uri)
|
||||
|
||||
self.__set_columns()
|
||||
|
||||
def save_df(self, file_path: str) -> None:
|
||||
print(f'Saved as "{file_path}"')
|
||||
self.df.to_csv(file_path, index=False)
|
||||
|
||||
def load_df(self, file_path: str) -> None:
|
||||
print(f'Read from "{file_path}"')
|
||||
self.df = pd.read_csv(file_path)
|
23
parser/util.py
Normal file
23
parser/util.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from typing import Optional
|
||||
|
||||
from . import (
|
||||
RossetiParser,
|
||||
concurrent_fetch_builing_ids,
|
||||
preprocess_df,
|
||||
split_addresses,
|
||||
)
|
||||
|
||||
|
||||
def pipeline(parser: Optional[RossetiParser] = None) -> RossetiParser:
|
||||
if parser is None:
|
||||
parser = RossetiParser()
|
||||
|
||||
print(parser)
|
||||
|
||||
parser.df = split_addresses(parser.df)
|
||||
|
||||
parser.df = concurrent_fetch_builing_ids(parser.df)
|
||||
|
||||
parser.df = preprocess_df(parser.df)
|
||||
|
||||
return parser
|
Reference in New Issue
Block a user