FInished work

(Too lazy to split by commits)
This commit is contained in:
2023-09-21 20:41:56 +03:00
parent 600daa5498
commit c40a1b4f92
30 changed files with 2424 additions and 144 deletions

View File

@@ -0,0 +1,5 @@
from .rosseti import RossetiParser
from .address import split_addresses
from .building_id import fetch_builing_ids
from .preprocess import preprocess_df, COL_NS, ICOL_NS, preprocess_read_df, group_by_index
from .util import pipeline

View File

@@ -0,0 +1,28 @@
import sys
import schedule
import time
from . import pipeline
def job():
parser = pipeline()
parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H:%M")}.csv')
if len(sys.argv) == 2:
if sys.argv[1] == '-h' or sys.argv[1] == '--help':
print("python -m parser [<running period in hours>]")
exit(0)
interval = int(sys.argv[1])
if interval > 0:
schedule.every(interval).hours.do(job)
job()
while True:
schedule.run_pending()
time.sleep(schedule.idle_seconds())
else:
job()

88
rosseti_parser/address.py Normal file
View File

@@ -0,0 +1,88 @@
from __future__ import annotations
from typing import List, Iterable, TypeVar, Any
import pandas as pd
import re
T = TypeVar('T')
STREET_PREFIXES = ('ул.', 'бул.', 'пр.', 'ул', 'бул',
'пр', 'ш.', 'ш', 'пер.', 'пер')
HOUSES_PREFIXES = ('д.', 'д')
def unfold_house_ranges(token: str) -> str:
pairs_strings = re.findall(r'([\d]+-[\d]+)', token)
for pair_string in pairs_strings:
a, b = pair_string.split('-')
a, b = int(a), int(b)
if b > a:
token = token.replace(
pair_string, ', '.join(map(str, range(a, b+1))))
return token
def unfold_houses_list(token: str) -> List[str]:
token = unfold_house_ranges(token)
reg = re.compile(r'(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )')
if len(re.findall(reg, token)) > 1:
tokens = token.split(',')
return [*[tokens[0] + ' ' + house_token for house_token in tokens[1:]]]
return [token]
def any_of_in(substrings: Iterable[str], string: str) -> bool:
return any(map(lambda substring: substring in string, substrings))
def flatten(arr: Iterable[List[T]]) -> List[T]:
return sum(arr, [])
def split_address(address: str) -> List[str]:
if ';' in address:
return flatten(map(unfold_houses_list, address.split(';')))
elif ',' in address:
tokens = re.split(r'(,)', address)
tokens = list(map(str.strip, filter(
lambda token: token != '', tokens)))
res = []
accumulator = ''
for i in range(len(tokens)):
if (any_of_in(STREET_PREFIXES, tokens[i].lower()) and
any_of_in(STREET_PREFIXES, accumulator.lower())):
res += unfold_houses_list(accumulator)
accumulator = ''
accumulator += tokens[i]
res += unfold_houses_list(accumulator)
return res
return [address]
def process_row(row: pd.Series[str]) -> pd.Series[str]:
row = row.copy()
if pd.isnull(row['Улица']):
row['Улица'] = [None]
else:
addresses = split_address(row['Улица'])
row['Улица'] = addresses
return row
def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
merged_df = df.apply(process_row, axis=1).reset_index()
return merged_df.explode('Улица', ignore_index=True)

View File

@@ -0,0 +1,31 @@
from __future__ import annotations
from typing import Optional, Tuple, Any, List
import requests
import pandas as pd
import numpy as np
GeoTupleType = Tuple[Optional[int], Optional[float], Optional[float]]
def get_building_id(street: str) -> GeoTupleType:
if pd.isnull(street):
return None, None, None
r = requests.get('https://geocode.gate.petersburg.ru/parse/eas', params={
'street': street
}, timeout=10)
res = r.json()
if 'error' in res:
return None, None, None
return res['Building_ID'], res['Latitude'], res['Longitude']
def fetch_builing_ids(df: pd.DataFrame) -> pd.DataFrame:
df[['ID здания', 'Широта', 'Долгота']] = df.apply(
lambda row: get_building_id(row['Улица']), axis=1, result_type='expand')
return df

View File

@@ -0,0 +1,62 @@
from __future__ import annotations
from typing import Any, List
import pandas as pd
COL_NS = {
'region': 'Регион РФ (область, край, город фед. значения, округ)',
'area': 'Административный район',
'town': 'Населённый пункт',
'street': 'Улица',
'start_date': 'Плановая дата начала отключения электроснабжения',
'start_time': 'Плановое время начала отключения электроснабжения',
'finish_date': 'Плановая дата восстановления отключения электроснабжения',
'finish_time': 'Плановое время восстановления отключения электроснабжения',
'branch': 'Филиал',
'res': 'РЭС',
'comment': 'Комментарий',
'building_id': 'ID здания',
'lat': 'Широта',
'lng': 'Долгота'
}
ICOL_NS = dict(map(reversed, COL_NS.items()))
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
df.rename(columns=ICOL_NS, inplace=True)
for a in ('start', 'finish'):
df[f'{a}'] = pd.to_datetime(
df[f'{a}_date'].astype(str) + ' ' + df[f'{a}_time'].astype(str),
dayfirst=True
)
df.drop(columns=[f'{a}_date', f'{a}_time'], inplace=True)
return df
def preprocess_read_df(df: pd.DataFrame) -> pd.DataFrame:
for name in ('start', 'finish'):
df[name] = pd.to_datetime(df[name])
return df
def join_columns(col: pd.Series[Any]) -> List[Any] | Any:
first = col.iloc[0]
if col.name in ('street', 'building_id', 'lat', 'lng') and pd.notnull(first):
return list(col)
return first
def group_by_index(df: pd.DataFrame) -> pd.DataFrame:
groupped = df.groupby('index')
res_df = groupped.apply(
lambda index_df: index_df.apply(join_columns)
).drop(columns='index')
return res_df

109
rosseti_parser/rosseti.py Normal file
View File

@@ -0,0 +1,109 @@
from datetime import datetime, timedelta
import io
from typing import Mapping, Optional, Tuple, no_type_check
from bs4 import BeautifulSoup
import requests
import pandas as pd
class RossetiParser:
def __init__(self, ndays=7, today: Optional[datetime] = None, file_path: Optional[str] = None) -> None:
self.base_url = "https://rosseti-lenenergo.ru/planned_work"
if today is None:
today = datetime.today()
self.ndays = ndays
self.today = today
self.df = pd.DataFrame()
if file_path is None:
self.fetch(ndays)
else:
self.load_df(file_path)
def __str__(self) -> str:
return f"From {self.today.date()} for {self.ndays} days with {len(self.df)} records"
@staticmethod
def __format_date(date: datetime) -> str:
return date.strftime("%d.%m.%y")
def __compose_date_params(self, ndays: int, today: datetime) -> Mapping[str, str]:
date_start = self.__format_date(today)
date_finish = self.__format_date(today + timedelta(days=ndays))
return {
'date_start': date_start,
'date_finish': date_finish
}
def __get_page(self, url: str, params: Mapping[str, str]) -> None:
r = requests.get(url, params)
self.soup = BeautifulSoup(r.text, features='html.parser')
def __parse_nav(self) -> Tuple[str, str]:
navigator = self.soup.find('span', attrs={'class': 'page-nav-i'})
next_uri = navigator.find('a', attrs={'class': 'next'})['href']
last_uri = navigator.find_all('a')[-1]['href']
return next_uri, last_uri
def __parse_table(self) -> pd.DataFrame:
table = self.soup.find(
'table', attrs={'class': 'tableous_facts funds'})
return pd.read_html(io.StringIO(str(table)))[0]
def __save_page(self, uri: str) -> None:
print(f'Processing page "{uri}"')
self.__get_page(self.base_url + uri, self.__params)
self.df = pd.concat(
(self.df, self.__parse_table()), ignore_index=True)
def __set_columns(self) -> None:
self.df.columns = pd.Index((
"Регион РФ (область, край, город фед. значения, округ)",
"Административный район",
"Населённый пункт",
"Улица",
"Плановая дата начала отключения электроснабжения",
"Плановое время начала отключения электроснабжения",
"Плановая дата восстановления отключения электроснабжения",
"Плановое время восстановления отключения электроснабжения",
"Филиал",
"РЭС",
"Комментарий",
))
def fetch(self, ndays: Optional[int] = None, today: Optional[datetime] = None) -> None:
if ndays is None:
ndays = self.ndays
if today is None:
today = self.today
self.__params = self.__compose_date_params(ndays, today)
self.__save_page('')
next_uri, last_uri = self.__parse_nav()
while next_uri != last_uri:
self.__save_page(next_uri)
next_uri, _ = self.__parse_nav()
self.__save_page(next_uri)
self.__set_columns()
def save_df(self, file_path: str) -> None:
print(f'Saved as "{file_path}"')
self.df.to_csv(file_path, index=False)
def load_df(self, file_path: str) -> None:
print(f'Read from "{file_path}"')
self.df = pd.read_csv(file_path)

18
rosseti_parser/util.py Normal file
View File

@@ -0,0 +1,18 @@
from typing import Optional
from . import RossetiParser, split_addresses, fetch_builing_ids, preprocess_df
def pipeline(parser: Optional[RossetiParser] = None) -> RossetiParser:
if parser is None:
parser = RossetiParser()
print(parser)
parser.df = split_addresses(parser.df)
parser.df = fetch_builing_ids(parser.df)
parser.df = preprocess_df(parser.df)
return parser