FInished work
(Too lazy to split by commits)
This commit is contained in:
5
rosseti_parser/__init__.py
Normal file
5
rosseti_parser/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .rosseti import RossetiParser
|
||||
from .address import split_addresses
|
||||
from .building_id import fetch_builing_ids
|
||||
from .preprocess import preprocess_df, COL_NS, ICOL_NS, preprocess_read_df, group_by_index
|
||||
from .util import pipeline
|
28
rosseti_parser/__main__.py
Normal file
28
rosseti_parser/__main__.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import sys
|
||||
import schedule
|
||||
import time
|
||||
|
||||
from . import pipeline
|
||||
|
||||
|
||||
def job():
|
||||
parser = pipeline()
|
||||
parser.save_df(f'./data_{parser.today.strftime("%d-%m-%y_%H:%M")}.csv')
|
||||
|
||||
|
||||
if len(sys.argv) == 2:
|
||||
if sys.argv[1] == '-h' or sys.argv[1] == '--help':
|
||||
print("python -m parser [<running period in hours>]")
|
||||
exit(0)
|
||||
|
||||
interval = int(sys.argv[1])
|
||||
if interval > 0:
|
||||
schedule.every(interval).hours.do(job)
|
||||
|
||||
job()
|
||||
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(schedule.idle_seconds())
|
||||
else:
|
||||
job()
|
88
rosseti_parser/address.py
Normal file
88
rosseti_parser/address.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from __future__ import annotations
|
||||
from typing import List, Iterable, TypeVar, Any
|
||||
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
STREET_PREFIXES = ('ул.', 'бул.', 'пр.', 'ул', 'бул',
|
||||
'пр', 'ш.', 'ш', 'пер.', 'пер')
|
||||
HOUSES_PREFIXES = ('д.', 'д')
|
||||
|
||||
|
||||
def unfold_house_ranges(token: str) -> str:
|
||||
pairs_strings = re.findall(r'([\d]+-[\d]+)', token)
|
||||
for pair_string in pairs_strings:
|
||||
a, b = pair_string.split('-')
|
||||
a, b = int(a), int(b)
|
||||
|
||||
if b > a:
|
||||
token = token.replace(
|
||||
pair_string, ', '.join(map(str, range(a, b+1))))
|
||||
|
||||
return token
|
||||
|
||||
|
||||
def unfold_houses_list(token: str) -> List[str]:
|
||||
token = unfold_house_ranges(token)
|
||||
|
||||
reg = re.compile(r'(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )')
|
||||
|
||||
if len(re.findall(reg, token)) > 1:
|
||||
tokens = token.split(',')
|
||||
return [*[tokens[0] + ' ' + house_token for house_token in tokens[1:]]]
|
||||
return [token]
|
||||
|
||||
|
||||
def any_of_in(substrings: Iterable[str], string: str) -> bool:
|
||||
return any(map(lambda substring: substring in string, substrings))
|
||||
|
||||
|
||||
def flatten(arr: Iterable[List[T]]) -> List[T]:
|
||||
return sum(arr, [])
|
||||
|
||||
|
||||
def split_address(address: str) -> List[str]:
|
||||
if ';' in address:
|
||||
return flatten(map(unfold_houses_list, address.split(';')))
|
||||
elif ',' in address:
|
||||
tokens = re.split(r'(,)', address)
|
||||
|
||||
tokens = list(map(str.strip, filter(
|
||||
lambda token: token != '', tokens)))
|
||||
|
||||
res = []
|
||||
accumulator = ''
|
||||
|
||||
for i in range(len(tokens)):
|
||||
if (any_of_in(STREET_PREFIXES, tokens[i].lower()) and
|
||||
any_of_in(STREET_PREFIXES, accumulator.lower())):
|
||||
res += unfold_houses_list(accumulator)
|
||||
accumulator = ''
|
||||
|
||||
accumulator += tokens[i]
|
||||
|
||||
res += unfold_houses_list(accumulator)
|
||||
|
||||
return res
|
||||
|
||||
return [address]
|
||||
|
||||
|
||||
def process_row(row: pd.Series[str]) -> pd.Series[str]:
|
||||
row = row.copy()
|
||||
|
||||
if pd.isnull(row['Улица']):
|
||||
row['Улица'] = [None]
|
||||
else:
|
||||
addresses = split_address(row['Улица'])
|
||||
row['Улица'] = addresses
|
||||
|
||||
return row
|
||||
|
||||
|
||||
def split_addresses(df: pd.DataFrame) -> pd.DataFrame:
|
||||
merged_df = df.apply(process_row, axis=1).reset_index()
|
||||
|
||||
return merged_df.explode('Улица', ignore_index=True)
|
31
rosseti_parser/building_id.py
Normal file
31
rosseti_parser/building_id.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from __future__ import annotations
|
||||
from typing import Optional, Tuple, Any, List
|
||||
|
||||
import requests
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
GeoTupleType = Tuple[Optional[int], Optional[float], Optional[float]]
|
||||
|
||||
|
||||
def get_building_id(street: str) -> GeoTupleType:
|
||||
if pd.isnull(street):
|
||||
return None, None, None
|
||||
|
||||
r = requests.get('https://geocode.gate.petersburg.ru/parse/eas', params={
|
||||
'street': street
|
||||
}, timeout=10)
|
||||
|
||||
res = r.json()
|
||||
|
||||
if 'error' in res:
|
||||
return None, None, None
|
||||
|
||||
return res['Building_ID'], res['Latitude'], res['Longitude']
|
||||
|
||||
|
||||
def fetch_builing_ids(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[['ID здания', 'Широта', 'Долгота']] = df.apply(
|
||||
lambda row: get_building_id(row['Улица']), axis=1, result_type='expand')
|
||||
|
||||
return df
|
62
rosseti_parser/preprocess.py
Normal file
62
rosseti_parser/preprocess.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from __future__ import annotations
|
||||
from typing import Any, List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
COL_NS = {
|
||||
'region': 'Регион РФ (область, край, город фед. значения, округ)',
|
||||
'area': 'Административный район',
|
||||
'town': 'Населённый пункт',
|
||||
'street': 'Улица',
|
||||
'start_date': 'Плановая дата начала отключения электроснабжения',
|
||||
'start_time': 'Плановое время начала отключения электроснабжения',
|
||||
'finish_date': 'Плановая дата восстановления отключения электроснабжения',
|
||||
'finish_time': 'Плановое время восстановления отключения электроснабжения',
|
||||
'branch': 'Филиал',
|
||||
'res': 'РЭС',
|
||||
'comment': 'Комментарий',
|
||||
'building_id': 'ID здания',
|
||||
'lat': 'Широта',
|
||||
'lng': 'Долгота'
|
||||
}
|
||||
|
||||
ICOL_NS = dict(map(reversed, COL_NS.items()))
|
||||
|
||||
|
||||
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df.rename(columns=ICOL_NS, inplace=True)
|
||||
|
||||
for a in ('start', 'finish'):
|
||||
df[f'{a}'] = pd.to_datetime(
|
||||
df[f'{a}_date'].astype(str) + ' ' + df[f'{a}_time'].astype(str),
|
||||
dayfirst=True
|
||||
)
|
||||
df.drop(columns=[f'{a}_date', f'{a}_time'], inplace=True)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def preprocess_read_df(df: pd.DataFrame) -> pd.DataFrame:
|
||||
for name in ('start', 'finish'):
|
||||
df[name] = pd.to_datetime(df[name])
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def join_columns(col: pd.Series[Any]) -> List[Any] | Any:
|
||||
first = col.iloc[0]
|
||||
|
||||
if col.name in ('street', 'building_id', 'lat', 'lng') and pd.notnull(first):
|
||||
return list(col)
|
||||
|
||||
return first
|
||||
|
||||
|
||||
def group_by_index(df: pd.DataFrame) -> pd.DataFrame:
|
||||
groupped = df.groupby('index')
|
||||
|
||||
res_df = groupped.apply(
|
||||
lambda index_df: index_df.apply(join_columns)
|
||||
).drop(columns='index')
|
||||
|
||||
return res_df
|
109
rosseti_parser/rosseti.py
Normal file
109
rosseti_parser/rosseti.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from datetime import datetime, timedelta
|
||||
import io
|
||||
from typing import Mapping, Optional, Tuple, no_type_check
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class RossetiParser:
|
||||
def __init__(self, ndays=7, today: Optional[datetime] = None, file_path: Optional[str] = None) -> None:
|
||||
self.base_url = "https://rosseti-lenenergo.ru/planned_work"
|
||||
|
||||
if today is None:
|
||||
today = datetime.today()
|
||||
|
||||
self.ndays = ndays
|
||||
self.today = today
|
||||
self.df = pd.DataFrame()
|
||||
|
||||
if file_path is None:
|
||||
self.fetch(ndays)
|
||||
else:
|
||||
self.load_df(file_path)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"From {self.today.date()} for {self.ndays} days with {len(self.df)} records"
|
||||
|
||||
@staticmethod
|
||||
def __format_date(date: datetime) -> str:
|
||||
return date.strftime("%d.%m.%y")
|
||||
|
||||
def __compose_date_params(self, ndays: int, today: datetime) -> Mapping[str, str]:
|
||||
date_start = self.__format_date(today)
|
||||
date_finish = self.__format_date(today + timedelta(days=ndays))
|
||||
|
||||
return {
|
||||
'date_start': date_start,
|
||||
'date_finish': date_finish
|
||||
}
|
||||
|
||||
def __get_page(self, url: str, params: Mapping[str, str]) -> None:
|
||||
r = requests.get(url, params)
|
||||
|
||||
self.soup = BeautifulSoup(r.text, features='html.parser')
|
||||
|
||||
def __parse_nav(self) -> Tuple[str, str]:
|
||||
navigator = self.soup.find('span', attrs={'class': 'page-nav-i'})
|
||||
|
||||
next_uri = navigator.find('a', attrs={'class': 'next'})['href']
|
||||
last_uri = navigator.find_all('a')[-1]['href']
|
||||
|
||||
return next_uri, last_uri
|
||||
|
||||
def __parse_table(self) -> pd.DataFrame:
|
||||
table = self.soup.find(
|
||||
'table', attrs={'class': 'tableous_facts funds'})
|
||||
|
||||
return pd.read_html(io.StringIO(str(table)))[0]
|
||||
|
||||
def __save_page(self, uri: str) -> None:
|
||||
print(f'Processing page "{uri}"')
|
||||
self.__get_page(self.base_url + uri, self.__params)
|
||||
self.df = pd.concat(
|
||||
(self.df, self.__parse_table()), ignore_index=True)
|
||||
|
||||
def __set_columns(self) -> None:
|
||||
self.df.columns = pd.Index((
|
||||
"Регион РФ (область, край, город фед. значения, округ)",
|
||||
"Административный район",
|
||||
"Населённый пункт",
|
||||
"Улица",
|
||||
"Плановая дата начала отключения электроснабжения",
|
||||
"Плановое время начала отключения электроснабжения",
|
||||
"Плановая дата восстановления отключения электроснабжения",
|
||||
"Плановое время восстановления отключения электроснабжения",
|
||||
"Филиал",
|
||||
"РЭС",
|
||||
"Комментарий",
|
||||
))
|
||||
|
||||
def fetch(self, ndays: Optional[int] = None, today: Optional[datetime] = None) -> None:
|
||||
if ndays is None:
|
||||
ndays = self.ndays
|
||||
if today is None:
|
||||
today = self.today
|
||||
|
||||
self.__params = self.__compose_date_params(ndays, today)
|
||||
|
||||
self.__save_page('')
|
||||
|
||||
next_uri, last_uri = self.__parse_nav()
|
||||
|
||||
while next_uri != last_uri:
|
||||
self.__save_page(next_uri)
|
||||
|
||||
next_uri, _ = self.__parse_nav()
|
||||
|
||||
self.__save_page(next_uri)
|
||||
|
||||
self.__set_columns()
|
||||
|
||||
def save_df(self, file_path: str) -> None:
|
||||
print(f'Saved as "{file_path}"')
|
||||
self.df.to_csv(file_path, index=False)
|
||||
|
||||
def load_df(self, file_path: str) -> None:
|
||||
print(f'Read from "{file_path}"')
|
||||
self.df = pd.read_csv(file_path)
|
18
rosseti_parser/util.py
Normal file
18
rosseti_parser/util.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from typing import Optional
|
||||
|
||||
from . import RossetiParser, split_addresses, fetch_builing_ids, preprocess_df
|
||||
|
||||
|
||||
def pipeline(parser: Optional[RossetiParser] = None) -> RossetiParser:
|
||||
if parser is None:
|
||||
parser = RossetiParser()
|
||||
|
||||
print(parser)
|
||||
|
||||
parser.df = split_addresses(parser.df)
|
||||
|
||||
parser.df = fetch_builing_ids(parser.df)
|
||||
|
||||
parser.df = preprocess_df(parser.df)
|
||||
|
||||
return parser
|
Reference in New Issue
Block a user