Added class for web page parsing
This commit is contained in:
parent
618f13fbd5
commit
bdca9a6ebf
116
parser/__init__.py
Normal file
116
parser/__init__.py
Normal file
@ -0,0 +1,116 @@
|
||||
import io
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Mapping, Optional, Tuple, no_type_check
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class RossetiParser:
|
||||
def __init__(self, ndays=7, today=datetime.today(), file_path: Optional[str] = None):
|
||||
self.base_url = "https://rosseti-lenenergo.ru/planned_work"
|
||||
|
||||
self.ndays = ndays
|
||||
self.today = today
|
||||
|
||||
if file_path is None:
|
||||
self.fetch(ndays)
|
||||
else:
|
||||
self.load_df(file_path)
|
||||
|
||||
def __str__(self):
|
||||
return f"From {self.today.date()} for {self.ndays} days with {len(self.df)}_records"
|
||||
|
||||
@staticmethod
|
||||
def __format_date(date: datetime) -> str:
|
||||
return date.strftime("%d.%m.%y")
|
||||
|
||||
def __compose_date_params(self, ndays: int, today: datetime) -> Mapping[str, str]:
|
||||
date_start = self.__format_date(today)
|
||||
date_finish = self.__format_date(today + timedelta(days=ndays))
|
||||
|
||||
return {
|
||||
'date_start': date_start,
|
||||
'date_finish': date_finish
|
||||
}
|
||||
|
||||
def __get_page(self, url: str, params: Mapping[str, str]):
|
||||
r = requests.get(url, params)
|
||||
|
||||
self.soup = BeautifulSoup(r.text, features='html.parser')
|
||||
|
||||
def __parse_nav(self) -> Tuple[str, str]:
|
||||
navigator = self.soup.find('span', attrs={'class': 'page-nav-i'})
|
||||
|
||||
next_uri = navigator.find('a', attrs={'class': 'next'})['href']
|
||||
last_uri = navigator.find_all('a')[-1]['href']
|
||||
|
||||
return next_uri, last_uri
|
||||
|
||||
def __parse_table(self) -> pd.DataFrame:
|
||||
table = self.soup.find(
|
||||
'table', attrs={'class': 'tableous_facts funds'})
|
||||
|
||||
return pd.read_html(io.StringIO(str(table)))[0]
|
||||
|
||||
def __set_columns(self):
|
||||
self.df.columns = (
|
||||
"Регион РФ (область, край, город фед. значения, округ)",
|
||||
"Административный район",
|
||||
"Населённый пункт",
|
||||
"Улица",
|
||||
"Плановая дата начала отключения электроснабжения",
|
||||
"Плановое время начала отключения электроснабжения",
|
||||
"Плановая дата восстановления отключения электроснабжения",
|
||||
"Плановое время восстановления отключения электроснабжения",
|
||||
"Филиал",
|
||||
"РЭС",
|
||||
"Комментарий",
|
||||
)
|
||||
|
||||
def fetch(self, ndays: Optional[int] = None, today: Optional[datetime] = None):
|
||||
if ndays is None:
|
||||
ndays = self.ndays
|
||||
if today is None:
|
||||
today = self.today
|
||||
|
||||
params = self.__compose_date_params(ndays, today)
|
||||
|
||||
self.__get_page(self.base_url, params)
|
||||
|
||||
next_uri, last_uri = self.__parse_nav()
|
||||
|
||||
self.df = self.__parse_table()
|
||||
|
||||
def get(next_uri: str):
|
||||
self.__get_page(self.base_url + next_uri, params) # Instead of adding params to every request session could be used to store them with cookies
|
||||
self.df = pd.concat(
|
||||
(self.df, self.__parse_table()), ignore_index=True)
|
||||
|
||||
while next_uri != last_uri:
|
||||
get(next_uri)
|
||||
|
||||
next_uri, _ = self.__parse_nav()
|
||||
|
||||
get(next_uri)
|
||||
|
||||
self.__set_columns()
|
||||
|
||||
def save_df(self, file_path: str):
|
||||
self.df.to_csv(file_path)
|
||||
|
||||
def load_df(self, file_path: str):
|
||||
self.df = pd.read_csv(file_path)
|
||||
|
||||
|
||||
def main():
|
||||
parser = RossetiParser()
|
||||
|
||||
print(parser)
|
||||
|
||||
parser.save_df('./data.csv')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user