diff --git a/parser/__init__.py b/parser/__init__.py new file mode 100644 index 0000000..53186ad --- /dev/null +++ b/parser/__init__.py @@ -0,0 +1,116 @@ +import io +from datetime import datetime, timedelta +from typing import Mapping, Optional, Tuple, no_type_check + +from bs4 import BeautifulSoup +import requests +import pandas as pd + + +class RossetiParser: + def __init__(self, ndays=7, today=datetime.today(), file_path: Optional[str] = None): + self.base_url = "https://rosseti-lenenergo.ru/planned_work" + + self.ndays = ndays + self.today = today + + if file_path is None: + self.fetch(ndays) + else: + self.load_df(file_path) + + def __str__(self): + return f"From {self.today.date()} for {self.ndays} days with {len(self.df)}_records" + + @staticmethod + def __format_date(date: datetime) -> str: + return date.strftime("%d.%m.%y") + + def __compose_date_params(self, ndays: int, today: datetime) -> Mapping[str, str]: + date_start = self.__format_date(today) + date_finish = self.__format_date(today + timedelta(days=ndays)) + + return { + 'date_start': date_start, + 'date_finish': date_finish + } + + def __get_page(self, url: str, params: Mapping[str, str]): + r = requests.get(url, params) + + self.soup = BeautifulSoup(r.text, features='html.parser') + + def __parse_nav(self) -> Tuple[str, str]: + navigator = self.soup.find('span', attrs={'class': 'page-nav-i'}) + + next_uri = navigator.find('a', attrs={'class': 'next'})['href'] + last_uri = navigator.find_all('a')[-1]['href'] + + return next_uri, last_uri + + def __parse_table(self) -> pd.DataFrame: + table = self.soup.find( + 'table', attrs={'class': 'tableous_facts funds'}) + + return pd.read_html(io.StringIO(str(table)))[0] + + def __set_columns(self): + self.df.columns = ( + "Регион РФ (область, край, город фед. значения, округ)", + "Административный район", + "Населённый пункт", + "Улица", + "Плановая дата начала отключения электроснабжения", + "Плановое время начала отключения электроснабжения", + "Плановая дата восстановления отключения электроснабжения", + "Плановое время восстановления отключения электроснабжения", + "Филиал", + "РЭС", + "Комментарий", + ) + + def fetch(self, ndays: Optional[int] = None, today: Optional[datetime] = None): + if ndays is None: + ndays = self.ndays + if today is None: + today = self.today + + params = self.__compose_date_params(ndays, today) + + self.__get_page(self.base_url, params) + + next_uri, last_uri = self.__parse_nav() + + self.df = self.__parse_table() + + def get(next_uri: str): + self.__get_page(self.base_url + next_uri, params) # Instead of adding params to every request session could be used to store them with cookies + self.df = pd.concat( + (self.df, self.__parse_table()), ignore_index=True) + + while next_uri != last_uri: + get(next_uri) + + next_uri, _ = self.__parse_nav() + + get(next_uri) + + self.__set_columns() + + def save_df(self, file_path: str): + self.df.to_csv(file_path) + + def load_df(self, file_path: str): + self.df = pd.read_csv(file_path) + + +def main(): + parser = RossetiParser() + + print(parser) + + parser.save_df('./data.csv') + + +if __name__ == "__main__": + main()