diff --git a/parser/__init__.py b/parser/__init__.py index 49cc956..28442cb 100644 --- a/parser/__init__.py +++ b/parser/__init__.py @@ -1,109 +1,2 @@ -from datetime import datetime, timedelta -import io -from typing import Mapping, Optional, Tuple, no_type_check - -from bs4 import BeautifulSoup -import requests -import pandas as pd - - -class RossetiParser: - def __init__(self, ndays=7, today: Optional[datetime] = None, file_path: Optional[str] = None): - self.base_url = "https://rosseti-lenenergo.ru/planned_work" - - if today is None: - today = datetime.today() - - self.ndays = ndays - self.today = today - self.df = pd.DataFrame() - - if file_path is None: - self.fetch(ndays) - else: - self.load_df(file_path) - - def __str__(self): - return f"From {self.today.date()} for {self.ndays} days with {len(self.df)} records" - - @staticmethod - def __format_date(date: datetime) -> str: - return date.strftime("%d.%m.%y") - - def __compose_date_params(self, ndays: int, today: datetime) -> Mapping[str, str]: - date_start = self.__format_date(today) - date_finish = self.__format_date(today + timedelta(days=ndays)) - - return { - 'date_start': date_start, - 'date_finish': date_finish - } - - def __get_page(self, url: str, params: Mapping[str, str]): - r = requests.get(url, params) - - self.soup = BeautifulSoup(r.text, features='html.parser') - - def __parse_nav(self) -> Tuple[str, str]: - navigator = self.soup.find('span', attrs={'class': 'page-nav-i'}) - - next_uri = navigator.find('a', attrs={'class': 'next'})['href'] - last_uri = navigator.find_all('a')[-1]['href'] - - return next_uri, last_uri - - def __parse_table(self) -> pd.DataFrame: - table = self.soup.find( - 'table', attrs={'class': 'tableous_facts funds'}) - - return pd.read_html(io.StringIO(str(table)))[0] - - def __save_page(self, uri: str): - print(f'Processing page "{uri}"') - self.__get_page(self.base_url + uri, self.__params) - self.df = pd.concat( - (self.df, self.__parse_table()), ignore_index=True) - - def __set_columns(self): - self.df.columns = ( - "Регион РФ (область, край, город фед. значения, округ)", - "Административный район", - "Населённый пункт", - "Улица", - "Плановая дата начала отключения электроснабжения", - "Плановое время начала отключения электроснабжения", - "Плановая дата восстановления отключения электроснабжения", - "Плановое время восстановления отключения электроснабжения", - "Филиал", - "РЭС", - "Комментарий", - ) - - def fetch(self, ndays: Optional[int] = None, today: Optional[datetime] = None): - if ndays is None: - ndays = self.ndays - if today is None: - today = self.today - - self.__params = self.__compose_date_params(ndays, today) - - self.__save_page('') - - next_uri, last_uri = self.__parse_nav() - - while next_uri != last_uri: - self.__save_page(next_uri) - - next_uri, _ = self.__parse_nav() - - self.__save_page(next_uri) - - self.__set_columns() - - def save_df(self, file_path: str): - print(f'Saved as "{file_path}"') - self.df.to_csv(file_path) - - def load_df(self, file_path: str): - print(f'Read from "{file}"') - self.df = pd.read_csv(file_path) +from .rosseti import RossetiParser +from .address import split_addresses \ No newline at end of file diff --git a/parser/address.py b/parser/address.py new file mode 100644 index 0000000..9b61a76 --- /dev/null +++ b/parser/address.py @@ -0,0 +1,86 @@ +from typing import List, TypeVar + +import pandas as pd +import re + +T = TypeVar('T') + +street_prefixes = ('ул.', 'бул.', 'пр.', 'ул', 'бул', + 'пр', 'ш.', 'ш', 'пер.', 'пер') +houses_prefixes = ('д.', 'д') + + +def unfold_house_ranges(token: str) -> str: + pairs_strings = re.findall(r'([\d]+-[\d]+)', token) + for pair_string in pairs_strings: + a, b = pair_string.split('-') + a, b = int(a), int(b) + + if b > a: + token = token.replace( + pair_string, ', '.join(map(str, range(a, b+1)))) + + return token + + +def unfold_houses_list(token: str) -> List[str]: + token = unfold_house_ranges(token) + + reg = re.compile(r'(д|д\.)? ?\d+[а-яА-Я\/]*\d*(,|$| )') + + if len(re.findall(reg, token)) > 1: + tokens = token.split(',') + return [*[tokens[0] + ' ' + house_token for house_token in tokens[1:]]] + return [token] + + +def any_of_in(substrings: List[str], string) -> bool: + return any(map(lambda substring: substring in string, substrings)) + + +def flatten(arr: List[List[T]]) -> List[T]: + return sum(arr, []) + + +def split_address(address: str) -> List[str]: + if ';' in address: + return flatten(map(unfold_houses_list, address.split(';'))) + elif ',' in address: + tokens = re.split(r'(,)', address) + + tokens = list(map(str.strip, filter( + lambda token: token != '', tokens))) + + res = [] + accumulator = '' + + for i in range(len(tokens)): + if (any_of_in(street_prefixes, tokens[i].lower()) and + any_of_in(street_prefixes, accumulator.lower())): + res += unfold_houses_list(accumulator) + accumulator = '' + + accumulator += tokens[i] + + res += unfold_houses_list(accumulator) + + return res + + return [address] + + +def process_row(row): + if pd.isnull(row['Улица']): + return row + + addresses = split_address(row['Улица']) + + row = row.copy() + + row['Улица'] = addresses + + return row + + +def split_addresses(df: pd.DataFrame) -> pd.DataFrame: + return df.apply(process_row, axis=1).explode('Улица', ignore_index=True) diff --git a/parser/rosseti.py b/parser/rosseti.py new file mode 100644 index 0000000..8bcdf20 --- /dev/null +++ b/parser/rosseti.py @@ -0,0 +1,109 @@ +from datetime import datetime, timedelta +import io +from typing import Mapping, Optional, Tuple, no_type_check + +from bs4 import BeautifulSoup +import requests +import pandas as pd + + +class RossetiParser: + def __init__(self, ndays=7, today: Optional[datetime] = None, file_path: Optional[str] = None): + self.base_url = "https://rosseti-lenenergo.ru/planned_work" + + if today is None: + today = datetime.today() + + self.ndays = ndays + self.today = today + self.df = pd.DataFrame() + + if file_path is None: + self.fetch(ndays) + else: + self.load_df(file_path) + + def __str__(self): + return f"From {self.today.date()} for {self.ndays} days with {len(self.df)} records" + + @staticmethod + def __format_date(date: datetime) -> str: + return date.strftime("%d.%m.%y") + + def __compose_date_params(self, ndays: int, today: datetime) -> Mapping[str, str]: + date_start = self.__format_date(today) + date_finish = self.__format_date(today + timedelta(days=ndays)) + + return { + 'date_start': date_start, + 'date_finish': date_finish + } + + def __get_page(self, url: str, params: Mapping[str, str]): + r = requests.get(url, params) + + self.soup = BeautifulSoup(r.text, features='html.parser') + + def __parse_nav(self) -> Tuple[str, str]: + navigator = self.soup.find('span', attrs={'class': 'page-nav-i'}) + + next_uri = navigator.find('a', attrs={'class': 'next'})['href'] + last_uri = navigator.find_all('a')[-1]['href'] + + return next_uri, last_uri + + def __parse_table(self) -> pd.DataFrame: + table = self.soup.find( + 'table', attrs={'class': 'tableous_facts funds'}) + + return pd.read_html(io.StringIO(str(table)))[0] + + def __save_page(self, uri: str): + print(f'Processing page "{uri}"') + self.__get_page(self.base_url + uri, self.__params) + self.df = pd.concat( + (self.df, self.__parse_table()), ignore_index=True) + + def __set_columns(self): + self.df.columns = ( + "Регион РФ (область, край, город фед. значения, округ)", + "Административный район", + "Населённый пункт", + "Улица", + "Плановая дата начала отключения электроснабжения", + "Плановое время начала отключения электроснабжения", + "Плановая дата восстановления отключения электроснабжения", + "Плановое время восстановления отключения электроснабжения", + "Филиал", + "РЭС", + "Комментарий", + ) + + def fetch(self, ndays: Optional[int] = None, today: Optional[datetime] = None): + if ndays is None: + ndays = self.ndays + if today is None: + today = self.today + + self.__params = self.__compose_date_params(ndays, today) + + self.__save_page('') + + next_uri, last_uri = self.__parse_nav() + + while next_uri != last_uri: + self.__save_page(next_uri) + + next_uri, _ = self.__parse_nav() + + self.__save_page(next_uri) + + self.__set_columns() + + def save_df(self, file_path: str): + print(f'Saved as "{file_path}"') + self.df.to_csv(file_path, index=False) + + def load_df(self, file_path: str): + print(f'Read from "{file_path}"') + self.df = pd.read_csv(file_path)