Added async building_id fetching and defaulted it

This commit is contained in:
Dmitriy Shishkov 2023-09-21 22:37:10 +03:00
parent c705a0ed02
commit 464d0d3640
Signed by: dm1sh
GPG Key ID: 027994B0AA357688
7 changed files with 210 additions and 310 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,10 @@
aiohttp==3.8.5
aiosignal==1.3.1
annotated-types==0.5.0
anyio==3.7.1
asttokens==2.4.0
async-timeout==4.0.3
attrs==23.1.0
autopep8==2.0.4
backcall==0.2.0
beautifulsoup4==4.12.2
@ -14,6 +18,7 @@ decorator==5.1.1
executing==1.2.0
fastapi==0.103.1
fonttools==4.42.1
frozenlist==1.4.0
greenlet==2.0.2
idna==3.4
ipykernel==6.25.2
@ -25,6 +30,7 @@ kiwisolver==1.4.5
lxml==4.9.3
matplotlib==3.8.0
matplotlib-inline==0.1.6
multidict==6.0.4
mypy==1.5.1
mypy-extensions==1.0.0
nest-asyncio==1.5.8
@ -70,3 +76,4 @@ typing_extensions==4.8.0
tzdata==2023.3
urllib3==2.0.4
wcwidth==0.2.6
yarl==1.9.2

View File

@ -1,15 +1,21 @@
aiohttp==3.8.5
aiosignal==1.3.1
annotated-types==0.5.0
anyio==3.7.1
async-timeout==4.0.3
attrs==23.1.0
beautifulsoup4==4.12.2
bs4==0.0.1
certifi==2023.7.22
charset-normalizer==3.2.0
click==8.1.7
fastapi==0.103.1
frozenlist==1.4.0
greenlet==2.0.2
h11==0.14.0
idna==3.4
lxml==4.9.3
multidict==6.0.4
numpy==1.26.0
pandas==2.1.1
psycopg==3.1.10
@ -30,3 +36,4 @@ typing_extensions==4.8.0
tzdata==2023.3
urllib3==2.0.5
uvicorn==0.23.2
yarl==1.9.2

View File

@ -33,6 +33,14 @@ def split_addresses(df: pd.DataFrame) -> pd.DataFrame
```python
def fetch_builing_ids(df: pd.DataFrame) -> pd.DataFrame
```
- `async_fetch_building_ids`:
```python
async def async_fetch_building_ids(df: pd.DataFrame) -> pd.DataFrame
```
- `concurrent_fetch_builing_ids`:
```python
def concurrent_fetch_builing_ids(df: pd.Dataframe) -> pd.DataFrame
```
- `preprocess_df`:
```python
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame

View File

@ -1,5 +1,5 @@
from .rosseti import RossetiParser
from .address import split_addresses
from .building_id import fetch_builing_ids
from .building_id import fetch_builing_ids, async_fetch_building_ids, concurrent_fetch_builing_ids
from .preprocess import preprocess_df, COL_NS, ICOL_NS, preprocess_read_df, group_by_index
from .util import pipeline

View File

@ -4,6 +4,8 @@ from typing import Optional, Tuple, Any, List
import requests
import pandas as pd
import numpy as np
import asyncio
import aiohttp
GeoTupleType = Tuple[Optional[int], Optional[float], Optional[float]]
@ -29,3 +31,42 @@ def fetch_builing_ids(df: pd.DataFrame) -> pd.DataFrame:
lambda row: get_building_id(row['Улица']), axis=1, result_type='expand')
return df
async def async_fetch_building_id(session: aiohttp.ClientSession, street: str) -> GeoTupleType:
if pd.isnull(street):
return None, None, None
async with session.get('https://geocode.gate.petersburg.ru/parse/eas', params={
'street': street
}) as r:
res = await r.json()
if 'error' in res:
return None, None, None
return res['Building_ID'], res['Latitude'], res['Longitude']
async def async_fetch_building_ids(df: pd.DataFrame) -> pd.DataFrame:
async with aiohttp.ClientSession() as session:
tasks = []
for _, row in df.iterrows():
tasks.append(
asyncio.ensure_future(
async_fetch_building_id(session, row['Улица'])
)
)
res = await asyncio.gather(*tasks)
df[['ID здания', 'Широта', 'Долгота']] = res
return df
def concurrent_fetch_builing_ids(df: pd.Dataframe) -> pd.DataFrame:
return asyncio.run(
async_fetch_building_ids(df)
)

View File

@ -1,6 +1,6 @@
from typing import Optional
from . import RossetiParser, split_addresses, fetch_builing_ids, preprocess_df
from . import RossetiParser, split_addresses, concurrent_fetch_builing_ids, preprocess_df
def pipeline(parser: Optional[RossetiParser] = None) -> RossetiParser:
@ -11,7 +11,7 @@ def pipeline(parser: Optional[RossetiParser] = None) -> RossetiParser:
parser.df = split_addresses(parser.df)
parser.df = fetch_builing_ids(parser.df)
parser.df = concurrent_fetch_builing_ids(parser.df)
parser.df = preprocess_df(parser.df)