Added epub file processing: currently epub file tokenization is ready
This commit is contained in:
parent
54e1dca653
commit
c24a26ab29
4
.gitignore
vendored
4
.gitignore
vendored
@ -1 +1,3 @@
|
|||||||
.venv
|
.venv
|
||||||
|
__pycache__/
|
||||||
|
.vscode
|
59
app/epub.py
Normal file
59
app/epub.py
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
import aiofiles as aiof
|
||||||
|
from base64 import b64encode
|
||||||
|
|
||||||
|
import ebooklib
|
||||||
|
from ebooklib import epub
|
||||||
|
|
||||||
|
from tempfile import SpooledTemporaryFile
|
||||||
|
|
||||||
|
|
||||||
|
async def epub2html(file: SpooledTemporaryFile):
|
||||||
|
|
||||||
|
"""
|
||||||
|
Splits epub to tokens and joins them to one html file
|
||||||
|
"""
|
||||||
|
|
||||||
|
tokens = await epub_to_tokens(file)
|
||||||
|
...
|
||||||
|
# TODO: join tokens to HTML
|
||||||
|
html_content = ""
|
||||||
|
...
|
||||||
|
return html_content
|
||||||
|
|
||||||
|
|
||||||
|
async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
||||||
|
|
||||||
|
"""
|
||||||
|
Passes file content to ebooklib library and parses epub tokens into dict of the following format:
|
||||||
|
|
||||||
|
"\<file_name\>": "\<file_content\>"
|
||||||
|
|
||||||
|
Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
async with aiof.tempfile.NamedTemporaryFile() as tmp:
|
||||||
|
|
||||||
|
await tmp.write(file.read())
|
||||||
|
await tmp.seek(0)
|
||||||
|
content = await tmp.read()
|
||||||
|
|
||||||
|
try:
|
||||||
|
book = epub.read_epub(tmp.name)
|
||||||
|
tokens = {}
|
||||||
|
for item in book.get_items():
|
||||||
|
item_type = item.get_type()
|
||||||
|
if item_type in (ebooklib.ITEM_COVER, ebooklib.ITEM_IMAGE, ebooklib.ITEM_STYLE, ebooklib.ITEM_VIDEO):
|
||||||
|
name = item.get_name()
|
||||||
|
media_type = item.media_type
|
||||||
|
b64_content = b64encode(item.get_content()).decode()
|
||||||
|
|
||||||
|
tokens[name] = f'data:{media_type};base64,{b64_content}'
|
||||||
|
elif item_type == ebooklib.ITEM_DOCUMENT:
|
||||||
|
name = item.get_name()
|
||||||
|
content = item.get_content()
|
||||||
|
|
||||||
|
tokens[name] = content
|
||||||
|
return tokens
|
||||||
|
except Exception as e:
|
||||||
|
return 'Error! Wrong epub file format: ' + str(e)
|
17
app/main.py
17
app/main.py
@ -1,7 +1,20 @@
|
|||||||
from fastapi import FastAPI
|
from fastapi import FastAPI, File, UploadFile
|
||||||
|
from fastapi.responses import HTMLResponse
|
||||||
|
|
||||||
|
from .epub import epub2html
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
@app.get('/')
|
@app.get('/')
|
||||||
def root():
|
def root():
|
||||||
return "Hello, World!"
|
return "Hello, World!"
|
||||||
|
|
||||||
|
@app.post('/uploadfile/', )
|
||||||
|
async def create_upload_file(file: UploadFile = File(...)):
|
||||||
|
if file.filename.endswith('.epub'):
|
||||||
|
content = await epub2html(file.file)
|
||||||
|
elif file.filename.endswith('.fb2'):
|
||||||
|
content = await fb22html(file.file)
|
||||||
|
else:
|
||||||
|
content = 'Error! Unsupported file type'
|
||||||
|
return HTMLResponse(content=content)
|
@ -1,3 +1,4 @@
|
|||||||
|
aiofiles==0.7.0
|
||||||
asgiref==3.4.0
|
asgiref==3.4.0
|
||||||
click==8.0.1
|
click==8.0.1
|
||||||
EbookLib==0.17.1
|
EbookLib==0.17.1
|
||||||
@ -5,6 +6,7 @@ fastapi==0.65.2
|
|||||||
h11==0.12.0
|
h11==0.12.0
|
||||||
lxml==4.6.3
|
lxml==4.6.3
|
||||||
pydantic==1.8.2
|
pydantic==1.8.2
|
||||||
|
python-multipart==0.0.5
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
starlette==0.14.2
|
starlette==0.14.2
|
||||||
typing-extensions==3.10.0.0
|
typing-extensions==3.10.0.0
|
||||||
|
Loading…
x
Reference in New Issue
Block a user