Added epub file processing: currently epub file tokenization is ready

2021-06-29 17:59:27 +05:00
parent 54e1dca653
commit c24a26ab29
4 changed files with 79 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,3 @@
-.venv
+.venv
 __pycache__/
 .vscode
--- a/app/epub.py
+++ b/app/epub.py
@ -0,0 +1,59 @@
 import aiofiles as aiof
 from base64 import b64encode
 import ebooklib
 from ebooklib import epub
 from tempfile import SpooledTemporaryFile
 async def epub2html(file: SpooledTemporaryFile):
    """
    Splits epub to tokens and joins them to one html file
    """
    tokens = await epub_to_tokens(file)
    ...
    # TODO: join tokens to HTML
    html_content = ""
    ...
    return html_content
 async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
    """
    Passes file content to ebooklib library and parses epub tokens into dict of the following format:
    "\<file_name\>": "\<file_content\>"
    Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
    """
    async with aiof.tempfile.NamedTemporaryFile() as tmp:
        await tmp.write(file.read())
        await tmp.seek(0)
        content = await tmp.read()
        try:
            book = epub.read_epub(tmp.name)
            tokens = {}
            for item in book.get_items():
                item_type = item.get_type()
                if item_type in (ebooklib.ITEM_COVER, ebooklib.ITEM_IMAGE, ebooklib.ITEM_STYLE, ebooklib.ITEM_VIDEO):
                    name = item.get_name()
                    media_type = item.media_type
                    b64_content = b64encode(item.get_content()).decode()
                    tokens[name] = f'data:{media_type};base64,{b64_content}'
                elif item_type == ebooklib.ITEM_DOCUMENT:
                    name = item.get_name()
                    content = item.get_content()
                    tokens[name] = content
            return tokens
        except Exception as e:
            return 'Error! Wrong epub file format: ' + str(e)
--- a/app/main.py
+++ b/app/main.py
@ -1,7 +1,20 @@
-from fastapi import FastAPI
+from fastapi import FastAPI, File, UploadFile
 from fastapi.responses import HTMLResponse
 from .epub import epub2html
 app = FastAPI()
@app.get('/')
 def root():
-    return "Hello, World!"
+    return "Hello, World!"
@app.post('/uploadfile/', )
 async def create_upload_file(file: UploadFile = File(...)):
    if file.filename.endswith('.epub'):
        content = await epub2html(file.file)
    elif file.filename.endswith('.fb2'):
        content = await fb22html(file.file)
    else:
        content = 'Error! Unsupported file type'
    return HTMLResponse(content=content)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,4 @@
 aiofiles==0.7.0
 asgiref==3.4.0
 click==8.0.1
 EbookLib==0.17.1
@ -5,6 +6,7 @@ fastapi==0.65.2
 h11==0.12.0
 lxml==4.6.3
 pydantic==1.8.2
 python-multipart==0.0.5
 six==1.16.0
 starlette==0.14.2
 typing-extensions==3.10.0.0