Added fb2 file parsing to tokens

2021-07-01 19:38:30 +05:00
parent 52057c0cd5
commit d8529740fd
3 changed files with 117 additions and 4 deletions
--- a/app/epub.py
+++ b/app/epub.py
@@ -14,7 +14,6 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
    """
    try:
        tokens = await epub_to_tokens(file)
        ...
        # TODO: join tokens to HTML
@@ -29,14 +28,14 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
 async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
    """
-    Passes file content to ebooklib library and parses epub tokens into dict of the following format:
+    Passes file content to EbookLib library and parses epub tokens into dict of the following format:
-    "\<file_name\>": "\<file_content\>"
+    { "\<file_name\>": "\<file_content\>" }
    Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
    """
-    tokens = {"metadata": {"test"}}
+    tokens = {"metadata": {"test": "t"}}
    async with aiof.tempfile.NamedTemporaryFile() as tmp:
        await tmp.write(file.read())
@@ -49,6 +48,7 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
            content = item.get_content()
            if item_type == ebooklib.ITEM_DOCUMENT:
                # Adding book chapters to tokens list
                tokens[name] = content
            elif item_type in (
@@ -58,6 +58,7 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
                ebooklib.ITEM_VIDEO,
                ebooklib.ITEM_VECTOR,
            ):
                # Adding assets to tokens list
                media_type = item.media_type
                b64_content = b64encode(content).decode()
--- a/app/fb2.py
+++ b/app/fb2.py
@@ -0,0 +1,111 @@
 from tempfile import SpooledTemporaryFile
 import xml.etree.ElementTree as ET
 from xml.etree.ElementTree import Element
 from typing import Optional
 namespaces = {
    "": "http://www.gribuser.ru/xml/fictionbook/2.0",
    "xlink": "http://www.w3.org/1999/xlink",
 }
 HREF = f"{{{namespaces['xlink']}}}href"
 async def fb22html(file: SpooledTemporaryFile) -> str:
    """
    Splits fb2 to tokens and joins them to one html file
    """
    try:
        tokens = fb22tokens(file)
        ...
        # TODO: join tokens to HTML
        html_content = ""
        ...
        return html_content
    except Exception as e:
        return "Error! Wrong FB2 file format: " + str(e)
 def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
    """
    Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it.
    `tokens` format:
    { "metadata": { ... },
    "content": "\<string\>",
    "\<asset_id\>": "\<base64_data\>" }
    """
    tokens = {"metadata": {}, "content": ""}
    book = ET.parse(file)
    description, body, *assets = book.getroot()
    description: Element
    body: Element
    assets: list[Element]
    # Reading book metadata
    book_info = description.find("title-info")
    if book_info:
        metadata = {}
        metadata["title"] = book_info.find("book-title", namespaces).text
        metadata["author"] = get_author(book_info.find("author", namespaces))
        metadata["cover"] = get_cover(book_info.find("coverpage", namespaces))
        if metadata["cover"] is None:
            metadata.pop("cover")
        if len(metadata.keys()):
            tokens["metadata"] = metadata.copy()
    # Reading book content
    tokens["content"] = ET.tostring(body).replace(b"ns0:", b"")
    # Reading assets
    for asset in assets:
        key = asset.get("id")
        media_type = asset.get("content-type")
        b64_content = asset.text
        tokens[key] = f"data:{media_type};base64,{b64_content}"
    return tokens
 def get_author(author: Element) -> str:
    """
    Converts author xml structure to string
    """
    res = []
    for tag_name in ("first-name", "middle-name", "last-name"):
        el = author.find(tag_name, namespaces)
        if not el is None:
            res.append(el.text)
    if len(res) == 0:
        res = author.find("nickname", namespaces).text
    else:
        res = " ".join(res)
    return res
 def get_cover(coverpage: Optional[Element]) -> Optional[str]:
    """
    Extracts cover image id if exists
    """
    if coverpage:
        return coverpage.find("image", namespaces).get(HREF)
--- a/app/main.py
+++ b/app/main.py
@@ -2,6 +2,7 @@ from fastapi import FastAPI, File, UploadFile
 from fastapi.responses import HTMLResponse
 from .epub import epub2html
 from .fb2 import fb22html
 app = FastAPI()