Added fb2 file parsing to tokens

2021-07-01 19:38:30 +05:00
parent 52057c0cd5
commit d8529740fd
3 changed files with 117 additions and 4 deletions
--- a/app/epub.py
+++ b/app/epub.py
@ -14,7 +14,6 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
    """

    try:
-
        tokens = await epub_to_tokens(file)
        ...
        # TODO: join tokens to HTML
@ -29,14 +28,14 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
 async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:

    """
-    Passes file content to ebooklib library and parses epub tokens into dict of the following format:
+    Passes file content to EbookLib library and parses epub tokens into dict of the following format:

-    "\<file_name\>": "\<file_content\>"
+    { "\<file_name\>": "\<file_content\>" }

    Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
    """

-    tokens = {"metadata": {"test"}}
+    tokens = {"metadata": {"test": "t"}}

    async with aiof.tempfile.NamedTemporaryFile() as tmp:
        await tmp.write(file.read())
@ -49,6 +48,7 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
            content = item.get_content()

            if item_type == ebooklib.ITEM_DOCUMENT:
+                # Adding book chapters to tokens list
                tokens[name] = content

            elif item_type in (
@ -58,6 +58,7 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
                ebooklib.ITEM_VIDEO,
                ebooklib.ITEM_VECTOR,
            ):
+                # Adding assets to tokens list
                media_type = item.media_type
                b64_content = b64encode(content).decode()

--- a/app/fb2.py
+++ b/app/fb2.py
@ -0,0 +1,111 @@
+from tempfile import SpooledTemporaryFile
+import xml.etree.ElementTree as ET
+from xml.etree.ElementTree import Element
+from typing import Optional
+
+
+namespaces = {
+    "": "http://www.gribuser.ru/xml/fictionbook/2.0",
+    "xlink": "http://www.w3.org/1999/xlink",
+}
+HREF = f"{{{namespaces['xlink']}}}href"
+
+
+async def fb22html(file: SpooledTemporaryFile) -> str:
+
+    """
+    Splits fb2 to tokens and joins them to one html file
+    """
+
+    try:
+
+        tokens = fb22tokens(file)
+        ...
+        # TODO: join tokens to HTML
+        html_content = ""
+        ...
+        return html_content
+
+    except Exception as e:
+        return "Error! Wrong FB2 file format: " + str(e)
+
+
+def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
+
+    """
+    Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it.
+
+    `tokens` format:
+
+    { "metadata": { ... },
+
+    "content": "\<string\>",
+
+    "\<asset_id\>": "\<base64_data\>" }
+    """
+
+    tokens = {"metadata": {}, "content": ""}
+
+    book = ET.parse(file)
+    description, body, *assets = book.getroot()
+
+    description: Element
+    body: Element
+    assets: list[Element]
+
+    # Reading book metadata
+
+    book_info = description.find("title-info")
+    if book_info:
+        metadata = {}
+        metadata["title"] = book_info.find("book-title", namespaces).text
+        metadata["author"] = get_author(book_info.find("author", namespaces))
+        metadata["cover"] = get_cover(book_info.find("coverpage", namespaces))
+        if metadata["cover"] is None:
+            metadata.pop("cover")
+
+        if len(metadata.keys()):
+            tokens["metadata"] = metadata.copy()
+
+    # Reading book content
+
+    tokens["content"] = ET.tostring(body).replace(b"ns0:", b"")
+
+    # Reading assets
+
+    for asset in assets:
+        key = asset.get("id")
+        media_type = asset.get("content-type")
+        b64_content = asset.text
+        tokens[key] = f"data:{media_type};base64,{b64_content}"
+
+    return tokens
+
+
+def get_author(author: Element) -> str:
+
+    """
+    Converts author xml structure to string
+    """
+
+    res = []
+    for tag_name in ("first-name", "middle-name", "last-name"):
+        el = author.find(tag_name, namespaces)
+        if not el is None:
+            res.append(el.text)
+    if len(res) == 0:
+        res = author.find("nickname", namespaces).text
+    else:
+        res = " ".join(res)
+
+    return res
+
+
+def get_cover(coverpage: Optional[Element]) -> Optional[str]:
+
+    """
+    Extracts cover image id if exists
+    """
+
+    if coverpage:
+        return coverpage.find("image", namespaces).get(HREF)
--- a/app/main.py
+++ b/app/main.py
@ -2,6 +2,7 @@ from fastapi import FastAPI, File, UploadFile
 from fastapi.responses import HTMLResponse

 from .epub import epub2html
+from .fb2 import fb22html

 app = FastAPI()