Added fb2 xml to html convertion

2021-07-02 03:10:03 +05:00
parent f717291823
commit 4eae488f45
3 changed files with 138 additions and 10 deletions
--- a/app/epub.py
+++ b/app/epub.py
@ -6,6 +6,7 @@ from ebooklib import epub

 from tempfile import SpooledTemporaryFile

+from .utils import Document_Tokens

 async def epub2html(file: SpooledTemporaryFile) -> str:

@ -27,7 +28,7 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
        return "Error! Wrong epub file format: " + str(e)


-async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
+async def epub_to_tokens(file: SpooledTemporaryFile) -> Document_Tokens:

    """
    Passes file content to EbookLib library and parses epub tokens into dict of the following format:
--- a/app/fb2.py
+++ b/app/fb2.py
@ -3,6 +3,8 @@ import xml.etree.ElementTree as ET
 from xml.etree.ElementTree import Element
 from typing import Optional

+from .utils import Document_Tokens
+

 namespaces = {
    "": "http://www.gribuser.ru/xml/fictionbook/2.0",
@ -20,22 +22,19 @@ async def fb22html(file: SpooledTemporaryFile) -> str:
    try:

        tokens = fb22tokens(file)
-        ...
-        # TODO: join tokens to HTML
-        html_content = ""
-        ...
+        html_content = fb2body2html(tokens)

-        print(tokens.keys())
        return html_content

    except Exception as e:
        return "Error! Wrong FB2 file format: " + str(e)


-def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
+def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:

    """
-    Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it.
+    Parses fb2 file as xml document.
+    It puts book metadata, its content and media to `tokens` dictionary and returns it.

    `tokens` format:

@ -46,7 +45,10 @@ def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
    "\<asset_id\>": "\<base64_data\>" }
    """

-    tokens = {"metadata": {}, "content": b""}
+    tokens = {
+        "metadata": {},
+        "content": b"<root>",
+    }

    book = ET.parse(file)
    root = book.getroot()
@ -74,6 +76,8 @@ def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
    for body in bodies:
        tokens["content"] += ET.tostring(body).replace(b"ns0:", b"")

+    tokens["content"] += b"</root>"
+
    # Reading assets

    for asset in assets:
@ -92,7 +96,11 @@ def get_author(author: Element) -> str:
    """

    res = []
-    for tag_name in ("first-name", "middle-name", "last-name"):
+    for tag_name in (
+        "first-name",
+        "middle-name",
+        "last-name",
+    ):
        el = author.find(tag_name, namespaces)
        if not el is None:
            res.append(el.text)
@ -112,3 +120,119 @@ def get_cover(coverpage: Optional[Element]) -> Optional[str]:

    if coverpage:
        return coverpage.find("image", namespaces).get(HREF)
+
+
+def fb2body2html(tokens: Document_Tokens) -> str:
+
+    """
+    Convert fb2 xml to html, joins bodies into one string
+    """
+
+    res = b""
+
+    xml_root = ET.fromstring(tokens["content"])
+    for body in xml_root.iterfind("./body"):
+        res += process_section(body, tokens)
+
+    return res
+
+
+def process_section(body: Element, tokens: Document_Tokens) -> str:
+
+    """
+    Processes individual sections, recursively goes throw sections tree
+    """
+
+    res = b"<section>\n"
+
+    for tag in ("title", "epigraph", "annotation"):
+        el = body.find("./" + tag)
+        if el:
+            process_content(el, tokens)
+            res += children_to_html(el)
+    image = body.find("./image")
+    if image:
+        process_image(image, tokens)
+        res += ET.tostring(image)
+
+    for section in body.findall("./section"):
+        if section.find("./section"):
+            res += process_section(section, tokens)
+        else:
+            process_content(section, tokens)
+            res += b"<section>\n" + children_to_html(section) + b"</section>\n"
+
+    return res + b"</section>\n"
+
+
+def children_to_html(root: Element) -> str:
+
+    """
+    Converts xml tag children to string
+    """
+
+    res = b""
+
+    for child in root:
+        res += ET.tostring(child)
+
+    return res
+
+
+def process_image(el: Element, tokens: Document_Tokens) -> None:
+
+    """
+    Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
+    """
+
+    el.tag = "img"
+
+    href = el.get(HREF)
+    el.attrib.pop(HREF)
+
+    el.set("src", tokens[href[1:]] if href[0] == "#" else href)
+
+
+tag_replacement = {
+    "empty-line": "br",
+    "emphasis": "em",
+    "strikethrough": "strike",
+    "v": "p",
+}
+
+tag_with_class = {
+    "subtitle": "p",
+    "cite": "div",
+    "poem": "div",
+    "stanza": "div",
+    "poem": "div",
+    "poem": "div",
+    "epigraph": "div",
+    "text-author": "p",
+}
+
+
+def process_content(root: Element, tokens: Document_Tokens) -> None:
+
+    """
+    Converts fb2 xml tag names to html equivalents and my own styled elements.
+    Resolves binary data dependencies
+    """
+
+    for child in root:
+        process_content(child, tokens)
+
+        if child.tag == "a":
+            href = child.get(HREF)
+            child.attrib.pop(HREF)
+            child.set("href", href)
+
+        if child.tag == "image":
+            process_image(child, tokens)
+
+        elif child.tag in tag_replacement.keys():
+            child.tag = tag_replacement[child.tag]
+
+        elif child.tag in tag_with_class.keys():
+            child.set("class", child.tag)
+            child.tag = tag_with_class[child.tag]
--- a/app/utils.py
+++ b/app/utils.py
@ -0,0 +1,3 @@
+from typing import Union
+
+Document_Tokens = dict[str, Union[str, dict[str, str]]]