Added fb2 xml to html convertion

2021-07-02 03:10:03 +05:00
parent f717291823
commit 4eae488f45
3 changed files with 138 additions and 10 deletions
--- a/app/epub.py
+++ b/app/epub.py
@ -6,6 +6,7 @@ from ebooklib import epub
 from tempfile import SpooledTemporaryFile
 from .utils import Document_Tokens
 async def epub2html(file: SpooledTemporaryFile) -> str:
@ -27,7 +28,7 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
        return "Error! Wrong epub file format: " + str(e)
-async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
+async def epub_to_tokens(file: SpooledTemporaryFile) -> Document_Tokens:
    """
    Passes file content to EbookLib library and parses epub tokens into dict of the following format:
--- a/app/fb2.py
+++ b/app/fb2.py
@ -3,6 +3,8 @@ import xml.etree.ElementTree as ET
 from xml.etree.ElementTree import Element
 from typing import Optional
 from .utils import Document_Tokens
 namespaces = {
    "": "http://www.gribuser.ru/xml/fictionbook/2.0",
@ -20,22 +22,19 @@ async def fb22html(file: SpooledTemporaryFile) -> str:
    try:
        tokens = fb22tokens(file)
-        ...
+        html_content = fb2body2html(tokens)
        # TODO: join tokens to HTML
        html_content = ""
        ...
        print(tokens.keys())
        return html_content
    except Exception as e:
        return "Error! Wrong FB2 file format: " + str(e)
-def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
+def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
    """
-    Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it.
+    Parses fb2 file as xml document.
    It puts book metadata, its content and media to `tokens` dictionary and returns it.
    `tokens` format:
@ -46,7 +45,10 @@ def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
    "\<asset_id\>": "\<base64_data\>" }
    """
-    tokens = {"metadata": {}, "content": b""}
+    tokens = {
        "metadata": {},
        "content": b"<root>",
    }
    book = ET.parse(file)
    root = book.getroot()
@ -74,6 +76,8 @@ def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
    for body in bodies:
        tokens["content"] += ET.tostring(body).replace(b"ns0:", b"")
    tokens["content"] += b"</root>"
    # Reading assets
    for asset in assets:
@ -92,7 +96,11 @@ def get_author(author: Element) -> str:
    """
    res = []
-    for tag_name in ("first-name", "middle-name", "last-name"):
+    for tag_name in (
        "first-name",
        "middle-name",
        "last-name",
    ):
        el = author.find(tag_name, namespaces)
        if not el is None:
            res.append(el.text)
@ -112,3 +120,119 @@ def get_cover(coverpage: Optional[Element]) -> Optional[str]:
    if coverpage:
        return coverpage.find("image", namespaces).get(HREF)
 def fb2body2html(tokens: Document_Tokens) -> str:
    """
    Convert fb2 xml to html, joins bodies into one string
    """
    res = b""
    xml_root = ET.fromstring(tokens["content"])
    for body in xml_root.iterfind("./body"):
        res += process_section(body, tokens)
    return res
 def process_section(body: Element, tokens: Document_Tokens) -> str:
    """
    Processes individual sections, recursively goes throw sections tree
    """
    res = b"<section>\n"
    for tag in ("title", "epigraph", "annotation"):
        el = body.find("./" + tag)
        if el:
            process_content(el, tokens)
            res += children_to_html(el)
    image = body.find("./image")
    if image:
        process_image(image, tokens)
        res += ET.tostring(image)
    for section in body.findall("./section"):
        if section.find("./section"):
            res += process_section(section, tokens)
        else:
            process_content(section, tokens)
            res += b"<section>\n" + children_to_html(section) + b"</section>\n"
    return res + b"</section>\n"
 def children_to_html(root: Element) -> str:
    """
    Converts xml tag children to string
    """
    res = b""
    for child in root:
        res += ET.tostring(child)
    return res
 def process_image(el: Element, tokens: Document_Tokens) -> None:
    """
    Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
    """
    el.tag = "img"
    href = el.get(HREF)
    el.attrib.pop(HREF)
    el.set("src", tokens[href[1:]] if href[0] == "#" else href)
 tag_replacement = {
    "empty-line": "br",
    "emphasis": "em",
    "strikethrough": "strike",
    "v": "p",
 }
 tag_with_class = {
    "subtitle": "p",
    "cite": "div",
    "poem": "div",
    "stanza": "div",
    "poem": "div",
    "poem": "div",
    "epigraph": "div",
    "text-author": "p",
 }
 def process_content(root: Element, tokens: Document_Tokens) -> None:
    """
    Converts fb2 xml tag names to html equivalents and my own styled elements.
    Resolves binary data dependencies
    """
    for child in root:
        process_content(child, tokens)
        if child.tag == "a":
            href = child.get(HREF)
            child.attrib.pop(HREF)
            child.set("href", href)
        if child.tag == "image":
            process_image(child, tokens)
        elif child.tag in tag_replacement.keys():
            child.tag = tag_replacement[child.tag]
        elif child.tag in tag_with_class.keys():
            child.set("class", child.tag)
            child.tag = tag_with_class[child.tag]
--- a/app/utils.py
+++ b/app/utils.py
@ -0,0 +1,3 @@
 from typing import Union
 Document_Tokens = dict[str, Union[str, dict[str, str]]]
		`@ -0,0 +1,3 @@`
							`from typing import Union`

							`Document_Tokens = dict[str, Union[str, dict[str, str]]]`