Finished epub to html convertion functionality, fixed all pylint errors

2021-07-07 17:35:31 +05:00
parent 2f4a683cb4
commit 5155790357
4 changed files with 283 additions and 65 deletions
--- a/app/epub.py
+++ b/app/epub.py
@@ -1,13 +1,25 @@
-import aiofiles as aiof
-from base64 import b64encode
-from fastapi import HTTPException
+"""
+Module for EPUB file conversion to html
+"""

+from base64 import b64encode
+from functools import cache
+import html
+import os
+from tempfile import SpooledTemporaryFile
+
+import aiofiles as aiof
+from fastapi import HTTPException
+from lxml import etree
 import ebooklib
 from ebooklib import epub

-from tempfile import SpooledTemporaryFile
+from .utils import DocumentTokens, strip_whitespace, HTMLBook

-from .utils import Document_Tokens, strip_whitespace, HTMLBook
+parser = etree.XMLParser(recover=True)
+
+IMAGE = "{http://www.w3.org/2000/svg}image"
+HREF = "{http://www.w3.org/1999/xlink}href"


 async def epub2html(file: SpooledTemporaryFile) -> HTMLBook:
@@ -22,24 +34,29 @@ async def epub2html(file: SpooledTemporaryFile) -> HTMLBook:

        html_content = epub_tokens2html(spine, tokens)

-        return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
+        return {
+            **(tokens["metadata"]),
+            "content": html_content,
+        }

-    except Exception as e:
+    except Exception as err:
        raise HTTPException(
-            status_code=500, detail="Error! Wrong epub file format: " + str(e)
-        )
+            status_code=500, detail="Error! Wrong epub file format: " + str(err)
+        ) from err


 async def epub_to_tokens(
    file: SpooledTemporaryFile,
-) -> tuple[Document_Tokens, list[tuple[str, str]]]:
+) -> tuple[DocumentTokens, list[tuple[str, str]]]:

-    """
-    Passes file content to EbookLib library and parses epub tokens into dict of the following format:
+    r"""
+    Passes file content to EbookLib library and parses epub tokens into dict of
+    the following format:

    { "\<file_name\>": "\<file_content\>" }

-    Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
+    Where file content is either plain text for xhtml or base64 encoded data
+    for other formats, prepared for embeding to html
    """

    tokens = {}
@@ -61,19 +78,20 @@ async def epub_to_tokens(
            item: epub.EpubItem

            item_type = item.get_type()
-            file_path = reader.opf_dir + "/" + item.get_name()
+            file_path = os.path.join(reader.opf_dir, item.get_name())

            if item_type == ebooklib.ITEM_DOCUMENT:
                # Adding book chapters to tokens list
                name = item.get_id()
-                tokens[file_path] = item.get_body_content()
+                tokens[file_path] = strip_whitespace(item.get_body_content())
                tokens["toc"][name] = file_path

            elif item_type in (
+                ebooklib.ITEM_AUDIO,
                ebooklib.ITEM_COVER,
                ebooklib.ITEM_IMAGE,
-                ebooklib.ITEM_VIDEO,
                ebooklib.ITEM_VECTOR,
+                ebooklib.ITEM_VIDEO,
            ):
                # Adding assets to tokens list

@@ -89,7 +107,12 @@ async def epub_to_tokens(
    return tokens, book.spine.copy()


-def read_metadata(book: epub.EpubBook):
+def read_metadata(book: epub.EpubBook) -> dict[str, str]:
+
+    """
+    Reads metadata from xml to dict
+    """
+
    metadata = {}
    metadata["title"] = book.get_metadata("DC", "title")[0][0]
    metadata["author"] = convert_list(book.get_metadata("DC", "creator"))
@@ -97,7 +120,12 @@ def read_metadata(book: epub.EpubBook):
    return metadata.copy()


-def convert_list(titles_list: list[tuple[str, dict[str, str]]]):
+def convert_list(titles_list: list[tuple[str, dict[str, str]]]) -> str:
+
+    """
+    Joins titles list to one string
+    """
+
    res = []
    for title_obj in titles_list:
        res.append(title_obj[0])
@@ -105,23 +133,156 @@ def convert_list(titles_list: list[tuple[str, dict[str, str]]]):
    return "; ".join(res)


-def set_cover(tokens: Document_Tokens):
+def set_cover(tokens: DocumentTokens) -> None:
+
+    """
+    Converts cover file name to base64 image stored in `tokens`
+    """
+
    cover_name = tokens["metadata"].get("cover")
    if cover_name in tokens.keys():
        tokens["metadata"]["cover"] = tokens[cover_name]


-def epub_tokens2html(spine: list[tuple[str, str]], tokens: Document_Tokens):
-    res = b""
+def epub_tokens2html(spine: list[tuple[str, str]], tokens: DocumentTokens) -> bytes:
+
+    """
+    Joins chapters in `spice` to one html string
+    """
+
+    res = ""

    for name, _ in spine:
        file_path = tokens["toc"].get(name)
        if file_path:
            res += process_xhtml(file_path, tokens)

+    return html.escape(html.unescape(res))
+
+
+def process_xhtml(path: str, tokens: DocumentTokens) -> bytes:
+
+    """
+    Processes content of one xml body
+    """
+
+    xml: etree.Element = etree.fromstring(tokens[path], parser=parser)
+
+    if xml.tag == "body":
+        xml.tag = "div"
+
+    process_content(xml, path, tokens)
+
+    return (
+        f'<section id="b_{path_to_name(path)}">{etree.tostring(xml).decode()}</section>'
+    )
+
+
+def process_content(node: etree.Element, path: str, tokens: DocumentTokens) -> None:
+
+    """
+    Recursive function for xml element convertion to valid html
+    """
+
+    # Process universal tags
+
+    if node.get("epub:type"):
+        node.attrib.pop("epub:type")
+    el_id = node.get("id")
+    if el_id:
+        node.set("id", f"{path_to_name(path)}_{el_id}")
+
+    # Tag processing
+
+    if node.tag == "a":
+        process_a_element(node, path)
+
+    elif node.tag == "hgroup":
+        node.tag = "div"
+
+    elif node.tag in ("img", "source", "video", "audio"):
+        process_media_element(node, path, tokens)
+
+    elif node.tag == IMAGE:
+        href = node.get(HREF)
+        media_path = rel_to_abs_path(path, href)
+        if media_path in tokens.keys():
+            node.set(HREF, tokens[media_path])
+
+    elif node.tag == "trigger":
+        node.getparent().remove(node)
+
+    # Recursively run for all children
+
+    for child in node:
+        process_content(child, path, tokens)
+
+
+def process_a_element(node: etree.Element, path: str):
+
+    r"""
+    Converts `filed` links to ids in \<a\> element
+    """
+
+    href = node.get("href")
+    if href.count(".xhtml") or href.count(".html"):
+        id_pos = href.rfind("#")
+        if id_pos != -1:
+            href_path, el_id = href[:id_pos], href[id_pos:]
+            node.set("href", f"#{path_to_name(href_path)}_{el_id[1:]}")
+        else:
+            node.set("href", f"#b_{path_to_name(href)}")
+    elif href.count("#"):
+        node.set("href", f"#{path_to_name(path)}_{href[1:]}")
+
+
+def process_media_element(node: etree.Element, path: str, tokens: DocumentTokens):
+
+    """
+    Replaces file paths to base64 encoded media in `src` and `srcset` tags
+    """
+
+    src = node.get("src")
+    attr = "src"
+
+    if not src:
+        src = node.get("srcset")
+        attr = "srcset"
+
+    if src:
+        media_path = rel_to_abs_path(path, src)
+        if media_path in tokens.keys():
+            node.set(attr, tokens[media_path])
+
+
+def rel_to_abs_path(parent: str, rel: str):
+
+    """
+    Helper for relative path to media convertion to absolute
+    """
+
+    return os.path.normpath(os.path.join(os.path.dirname(parent), rel))
+
+
+@cache
+def path_to_name(path: str) -> str:
+
+    """
+    Helper function for getting file name
+    """
+
+    return os.path.basename(path).split(".")[0]
+
+
+def children_to_html(root: etree.Element) -> bytes:
+
+    """
+    Converts all xml children of element to string and joins them
+    """
+
+    res = b""
+
+    for child in root:
+        res += etree.tostring(child)
+
    return res
-
-
-def process_xhtml(path: str, tokens: Document_Tokens):
-    # TODO: Add xhtml procession
-    return tokens[path]
--- a/app/fb2.py
+++ b/app/fb2.py
@@ -1,10 +1,16 @@
+"""
+Module for FB2 file conversion to html
+"""
+
 from tempfile import SpooledTemporaryFile
 import xml.etree.ElementTree as ET
 from xml.etree.ElementTree import Element
 from typing import Optional
+import html
+
 from fastapi import HTTPException

-from .utils import Document_Tokens, strip_whitespace, HTMLBook
+from .utils import DocumentTokens, strip_whitespace, HTMLBook


 namespaces = {
@@ -25,17 +31,20 @@ async def fb22html(file: SpooledTemporaryFile) -> HTMLBook:
        set_cover(tokens)
        html_content = fb2body2html(tokens)

-        return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
+        return {
+            **(tokens["metadata"]),
+            "content": html.escape(html.unescape(html_content.decode())),
+        }

-    except Exception as e:
+    except Exception as err:
        raise HTTPException(
-            status_code=500, detail="Error! Wrong fb2 file format: " + str(e)
-        )
+            status_code=500, detail="Error! Wrong fb2 file format: " + str(err)
+        ) from err


-def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
+def fb22tokens(file: SpooledTemporaryFile) -> DocumentTokens:

-    """
+    r"""
    Parses fb2 file as xml document.
    It puts book metadata, its content and media to `tokens` dictionary and returns it.

@@ -74,7 +83,7 @@ def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
        if "cover" not in metadata.keys():
            metadata.pop("cover")

-        if len(metadata.keys()):
+        if len(metadata.keys()) != 0:
            tokens["metadata"] = metadata.copy()

    # Reading book content
@@ -107,9 +116,9 @@ def get_author(author: Element) -> str:
        "middle-name",
        "last-name",
    ):
-        el = author.find("./" + tag_name, namespaces)
-        if el is not None:
-            res.append(el.text)
+        tag = author.find("./" + tag_name, namespaces)
+        if tag is not None:
+            res.append(tag.text)
    if len(res) == 0:
        res = author.find("./nickname", namespaces).text
    else:
@@ -127,8 +136,11 @@ def get_cover(coverpage: Optional[Element]) -> Optional[str]:
    if coverpage:
        return coverpage.find("./image", namespaces).get(HREF)

+    return None

-def set_cover(tokens: Document_Tokens) -> None:
+
+def set_cover(tokens: DocumentTokens) -> None:
+    """Gets cover from book and sets it in metadata"""
    cover = tokens["metadata"]["cover"]
    if cover is None:
        tokens["metadata"]["cover"] = "none"
@@ -136,7 +148,7 @@ def set_cover(tokens: Document_Tokens) -> None:
        tokens["metadata"]["cover"] = tokens[cover[1:]]


-def fb2body2html(tokens: Document_Tokens) -> str:
+def fb2body2html(tokens: DocumentTokens) -> str:

    """
    Convert fb2 xml to html, joins bodies into one string
@@ -144,14 +156,14 @@ def fb2body2html(tokens: Document_Tokens) -> str:

    res = b""

-    xml_root = ET.fromstring(tokens["content"])
+    xml_root = ET.fromstring(strip_whitespace(tokens["content"]))
    for body in xml_root.iterfind("./body"):
        res += process_section(body, tokens)

    return res


-def process_section(body: Element, tokens: Document_Tokens) -> str:
+def process_section(body: Element, tokens: DocumentTokens) -> str:

    """
    Processes individual sections, recursively goes throw sections tree
@@ -159,11 +171,11 @@ def process_section(body: Element, tokens: Document_Tokens) -> str:

    res = b"<section>\n"

-    for tag in ("title", "epigraph", "annotation"):
-        el = body.find("./" + tag)
-        if el:
-            process_content(el, tokens)
-            res += children_to_html(el)
+    for tag_name in ("title", "epigraph", "annotation"):
+        tag = body.find("./" + tag_name)
+        if tag:
+            process_content(tag, tokens)
+            res += children_to_html(tag)
    image = body.find("./image")
    if image:
        process_image(image, tokens)
@@ -193,18 +205,18 @@ def children_to_html(root: Element) -> str:
    return res


-def process_image(el: Element, tokens: Document_Tokens) -> None:
+def process_image(element: Element, tokens: DocumentTokens) -> None:

-    """
+    r"""
    Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
    """

-    el.tag = "img"
+    element.tag = "img"

-    href = el.get(HREF)
-    el.attrib.pop(HREF)
+    href = element.get(HREF)
+    element.attrib.pop(HREF)

-    el.set("src", tokens[href[1:]] if href[0] == "#" else href)
+    element.set("src", tokens[href[1:]] if href[0] == "#" else href)


 tag_replacement = {
@@ -219,14 +231,12 @@ tag_with_class = {
    "cite": "div",
    "poem": "div",
    "stanza": "div",
-    "poem": "div",
-    "poem": "div",
    "epigraph": "div",
    "text-author": "p",
 }


-def process_content(root: Element, tokens: Document_Tokens) -> None:
+def process_content(root: Element, tokens: DocumentTokens) -> None:

    """
    Converts fb2 xml tag names to html equivalents and my own styled elements.
--- a/app/main.py
+++ b/app/main.py
@@ -1,19 +1,47 @@
+"""Webserver for epub and fb2 files convertation to html"""
+
+from datetime import datetime
+
 from fastapi import FastAPI, File, UploadFile, HTTPException
+from pydantic import BaseModel  # pylint: disable=no-name-in-module

 from .epub import epub2html
 from .fb2 import fb22html
 from .utils import HashedHTMLBook, add_hash

+
+class DebugInfo(BaseModel):  # pylint: disable=too-few-public-methods
+    """Main handler return types"""
+
+    startup_time: str
+
+
 app = FastAPI()

+start_time = datetime.now()

-@app.get("/")
+
+@app.get("/", response_model=DebugInfo)
 def root():
-    return "Hello, World!"
+    """
+    Test if server is running.
+
+    Returns startup time
+    """
+    return {"startup_time": start_time.isoformat()}


@app.post("/uploadfile/", response_model=HashedHTMLBook)
 async def create_upload_file(file: UploadFile = File(...)):
+    """
+    Main api handler:
+
+    Accepts files with fb2 and epub extensions
+
+    Returns HTTP 415 error if file has unsupported format
+
+    Else returns object with book metadata and its html
+    """
    if file.filename.endswith(".fb2"):
        content = await fb22html(file.file)
    elif file.filename.endswith(".epub"):
--- a/app/utils.py
+++ b/app/utils.py
@@ -1,30 +1,44 @@
+"""
+Utils for publite_backend module
+"""
+
+
 from typing import Union, Optional
-from pydantic import BaseModel
 import re
 from hashlib import sha256

-Document_Tokens = dict[str, Union[str, dict[str, str]]]
+from pydantic import BaseModel  # pylint: disable=no-name-in-module
+
+DocumentTokens = dict[str, Union[str, dict[str, str]]]


-class HTMLBook(BaseModel):
+class HTMLBook(BaseModel):  # pylint: disable=too-few-public-methods
+    """Transitional model for returned book data"""
+
    title: str
    author: str
    cover: Optional[str]
    content: str


-class HashedHTMLBook(HTMLBook):
+class HashedHTMLBook(HTMLBook):  # pylint: disable=too-few-public-methods
+    """Model for returned book data with content hash"""
+
    hash: str


 replacements = [
-    ("&#13;", "\r"),
-    (">\s+?<", "><"),
+    ("&#13;", ""),
+    ("&#17;", ""),
+    (r">\s+?<", "><"),
 ]


-def strip_whitespace(s: bytes) -> str:
-    res = s.decode()
+def strip_whitespace(string: bytes) -> str:
+
+    """Removes"""
+
+    res = string.decode()

    for old, new in replacements:
        res = re.sub(old, new, res)
@@ -33,6 +47,11 @@ def strip_whitespace(s: bytes) -> str:


 def add_hash(content: HTMLBook) -> HashedHTMLBook:
+
+    """
+    Adds hash of book content
+    """
+
    h_content: HashedHTMLBook = content.copy()
    h_content["hash"] = sha256(content["content"].encode()).hexdigest()