From 515579035754f2d2033da41a9dd801f61168e3fa Mon Sep 17 00:00:00 2001
From: dm1sh <me@dmitriy.icu>
Date: Wed, 7 Jul 2021 17:35:31 +0500
Subject: [PATCH] Finished epub to html convertion functionality, fixed all
 pylint errors

---
 app/epub.py  | 213 ++++++++++++++++++++++++++++++++++++++++++++-------
 app/fb2.py   |  68 +++++++++-------
 app/main.py  |  32 +++++++-
 app/utils.py |  35 +++++++--
 4 files changed, 283 insertions(+), 65 deletions(-)

diff --git a/app/epub.py b/app/epub.py
index bce98ed..5fed27f 100644
--- a/app/epub.py
+++ b/app/epub.py
@@ -1,13 +1,25 @@
-import aiofiles as aiof
-from base64 import b64encode
-from fastapi import HTTPException
+"""
+Module for EPUB file conversion to html
+"""
 
+from base64 import b64encode
+from functools import cache
+import html
+import os
+from tempfile import SpooledTemporaryFile
+
+import aiofiles as aiof
+from fastapi import HTTPException
+from lxml import etree
 import ebooklib
 from ebooklib import epub
 
-from tempfile import SpooledTemporaryFile
+from .utils import DocumentTokens, strip_whitespace, HTMLBook
 
-from .utils import Document_Tokens, strip_whitespace, HTMLBook
+parser = etree.XMLParser(recover=True)
+
+IMAGE = "{http://www.w3.org/2000/svg}image"
+HREF = "{http://www.w3.org/1999/xlink}href"
 
 
 async def epub2html(file: SpooledTemporaryFile) -> HTMLBook:
@@ -22,24 +34,29 @@ async def epub2html(file: SpooledTemporaryFile) -> HTMLBook:
 
         html_content = epub_tokens2html(spine, tokens)
 
-        return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
+        return {
+            **(tokens["metadata"]),
+            "content": html_content,
+        }
 
-    except Exception as e:
+    except Exception as err:
         raise HTTPException(
-            status_code=500, detail="Error! Wrong epub file format: " + str(e)
-        )
+            status_code=500, detail="Error! Wrong epub file format: " + str(err)
+        ) from err
 
 
 async def epub_to_tokens(
     file: SpooledTemporaryFile,
-) -> tuple[Document_Tokens, list[tuple[str, str]]]:
+) -> tuple[DocumentTokens, list[tuple[str, str]]]:
 
-    """
-    Passes file content to EbookLib library and parses epub tokens into dict of the following format:
+    r"""
+    Passes file content to EbookLib library and parses epub tokens into dict of
+    the following format:
 
     { "\<file_name\>": "\<file_content\>" }
 
-    Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
+    Where file content is either plain text for xhtml or base64 encoded data
+    for other formats, prepared for embeding to html
     """
 
     tokens = {}
@@ -61,19 +78,20 @@ async def epub_to_tokens(
             item: epub.EpubItem
 
             item_type = item.get_type()
-            file_path = reader.opf_dir + "/" + item.get_name()
+            file_path = os.path.join(reader.opf_dir, item.get_name())
 
             if item_type == ebooklib.ITEM_DOCUMENT:
                 # Adding book chapters to tokens list
                 name = item.get_id()
-                tokens[file_path] = item.get_body_content()
+                tokens[file_path] = strip_whitespace(item.get_body_content())
                 tokens["toc"][name] = file_path
 
             elif item_type in (
+                ebooklib.ITEM_AUDIO,
                 ebooklib.ITEM_COVER,
                 ebooklib.ITEM_IMAGE,
-                ebooklib.ITEM_VIDEO,
                 ebooklib.ITEM_VECTOR,
+                ebooklib.ITEM_VIDEO,
             ):
                 # Adding assets to tokens list
 
@@ -89,7 +107,12 @@ async def epub_to_tokens(
     return tokens, book.spine.copy()
 
 
-def read_metadata(book: epub.EpubBook):
+def read_metadata(book: epub.EpubBook) -> dict[str, str]:
+
+    """
+    Reads metadata from xml to dict
+    """
+
     metadata = {}
     metadata["title"] = book.get_metadata("DC", "title")[0][0]
     metadata["author"] = convert_list(book.get_metadata("DC", "creator"))
@@ -97,7 +120,12 @@ def read_metadata(book: epub.EpubBook):
     return metadata.copy()
 
 
-def convert_list(titles_list: list[tuple[str, dict[str, str]]]):
+def convert_list(titles_list: list[tuple[str, dict[str, str]]]) -> str:
+
+    """
+    Joins titles list to one string
+    """
+
     res = []
     for title_obj in titles_list:
         res.append(title_obj[0])
@@ -105,23 +133,156 @@ def convert_list(titles_list: list[tuple[str, dict[str, str]]]):
     return "; ".join(res)
 
 
-def set_cover(tokens: Document_Tokens):
+def set_cover(tokens: DocumentTokens) -> None:
+
+    """
+    Converts cover file name to base64 image stored in `tokens`
+    """
+
     cover_name = tokens["metadata"].get("cover")
     if cover_name in tokens.keys():
         tokens["metadata"]["cover"] = tokens[cover_name]
 
 
-def epub_tokens2html(spine: list[tuple[str, str]], tokens: Document_Tokens):
-    res = b""
+def epub_tokens2html(spine: list[tuple[str, str]], tokens: DocumentTokens) -> bytes:
+
+    """
+    Joins chapters in `spice` to one html string
+    """
+
+    res = ""
 
     for name, _ in spine:
         file_path = tokens["toc"].get(name)
         if file_path:
             res += process_xhtml(file_path, tokens)
 
+    return html.escape(html.unescape(res))
+
+
+def process_xhtml(path: str, tokens: DocumentTokens) -> bytes:
+
+    """
+    Processes content of one xml body
+    """
+
+    xml: etree.Element = etree.fromstring(tokens[path], parser=parser)
+
+    if xml.tag == "body":
+        xml.tag = "div"
+
+    process_content(xml, path, tokens)
+
+    return (
+        f'<section id="b_{path_to_name(path)}">{etree.tostring(xml).decode()}</section>'
+    )
+
+
+def process_content(node: etree.Element, path: str, tokens: DocumentTokens) -> None:
+
+    """
+    Recursive function for xml element convertion to valid html
+    """
+
+    # Process universal tags
+
+    if node.get("epub:type"):
+        node.attrib.pop("epub:type")
+    el_id = node.get("id")
+    if el_id:
+        node.set("id", f"{path_to_name(path)}_{el_id}")
+
+    # Tag processing
+
+    if node.tag == "a":
+        process_a_element(node, path)
+
+    elif node.tag == "hgroup":
+        node.tag = "div"
+
+    elif node.tag in ("img", "source", "video", "audio"):
+        process_media_element(node, path, tokens)
+
+    elif node.tag == IMAGE:
+        href = node.get(HREF)
+        media_path = rel_to_abs_path(path, href)
+        if media_path in tokens.keys():
+            node.set(HREF, tokens[media_path])
+
+    elif node.tag == "trigger":
+        node.getparent().remove(node)
+
+    # Recursively run for all children
+
+    for child in node:
+        process_content(child, path, tokens)
+
+
+def process_a_element(node: etree.Element, path: str):
+
+    r"""
+    Converts `filed` links to ids in \<a\> element
+    """
+
+    href = node.get("href")
+    if href.count(".xhtml") or href.count(".html"):
+        id_pos = href.rfind("#")
+        if id_pos != -1:
+            href_path, el_id = href[:id_pos], href[id_pos:]
+            node.set("href", f"#{path_to_name(href_path)}_{el_id[1:]}")
+        else:
+            node.set("href", f"#b_{path_to_name(href)}")
+    elif href.count("#"):
+        node.set("href", f"#{path_to_name(path)}_{href[1:]}")
+
+
+def process_media_element(node: etree.Element, path: str, tokens: DocumentTokens):
+
+    """
+    Replaces file paths to base64 encoded media in `src` and `srcset` tags
+    """
+
+    src = node.get("src")
+    attr = "src"
+
+    if not src:
+        src = node.get("srcset")
+        attr = "srcset"
+
+    if src:
+        media_path = rel_to_abs_path(path, src)
+        if media_path in tokens.keys():
+            node.set(attr, tokens[media_path])
+
+
+def rel_to_abs_path(parent: str, rel: str):
+
+    """
+    Helper for relative path to media convertion to absolute
+    """
+
+    return os.path.normpath(os.path.join(os.path.dirname(parent), rel))
+
+
+@cache
+def path_to_name(path: str) -> str:
+
+    """
+    Helper function for getting file name
+    """
+
+    return os.path.basename(path).split(".")[0]
+
+
+def children_to_html(root: etree.Element) -> bytes:
+
+    """
+    Converts all xml children of element to string and joins them
+    """
+
+    res = b""
+
+    for child in root:
+        res += etree.tostring(child)
+
     return res
-
-
-def process_xhtml(path: str, tokens: Document_Tokens):
-    # TODO: Add xhtml procession
-    return tokens[path]
diff --git a/app/fb2.py b/app/fb2.py
index cb8ad2b..f829217 100644
--- a/app/fb2.py
+++ b/app/fb2.py
@@ -1,10 +1,16 @@
+"""
+Module for FB2 file conversion to html
+"""
+
 from tempfile import SpooledTemporaryFile
 import xml.etree.ElementTree as ET
 from xml.etree.ElementTree import Element
 from typing import Optional
+import html
+
 from fastapi import HTTPException
 
-from .utils import Document_Tokens, strip_whitespace, HTMLBook
+from .utils import DocumentTokens, strip_whitespace, HTMLBook
 
 
 namespaces = {
@@ -25,17 +31,20 @@ async def fb22html(file: SpooledTemporaryFile) -> HTMLBook:
         set_cover(tokens)
         html_content = fb2body2html(tokens)
 
-        return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
+        return {
+            **(tokens["metadata"]),
+            "content": html.escape(html.unescape(html_content.decode())),
+        }
 
-    except Exception as e:
+    except Exception as err:
         raise HTTPException(
-            status_code=500, detail="Error! Wrong fb2 file format: " + str(e)
-        )
+            status_code=500, detail="Error! Wrong fb2 file format: " + str(err)
+        ) from err
 
 
-def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
+def fb22tokens(file: SpooledTemporaryFile) -> DocumentTokens:
 
-    """
+    r"""
     Parses fb2 file as xml document.
     It puts book metadata, its content and media to `tokens` dictionary and returns it.
 
@@ -74,7 +83,7 @@ def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
         if "cover" not in metadata.keys():
             metadata.pop("cover")
 
-        if len(metadata.keys()):
+        if len(metadata.keys()) != 0:
             tokens["metadata"] = metadata.copy()
 
     # Reading book content
@@ -107,9 +116,9 @@ def get_author(author: Element) -> str:
         "middle-name",
         "last-name",
     ):
-        el = author.find("./" + tag_name, namespaces)
-        if el is not None:
-            res.append(el.text)
+        tag = author.find("./" + tag_name, namespaces)
+        if tag is not None:
+            res.append(tag.text)
     if len(res) == 0:
         res = author.find("./nickname", namespaces).text
     else:
@@ -127,8 +136,11 @@ def get_cover(coverpage: Optional[Element]) -> Optional[str]:
     if coverpage:
         return coverpage.find("./image", namespaces).get(HREF)
 
+    return None
 
-def set_cover(tokens: Document_Tokens) -> None:
+
+def set_cover(tokens: DocumentTokens) -> None:
+    """Gets cover from book and sets it in metadata"""
     cover = tokens["metadata"]["cover"]
     if cover is None:
         tokens["metadata"]["cover"] = "none"
@@ -136,7 +148,7 @@ def set_cover(tokens: Document_Tokens) -> None:
         tokens["metadata"]["cover"] = tokens[cover[1:]]
 
 
-def fb2body2html(tokens: Document_Tokens) -> str:
+def fb2body2html(tokens: DocumentTokens) -> str:
 
     """
     Convert fb2 xml to html, joins bodies into one string
@@ -144,14 +156,14 @@ def fb2body2html(tokens: Document_Tokens) -> str:
 
     res = b""
 
-    xml_root = ET.fromstring(tokens["content"])
+    xml_root = ET.fromstring(strip_whitespace(tokens["content"]))
     for body in xml_root.iterfind("./body"):
         res += process_section(body, tokens)
 
     return res
 
 
-def process_section(body: Element, tokens: Document_Tokens) -> str:
+def process_section(body: Element, tokens: DocumentTokens) -> str:
 
     """
     Processes individual sections, recursively goes throw sections tree
@@ -159,11 +171,11 @@ def process_section(body: Element, tokens: Document_Tokens) -> str:
 
     res = b"<section>\n"
 
-    for tag in ("title", "epigraph", "annotation"):
-        el = body.find("./" + tag)
-        if el:
-            process_content(el, tokens)
-            res += children_to_html(el)
+    for tag_name in ("title", "epigraph", "annotation"):
+        tag = body.find("./" + tag_name)
+        if tag:
+            process_content(tag, tokens)
+            res += children_to_html(tag)
     image = body.find("./image")
     if image:
         process_image(image, tokens)
@@ -193,18 +205,18 @@ def children_to_html(root: Element) -> str:
     return res
 
 
-def process_image(el: Element, tokens: Document_Tokens) -> None:
+def process_image(element: Element, tokens: DocumentTokens) -> None:
 
-    """
+    r"""
     Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
     """
 
-    el.tag = "img"
+    element.tag = "img"
 
-    href = el.get(HREF)
-    el.attrib.pop(HREF)
+    href = element.get(HREF)
+    element.attrib.pop(HREF)
 
-    el.set("src", tokens[href[1:]] if href[0] == "#" else href)
+    element.set("src", tokens[href[1:]] if href[0] == "#" else href)
 
 
 tag_replacement = {
@@ -219,14 +231,12 @@ tag_with_class = {
     "cite": "div",
     "poem": "div",
     "stanza": "div",
-    "poem": "div",
-    "poem": "div",
     "epigraph": "div",
     "text-author": "p",
 }
 
 
-def process_content(root: Element, tokens: Document_Tokens) -> None:
+def process_content(root: Element, tokens: DocumentTokens) -> None:
 
     """
     Converts fb2 xml tag names to html equivalents and my own styled elements.
diff --git a/app/main.py b/app/main.py
index 57a9192..57b6ee2 100644
--- a/app/main.py
+++ b/app/main.py
@@ -1,19 +1,47 @@
+"""Webserver for epub and fb2 files convertation to html"""
+
+from datetime import datetime
+
 from fastapi import FastAPI, File, UploadFile, HTTPException
+from pydantic import BaseModel  # pylint: disable=no-name-in-module
 
 from .epub import epub2html
 from .fb2 import fb22html
 from .utils import HashedHTMLBook, add_hash
 
+
+class DebugInfo(BaseModel):  # pylint: disable=too-few-public-methods
+    """Main handler return types"""
+
+    startup_time: str
+
+
 app = FastAPI()
 
+start_time = datetime.now()
 
-@app.get("/")
+
+@app.get("/", response_model=DebugInfo)
 def root():
-    return "Hello, World!"
+    """
+    Test if server is running.
+
+    Returns startup time
+    """
+    return {"startup_time": start_time.isoformat()}
 
 
 @app.post("/uploadfile/", response_model=HashedHTMLBook)
 async def create_upload_file(file: UploadFile = File(...)):
+    """
+    Main api handler:
+
+    Accepts files with fb2 and epub extensions
+
+    Returns HTTP 415 error if file has unsupported format
+
+    Else returns object with book metadata and its html
+    """
     if file.filename.endswith(".fb2"):
         content = await fb22html(file.file)
     elif file.filename.endswith(".epub"):
diff --git a/app/utils.py b/app/utils.py
index 52da556..3ea9e46 100644
--- a/app/utils.py
+++ b/app/utils.py
@@ -1,30 +1,44 @@
+"""
+Utils for publite_backend module
+"""
+
+
 from typing import Union, Optional
-from pydantic import BaseModel
 import re
 from hashlib import sha256
 
-Document_Tokens = dict[str, Union[str, dict[str, str]]]
+from pydantic import BaseModel  # pylint: disable=no-name-in-module
+
+DocumentTokens = dict[str, Union[str, dict[str, str]]]
 
 
-class HTMLBook(BaseModel):
+class HTMLBook(BaseModel):  # pylint: disable=too-few-public-methods
+    """Transitional model for returned book data"""
+
     title: str
     author: str
     cover: Optional[str]
     content: str
 
 
-class HashedHTMLBook(HTMLBook):
+class HashedHTMLBook(HTMLBook):  # pylint: disable=too-few-public-methods
+    """Model for returned book data with content hash"""
+
     hash: str
 
 
 replacements = [
-    ("&#13;", "\r"),
-    (">\s+?<", "><"),
+    ("&#13;", ""),
+    ("&#17;", ""),
+    (r">\s+?<", "><"),
 ]
 
 
-def strip_whitespace(s: bytes) -> str:
-    res = s.decode()
+def strip_whitespace(string: bytes) -> str:
+
+    """Removes"""
+
+    res = string.decode()
 
     for old, new in replacements:
         res = re.sub(old, new, res)
@@ -33,6 +47,11 @@ def strip_whitespace(s: bytes) -> str:
 
 
 def add_hash(content: HTMLBook) -> HashedHTMLBook:
+
+    """
+    Adds hash of book content
+    """
+
     h_content: HashedHTMLBook = content.copy()
     h_content["hash"] = sha256(content["content"].encode()).hexdigest()