Finished epub to html convertion functionality, fixed all pylint errors

2021-07-07 17:35:31 +05:00
parent 2f4a683cb4
commit 5155790357
4 changed files with 283 additions and 65 deletions
--- a/app/epub.py
+++ b/app/epub.py
@@ -1,13 +1,25 @@
-import aiofiles as aiof
+"""
-from base64 import b64encode
+Module for EPUB file conversion to html
-from fastapi import HTTPException
+"""
 from base64 import b64encode
 from functools import cache
 import html
 import os
 from tempfile import SpooledTemporaryFile
 import aiofiles as aiof
 from fastapi import HTTPException
 from lxml import etree
 import ebooklib
 from ebooklib import epub
-from tempfile import SpooledTemporaryFile
+from .utils import DocumentTokens, strip_whitespace, HTMLBook
-from .utils import Document_Tokens, strip_whitespace, HTMLBook
+parser = etree.XMLParser(recover=True)
 IMAGE = "{http://www.w3.org/2000/svg}image"
 HREF = "{http://www.w3.org/1999/xlink}href"
 async def epub2html(file: SpooledTemporaryFile) -> HTMLBook:
@@ -22,24 +34,29 @@ async def epub2html(file: SpooledTemporaryFile) -> HTMLBook:
        html_content = epub_tokens2html(spine, tokens)
-        return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
+        return {
            **(tokens["metadata"]),
            "content": html_content,
        }
-    except Exception as e:
+    except Exception as err:
        raise HTTPException(
-            status_code=500, detail="Error! Wrong epub file format: " + str(e)
+            status_code=500, detail="Error! Wrong epub file format: " + str(err)
-        )
+        ) from err
 async def epub_to_tokens(
    file: SpooledTemporaryFile,
-) -> tuple[Document_Tokens, list[tuple[str, str]]]:
+) -> tuple[DocumentTokens, list[tuple[str, str]]]:
-    """
+    r"""
-    Passes file content to EbookLib library and parses epub tokens into dict of the following format:
+    Passes file content to EbookLib library and parses epub tokens into dict of
    the following format:
    { "\<file_name\>": "\<file_content\>" }
-    Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
+    Where file content is either plain text for xhtml or base64 encoded data
    for other formats, prepared for embeding to html
    """
    tokens = {}
@@ -61,19 +78,20 @@ async def epub_to_tokens(
            item: epub.EpubItem
            item_type = item.get_type()
-            file_path = reader.opf_dir + "/" + item.get_name()
+            file_path = os.path.join(reader.opf_dir, item.get_name())
            if item_type == ebooklib.ITEM_DOCUMENT:
                # Adding book chapters to tokens list
                name = item.get_id()
-                tokens[file_path] = item.get_body_content()
+                tokens[file_path] = strip_whitespace(item.get_body_content())
                tokens["toc"][name] = file_path
            elif item_type in (
                ebooklib.ITEM_AUDIO,
                ebooklib.ITEM_COVER,
                ebooklib.ITEM_IMAGE,
                ebooklib.ITEM_VIDEO,
                ebooklib.ITEM_VECTOR,
                ebooklib.ITEM_VIDEO,
            ):
                # Adding assets to tokens list
@@ -89,7 +107,12 @@ async def epub_to_tokens(
    return tokens, book.spine.copy()
-def read_metadata(book: epub.EpubBook):
+def read_metadata(book: epub.EpubBook) -> dict[str, str]:
    """
    Reads metadata from xml to dict
    """
    metadata = {}
    metadata["title"] = book.get_metadata("DC", "title")[0][0]
    metadata["author"] = convert_list(book.get_metadata("DC", "creator"))
@@ -97,7 +120,12 @@ def read_metadata(book: epub.EpubBook):
    return metadata.copy()
-def convert_list(titles_list: list[tuple[str, dict[str, str]]]):
+def convert_list(titles_list: list[tuple[str, dict[str, str]]]) -> str:
    """
    Joins titles list to one string
    """
    res = []
    for title_obj in titles_list:
        res.append(title_obj[0])
@@ -105,23 +133,156 @@ def convert_list(titles_list: list[tuple[str, dict[str, str]]]):
    return "; ".join(res)
-def set_cover(tokens: Document_Tokens):
+def set_cover(tokens: DocumentTokens) -> None:
    """
    Converts cover file name to base64 image stored in `tokens`
    """
    cover_name = tokens["metadata"].get("cover")
    if cover_name in tokens.keys():
        tokens["metadata"]["cover"] = tokens[cover_name]
-def epub_tokens2html(spine: list[tuple[str, str]], tokens: Document_Tokens):
+def epub_tokens2html(spine: list[tuple[str, str]], tokens: DocumentTokens) -> bytes:
-    res = b""
+
    """
    Joins chapters in `spice` to one html string
    """
    res = ""
    for name, _ in spine:
        file_path = tokens["toc"].get(name)
        if file_path:
            res += process_xhtml(file_path, tokens)
    return html.escape(html.unescape(res))
 def process_xhtml(path: str, tokens: DocumentTokens) -> bytes:
    """
    Processes content of one xml body
    """
    xml: etree.Element = etree.fromstring(tokens[path], parser=parser)
    if xml.tag == "body":
        xml.tag = "div"
    process_content(xml, path, tokens)
    return (
        f'<section id="b_{path_to_name(path)}">{etree.tostring(xml).decode()}</section>'
    )
 def process_content(node: etree.Element, path: str, tokens: DocumentTokens) -> None:
    """
    Recursive function for xml element convertion to valid html
    """
    # Process universal tags
    if node.get("epub:type"):
        node.attrib.pop("epub:type")
    el_id = node.get("id")
    if el_id:
        node.set("id", f"{path_to_name(path)}_{el_id}")
    # Tag processing
    if node.tag == "a":
        process_a_element(node, path)
    elif node.tag == "hgroup":
        node.tag = "div"
    elif node.tag in ("img", "source", "video", "audio"):
        process_media_element(node, path, tokens)
    elif node.tag == IMAGE:
        href = node.get(HREF)
        media_path = rel_to_abs_path(path, href)
        if media_path in tokens.keys():
            node.set(HREF, tokens[media_path])
    elif node.tag == "trigger":
        node.getparent().remove(node)
    # Recursively run for all children
    for child in node:
        process_content(child, path, tokens)
 def process_a_element(node: etree.Element, path: str):
    r"""
    Converts `filed` links to ids in \<a\> element
    """
    href = node.get("href")
    if href.count(".xhtml") or href.count(".html"):
        id_pos = href.rfind("#")
        if id_pos != -1:
            href_path, el_id = href[:id_pos], href[id_pos:]
            node.set("href", f"#{path_to_name(href_path)}_{el_id[1:]}")
        else:
            node.set("href", f"#b_{path_to_name(href)}")
    elif href.count("#"):
        node.set("href", f"#{path_to_name(path)}_{href[1:]}")
 def process_media_element(node: etree.Element, path: str, tokens: DocumentTokens):
    """
    Replaces file paths to base64 encoded media in `src` and `srcset` tags
    """
    src = node.get("src")
    attr = "src"
    if not src:
        src = node.get("srcset")
        attr = "srcset"
    if src:
        media_path = rel_to_abs_path(path, src)
        if media_path in tokens.keys():
            node.set(attr, tokens[media_path])
 def rel_to_abs_path(parent: str, rel: str):
    """
    Helper for relative path to media convertion to absolute
    """
    return os.path.normpath(os.path.join(os.path.dirname(parent), rel))
@cache
 def path_to_name(path: str) -> str:
    """
    Helper function for getting file name
    """
    return os.path.basename(path).split(".")[0]
 def children_to_html(root: etree.Element) -> bytes:
    """
    Converts all xml children of element to string and joins them
    """
    res = b""
    for child in root:
        res += etree.tostring(child)
    return res
 def process_xhtml(path: str, tokens: Document_Tokens):
    # TODO: Add xhtml procession
    return tokens[path]
--- a/app/fb2.py
+++ b/app/fb2.py
@@ -1,10 +1,16 @@
 """
 Module for FB2 file conversion to html
 """
 from tempfile import SpooledTemporaryFile
 import xml.etree.ElementTree as ET
 from xml.etree.ElementTree import Element
 from typing import Optional
 import html
 from fastapi import HTTPException
-from .utils import Document_Tokens, strip_whitespace, HTMLBook
+from .utils import DocumentTokens, strip_whitespace, HTMLBook
 namespaces = {
@@ -25,17 +31,20 @@ async def fb22html(file: SpooledTemporaryFile) -> HTMLBook:
        set_cover(tokens)
        html_content = fb2body2html(tokens)
-        return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
+        return {
            **(tokens["metadata"]),
            "content": html.escape(html.unescape(html_content.decode())),
        }
-    except Exception as e:
+    except Exception as err:
        raise HTTPException(
-            status_code=500, detail="Error! Wrong fb2 file format: " + str(e)
+            status_code=500, detail="Error! Wrong fb2 file format: " + str(err)
-        )
+        ) from err
-def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
+def fb22tokens(file: SpooledTemporaryFile) -> DocumentTokens:
-    """
+    r"""
    Parses fb2 file as xml document.
    It puts book metadata, its content and media to `tokens` dictionary and returns it.
@@ -74,7 +83,7 @@ def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
        if "cover" not in metadata.keys():
            metadata.pop("cover")
-        if len(metadata.keys()):
+        if len(metadata.keys()) != 0:
            tokens["metadata"] = metadata.copy()
    # Reading book content
@@ -107,9 +116,9 @@ def get_author(author: Element) -> str:
        "middle-name",
        "last-name",
    ):
-        el = author.find("./" + tag_name, namespaces)
+        tag = author.find("./" + tag_name, namespaces)
-        if el is not None:
+        if tag is not None:
-            res.append(el.text)
+            res.append(tag.text)
    if len(res) == 0:
        res = author.find("./nickname", namespaces).text
    else:
@@ -127,8 +136,11 @@ def get_cover(coverpage: Optional[Element]) -> Optional[str]:
    if coverpage:
        return coverpage.find("./image", namespaces).get(HREF)
    return None
-def set_cover(tokens: Document_Tokens) -> None:
+
 def set_cover(tokens: DocumentTokens) -> None:
    """Gets cover from book and sets it in metadata"""
    cover = tokens["metadata"]["cover"]
    if cover is None:
        tokens["metadata"]["cover"] = "none"
@@ -136,7 +148,7 @@ def set_cover(tokens: Document_Tokens) -> None:
        tokens["metadata"]["cover"] = tokens[cover[1:]]
-def fb2body2html(tokens: Document_Tokens) -> str:
+def fb2body2html(tokens: DocumentTokens) -> str:
    """
    Convert fb2 xml to html, joins bodies into one string
@@ -144,14 +156,14 @@ def fb2body2html(tokens: Document_Tokens) -> str:
    res = b""
-    xml_root = ET.fromstring(tokens["content"])
+    xml_root = ET.fromstring(strip_whitespace(tokens["content"]))
    for body in xml_root.iterfind("./body"):
        res += process_section(body, tokens)
    return res
-def process_section(body: Element, tokens: Document_Tokens) -> str:
+def process_section(body: Element, tokens: DocumentTokens) -> str:
    """
    Processes individual sections, recursively goes throw sections tree
@@ -159,11 +171,11 @@ def process_section(body: Element, tokens: Document_Tokens) -> str:
    res = b"<section>\n"
-    for tag in ("title", "epigraph", "annotation"):
+    for tag_name in ("title", "epigraph", "annotation"):
-        el = body.find("./" + tag)
+        tag = body.find("./" + tag_name)
-        if el:
+        if tag:
-            process_content(el, tokens)
+            process_content(tag, tokens)
-            res += children_to_html(el)
+            res += children_to_html(tag)
    image = body.find("./image")
    if image:
        process_image(image, tokens)
@@ -193,18 +205,18 @@ def children_to_html(root: Element) -> str:
    return res
-def process_image(el: Element, tokens: Document_Tokens) -> None:
+def process_image(element: Element, tokens: DocumentTokens) -> None:
-    """
+    r"""
    Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
    """
-    el.tag = "img"
+    element.tag = "img"
-    href = el.get(HREF)
+    href = element.get(HREF)
-    el.attrib.pop(HREF)
+    element.attrib.pop(HREF)
-    el.set("src", tokens[href[1:]] if href[0] == "#" else href)
+    element.set("src", tokens[href[1:]] if href[0] == "#" else href)
 tag_replacement = {
@@ -219,14 +231,12 @@ tag_with_class = {
    "cite": "div",
    "poem": "div",
    "stanza": "div",
    "poem": "div",
    "poem": "div",
    "epigraph": "div",
    "text-author": "p",
 }
-def process_content(root: Element, tokens: Document_Tokens) -> None:
+def process_content(root: Element, tokens: DocumentTokens) -> None:
    """
    Converts fb2 xml tag names to html equivalents and my own styled elements.
--- a/app/main.py
+++ b/app/main.py
@@ -1,19 +1,47 @@
 """Webserver for epub and fb2 files convertation to html"""
 from datetime import datetime
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from pydantic import BaseModel  # pylint: disable=no-name-in-module
 from .epub import epub2html
 from .fb2 import fb22html
 from .utils import HashedHTMLBook, add_hash
 class DebugInfo(BaseModel):  # pylint: disable=too-few-public-methods
    """Main handler return types"""
    startup_time: str
 app = FastAPI()
 start_time = datetime.now()
-@app.get("/")
+
@app.get("/", response_model=DebugInfo)
 def root():
-    return "Hello, World!"
+    """
    Test if server is running.
    Returns startup time
    """
    return {"startup_time": start_time.isoformat()}
@app.post("/uploadfile/", response_model=HashedHTMLBook)
 async def create_upload_file(file: UploadFile = File(...)):
    """
    Main api handler:
    Accepts files with fb2 and epub extensions
    Returns HTTP 415 error if file has unsupported format
    Else returns object with book metadata and its html
    """
    if file.filename.endswith(".fb2"):
        content = await fb22html(file.file)
    elif file.filename.endswith(".epub"):
--- a/app/utils.py
+++ b/app/utils.py
@@ -1,30 +1,44 @@
 """
 Utils for publite_backend module
 """
 from typing import Union, Optional
 from pydantic import BaseModel
 import re
 from hashlib import sha256
-Document_Tokens = dict[str, Union[str, dict[str, str]]]
+from pydantic import BaseModel  # pylint: disable=no-name-in-module
 DocumentTokens = dict[str, Union[str, dict[str, str]]]
-class HTMLBook(BaseModel):
+class HTMLBook(BaseModel):  # pylint: disable=too-few-public-methods
    """Transitional model for returned book data"""
    title: str
    author: str
    cover: Optional[str]
    content: str
-class HashedHTMLBook(HTMLBook):
+class HashedHTMLBook(HTMLBook):  # pylint: disable=too-few-public-methods
    """Model for returned book data with content hash"""
    hash: str
 replacements = [
-    ("&#13;", "\r"),
+    ("&#13;", ""),
-    (">\s+?<", "><"),
+    ("&#17;", ""),
    (r">\s+?<", "><"),
 ]
-def strip_whitespace(s: bytes) -> str:
+def strip_whitespace(string: bytes) -> str:
-    res = s.decode()
+
    """Removes"""
    res = string.decode()
    for old, new in replacements:
        res = re.sub(old, new, res)
@@ -33,6 +47,11 @@ def strip_whitespace(s: bytes) -> str:
 def add_hash(content: HTMLBook) -> HashedHTMLBook:
    """
    Adds hash of book content
    """
    h_content: HashedHTMLBook = content.copy()
    h_content["hash"] = sha256(content["content"].encode()).hexdigest()