From 515579035754f2d2033da41a9dd801f61168e3fa Mon Sep 17 00:00:00 2001 From: dm1sh Date: Wed, 7 Jul 2021 17:35:31 +0500 Subject: [PATCH] Finished epub to html convertion functionality, fixed all pylint errors --- app/epub.py | 213 ++++++++++++++++++++++++++++++++++++++++++++------- app/fb2.py | 68 +++++++++------- app/main.py | 32 +++++++- app/utils.py | 35 +++++++-- 4 files changed, 283 insertions(+), 65 deletions(-) diff --git a/app/epub.py b/app/epub.py index bce98ed..5fed27f 100644 --- a/app/epub.py +++ b/app/epub.py @@ -1,13 +1,25 @@ -import aiofiles as aiof -from base64 import b64encode -from fastapi import HTTPException +""" +Module for EPUB file conversion to html +""" +from base64 import b64encode +from functools import cache +import html +import os +from tempfile import SpooledTemporaryFile + +import aiofiles as aiof +from fastapi import HTTPException +from lxml import etree import ebooklib from ebooklib import epub -from tempfile import SpooledTemporaryFile +from .utils import DocumentTokens, strip_whitespace, HTMLBook -from .utils import Document_Tokens, strip_whitespace, HTMLBook +parser = etree.XMLParser(recover=True) + +IMAGE = "{http://www.w3.org/2000/svg}image" +HREF = "{http://www.w3.org/1999/xlink}href" async def epub2html(file: SpooledTemporaryFile) -> HTMLBook: @@ -22,24 +34,29 @@ async def epub2html(file: SpooledTemporaryFile) -> HTMLBook: html_content = epub_tokens2html(spine, tokens) - return {**(tokens["metadata"]), "content": strip_whitespace(html_content)} + return { + **(tokens["metadata"]), + "content": html_content, + } - except Exception as e: + except Exception as err: raise HTTPException( - status_code=500, detail="Error! Wrong epub file format: " + str(e) - ) + status_code=500, detail="Error! Wrong epub file format: " + str(err) + ) from err async def epub_to_tokens( file: SpooledTemporaryFile, -) -> tuple[Document_Tokens, list[tuple[str, str]]]: +) -> tuple[DocumentTokens, list[tuple[str, str]]]: - """ - Passes file content to EbookLib library and parses epub tokens into dict of the following format: + r""" + Passes file content to EbookLib library and parses epub tokens into dict of + the following format: { "\": "\" } - Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html + Where file content is either plain text for xhtml or base64 encoded data + for other formats, prepared for embeding to html """ tokens = {} @@ -61,19 +78,20 @@ async def epub_to_tokens( item: epub.EpubItem item_type = item.get_type() - file_path = reader.opf_dir + "/" + item.get_name() + file_path = os.path.join(reader.opf_dir, item.get_name()) if item_type == ebooklib.ITEM_DOCUMENT: # Adding book chapters to tokens list name = item.get_id() - tokens[file_path] = item.get_body_content() + tokens[file_path] = strip_whitespace(item.get_body_content()) tokens["toc"][name] = file_path elif item_type in ( + ebooklib.ITEM_AUDIO, ebooklib.ITEM_COVER, ebooklib.ITEM_IMAGE, - ebooklib.ITEM_VIDEO, ebooklib.ITEM_VECTOR, + ebooklib.ITEM_VIDEO, ): # Adding assets to tokens list @@ -89,7 +107,12 @@ async def epub_to_tokens( return tokens, book.spine.copy() -def read_metadata(book: epub.EpubBook): +def read_metadata(book: epub.EpubBook) -> dict[str, str]: + + """ + Reads metadata from xml to dict + """ + metadata = {} metadata["title"] = book.get_metadata("DC", "title")[0][0] metadata["author"] = convert_list(book.get_metadata("DC", "creator")) @@ -97,7 +120,12 @@ def read_metadata(book: epub.EpubBook): return metadata.copy() -def convert_list(titles_list: list[tuple[str, dict[str, str]]]): +def convert_list(titles_list: list[tuple[str, dict[str, str]]]) -> str: + + """ + Joins titles list to one string + """ + res = [] for title_obj in titles_list: res.append(title_obj[0]) @@ -105,23 +133,156 @@ def convert_list(titles_list: list[tuple[str, dict[str, str]]]): return "; ".join(res) -def set_cover(tokens: Document_Tokens): +def set_cover(tokens: DocumentTokens) -> None: + + """ + Converts cover file name to base64 image stored in `tokens` + """ + cover_name = tokens["metadata"].get("cover") if cover_name in tokens.keys(): tokens["metadata"]["cover"] = tokens[cover_name] -def epub_tokens2html(spine: list[tuple[str, str]], tokens: Document_Tokens): - res = b"" +def epub_tokens2html(spine: list[tuple[str, str]], tokens: DocumentTokens) -> bytes: + + """ + Joins chapters in `spice` to one html string + """ + + res = "" for name, _ in spine: file_path = tokens["toc"].get(name) if file_path: res += process_xhtml(file_path, tokens) + return html.escape(html.unescape(res)) + + +def process_xhtml(path: str, tokens: DocumentTokens) -> bytes: + + """ + Processes content of one xml body + """ + + xml: etree.Element = etree.fromstring(tokens[path], parser=parser) + + if xml.tag == "body": + xml.tag = "div" + + process_content(xml, path, tokens) + + return ( + f'
{etree.tostring(xml).decode()}
' + ) + + +def process_content(node: etree.Element, path: str, tokens: DocumentTokens) -> None: + + """ + Recursive function for xml element convertion to valid html + """ + + # Process universal tags + + if node.get("epub:type"): + node.attrib.pop("epub:type") + el_id = node.get("id") + if el_id: + node.set("id", f"{path_to_name(path)}_{el_id}") + + # Tag processing + + if node.tag == "a": + process_a_element(node, path) + + elif node.tag == "hgroup": + node.tag = "div" + + elif node.tag in ("img", "source", "video", "audio"): + process_media_element(node, path, tokens) + + elif node.tag == IMAGE: + href = node.get(HREF) + media_path = rel_to_abs_path(path, href) + if media_path in tokens.keys(): + node.set(HREF, tokens[media_path]) + + elif node.tag == "trigger": + node.getparent().remove(node) + + # Recursively run for all children + + for child in node: + process_content(child, path, tokens) + + +def process_a_element(node: etree.Element, path: str): + + r""" + Converts `filed` links to ids in \ element + """ + + href = node.get("href") + if href.count(".xhtml") or href.count(".html"): + id_pos = href.rfind("#") + if id_pos != -1: + href_path, el_id = href[:id_pos], href[id_pos:] + node.set("href", f"#{path_to_name(href_path)}_{el_id[1:]}") + else: + node.set("href", f"#b_{path_to_name(href)}") + elif href.count("#"): + node.set("href", f"#{path_to_name(path)}_{href[1:]}") + + +def process_media_element(node: etree.Element, path: str, tokens: DocumentTokens): + + """ + Replaces file paths to base64 encoded media in `src` and `srcset` tags + """ + + src = node.get("src") + attr = "src" + + if not src: + src = node.get("srcset") + attr = "srcset" + + if src: + media_path = rel_to_abs_path(path, src) + if media_path in tokens.keys(): + node.set(attr, tokens[media_path]) + + +def rel_to_abs_path(parent: str, rel: str): + + """ + Helper for relative path to media convertion to absolute + """ + + return os.path.normpath(os.path.join(os.path.dirname(parent), rel)) + + +@cache +def path_to_name(path: str) -> str: + + """ + Helper function for getting file name + """ + + return os.path.basename(path).split(".")[0] + + +def children_to_html(root: etree.Element) -> bytes: + + """ + Converts all xml children of element to string and joins them + """ + + res = b"" + + for child in root: + res += etree.tostring(child) + return res - - -def process_xhtml(path: str, tokens: Document_Tokens): - # TODO: Add xhtml procession - return tokens[path] diff --git a/app/fb2.py b/app/fb2.py index cb8ad2b..f829217 100644 --- a/app/fb2.py +++ b/app/fb2.py @@ -1,10 +1,16 @@ +""" +Module for FB2 file conversion to html +""" + from tempfile import SpooledTemporaryFile import xml.etree.ElementTree as ET from xml.etree.ElementTree import Element from typing import Optional +import html + from fastapi import HTTPException -from .utils import Document_Tokens, strip_whitespace, HTMLBook +from .utils import DocumentTokens, strip_whitespace, HTMLBook namespaces = { @@ -25,17 +31,20 @@ async def fb22html(file: SpooledTemporaryFile) -> HTMLBook: set_cover(tokens) html_content = fb2body2html(tokens) - return {**(tokens["metadata"]), "content": strip_whitespace(html_content)} + return { + **(tokens["metadata"]), + "content": html.escape(html.unescape(html_content.decode())), + } - except Exception as e: + except Exception as err: raise HTTPException( - status_code=500, detail="Error! Wrong fb2 file format: " + str(e) - ) + status_code=500, detail="Error! Wrong fb2 file format: " + str(err) + ) from err -def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens: +def fb22tokens(file: SpooledTemporaryFile) -> DocumentTokens: - """ + r""" Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it. @@ -74,7 +83,7 @@ def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens: if "cover" not in metadata.keys(): metadata.pop("cover") - if len(metadata.keys()): + if len(metadata.keys()) != 0: tokens["metadata"] = metadata.copy() # Reading book content @@ -107,9 +116,9 @@ def get_author(author: Element) -> str: "middle-name", "last-name", ): - el = author.find("./" + tag_name, namespaces) - if el is not None: - res.append(el.text) + tag = author.find("./" + tag_name, namespaces) + if tag is not None: + res.append(tag.text) if len(res) == 0: res = author.find("./nickname", namespaces).text else: @@ -127,8 +136,11 @@ def get_cover(coverpage: Optional[Element]) -> Optional[str]: if coverpage: return coverpage.find("./image", namespaces).get(HREF) + return None -def set_cover(tokens: Document_Tokens) -> None: + +def set_cover(tokens: DocumentTokens) -> None: + """Gets cover from book and sets it in metadata""" cover = tokens["metadata"]["cover"] if cover is None: tokens["metadata"]["cover"] = "none" @@ -136,7 +148,7 @@ def set_cover(tokens: Document_Tokens) -> None: tokens["metadata"]["cover"] = tokens[cover[1:]] -def fb2body2html(tokens: Document_Tokens) -> str: +def fb2body2html(tokens: DocumentTokens) -> str: """ Convert fb2 xml to html, joins bodies into one string @@ -144,14 +156,14 @@ def fb2body2html(tokens: Document_Tokens) -> str: res = b"" - xml_root = ET.fromstring(tokens["content"]) + xml_root = ET.fromstring(strip_whitespace(tokens["content"])) for body in xml_root.iterfind("./body"): res += process_section(body, tokens) return res -def process_section(body: Element, tokens: Document_Tokens) -> str: +def process_section(body: Element, tokens: DocumentTokens) -> str: """ Processes individual sections, recursively goes throw sections tree @@ -159,11 +171,11 @@ def process_section(body: Element, tokens: Document_Tokens) -> str: res = b"
\n" - for tag in ("title", "epigraph", "annotation"): - el = body.find("./" + tag) - if el: - process_content(el, tokens) - res += children_to_html(el) + for tag_name in ("title", "epigraph", "annotation"): + tag = body.find("./" + tag_name) + if tag: + process_content(tag, tokens) + res += children_to_html(tag) image = body.find("./image") if image: process_image(image, tokens) @@ -193,18 +205,18 @@ def children_to_html(root: Element) -> str: return res -def process_image(el: Element, tokens: Document_Tokens) -> None: +def process_image(element: Element, tokens: DocumentTokens) -> None: - """ + r""" Converts fb2 \ to html \. Replaces xlink:href with src="\" """ - el.tag = "img" + element.tag = "img" - href = el.get(HREF) - el.attrib.pop(HREF) + href = element.get(HREF) + element.attrib.pop(HREF) - el.set("src", tokens[href[1:]] if href[0] == "#" else href) + element.set("src", tokens[href[1:]] if href[0] == "#" else href) tag_replacement = { @@ -219,14 +231,12 @@ tag_with_class = { "cite": "div", "poem": "div", "stanza": "div", - "poem": "div", - "poem": "div", "epigraph": "div", "text-author": "p", } -def process_content(root: Element, tokens: Document_Tokens) -> None: +def process_content(root: Element, tokens: DocumentTokens) -> None: """ Converts fb2 xml tag names to html equivalents and my own styled elements. diff --git a/app/main.py b/app/main.py index 57a9192..57b6ee2 100644 --- a/app/main.py +++ b/app/main.py @@ -1,19 +1,47 @@ +"""Webserver for epub and fb2 files convertation to html""" + +from datetime import datetime + from fastapi import FastAPI, File, UploadFile, HTTPException +from pydantic import BaseModel # pylint: disable=no-name-in-module from .epub import epub2html from .fb2 import fb22html from .utils import HashedHTMLBook, add_hash + +class DebugInfo(BaseModel): # pylint: disable=too-few-public-methods + """Main handler return types""" + + startup_time: str + + app = FastAPI() +start_time = datetime.now() -@app.get("/") + +@app.get("/", response_model=DebugInfo) def root(): - return "Hello, World!" + """ + Test if server is running. + + Returns startup time + """ + return {"startup_time": start_time.isoformat()} @app.post("/uploadfile/", response_model=HashedHTMLBook) async def create_upload_file(file: UploadFile = File(...)): + """ + Main api handler: + + Accepts files with fb2 and epub extensions + + Returns HTTP 415 error if file has unsupported format + + Else returns object with book metadata and its html + """ if file.filename.endswith(".fb2"): content = await fb22html(file.file) elif file.filename.endswith(".epub"): diff --git a/app/utils.py b/app/utils.py index 52da556..3ea9e46 100644 --- a/app/utils.py +++ b/app/utils.py @@ -1,30 +1,44 @@ +""" +Utils for publite_backend module +""" + + from typing import Union, Optional -from pydantic import BaseModel import re from hashlib import sha256 -Document_Tokens = dict[str, Union[str, dict[str, str]]] +from pydantic import BaseModel # pylint: disable=no-name-in-module + +DocumentTokens = dict[str, Union[str, dict[str, str]]] -class HTMLBook(BaseModel): +class HTMLBook(BaseModel): # pylint: disable=too-few-public-methods + """Transitional model for returned book data""" + title: str author: str cover: Optional[str] content: str -class HashedHTMLBook(HTMLBook): +class HashedHTMLBook(HTMLBook): # pylint: disable=too-few-public-methods + """Model for returned book data with content hash""" + hash: str replacements = [ - (" ", "\r"), - (">\s+?<", "><"), + (" ", ""), + ("", ""), + (r">\s+?<", "><"), ] -def strip_whitespace(s: bytes) -> str: - res = s.decode() +def strip_whitespace(string: bytes) -> str: + + """Removes""" + + res = string.decode() for old, new in replacements: res = re.sub(old, new, res) @@ -33,6 +47,11 @@ def strip_whitespace(s: bytes) -> str: def add_hash(content: HTMLBook) -> HashedHTMLBook: + + """ + Adds hash of book content + """ + h_content: HashedHTMLBook = content.copy() h_content["hash"] = sha256(content["content"].encode()).hexdigest()