diff --git a/app/epub.py b/app/epub.py index 69f8009..ad7ec43 100644 --- a/app/epub.py +++ b/app/epub.py @@ -6,6 +6,7 @@ from ebooklib import epub from tempfile import SpooledTemporaryFile +from .utils import Document_Tokens async def epub2html(file: SpooledTemporaryFile) -> str: @@ -27,7 +28,7 @@ async def epub2html(file: SpooledTemporaryFile) -> str: return "Error! Wrong epub file format: " + str(e) -async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]: +async def epub_to_tokens(file: SpooledTemporaryFile) -> Document_Tokens: """ Passes file content to EbookLib library and parses epub tokens into dict of the following format: diff --git a/app/fb2.py b/app/fb2.py index 4d1d8bb..330711a 100644 --- a/app/fb2.py +++ b/app/fb2.py @@ -3,6 +3,8 @@ import xml.etree.ElementTree as ET from xml.etree.ElementTree import Element from typing import Optional +from .utils import Document_Tokens + namespaces = { "": "http://www.gribuser.ru/xml/fictionbook/2.0", @@ -20,22 +22,19 @@ async def fb22html(file: SpooledTemporaryFile) -> str: try: tokens = fb22tokens(file) - ... - # TODO: join tokens to HTML - html_content = "" - ... + html_content = fb2body2html(tokens) - print(tokens.keys()) return html_content except Exception as e: return "Error! Wrong FB2 file format: " + str(e) -def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]: +def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens: """ - Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it. + Parses fb2 file as xml document. + It puts book metadata, its content and media to `tokens` dictionary and returns it. `tokens` format: @@ -46,7 +45,10 @@ def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]: "\": "\" } """ - tokens = {"metadata": {}, "content": b""} + tokens = { + "metadata": {}, + "content": b"", + } book = ET.parse(file) root = book.getroot() @@ -74,6 +76,8 @@ def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]: for body in bodies: tokens["content"] += ET.tostring(body).replace(b"ns0:", b"") + tokens["content"] += b"" + # Reading assets for asset in assets: @@ -92,7 +96,11 @@ def get_author(author: Element) -> str: """ res = [] - for tag_name in ("first-name", "middle-name", "last-name"): + for tag_name in ( + "first-name", + "middle-name", + "last-name", + ): el = author.find(tag_name, namespaces) if not el is None: res.append(el.text) @@ -112,3 +120,119 @@ def get_cover(coverpage: Optional[Element]) -> Optional[str]: if coverpage: return coverpage.find("image", namespaces).get(HREF) + + +def fb2body2html(tokens: Document_Tokens) -> str: + + """ + Convert fb2 xml to html, joins bodies into one string + """ + + res = b"" + + xml_root = ET.fromstring(tokens["content"]) + for body in xml_root.iterfind("./body"): + res += process_section(body, tokens) + + return res + + +def process_section(body: Element, tokens: Document_Tokens) -> str: + + """ + Processes individual sections, recursively goes throw sections tree + """ + + res = b"
\n" + + for tag in ("title", "epigraph", "annotation"): + el = body.find("./" + tag) + if el: + process_content(el, tokens) + res += children_to_html(el) + image = body.find("./image") + if image: + process_image(image, tokens) + res += ET.tostring(image) + + for section in body.findall("./section"): + if section.find("./section"): + res += process_section(section, tokens) + else: + process_content(section, tokens) + res += b"
\n" + children_to_html(section) + b"
\n" + + return res + b"
\n" + + +def children_to_html(root: Element) -> str: + + """ + Converts xml tag children to string + """ + + res = b"" + + for child in root: + res += ET.tostring(child) + + return res + + +def process_image(el: Element, tokens: Document_Tokens) -> None: + + """ + Converts fb2 \ to html \. Replaces xlink:href with src="\" + """ + + el.tag = "img" + + href = el.get(HREF) + el.attrib.pop(HREF) + + el.set("src", tokens[href[1:]] if href[0] == "#" else href) + + +tag_replacement = { + "empty-line": "br", + "emphasis": "em", + "strikethrough": "strike", + "v": "p", +} + +tag_with_class = { + "subtitle": "p", + "cite": "div", + "poem": "div", + "stanza": "div", + "poem": "div", + "poem": "div", + "epigraph": "div", + "text-author": "p", +} + + +def process_content(root: Element, tokens: Document_Tokens) -> None: + + """ + Converts fb2 xml tag names to html equivalents and my own styled elements. + Resolves binary data dependencies + """ + + for child in root: + process_content(child, tokens) + + if child.tag == "a": + href = child.get(HREF) + child.attrib.pop(HREF) + child.set("href", href) + + if child.tag == "image": + process_image(child, tokens) + + elif child.tag in tag_replacement.keys(): + child.tag = tag_replacement[child.tag] + + elif child.tag in tag_with_class.keys(): + child.set("class", child.tag) + child.tag = tag_with_class[child.tag] diff --git a/app/utils.py b/app/utils.py new file mode 100644 index 0000000..60a2a52 --- /dev/null +++ b/app/utils.py @@ -0,0 +1,3 @@ +from typing import Union + +Document_Tokens = dict[str, Union[str, dict[str, str]]] \ No newline at end of file