backend/app/fb2.py
dm1sh 87e5a16a06
Fixed empty cover error
Updated deployment instructions
Switched to alpine python image
2023-09-17 16:40:26 +03:00

255 lines
6.1 KiB
Python

"""
Module for FB2 file conversion to html
"""
import html
import xml.etree.ElementTree as ET
from tempfile import SpooledTemporaryFile
from typing import Optional
from xml.etree.ElementTree import Element
from fastapi import HTTPException
from .utils import DocumentTokens, HTMLBook, strip_whitespace
namespaces = {
"": "http://www.gribuser.ru/xml/fictionbook/2.0",
"xlink": "http://www.w3.org/1999/xlink",
}
HREF = f"{{{namespaces['xlink']}}}href"
async def fb22html(file: SpooledTemporaryFile) -> HTMLBook:
"""
Splits fb2 to tokens and joins them to one html file
"""
try:
tokens = fb22tokens(file)
set_cover(tokens)
html_content = fb2body2html(tokens)
return {
**(tokens["metadata"]),
"content": html.unescape(html_content.decode()),
}
except Exception as err:
raise HTTPException(
status_code=500, detail="Error! Wrong fb2 file format: " + str(err)
) from err
def fb22tokens(file: SpooledTemporaryFile) -> DocumentTokens:
r"""
Parses fb2 file as xml document.
It puts book metadata, its content and media to `tokens` dictionary and returns it.
`tokens` format:
{ "metadata": { ... },
"content": "\<string\>",
"\<asset_id\>": "\<base64_data\>" }
"""
tokens = {
"metadata": {
"title": "",
"author": "",
},
"content": b"<root>",
}
book = ET.parse(file)
root = book.getroot()
description = root.find("./description", namespaces)
bodies = root.findall("./body", namespaces)
assets = root.findall("./binary", namespaces)
# Reading book metadata
book_info = description.find("./title-info", namespaces)
if book_info:
metadata = {}
metadata["title"] = book_info.find("./book-title", namespaces).text
metadata["author"] = get_author(book_info.find("./author", namespaces))
metadata["cover"] = get_cover(
book_info.find("./coverpage", namespaces))
if "cover" not in metadata.keys():
metadata.pop("cover")
if len(metadata.keys()) != 0:
tokens["metadata"] = metadata.copy()
# Reading book content
for body in bodies:
tokens["content"] += ET.tostring(body).replace(b"ns0:", b"")
tokens["content"] += b"</root>"
# Reading assets
for asset in assets:
key = asset.get("id")
media_type = asset.get("content-type")
b64_content = asset.text
tokens[key] = f"data:{media_type};base64,{b64_content}"
return tokens
def get_author(author: Element) -> str:
"""
Converts author xml structure to string
"""
res = []
for tag_name in (
"first-name",
"middle-name",
"last-name",
):
tag = author.find("./" + tag_name, namespaces)
if tag is not None and tag.text is not None:
res.append(tag.text)
if len(res) == 0:
res = author.find("./nickname", namespaces).text
else:
res = " ".join(res)
return res
def get_cover(coverpage: Optional[Element]) -> Optional[str]:
"""
Extracts cover image id if exists
"""
if coverpage:
return coverpage.find("./image", namespaces).get(HREF)
return None
def set_cover(tokens: DocumentTokens) -> None:
"""Gets cover from book and sets it in metadata"""
cover = tokens["metadata"]["cover"]
if cover is None:
tokens["metadata"]["cover"] = "none"
elif cover[0] == "#":
tokens["metadata"]["cover"] = tokens[cover[1:]]
def fb2body2html(tokens: DocumentTokens) -> str:
"""
Convert fb2 xml to html, joins bodies into one string
"""
res = b""
xml_root = ET.fromstring(strip_whitespace(tokens["content"]))
for body in xml_root.iterfind("./body"):
res += process_section(body, tokens)
return res
def process_section(body: Element, tokens: DocumentTokens) -> str:
"""
Processes individual sections, recursively goes throw sections tree
"""
res = b"<section>\n"
for tag_name in ("title", "epigraph", "annotation"):
tag = body.find("./" + tag_name)
if tag:
process_content(tag, tokens)
res += children_to_html(tag)
image = body.find("./image")
if image:
process_image(image, tokens)
res += ET.tostring(image)
for section in body.findall("./section"):
if section.find("./section"):
res += process_section(section, tokens)
else:
process_content(section, tokens)
res += b"<section>\n" + children_to_html(section) + b"</section>\n"
return res + b"</section>\n"
def children_to_html(root: Element) -> str:
"""
Converts xml tag children to string
"""
res = b""
for child in root:
res += ET.tostring(child)
return res
def process_image(element: Element, tokens: DocumentTokens) -> None:
r"""
Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
"""
element.tag = "img"
href = element.get(HREF)
element.attrib.pop(HREF)
element.set("src", tokens[href[1:]] if href[0] == "#" else href)
tag_replacement = {
"empty-line": "br",
"emphasis": "em",
"strikethrough": "strike",
"v": "p",
}
tag_with_class = {
"subtitle": "p",
"cite": "div",
"poem": "div",
"stanza": "div",
"epigraph": "div",
"text-author": "p",
}
def process_content(root: Element, tokens: DocumentTokens) -> None:
"""
Converts fb2 xml tag names to html equivalents and my own styled elements.
Resolves binary data dependencies
"""
for child in root:
process_content(child, tokens)
if child.tag == "a":
href = child.get(HREF)
child.attrib.pop(HREF)
child.set("href", href)
if child.tag == "image":
process_image(child, tokens)
elif child.tag in tag_replacement.keys():
child.tag = tag_replacement[child.tag]
elif child.tag in tag_with_class.keys():
child.set("class", child.tag)
child.tag = tag_with_class[child.tag]