262 lines
6.1 KiB
Python
262 lines
6.1 KiB
Python
"""
|
|
Module for FB2 file conversion to html
|
|
"""
|
|
|
|
import html
|
|
import xml.etree.ElementTree as ET
|
|
from tempfile import SpooledTemporaryFile
|
|
from typing import Optional
|
|
from xml.etree.ElementTree import Element
|
|
|
|
from fastapi import HTTPException
|
|
|
|
from .utils import DocumentTokens, HTMLBook, strip_whitespace
|
|
|
|
namespaces = {
|
|
"": "http://www.gribuser.ru/xml/fictionbook/2.0",
|
|
"xlink": "http://www.w3.org/1999/xlink",
|
|
}
|
|
HREF = f"{{{namespaces['xlink']}}}href"
|
|
|
|
|
|
async def fb22html(file: SpooledTemporaryFile) -> HTMLBook:
|
|
|
|
"""
|
|
Splits fb2 to tokens and joins them to one html file
|
|
"""
|
|
|
|
try:
|
|
tokens = fb22tokens(file)
|
|
set_cover(tokens)
|
|
html_content = fb2body2html(tokens)
|
|
|
|
return {
|
|
**(tokens["metadata"]),
|
|
"content": html.escape(html.unescape(html_content.decode())),
|
|
}
|
|
|
|
except Exception as err:
|
|
raise HTTPException(
|
|
status_code=500, detail="Error! Wrong fb2 file format: " + str(err)
|
|
) from err
|
|
|
|
|
|
def fb22tokens(file: SpooledTemporaryFile) -> DocumentTokens:
|
|
|
|
r"""
|
|
Parses fb2 file as xml document.
|
|
It puts book metadata, its content and media to `tokens` dictionary and returns it.
|
|
|
|
`tokens` format:
|
|
|
|
{ "metadata": { ... },
|
|
|
|
"content": "\<string\>",
|
|
|
|
"\<asset_id\>": "\<base64_data\>" }
|
|
"""
|
|
|
|
tokens = {
|
|
"metadata": {
|
|
"title": "",
|
|
"author": "",
|
|
},
|
|
"content": b"<root>",
|
|
}
|
|
|
|
book = ET.parse(file)
|
|
root = book.getroot()
|
|
|
|
description = root.find("./description", namespaces)
|
|
bodies = root.findall("./body", namespaces)
|
|
assets = root.findall("./binary", namespaces)
|
|
|
|
# Reading book metadata
|
|
|
|
book_info = description.find("./title-info", namespaces)
|
|
if book_info:
|
|
metadata = {}
|
|
metadata["title"] = book_info.find("./book-title", namespaces).text
|
|
metadata["author"] = get_author(book_info.find("./author", namespaces))
|
|
metadata["cover"] = get_cover(book_info.find("./coverpage", namespaces))
|
|
if "cover" not in metadata.keys():
|
|
metadata.pop("cover")
|
|
|
|
if len(metadata.keys()) != 0:
|
|
tokens["metadata"] = metadata.copy()
|
|
|
|
# Reading book content
|
|
|
|
for body in bodies:
|
|
tokens["content"] += ET.tostring(body).replace(b"ns0:", b"")
|
|
|
|
tokens["content"] += b"</root>"
|
|
|
|
# Reading assets
|
|
|
|
for asset in assets:
|
|
key = asset.get("id")
|
|
media_type = asset.get("content-type")
|
|
b64_content = asset.text
|
|
tokens[key] = f"data:{media_type};base64,{b64_content}"
|
|
|
|
return tokens
|
|
|
|
|
|
def get_author(author: Element) -> str:
|
|
|
|
"""
|
|
Converts author xml structure to string
|
|
"""
|
|
|
|
res = []
|
|
for tag_name in (
|
|
"first-name",
|
|
"middle-name",
|
|
"last-name",
|
|
):
|
|
tag = author.find("./" + tag_name, namespaces)
|
|
if tag is not None:
|
|
res.append(tag.text)
|
|
if len(res) == 0:
|
|
res = author.find("./nickname", namespaces).text
|
|
else:
|
|
res = " ".join(res)
|
|
|
|
return res
|
|
|
|
|
|
def get_cover(coverpage: Optional[Element]) -> Optional[str]:
|
|
|
|
"""
|
|
Extracts cover image id if exists
|
|
"""
|
|
|
|
if coverpage:
|
|
return coverpage.find("./image", namespaces).get(HREF)
|
|
|
|
return None
|
|
|
|
|
|
def set_cover(tokens: DocumentTokens) -> None:
|
|
"""Gets cover from book and sets it in metadata"""
|
|
cover = tokens["metadata"]["cover"]
|
|
if cover is None:
|
|
tokens["metadata"]["cover"] = "none"
|
|
elif cover[0] == "#":
|
|
tokens["metadata"]["cover"] = tokens[cover[1:]]
|
|
|
|
|
|
def fb2body2html(tokens: DocumentTokens) -> str:
|
|
|
|
"""
|
|
Convert fb2 xml to html, joins bodies into one string
|
|
"""
|
|
|
|
res = b""
|
|
|
|
xml_root = ET.fromstring(strip_whitespace(tokens["content"]))
|
|
for body in xml_root.iterfind("./body"):
|
|
res += process_section(body, tokens)
|
|
|
|
return res
|
|
|
|
|
|
def process_section(body: Element, tokens: DocumentTokens) -> str:
|
|
|
|
"""
|
|
Processes individual sections, recursively goes throw sections tree
|
|
"""
|
|
|
|
res = b"<section>\n"
|
|
|
|
for tag_name in ("title", "epigraph", "annotation"):
|
|
tag = body.find("./" + tag_name)
|
|
if tag:
|
|
process_content(tag, tokens)
|
|
res += children_to_html(tag)
|
|
image = body.find("./image")
|
|
if image:
|
|
process_image(image, tokens)
|
|
res += ET.tostring(image)
|
|
|
|
for section in body.findall("./section"):
|
|
if section.find("./section"):
|
|
res += process_section(section, tokens)
|
|
else:
|
|
process_content(section, tokens)
|
|
res += b"<section>\n" + children_to_html(section) + b"</section>\n"
|
|
|
|
return res + b"</section>\n"
|
|
|
|
|
|
def children_to_html(root: Element) -> str:
|
|
|
|
"""
|
|
Converts xml tag children to string
|
|
"""
|
|
|
|
res = b""
|
|
|
|
for child in root:
|
|
res += ET.tostring(child)
|
|
|
|
return res
|
|
|
|
|
|
def process_image(element: Element, tokens: DocumentTokens) -> None:
|
|
|
|
r"""
|
|
Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
|
|
"""
|
|
|
|
element.tag = "img"
|
|
|
|
href = element.get(HREF)
|
|
element.attrib.pop(HREF)
|
|
|
|
element.set("src", tokens[href[1:]] if href[0] == "#" else href)
|
|
|
|
|
|
tag_replacement = {
|
|
"empty-line": "br",
|
|
"emphasis": "em",
|
|
"strikethrough": "strike",
|
|
"v": "p",
|
|
}
|
|
|
|
tag_with_class = {
|
|
"subtitle": "p",
|
|
"cite": "div",
|
|
"poem": "div",
|
|
"stanza": "div",
|
|
"epigraph": "div",
|
|
"text-author": "p",
|
|
}
|
|
|
|
|
|
def process_content(root: Element, tokens: DocumentTokens) -> None:
|
|
|
|
"""
|
|
Converts fb2 xml tag names to html equivalents and my own styled elements.
|
|
Resolves binary data dependencies
|
|
"""
|
|
|
|
for child in root:
|
|
process_content(child, tokens)
|
|
|
|
if child.tag == "a":
|
|
href = child.get(HREF)
|
|
child.attrib.pop(HREF)
|
|
child.set("href", href)
|
|
|
|
if child.tag == "image":
|
|
process_image(child, tokens)
|
|
|
|
elif child.tag in tag_replacement.keys():
|
|
child.tag = tag_replacement[child.tag]
|
|
|
|
elif child.tag in tag_with_class.keys():
|
|
child.set("class", child.tag)
|
|
child.tag = tag_with_class[child.tag]
|