115 lines
2.7 KiB
Python
115 lines
2.7 KiB
Python
from tempfile import SpooledTemporaryFile
|
|
import xml.etree.ElementTree as ET
|
|
from xml.etree.ElementTree import Element
|
|
from typing import Optional
|
|
|
|
|
|
namespaces = {
|
|
"": "http://www.gribuser.ru/xml/fictionbook/2.0",
|
|
"xlink": "http://www.w3.org/1999/xlink",
|
|
}
|
|
HREF = f"{{{namespaces['xlink']}}}href"
|
|
|
|
|
|
async def fb22html(file: SpooledTemporaryFile) -> str:
|
|
|
|
"""
|
|
Splits fb2 to tokens and joins them to one html file
|
|
"""
|
|
|
|
try:
|
|
|
|
tokens = fb22tokens(file)
|
|
...
|
|
# TODO: join tokens to HTML
|
|
html_content = ""
|
|
...
|
|
|
|
print(tokens.keys())
|
|
return html_content
|
|
|
|
except Exception as e:
|
|
return "Error! Wrong FB2 file format: " + str(e)
|
|
|
|
|
|
def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
|
|
|
"""
|
|
Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it.
|
|
|
|
`tokens` format:
|
|
|
|
{ "metadata": { ... },
|
|
|
|
"content": "\<string\>",
|
|
|
|
"\<asset_id\>": "\<base64_data\>" }
|
|
"""
|
|
|
|
tokens = {"metadata": {}, "content": b""}
|
|
|
|
book = ET.parse(file)
|
|
root = book.getroot()
|
|
|
|
description = root.find("./description", namespaces)
|
|
bodies = root.findall("./body", namespaces)
|
|
assets = root.findall("./binary", namespaces)
|
|
|
|
# Reading book metadata
|
|
|
|
book_info = description.find("title-info")
|
|
if book_info:
|
|
metadata = {}
|
|
metadata["title"] = book_info.find("book-title", namespaces).text
|
|
metadata["author"] = get_author(book_info.find("author", namespaces))
|
|
metadata["cover"] = get_cover(book_info.find("coverpage", namespaces))
|
|
if metadata["cover"] is None:
|
|
metadata.pop("cover")
|
|
|
|
if len(metadata.keys()):
|
|
tokens["metadata"] = metadata.copy()
|
|
|
|
# Reading book content
|
|
|
|
for body in bodies:
|
|
tokens["content"] += ET.tostring(body).replace(b"ns0:", b"")
|
|
|
|
# Reading assets
|
|
|
|
for asset in assets:
|
|
key = asset.get("id")
|
|
media_type = asset.get("content-type")
|
|
b64_content = asset.text
|
|
tokens[key] = f"data:{media_type};base64,{b64_content}"
|
|
|
|
return tokens
|
|
|
|
|
|
def get_author(author: Element) -> str:
|
|
|
|
"""
|
|
Converts author xml structure to string
|
|
"""
|
|
|
|
res = []
|
|
for tag_name in ("first-name", "middle-name", "last-name"):
|
|
el = author.find(tag_name, namespaces)
|
|
if not el is None:
|
|
res.append(el.text)
|
|
if len(res) == 0:
|
|
res = author.find("nickname", namespaces).text
|
|
else:
|
|
res = " ".join(res)
|
|
|
|
return res
|
|
|
|
|
|
def get_cover(coverpage: Optional[Element]) -> Optional[str]:
|
|
|
|
"""
|
|
Extracts cover image id if exists
|
|
"""
|
|
|
|
if coverpage:
|
|
return coverpage.find("image", namespaces).get(HREF)
|