Added fb2 xml to html convertion

This commit is contained in:
Dmitriy Shishkov 2021-07-02 03:10:03 +05:00
parent f717291823
commit 4eae488f45
No known key found for this signature in database
GPG Key ID: 14358F96FCDD8060
3 changed files with 138 additions and 10 deletions

View File

@ -6,6 +6,7 @@ from ebooklib import epub
from tempfile import SpooledTemporaryFile
from .utils import Document_Tokens
async def epub2html(file: SpooledTemporaryFile) -> str:
@ -27,7 +28,7 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
return "Error! Wrong epub file format: " + str(e)
async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
async def epub_to_tokens(file: SpooledTemporaryFile) -> Document_Tokens:
"""
Passes file content to EbookLib library and parses epub tokens into dict of the following format:

View File

@ -3,6 +3,8 @@ import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element
from typing import Optional
from .utils import Document_Tokens
namespaces = {
"": "http://www.gribuser.ru/xml/fictionbook/2.0",
@ -20,22 +22,19 @@ async def fb22html(file: SpooledTemporaryFile) -> str:
try:
tokens = fb22tokens(file)
...
# TODO: join tokens to HTML
html_content = ""
...
html_content = fb2body2html(tokens)
print(tokens.keys())
return html_content
except Exception as e:
return "Error! Wrong FB2 file format: " + str(e)
def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
"""
Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it.
Parses fb2 file as xml document.
It puts book metadata, its content and media to `tokens` dictionary and returns it.
`tokens` format:
@ -46,7 +45,10 @@ def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
"\<asset_id\>": "\<base64_data\>" }
"""
tokens = {"metadata": {}, "content": b""}
tokens = {
"metadata": {},
"content": b"<root>",
}
book = ET.parse(file)
root = book.getroot()
@ -74,6 +76,8 @@ def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
for body in bodies:
tokens["content"] += ET.tostring(body).replace(b"ns0:", b"")
tokens["content"] += b"</root>"
# Reading assets
for asset in assets:
@ -92,7 +96,11 @@ def get_author(author: Element) -> str:
"""
res = []
for tag_name in ("first-name", "middle-name", "last-name"):
for tag_name in (
"first-name",
"middle-name",
"last-name",
):
el = author.find(tag_name, namespaces)
if not el is None:
res.append(el.text)
@ -112,3 +120,119 @@ def get_cover(coverpage: Optional[Element]) -> Optional[str]:
if coverpage:
return coverpage.find("image", namespaces).get(HREF)
def fb2body2html(tokens: Document_Tokens) -> str:
"""
Convert fb2 xml to html, joins bodies into one string
"""
res = b""
xml_root = ET.fromstring(tokens["content"])
for body in xml_root.iterfind("./body"):
res += process_section(body, tokens)
return res
def process_section(body: Element, tokens: Document_Tokens) -> str:
"""
Processes individual sections, recursively goes throw sections tree
"""
res = b"<section>\n"
for tag in ("title", "epigraph", "annotation"):
el = body.find("./" + tag)
if el:
process_content(el, tokens)
res += children_to_html(el)
image = body.find("./image")
if image:
process_image(image, tokens)
res += ET.tostring(image)
for section in body.findall("./section"):
if section.find("./section"):
res += process_section(section, tokens)
else:
process_content(section, tokens)
res += b"<section>\n" + children_to_html(section) + b"</section>\n"
return res + b"</section>\n"
def children_to_html(root: Element) -> str:
"""
Converts xml tag children to string
"""
res = b""
for child in root:
res += ET.tostring(child)
return res
def process_image(el: Element, tokens: Document_Tokens) -> None:
"""
Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
"""
el.tag = "img"
href = el.get(HREF)
el.attrib.pop(HREF)
el.set("src", tokens[href[1:]] if href[0] == "#" else href)
tag_replacement = {
"empty-line": "br",
"emphasis": "em",
"strikethrough": "strike",
"v": "p",
}
tag_with_class = {
"subtitle": "p",
"cite": "div",
"poem": "div",
"stanza": "div",
"poem": "div",
"poem": "div",
"epigraph": "div",
"text-author": "p",
}
def process_content(root: Element, tokens: Document_Tokens) -> None:
"""
Converts fb2 xml tag names to html equivalents and my own styled elements.
Resolves binary data dependencies
"""
for child in root:
process_content(child, tokens)
if child.tag == "a":
href = child.get(HREF)
child.attrib.pop(HREF)
child.set("href", href)
if child.tag == "image":
process_image(child, tokens)
elif child.tag in tag_replacement.keys():
child.tag = tag_replacement[child.tag]
elif child.tag in tag_with_class.keys():
child.set("class", child.tag)
child.tag = tag_with_class[child.tag]

3
app/utils.py Normal file
View File

@ -0,0 +1,3 @@
from typing import Union
Document_Tokens = dict[str, Union[str, dict[str, str]]]