Added fb2 xml to html convertion
This commit is contained in:
parent
f717291823
commit
4eae488f45
@ -6,6 +6,7 @@ from ebooklib import epub
|
||||
|
||||
from tempfile import SpooledTemporaryFile
|
||||
|
||||
from .utils import Document_Tokens
|
||||
|
||||
async def epub2html(file: SpooledTemporaryFile) -> str:
|
||||
|
||||
@ -27,7 +28,7 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
|
||||
return "Error! Wrong epub file format: " + str(e)
|
||||
|
||||
|
||||
async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
||||
async def epub_to_tokens(file: SpooledTemporaryFile) -> Document_Tokens:
|
||||
|
||||
"""
|
||||
Passes file content to EbookLib library and parses epub tokens into dict of the following format:
|
||||
|
142
app/fb2.py
142
app/fb2.py
@ -3,6 +3,8 @@ import xml.etree.ElementTree as ET
|
||||
from xml.etree.ElementTree import Element
|
||||
from typing import Optional
|
||||
|
||||
from .utils import Document_Tokens
|
||||
|
||||
|
||||
namespaces = {
|
||||
"": "http://www.gribuser.ru/xml/fictionbook/2.0",
|
||||
@ -20,22 +22,19 @@ async def fb22html(file: SpooledTemporaryFile) -> str:
|
||||
try:
|
||||
|
||||
tokens = fb22tokens(file)
|
||||
...
|
||||
# TODO: join tokens to HTML
|
||||
html_content = ""
|
||||
...
|
||||
html_content = fb2body2html(tokens)
|
||||
|
||||
print(tokens.keys())
|
||||
return html_content
|
||||
|
||||
except Exception as e:
|
||||
return "Error! Wrong FB2 file format: " + str(e)
|
||||
|
||||
|
||||
def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
||||
def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
|
||||
|
||||
"""
|
||||
Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it.
|
||||
Parses fb2 file as xml document.
|
||||
It puts book metadata, its content and media to `tokens` dictionary and returns it.
|
||||
|
||||
`tokens` format:
|
||||
|
||||
@ -46,7 +45,10 @@ def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
||||
"\<asset_id\>": "\<base64_data\>" }
|
||||
"""
|
||||
|
||||
tokens = {"metadata": {}, "content": b""}
|
||||
tokens = {
|
||||
"metadata": {},
|
||||
"content": b"<root>",
|
||||
}
|
||||
|
||||
book = ET.parse(file)
|
||||
root = book.getroot()
|
||||
@ -74,6 +76,8 @@ def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
||||
for body in bodies:
|
||||
tokens["content"] += ET.tostring(body).replace(b"ns0:", b"")
|
||||
|
||||
tokens["content"] += b"</root>"
|
||||
|
||||
# Reading assets
|
||||
|
||||
for asset in assets:
|
||||
@ -92,7 +96,11 @@ def get_author(author: Element) -> str:
|
||||
"""
|
||||
|
||||
res = []
|
||||
for tag_name in ("first-name", "middle-name", "last-name"):
|
||||
for tag_name in (
|
||||
"first-name",
|
||||
"middle-name",
|
||||
"last-name",
|
||||
):
|
||||
el = author.find(tag_name, namespaces)
|
||||
if not el is None:
|
||||
res.append(el.text)
|
||||
@ -112,3 +120,119 @@ def get_cover(coverpage: Optional[Element]) -> Optional[str]:
|
||||
|
||||
if coverpage:
|
||||
return coverpage.find("image", namespaces).get(HREF)
|
||||
|
||||
|
||||
def fb2body2html(tokens: Document_Tokens) -> str:
|
||||
|
||||
"""
|
||||
Convert fb2 xml to html, joins bodies into one string
|
||||
"""
|
||||
|
||||
res = b""
|
||||
|
||||
xml_root = ET.fromstring(tokens["content"])
|
||||
for body in xml_root.iterfind("./body"):
|
||||
res += process_section(body, tokens)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def process_section(body: Element, tokens: Document_Tokens) -> str:
|
||||
|
||||
"""
|
||||
Processes individual sections, recursively goes throw sections tree
|
||||
"""
|
||||
|
||||
res = b"<section>\n"
|
||||
|
||||
for tag in ("title", "epigraph", "annotation"):
|
||||
el = body.find("./" + tag)
|
||||
if el:
|
||||
process_content(el, tokens)
|
||||
res += children_to_html(el)
|
||||
image = body.find("./image")
|
||||
if image:
|
||||
process_image(image, tokens)
|
||||
res += ET.tostring(image)
|
||||
|
||||
for section in body.findall("./section"):
|
||||
if section.find("./section"):
|
||||
res += process_section(section, tokens)
|
||||
else:
|
||||
process_content(section, tokens)
|
||||
res += b"<section>\n" + children_to_html(section) + b"</section>\n"
|
||||
|
||||
return res + b"</section>\n"
|
||||
|
||||
|
||||
def children_to_html(root: Element) -> str:
|
||||
|
||||
"""
|
||||
Converts xml tag children to string
|
||||
"""
|
||||
|
||||
res = b""
|
||||
|
||||
for child in root:
|
||||
res += ET.tostring(child)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def process_image(el: Element, tokens: Document_Tokens) -> None:
|
||||
|
||||
"""
|
||||
Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
|
||||
"""
|
||||
|
||||
el.tag = "img"
|
||||
|
||||
href = el.get(HREF)
|
||||
el.attrib.pop(HREF)
|
||||
|
||||
el.set("src", tokens[href[1:]] if href[0] == "#" else href)
|
||||
|
||||
|
||||
tag_replacement = {
|
||||
"empty-line": "br",
|
||||
"emphasis": "em",
|
||||
"strikethrough": "strike",
|
||||
"v": "p",
|
||||
}
|
||||
|
||||
tag_with_class = {
|
||||
"subtitle": "p",
|
||||
"cite": "div",
|
||||
"poem": "div",
|
||||
"stanza": "div",
|
||||
"poem": "div",
|
||||
"poem": "div",
|
||||
"epigraph": "div",
|
||||
"text-author": "p",
|
||||
}
|
||||
|
||||
|
||||
def process_content(root: Element, tokens: Document_Tokens) -> None:
|
||||
|
||||
"""
|
||||
Converts fb2 xml tag names to html equivalents and my own styled elements.
|
||||
Resolves binary data dependencies
|
||||
"""
|
||||
|
||||
for child in root:
|
||||
process_content(child, tokens)
|
||||
|
||||
if child.tag == "a":
|
||||
href = child.get(HREF)
|
||||
child.attrib.pop(HREF)
|
||||
child.set("href", href)
|
||||
|
||||
if child.tag == "image":
|
||||
process_image(child, tokens)
|
||||
|
||||
elif child.tag in tag_replacement.keys():
|
||||
child.tag = tag_replacement[child.tag]
|
||||
|
||||
elif child.tag in tag_with_class.keys():
|
||||
child.set("class", child.tag)
|
||||
child.tag = tag_with_class[child.tag]
|
||||
|
3
app/utils.py
Normal file
3
app/utils.py
Normal file
@ -0,0 +1,3 @@
|
||||
from typing import Union
|
||||
|
||||
Document_Tokens = dict[str, Union[str, dict[str, str]]]
|
Loading…
x
Reference in New Issue
Block a user