Added fb2 xml to html convertion
This commit is contained in:
parent
f717291823
commit
4eae488f45
@ -6,6 +6,7 @@ from ebooklib import epub
|
|||||||
|
|
||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile
|
||||||
|
|
||||||
|
from .utils import Document_Tokens
|
||||||
|
|
||||||
async def epub2html(file: SpooledTemporaryFile) -> str:
|
async def epub2html(file: SpooledTemporaryFile) -> str:
|
||||||
|
|
||||||
@ -27,7 +28,7 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
|
|||||||
return "Error! Wrong epub file format: " + str(e)
|
return "Error! Wrong epub file format: " + str(e)
|
||||||
|
|
||||||
|
|
||||||
async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
async def epub_to_tokens(file: SpooledTemporaryFile) -> Document_Tokens:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Passes file content to EbookLib library and parses epub tokens into dict of the following format:
|
Passes file content to EbookLib library and parses epub tokens into dict of the following format:
|
||||||
|
142
app/fb2.py
142
app/fb2.py
@ -3,6 +3,8 @@ import xml.etree.ElementTree as ET
|
|||||||
from xml.etree.ElementTree import Element
|
from xml.etree.ElementTree import Element
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
from .utils import Document_Tokens
|
||||||
|
|
||||||
|
|
||||||
namespaces = {
|
namespaces = {
|
||||||
"": "http://www.gribuser.ru/xml/fictionbook/2.0",
|
"": "http://www.gribuser.ru/xml/fictionbook/2.0",
|
||||||
@ -20,22 +22,19 @@ async def fb22html(file: SpooledTemporaryFile) -> str:
|
|||||||
try:
|
try:
|
||||||
|
|
||||||
tokens = fb22tokens(file)
|
tokens = fb22tokens(file)
|
||||||
...
|
html_content = fb2body2html(tokens)
|
||||||
# TODO: join tokens to HTML
|
|
||||||
html_content = ""
|
|
||||||
...
|
|
||||||
|
|
||||||
print(tokens.keys())
|
|
||||||
return html_content
|
return html_content
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return "Error! Wrong FB2 file format: " + str(e)
|
return "Error! Wrong FB2 file format: " + str(e)
|
||||||
|
|
||||||
|
|
||||||
def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it.
|
Parses fb2 file as xml document.
|
||||||
|
It puts book metadata, its content and media to `tokens` dictionary and returns it.
|
||||||
|
|
||||||
`tokens` format:
|
`tokens` format:
|
||||||
|
|
||||||
@ -46,7 +45,10 @@ def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
|||||||
"\<asset_id\>": "\<base64_data\>" }
|
"\<asset_id\>": "\<base64_data\>" }
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tokens = {"metadata": {}, "content": b""}
|
tokens = {
|
||||||
|
"metadata": {},
|
||||||
|
"content": b"<root>",
|
||||||
|
}
|
||||||
|
|
||||||
book = ET.parse(file)
|
book = ET.parse(file)
|
||||||
root = book.getroot()
|
root = book.getroot()
|
||||||
@ -74,6 +76,8 @@ def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
|||||||
for body in bodies:
|
for body in bodies:
|
||||||
tokens["content"] += ET.tostring(body).replace(b"ns0:", b"")
|
tokens["content"] += ET.tostring(body).replace(b"ns0:", b"")
|
||||||
|
|
||||||
|
tokens["content"] += b"</root>"
|
||||||
|
|
||||||
# Reading assets
|
# Reading assets
|
||||||
|
|
||||||
for asset in assets:
|
for asset in assets:
|
||||||
@ -92,7 +96,11 @@ def get_author(author: Element) -> str:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
res = []
|
res = []
|
||||||
for tag_name in ("first-name", "middle-name", "last-name"):
|
for tag_name in (
|
||||||
|
"first-name",
|
||||||
|
"middle-name",
|
||||||
|
"last-name",
|
||||||
|
):
|
||||||
el = author.find(tag_name, namespaces)
|
el = author.find(tag_name, namespaces)
|
||||||
if not el is None:
|
if not el is None:
|
||||||
res.append(el.text)
|
res.append(el.text)
|
||||||
@ -112,3 +120,119 @@ def get_cover(coverpage: Optional[Element]) -> Optional[str]:
|
|||||||
|
|
||||||
if coverpage:
|
if coverpage:
|
||||||
return coverpage.find("image", namespaces).get(HREF)
|
return coverpage.find("image", namespaces).get(HREF)
|
||||||
|
|
||||||
|
|
||||||
|
def fb2body2html(tokens: Document_Tokens) -> str:
|
||||||
|
|
||||||
|
"""
|
||||||
|
Convert fb2 xml to html, joins bodies into one string
|
||||||
|
"""
|
||||||
|
|
||||||
|
res = b""
|
||||||
|
|
||||||
|
xml_root = ET.fromstring(tokens["content"])
|
||||||
|
for body in xml_root.iterfind("./body"):
|
||||||
|
res += process_section(body, tokens)
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def process_section(body: Element, tokens: Document_Tokens) -> str:
|
||||||
|
|
||||||
|
"""
|
||||||
|
Processes individual sections, recursively goes throw sections tree
|
||||||
|
"""
|
||||||
|
|
||||||
|
res = b"<section>\n"
|
||||||
|
|
||||||
|
for tag in ("title", "epigraph", "annotation"):
|
||||||
|
el = body.find("./" + tag)
|
||||||
|
if el:
|
||||||
|
process_content(el, tokens)
|
||||||
|
res += children_to_html(el)
|
||||||
|
image = body.find("./image")
|
||||||
|
if image:
|
||||||
|
process_image(image, tokens)
|
||||||
|
res += ET.tostring(image)
|
||||||
|
|
||||||
|
for section in body.findall("./section"):
|
||||||
|
if section.find("./section"):
|
||||||
|
res += process_section(section, tokens)
|
||||||
|
else:
|
||||||
|
process_content(section, tokens)
|
||||||
|
res += b"<section>\n" + children_to_html(section) + b"</section>\n"
|
||||||
|
|
||||||
|
return res + b"</section>\n"
|
||||||
|
|
||||||
|
|
||||||
|
def children_to_html(root: Element) -> str:
|
||||||
|
|
||||||
|
"""
|
||||||
|
Converts xml tag children to string
|
||||||
|
"""
|
||||||
|
|
||||||
|
res = b""
|
||||||
|
|
||||||
|
for child in root:
|
||||||
|
res += ET.tostring(child)
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def process_image(el: Element, tokens: Document_Tokens) -> None:
|
||||||
|
|
||||||
|
"""
|
||||||
|
Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
|
||||||
|
"""
|
||||||
|
|
||||||
|
el.tag = "img"
|
||||||
|
|
||||||
|
href = el.get(HREF)
|
||||||
|
el.attrib.pop(HREF)
|
||||||
|
|
||||||
|
el.set("src", tokens[href[1:]] if href[0] == "#" else href)
|
||||||
|
|
||||||
|
|
||||||
|
tag_replacement = {
|
||||||
|
"empty-line": "br",
|
||||||
|
"emphasis": "em",
|
||||||
|
"strikethrough": "strike",
|
||||||
|
"v": "p",
|
||||||
|
}
|
||||||
|
|
||||||
|
tag_with_class = {
|
||||||
|
"subtitle": "p",
|
||||||
|
"cite": "div",
|
||||||
|
"poem": "div",
|
||||||
|
"stanza": "div",
|
||||||
|
"poem": "div",
|
||||||
|
"poem": "div",
|
||||||
|
"epigraph": "div",
|
||||||
|
"text-author": "p",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def process_content(root: Element, tokens: Document_Tokens) -> None:
|
||||||
|
|
||||||
|
"""
|
||||||
|
Converts fb2 xml tag names to html equivalents and my own styled elements.
|
||||||
|
Resolves binary data dependencies
|
||||||
|
"""
|
||||||
|
|
||||||
|
for child in root:
|
||||||
|
process_content(child, tokens)
|
||||||
|
|
||||||
|
if child.tag == "a":
|
||||||
|
href = child.get(HREF)
|
||||||
|
child.attrib.pop(HREF)
|
||||||
|
child.set("href", href)
|
||||||
|
|
||||||
|
if child.tag == "image":
|
||||||
|
process_image(child, tokens)
|
||||||
|
|
||||||
|
elif child.tag in tag_replacement.keys():
|
||||||
|
child.tag = tag_replacement[child.tag]
|
||||||
|
|
||||||
|
elif child.tag in tag_with_class.keys():
|
||||||
|
child.set("class", child.tag)
|
||||||
|
child.tag = tag_with_class[child.tag]
|
||||||
|
3
app/utils.py
Normal file
3
app/utils.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
from typing import Union
|
||||||
|
|
||||||
|
Document_Tokens = dict[str, Union[str, dict[str, str]]]
|
Loading…
x
Reference in New Issue
Block a user