Added fb2 file parsing to tokens
This commit is contained in:
parent
52057c0cd5
commit
d8529740fd
@ -14,7 +14,6 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
|
||||
"""
|
||||
|
||||
try:
|
||||
|
||||
tokens = await epub_to_tokens(file)
|
||||
...
|
||||
# TODO: join tokens to HTML
|
||||
@ -29,14 +28,14 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
|
||||
async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
||||
|
||||
"""
|
||||
Passes file content to ebooklib library and parses epub tokens into dict of the following format:
|
||||
Passes file content to EbookLib library and parses epub tokens into dict of the following format:
|
||||
|
||||
"\<file_name\>": "\<file_content\>"
|
||||
{ "\<file_name\>": "\<file_content\>" }
|
||||
|
||||
Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
|
||||
"""
|
||||
|
||||
tokens = {"metadata": {"test"}}
|
||||
tokens = {"metadata": {"test": "t"}}
|
||||
|
||||
async with aiof.tempfile.NamedTemporaryFile() as tmp:
|
||||
await tmp.write(file.read())
|
||||
@ -49,6 +48,7 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
||||
content = item.get_content()
|
||||
|
||||
if item_type == ebooklib.ITEM_DOCUMENT:
|
||||
# Adding book chapters to tokens list
|
||||
tokens[name] = content
|
||||
|
||||
elif item_type in (
|
||||
@ -58,6 +58,7 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
||||
ebooklib.ITEM_VIDEO,
|
||||
ebooklib.ITEM_VECTOR,
|
||||
):
|
||||
# Adding assets to tokens list
|
||||
media_type = item.media_type
|
||||
b64_content = b64encode(content).decode()
|
||||
|
||||
|
111
app/fb2.py
Normal file
111
app/fb2.py
Normal file
@ -0,0 +1,111 @@
|
||||
from tempfile import SpooledTemporaryFile
|
||||
import xml.etree.ElementTree as ET
|
||||
from xml.etree.ElementTree import Element
|
||||
from typing import Optional
|
||||
|
||||
|
||||
namespaces = {
|
||||
"": "http://www.gribuser.ru/xml/fictionbook/2.0",
|
||||
"xlink": "http://www.w3.org/1999/xlink",
|
||||
}
|
||||
HREF = f"{{{namespaces['xlink']}}}href"
|
||||
|
||||
|
||||
async def fb22html(file: SpooledTemporaryFile) -> str:
|
||||
|
||||
"""
|
||||
Splits fb2 to tokens and joins them to one html file
|
||||
"""
|
||||
|
||||
try:
|
||||
|
||||
tokens = fb22tokens(file)
|
||||
...
|
||||
# TODO: join tokens to HTML
|
||||
html_content = ""
|
||||
...
|
||||
return html_content
|
||||
|
||||
except Exception as e:
|
||||
return "Error! Wrong FB2 file format: " + str(e)
|
||||
|
||||
|
||||
def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
||||
|
||||
"""
|
||||
Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it.
|
||||
|
||||
`tokens` format:
|
||||
|
||||
{ "metadata": { ... },
|
||||
|
||||
"content": "\<string\>",
|
||||
|
||||
"\<asset_id\>": "\<base64_data\>" }
|
||||
"""
|
||||
|
||||
tokens = {"metadata": {}, "content": ""}
|
||||
|
||||
book = ET.parse(file)
|
||||
description, body, *assets = book.getroot()
|
||||
|
||||
description: Element
|
||||
body: Element
|
||||
assets: list[Element]
|
||||
|
||||
# Reading book metadata
|
||||
|
||||
book_info = description.find("title-info")
|
||||
if book_info:
|
||||
metadata = {}
|
||||
metadata["title"] = book_info.find("book-title", namespaces).text
|
||||
metadata["author"] = get_author(book_info.find("author", namespaces))
|
||||
metadata["cover"] = get_cover(book_info.find("coverpage", namespaces))
|
||||
if metadata["cover"] is None:
|
||||
metadata.pop("cover")
|
||||
|
||||
if len(metadata.keys()):
|
||||
tokens["metadata"] = metadata.copy()
|
||||
|
||||
# Reading book content
|
||||
|
||||
tokens["content"] = ET.tostring(body).replace(b"ns0:", b"")
|
||||
|
||||
# Reading assets
|
||||
|
||||
for asset in assets:
|
||||
key = asset.get("id")
|
||||
media_type = asset.get("content-type")
|
||||
b64_content = asset.text
|
||||
tokens[key] = f"data:{media_type};base64,{b64_content}"
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def get_author(author: Element) -> str:
|
||||
|
||||
"""
|
||||
Converts author xml structure to string
|
||||
"""
|
||||
|
||||
res = []
|
||||
for tag_name in ("first-name", "middle-name", "last-name"):
|
||||
el = author.find(tag_name, namespaces)
|
||||
if not el is None:
|
||||
res.append(el.text)
|
||||
if len(res) == 0:
|
||||
res = author.find("nickname", namespaces).text
|
||||
else:
|
||||
res = " ".join(res)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def get_cover(coverpage: Optional[Element]) -> Optional[str]:
|
||||
|
||||
"""
|
||||
Extracts cover image id if exists
|
||||
"""
|
||||
|
||||
if coverpage:
|
||||
return coverpage.find("image", namespaces).get(HREF)
|
@ -2,6 +2,7 @@ from fastapi import FastAPI, File, UploadFile
|
||||
from fastapi.responses import HTMLResponse
|
||||
|
||||
from .epub import epub2html
|
||||
from .fb2 import fb22html
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user