Added fb2 file parsing to tokens

This commit is contained in:
Dmitriy Shishkov 2021-07-01 19:38:30 +05:00
parent 52057c0cd5
commit d8529740fd
No known key found for this signature in database
GPG Key ID: 14358F96FCDD8060
3 changed files with 117 additions and 4 deletions

View File

@ -14,7 +14,6 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
"""
try:
tokens = await epub_to_tokens(file)
...
# TODO: join tokens to HTML
@ -29,14 +28,14 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
"""
Passes file content to ebooklib library and parses epub tokens into dict of the following format:
Passes file content to EbookLib library and parses epub tokens into dict of the following format:
"\<file_name\>": "\<file_content\>"
{ "\<file_name\>": "\<file_content\>" }
Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
"""
tokens = {"metadata": {"test"}}
tokens = {"metadata": {"test": "t"}}
async with aiof.tempfile.NamedTemporaryFile() as tmp:
await tmp.write(file.read())
@ -49,6 +48,7 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
content = item.get_content()
if item_type == ebooklib.ITEM_DOCUMENT:
# Adding book chapters to tokens list
tokens[name] = content
elif item_type in (
@ -58,6 +58,7 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
ebooklib.ITEM_VIDEO,
ebooklib.ITEM_VECTOR,
):
# Adding assets to tokens list
media_type = item.media_type
b64_content = b64encode(content).decode()

111
app/fb2.py Normal file
View File

@ -0,0 +1,111 @@
from tempfile import SpooledTemporaryFile
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element
from typing import Optional
namespaces = {
"": "http://www.gribuser.ru/xml/fictionbook/2.0",
"xlink": "http://www.w3.org/1999/xlink",
}
HREF = f"{{{namespaces['xlink']}}}href"
async def fb22html(file: SpooledTemporaryFile) -> str:
"""
Splits fb2 to tokens and joins them to one html file
"""
try:
tokens = fb22tokens(file)
...
# TODO: join tokens to HTML
html_content = ""
...
return html_content
except Exception as e:
return "Error! Wrong FB2 file format: " + str(e)
def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
"""
Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it.
`tokens` format:
{ "metadata": { ... },
"content": "\<string\>",
"\<asset_id\>": "\<base64_data\>" }
"""
tokens = {"metadata": {}, "content": ""}
book = ET.parse(file)
description, body, *assets = book.getroot()
description: Element
body: Element
assets: list[Element]
# Reading book metadata
book_info = description.find("title-info")
if book_info:
metadata = {}
metadata["title"] = book_info.find("book-title", namespaces).text
metadata["author"] = get_author(book_info.find("author", namespaces))
metadata["cover"] = get_cover(book_info.find("coverpage", namespaces))
if metadata["cover"] is None:
metadata.pop("cover")
if len(metadata.keys()):
tokens["metadata"] = metadata.copy()
# Reading book content
tokens["content"] = ET.tostring(body).replace(b"ns0:", b"")
# Reading assets
for asset in assets:
key = asset.get("id")
media_type = asset.get("content-type")
b64_content = asset.text
tokens[key] = f"data:{media_type};base64,{b64_content}"
return tokens
def get_author(author: Element) -> str:
"""
Converts author xml structure to string
"""
res = []
for tag_name in ("first-name", "middle-name", "last-name"):
el = author.find(tag_name, namespaces)
if not el is None:
res.append(el.text)
if len(res) == 0:
res = author.find("nickname", namespaces).text
else:
res = " ".join(res)
return res
def get_cover(coverpage: Optional[Element]) -> Optional[str]:
"""
Extracts cover image id if exists
"""
if coverpage:
return coverpage.find("image", namespaces).get(HREF)

View File

@ -2,6 +2,7 @@ from fastapi import FastAPI, File, UploadFile
from fastapi.responses import HTMLResponse
from .epub import epub2html
from .fb2 import fb22html
app = FastAPI()