Added fb2 file parsing to tokens

This commit is contained in:
Dmitriy Shishkov 2021-07-01 19:38:30 +05:00
parent 52057c0cd5
commit d8529740fd
No known key found for this signature in database
GPG Key ID: 14358F96FCDD8060
3 changed files with 117 additions and 4 deletions

View File

@ -14,7 +14,6 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
""" """
try: try:
tokens = await epub_to_tokens(file) tokens = await epub_to_tokens(file)
... ...
# TODO: join tokens to HTML # TODO: join tokens to HTML
@ -29,14 +28,14 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]: async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
""" """
Passes file content to ebooklib library and parses epub tokens into dict of the following format: Passes file content to EbookLib library and parses epub tokens into dict of the following format:
"\<file_name\>": "\<file_content\>" { "\<file_name\>": "\<file_content\>" }
Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
""" """
tokens = {"metadata": {"test"}} tokens = {"metadata": {"test": "t"}}
async with aiof.tempfile.NamedTemporaryFile() as tmp: async with aiof.tempfile.NamedTemporaryFile() as tmp:
await tmp.write(file.read()) await tmp.write(file.read())
@ -49,6 +48,7 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
content = item.get_content() content = item.get_content()
if item_type == ebooklib.ITEM_DOCUMENT: if item_type == ebooklib.ITEM_DOCUMENT:
# Adding book chapters to tokens list
tokens[name] = content tokens[name] = content
elif item_type in ( elif item_type in (
@ -58,6 +58,7 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
ebooklib.ITEM_VIDEO, ebooklib.ITEM_VIDEO,
ebooklib.ITEM_VECTOR, ebooklib.ITEM_VECTOR,
): ):
# Adding assets to tokens list
media_type = item.media_type media_type = item.media_type
b64_content = b64encode(content).decode() b64_content = b64encode(content).decode()

111
app/fb2.py Normal file
View File

@ -0,0 +1,111 @@
from tempfile import SpooledTemporaryFile
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element
from typing import Optional
namespaces = {
"": "http://www.gribuser.ru/xml/fictionbook/2.0",
"xlink": "http://www.w3.org/1999/xlink",
}
HREF = f"{{{namespaces['xlink']}}}href"
async def fb22html(file: SpooledTemporaryFile) -> str:
"""
Splits fb2 to tokens and joins them to one html file
"""
try:
tokens = fb22tokens(file)
...
# TODO: join tokens to HTML
html_content = ""
...
return html_content
except Exception as e:
return "Error! Wrong FB2 file format: " + str(e)
def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
"""
Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it.
`tokens` format:
{ "metadata": { ... },
"content": "\<string\>",
"\<asset_id\>": "\<base64_data\>" }
"""
tokens = {"metadata": {}, "content": ""}
book = ET.parse(file)
description, body, *assets = book.getroot()
description: Element
body: Element
assets: list[Element]
# Reading book metadata
book_info = description.find("title-info")
if book_info:
metadata = {}
metadata["title"] = book_info.find("book-title", namespaces).text
metadata["author"] = get_author(book_info.find("author", namespaces))
metadata["cover"] = get_cover(book_info.find("coverpage", namespaces))
if metadata["cover"] is None:
metadata.pop("cover")
if len(metadata.keys()):
tokens["metadata"] = metadata.copy()
# Reading book content
tokens["content"] = ET.tostring(body).replace(b"ns0:", b"")
# Reading assets
for asset in assets:
key = asset.get("id")
media_type = asset.get("content-type")
b64_content = asset.text
tokens[key] = f"data:{media_type};base64,{b64_content}"
return tokens
def get_author(author: Element) -> str:
"""
Converts author xml structure to string
"""
res = []
for tag_name in ("first-name", "middle-name", "last-name"):
el = author.find(tag_name, namespaces)
if not el is None:
res.append(el.text)
if len(res) == 0:
res = author.find("nickname", namespaces).text
else:
res = " ".join(res)
return res
def get_cover(coverpage: Optional[Element]) -> Optional[str]:
"""
Extracts cover image id if exists
"""
if coverpage:
return coverpage.find("image", namespaces).get(HREF)

View File

@ -2,6 +2,7 @@ from fastapi import FastAPI, File, UploadFile
from fastapi.responses import HTMLResponse from fastapi.responses import HTMLResponse
from .epub import epub2html from .epub import epub2html
from .fb2 import fb22html
app = FastAPI() app = FastAPI()