Added fb2 file parsing to tokens
This commit is contained in:
parent
52057c0cd5
commit
d8529740fd
@ -14,7 +14,6 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
tokens = await epub_to_tokens(file)
|
tokens = await epub_to_tokens(file)
|
||||||
...
|
...
|
||||||
# TODO: join tokens to HTML
|
# TODO: join tokens to HTML
|
||||||
@ -29,14 +28,14 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
|
|||||||
async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Passes file content to ebooklib library and parses epub tokens into dict of the following format:
|
Passes file content to EbookLib library and parses epub tokens into dict of the following format:
|
||||||
|
|
||||||
"\<file_name\>": "\<file_content\>"
|
{ "\<file_name\>": "\<file_content\>" }
|
||||||
|
|
||||||
Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
|
Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tokens = {"metadata": {"test"}}
|
tokens = {"metadata": {"test": "t"}}
|
||||||
|
|
||||||
async with aiof.tempfile.NamedTemporaryFile() as tmp:
|
async with aiof.tempfile.NamedTemporaryFile() as tmp:
|
||||||
await tmp.write(file.read())
|
await tmp.write(file.read())
|
||||||
@ -49,6 +48,7 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
|||||||
content = item.get_content()
|
content = item.get_content()
|
||||||
|
|
||||||
if item_type == ebooklib.ITEM_DOCUMENT:
|
if item_type == ebooklib.ITEM_DOCUMENT:
|
||||||
|
# Adding book chapters to tokens list
|
||||||
tokens[name] = content
|
tokens[name] = content
|
||||||
|
|
||||||
elif item_type in (
|
elif item_type in (
|
||||||
@ -58,6 +58,7 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
|||||||
ebooklib.ITEM_VIDEO,
|
ebooklib.ITEM_VIDEO,
|
||||||
ebooklib.ITEM_VECTOR,
|
ebooklib.ITEM_VECTOR,
|
||||||
):
|
):
|
||||||
|
# Adding assets to tokens list
|
||||||
media_type = item.media_type
|
media_type = item.media_type
|
||||||
b64_content = b64encode(content).decode()
|
b64_content = b64encode(content).decode()
|
||||||
|
|
||||||
|
111
app/fb2.py
Normal file
111
app/fb2.py
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
from tempfile import SpooledTemporaryFile
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from xml.etree.ElementTree import Element
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
namespaces = {
|
||||||
|
"": "http://www.gribuser.ru/xml/fictionbook/2.0",
|
||||||
|
"xlink": "http://www.w3.org/1999/xlink",
|
||||||
|
}
|
||||||
|
HREF = f"{{{namespaces['xlink']}}}href"
|
||||||
|
|
||||||
|
|
||||||
|
async def fb22html(file: SpooledTemporaryFile) -> str:
|
||||||
|
|
||||||
|
"""
|
||||||
|
Splits fb2 to tokens and joins them to one html file
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
tokens = fb22tokens(file)
|
||||||
|
...
|
||||||
|
# TODO: join tokens to HTML
|
||||||
|
html_content = ""
|
||||||
|
...
|
||||||
|
return html_content
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return "Error! Wrong FB2 file format: " + str(e)
|
||||||
|
|
||||||
|
|
||||||
|
def fb22tokens(file: SpooledTemporaryFile) -> dict[str, str]:
|
||||||
|
|
||||||
|
"""
|
||||||
|
Parses fb2 file as xml document. It puts book metadata, its content and media to `tokens` dictionary and returns it.
|
||||||
|
|
||||||
|
`tokens` format:
|
||||||
|
|
||||||
|
{ "metadata": { ... },
|
||||||
|
|
||||||
|
"content": "\<string\>",
|
||||||
|
|
||||||
|
"\<asset_id\>": "\<base64_data\>" }
|
||||||
|
"""
|
||||||
|
|
||||||
|
tokens = {"metadata": {}, "content": ""}
|
||||||
|
|
||||||
|
book = ET.parse(file)
|
||||||
|
description, body, *assets = book.getroot()
|
||||||
|
|
||||||
|
description: Element
|
||||||
|
body: Element
|
||||||
|
assets: list[Element]
|
||||||
|
|
||||||
|
# Reading book metadata
|
||||||
|
|
||||||
|
book_info = description.find("title-info")
|
||||||
|
if book_info:
|
||||||
|
metadata = {}
|
||||||
|
metadata["title"] = book_info.find("book-title", namespaces).text
|
||||||
|
metadata["author"] = get_author(book_info.find("author", namespaces))
|
||||||
|
metadata["cover"] = get_cover(book_info.find("coverpage", namespaces))
|
||||||
|
if metadata["cover"] is None:
|
||||||
|
metadata.pop("cover")
|
||||||
|
|
||||||
|
if len(metadata.keys()):
|
||||||
|
tokens["metadata"] = metadata.copy()
|
||||||
|
|
||||||
|
# Reading book content
|
||||||
|
|
||||||
|
tokens["content"] = ET.tostring(body).replace(b"ns0:", b"")
|
||||||
|
|
||||||
|
# Reading assets
|
||||||
|
|
||||||
|
for asset in assets:
|
||||||
|
key = asset.get("id")
|
||||||
|
media_type = asset.get("content-type")
|
||||||
|
b64_content = asset.text
|
||||||
|
tokens[key] = f"data:{media_type};base64,{b64_content}"
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
def get_author(author: Element) -> str:
|
||||||
|
|
||||||
|
"""
|
||||||
|
Converts author xml structure to string
|
||||||
|
"""
|
||||||
|
|
||||||
|
res = []
|
||||||
|
for tag_name in ("first-name", "middle-name", "last-name"):
|
||||||
|
el = author.find(tag_name, namespaces)
|
||||||
|
if not el is None:
|
||||||
|
res.append(el.text)
|
||||||
|
if len(res) == 0:
|
||||||
|
res = author.find("nickname", namespaces).text
|
||||||
|
else:
|
||||||
|
res = " ".join(res)
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def get_cover(coverpage: Optional[Element]) -> Optional[str]:
|
||||||
|
|
||||||
|
"""
|
||||||
|
Extracts cover image id if exists
|
||||||
|
"""
|
||||||
|
|
||||||
|
if coverpage:
|
||||||
|
return coverpage.find("image", namespaces).get(HREF)
|
@ -2,6 +2,7 @@ from fastapi import FastAPI, File, UploadFile
|
|||||||
from fastapi.responses import HTMLResponse
|
from fastapi.responses import HTMLResponse
|
||||||
|
|
||||||
from .epub import epub2html
|
from .epub import epub2html
|
||||||
|
from .fb2 import fb22html
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user