Finished epub to html convertion functionality, fixed all pylint errors
This commit is contained in:
parent
2f4a683cb4
commit
5155790357
213
app/epub.py
213
app/epub.py
@ -1,13 +1,25 @@
|
||||
import aiofiles as aiof
|
||||
from base64 import b64encode
|
||||
from fastapi import HTTPException
|
||||
"""
|
||||
Module for EPUB file conversion to html
|
||||
"""
|
||||
|
||||
from base64 import b64encode
|
||||
from functools import cache
|
||||
import html
|
||||
import os
|
||||
from tempfile import SpooledTemporaryFile
|
||||
|
||||
import aiofiles as aiof
|
||||
from fastapi import HTTPException
|
||||
from lxml import etree
|
||||
import ebooklib
|
||||
from ebooklib import epub
|
||||
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from .utils import DocumentTokens, strip_whitespace, HTMLBook
|
||||
|
||||
from .utils import Document_Tokens, strip_whitespace, HTMLBook
|
||||
parser = etree.XMLParser(recover=True)
|
||||
|
||||
IMAGE = "{http://www.w3.org/2000/svg}image"
|
||||
HREF = "{http://www.w3.org/1999/xlink}href"
|
||||
|
||||
|
||||
async def epub2html(file: SpooledTemporaryFile) -> HTMLBook:
|
||||
@ -22,24 +34,29 @@ async def epub2html(file: SpooledTemporaryFile) -> HTMLBook:
|
||||
|
||||
html_content = epub_tokens2html(spine, tokens)
|
||||
|
||||
return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
|
||||
return {
|
||||
**(tokens["metadata"]),
|
||||
"content": html_content,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
except Exception as err:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Error! Wrong epub file format: " + str(e)
|
||||
)
|
||||
status_code=500, detail="Error! Wrong epub file format: " + str(err)
|
||||
) from err
|
||||
|
||||
|
||||
async def epub_to_tokens(
|
||||
file: SpooledTemporaryFile,
|
||||
) -> tuple[Document_Tokens, list[tuple[str, str]]]:
|
||||
) -> tuple[DocumentTokens, list[tuple[str, str]]]:
|
||||
|
||||
"""
|
||||
Passes file content to EbookLib library and parses epub tokens into dict of the following format:
|
||||
r"""
|
||||
Passes file content to EbookLib library and parses epub tokens into dict of
|
||||
the following format:
|
||||
|
||||
{ "\<file_name\>": "\<file_content\>" }
|
||||
|
||||
Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
|
||||
Where file content is either plain text for xhtml or base64 encoded data
|
||||
for other formats, prepared for embeding to html
|
||||
"""
|
||||
|
||||
tokens = {}
|
||||
@ -61,19 +78,20 @@ async def epub_to_tokens(
|
||||
item: epub.EpubItem
|
||||
|
||||
item_type = item.get_type()
|
||||
file_path = reader.opf_dir + "/" + item.get_name()
|
||||
file_path = os.path.join(reader.opf_dir, item.get_name())
|
||||
|
||||
if item_type == ebooklib.ITEM_DOCUMENT:
|
||||
# Adding book chapters to tokens list
|
||||
name = item.get_id()
|
||||
tokens[file_path] = item.get_body_content()
|
||||
tokens[file_path] = strip_whitespace(item.get_body_content())
|
||||
tokens["toc"][name] = file_path
|
||||
|
||||
elif item_type in (
|
||||
ebooklib.ITEM_AUDIO,
|
||||
ebooklib.ITEM_COVER,
|
||||
ebooklib.ITEM_IMAGE,
|
||||
ebooklib.ITEM_VIDEO,
|
||||
ebooklib.ITEM_VECTOR,
|
||||
ebooklib.ITEM_VIDEO,
|
||||
):
|
||||
# Adding assets to tokens list
|
||||
|
||||
@ -89,7 +107,12 @@ async def epub_to_tokens(
|
||||
return tokens, book.spine.copy()
|
||||
|
||||
|
||||
def read_metadata(book: epub.EpubBook):
|
||||
def read_metadata(book: epub.EpubBook) -> dict[str, str]:
|
||||
|
||||
"""
|
||||
Reads metadata from xml to dict
|
||||
"""
|
||||
|
||||
metadata = {}
|
||||
metadata["title"] = book.get_metadata("DC", "title")[0][0]
|
||||
metadata["author"] = convert_list(book.get_metadata("DC", "creator"))
|
||||
@ -97,7 +120,12 @@ def read_metadata(book: epub.EpubBook):
|
||||
return metadata.copy()
|
||||
|
||||
|
||||
def convert_list(titles_list: list[tuple[str, dict[str, str]]]):
|
||||
def convert_list(titles_list: list[tuple[str, dict[str, str]]]) -> str:
|
||||
|
||||
"""
|
||||
Joins titles list to one string
|
||||
"""
|
||||
|
||||
res = []
|
||||
for title_obj in titles_list:
|
||||
res.append(title_obj[0])
|
||||
@ -105,23 +133,156 @@ def convert_list(titles_list: list[tuple[str, dict[str, str]]]):
|
||||
return "; ".join(res)
|
||||
|
||||
|
||||
def set_cover(tokens: Document_Tokens):
|
||||
def set_cover(tokens: DocumentTokens) -> None:
|
||||
|
||||
"""
|
||||
Converts cover file name to base64 image stored in `tokens`
|
||||
"""
|
||||
|
||||
cover_name = tokens["metadata"].get("cover")
|
||||
if cover_name in tokens.keys():
|
||||
tokens["metadata"]["cover"] = tokens[cover_name]
|
||||
|
||||
|
||||
def epub_tokens2html(spine: list[tuple[str, str]], tokens: Document_Tokens):
|
||||
res = b""
|
||||
def epub_tokens2html(spine: list[tuple[str, str]], tokens: DocumentTokens) -> bytes:
|
||||
|
||||
"""
|
||||
Joins chapters in `spice` to one html string
|
||||
"""
|
||||
|
||||
res = ""
|
||||
|
||||
for name, _ in spine:
|
||||
file_path = tokens["toc"].get(name)
|
||||
if file_path:
|
||||
res += process_xhtml(file_path, tokens)
|
||||
|
||||
return html.escape(html.unescape(res))
|
||||
|
||||
|
||||
def process_xhtml(path: str, tokens: DocumentTokens) -> bytes:
|
||||
|
||||
"""
|
||||
Processes content of one xml body
|
||||
"""
|
||||
|
||||
xml: etree.Element = etree.fromstring(tokens[path], parser=parser)
|
||||
|
||||
if xml.tag == "body":
|
||||
xml.tag = "div"
|
||||
|
||||
process_content(xml, path, tokens)
|
||||
|
||||
return (
|
||||
f'<section id="b_{path_to_name(path)}">{etree.tostring(xml).decode()}</section>'
|
||||
)
|
||||
|
||||
|
||||
def process_content(node: etree.Element, path: str, tokens: DocumentTokens) -> None:
|
||||
|
||||
"""
|
||||
Recursive function for xml element convertion to valid html
|
||||
"""
|
||||
|
||||
# Process universal tags
|
||||
|
||||
if node.get("epub:type"):
|
||||
node.attrib.pop("epub:type")
|
||||
el_id = node.get("id")
|
||||
if el_id:
|
||||
node.set("id", f"{path_to_name(path)}_{el_id}")
|
||||
|
||||
# Tag processing
|
||||
|
||||
if node.tag == "a":
|
||||
process_a_element(node, path)
|
||||
|
||||
elif node.tag == "hgroup":
|
||||
node.tag = "div"
|
||||
|
||||
elif node.tag in ("img", "source", "video", "audio"):
|
||||
process_media_element(node, path, tokens)
|
||||
|
||||
elif node.tag == IMAGE:
|
||||
href = node.get(HREF)
|
||||
media_path = rel_to_abs_path(path, href)
|
||||
if media_path in tokens.keys():
|
||||
node.set(HREF, tokens[media_path])
|
||||
|
||||
elif node.tag == "trigger":
|
||||
node.getparent().remove(node)
|
||||
|
||||
# Recursively run for all children
|
||||
|
||||
for child in node:
|
||||
process_content(child, path, tokens)
|
||||
|
||||
|
||||
def process_a_element(node: etree.Element, path: str):
|
||||
|
||||
r"""
|
||||
Converts `filed` links to ids in \<a\> element
|
||||
"""
|
||||
|
||||
href = node.get("href")
|
||||
if href.count(".xhtml") or href.count(".html"):
|
||||
id_pos = href.rfind("#")
|
||||
if id_pos != -1:
|
||||
href_path, el_id = href[:id_pos], href[id_pos:]
|
||||
node.set("href", f"#{path_to_name(href_path)}_{el_id[1:]}")
|
||||
else:
|
||||
node.set("href", f"#b_{path_to_name(href)}")
|
||||
elif href.count("#"):
|
||||
node.set("href", f"#{path_to_name(path)}_{href[1:]}")
|
||||
|
||||
|
||||
def process_media_element(node: etree.Element, path: str, tokens: DocumentTokens):
|
||||
|
||||
"""
|
||||
Replaces file paths to base64 encoded media in `src` and `srcset` tags
|
||||
"""
|
||||
|
||||
src = node.get("src")
|
||||
attr = "src"
|
||||
|
||||
if not src:
|
||||
src = node.get("srcset")
|
||||
attr = "srcset"
|
||||
|
||||
if src:
|
||||
media_path = rel_to_abs_path(path, src)
|
||||
if media_path in tokens.keys():
|
||||
node.set(attr, tokens[media_path])
|
||||
|
||||
|
||||
def rel_to_abs_path(parent: str, rel: str):
|
||||
|
||||
"""
|
||||
Helper for relative path to media convertion to absolute
|
||||
"""
|
||||
|
||||
return os.path.normpath(os.path.join(os.path.dirname(parent), rel))
|
||||
|
||||
|
||||
@cache
|
||||
def path_to_name(path: str) -> str:
|
||||
|
||||
"""
|
||||
Helper function for getting file name
|
||||
"""
|
||||
|
||||
return os.path.basename(path).split(".")[0]
|
||||
|
||||
|
||||
def children_to_html(root: etree.Element) -> bytes:
|
||||
|
||||
"""
|
||||
Converts all xml children of element to string and joins them
|
||||
"""
|
||||
|
||||
res = b""
|
||||
|
||||
for child in root:
|
||||
res += etree.tostring(child)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def process_xhtml(path: str, tokens: Document_Tokens):
|
||||
# TODO: Add xhtml procession
|
||||
return tokens[path]
|
||||
|
68
app/fb2.py
68
app/fb2.py
@ -1,10 +1,16 @@
|
||||
"""
|
||||
Module for FB2 file conversion to html
|
||||
"""
|
||||
|
||||
from tempfile import SpooledTemporaryFile
|
||||
import xml.etree.ElementTree as ET
|
||||
from xml.etree.ElementTree import Element
|
||||
from typing import Optional
|
||||
import html
|
||||
|
||||
from fastapi import HTTPException
|
||||
|
||||
from .utils import Document_Tokens, strip_whitespace, HTMLBook
|
||||
from .utils import DocumentTokens, strip_whitespace, HTMLBook
|
||||
|
||||
|
||||
namespaces = {
|
||||
@ -25,17 +31,20 @@ async def fb22html(file: SpooledTemporaryFile) -> HTMLBook:
|
||||
set_cover(tokens)
|
||||
html_content = fb2body2html(tokens)
|
||||
|
||||
return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
|
||||
return {
|
||||
**(tokens["metadata"]),
|
||||
"content": html.escape(html.unescape(html_content.decode())),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
except Exception as err:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Error! Wrong fb2 file format: " + str(e)
|
||||
)
|
||||
status_code=500, detail="Error! Wrong fb2 file format: " + str(err)
|
||||
) from err
|
||||
|
||||
|
||||
def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
|
||||
def fb22tokens(file: SpooledTemporaryFile) -> DocumentTokens:
|
||||
|
||||
"""
|
||||
r"""
|
||||
Parses fb2 file as xml document.
|
||||
It puts book metadata, its content and media to `tokens` dictionary and returns it.
|
||||
|
||||
@ -74,7 +83,7 @@ def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
|
||||
if "cover" not in metadata.keys():
|
||||
metadata.pop("cover")
|
||||
|
||||
if len(metadata.keys()):
|
||||
if len(metadata.keys()) != 0:
|
||||
tokens["metadata"] = metadata.copy()
|
||||
|
||||
# Reading book content
|
||||
@ -107,9 +116,9 @@ def get_author(author: Element) -> str:
|
||||
"middle-name",
|
||||
"last-name",
|
||||
):
|
||||
el = author.find("./" + tag_name, namespaces)
|
||||
if el is not None:
|
||||
res.append(el.text)
|
||||
tag = author.find("./" + tag_name, namespaces)
|
||||
if tag is not None:
|
||||
res.append(tag.text)
|
||||
if len(res) == 0:
|
||||
res = author.find("./nickname", namespaces).text
|
||||
else:
|
||||
@ -127,8 +136,11 @@ def get_cover(coverpage: Optional[Element]) -> Optional[str]:
|
||||
if coverpage:
|
||||
return coverpage.find("./image", namespaces).get(HREF)
|
||||
|
||||
return None
|
||||
|
||||
def set_cover(tokens: Document_Tokens) -> None:
|
||||
|
||||
def set_cover(tokens: DocumentTokens) -> None:
|
||||
"""Gets cover from book and sets it in metadata"""
|
||||
cover = tokens["metadata"]["cover"]
|
||||
if cover is None:
|
||||
tokens["metadata"]["cover"] = "none"
|
||||
@ -136,7 +148,7 @@ def set_cover(tokens: Document_Tokens) -> None:
|
||||
tokens["metadata"]["cover"] = tokens[cover[1:]]
|
||||
|
||||
|
||||
def fb2body2html(tokens: Document_Tokens) -> str:
|
||||
def fb2body2html(tokens: DocumentTokens) -> str:
|
||||
|
||||
"""
|
||||
Convert fb2 xml to html, joins bodies into one string
|
||||
@ -144,14 +156,14 @@ def fb2body2html(tokens: Document_Tokens) -> str:
|
||||
|
||||
res = b""
|
||||
|
||||
xml_root = ET.fromstring(tokens["content"])
|
||||
xml_root = ET.fromstring(strip_whitespace(tokens["content"]))
|
||||
for body in xml_root.iterfind("./body"):
|
||||
res += process_section(body, tokens)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def process_section(body: Element, tokens: Document_Tokens) -> str:
|
||||
def process_section(body: Element, tokens: DocumentTokens) -> str:
|
||||
|
||||
"""
|
||||
Processes individual sections, recursively goes throw sections tree
|
||||
@ -159,11 +171,11 @@ def process_section(body: Element, tokens: Document_Tokens) -> str:
|
||||
|
||||
res = b"<section>\n"
|
||||
|
||||
for tag in ("title", "epigraph", "annotation"):
|
||||
el = body.find("./" + tag)
|
||||
if el:
|
||||
process_content(el, tokens)
|
||||
res += children_to_html(el)
|
||||
for tag_name in ("title", "epigraph", "annotation"):
|
||||
tag = body.find("./" + tag_name)
|
||||
if tag:
|
||||
process_content(tag, tokens)
|
||||
res += children_to_html(tag)
|
||||
image = body.find("./image")
|
||||
if image:
|
||||
process_image(image, tokens)
|
||||
@ -193,18 +205,18 @@ def children_to_html(root: Element) -> str:
|
||||
return res
|
||||
|
||||
|
||||
def process_image(el: Element, tokens: Document_Tokens) -> None:
|
||||
def process_image(element: Element, tokens: DocumentTokens) -> None:
|
||||
|
||||
"""
|
||||
r"""
|
||||
Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
|
||||
"""
|
||||
|
||||
el.tag = "img"
|
||||
element.tag = "img"
|
||||
|
||||
href = el.get(HREF)
|
||||
el.attrib.pop(HREF)
|
||||
href = element.get(HREF)
|
||||
element.attrib.pop(HREF)
|
||||
|
||||
el.set("src", tokens[href[1:]] if href[0] == "#" else href)
|
||||
element.set("src", tokens[href[1:]] if href[0] == "#" else href)
|
||||
|
||||
|
||||
tag_replacement = {
|
||||
@ -219,14 +231,12 @@ tag_with_class = {
|
||||
"cite": "div",
|
||||
"poem": "div",
|
||||
"stanza": "div",
|
||||
"poem": "div",
|
||||
"poem": "div",
|
||||
"epigraph": "div",
|
||||
"text-author": "p",
|
||||
}
|
||||
|
||||
|
||||
def process_content(root: Element, tokens: Document_Tokens) -> None:
|
||||
def process_content(root: Element, tokens: DocumentTokens) -> None:
|
||||
|
||||
"""
|
||||
Converts fb2 xml tag names to html equivalents and my own styled elements.
|
||||
|
32
app/main.py
32
app/main.py
@ -1,19 +1,47 @@
|
||||
"""Webserver for epub and fb2 files convertation to html"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from fastapi import FastAPI, File, UploadFile, HTTPException
|
||||
from pydantic import BaseModel # pylint: disable=no-name-in-module
|
||||
|
||||
from .epub import epub2html
|
||||
from .fb2 import fb22html
|
||||
from .utils import HashedHTMLBook, add_hash
|
||||
|
||||
|
||||
class DebugInfo(BaseModel): # pylint: disable=too-few-public-methods
|
||||
"""Main handler return types"""
|
||||
|
||||
startup_time: str
|
||||
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
start_time = datetime.now()
|
||||
|
||||
@app.get("/")
|
||||
|
||||
@app.get("/", response_model=DebugInfo)
|
||||
def root():
|
||||
return "Hello, World!"
|
||||
"""
|
||||
Test if server is running.
|
||||
|
||||
Returns startup time
|
||||
"""
|
||||
return {"startup_time": start_time.isoformat()}
|
||||
|
||||
|
||||
@app.post("/uploadfile/", response_model=HashedHTMLBook)
|
||||
async def create_upload_file(file: UploadFile = File(...)):
|
||||
"""
|
||||
Main api handler:
|
||||
|
||||
Accepts files with fb2 and epub extensions
|
||||
|
||||
Returns HTTP 415 error if file has unsupported format
|
||||
|
||||
Else returns object with book metadata and its html
|
||||
"""
|
||||
if file.filename.endswith(".fb2"):
|
||||
content = await fb22html(file.file)
|
||||
elif file.filename.endswith(".epub"):
|
||||
|
35
app/utils.py
35
app/utils.py
@ -1,30 +1,44 @@
|
||||
"""
|
||||
Utils for publite_backend module
|
||||
"""
|
||||
|
||||
|
||||
from typing import Union, Optional
|
||||
from pydantic import BaseModel
|
||||
import re
|
||||
from hashlib import sha256
|
||||
|
||||
Document_Tokens = dict[str, Union[str, dict[str, str]]]
|
||||
from pydantic import BaseModel # pylint: disable=no-name-in-module
|
||||
|
||||
DocumentTokens = dict[str, Union[str, dict[str, str]]]
|
||||
|
||||
|
||||
class HTMLBook(BaseModel):
|
||||
class HTMLBook(BaseModel): # pylint: disable=too-few-public-methods
|
||||
"""Transitional model for returned book data"""
|
||||
|
||||
title: str
|
||||
author: str
|
||||
cover: Optional[str]
|
||||
content: str
|
||||
|
||||
|
||||
class HashedHTMLBook(HTMLBook):
|
||||
class HashedHTMLBook(HTMLBook): # pylint: disable=too-few-public-methods
|
||||
"""Model for returned book data with content hash"""
|
||||
|
||||
hash: str
|
||||
|
||||
|
||||
replacements = [
|
||||
(" ", "\r"),
|
||||
(">\s+?<", "><"),
|
||||
(" ", ""),
|
||||
("", ""),
|
||||
(r">\s+?<", "><"),
|
||||
]
|
||||
|
||||
|
||||
def strip_whitespace(s: bytes) -> str:
|
||||
res = s.decode()
|
||||
def strip_whitespace(string: bytes) -> str:
|
||||
|
||||
"""Removes"""
|
||||
|
||||
res = string.decode()
|
||||
|
||||
for old, new in replacements:
|
||||
res = re.sub(old, new, res)
|
||||
@ -33,6 +47,11 @@ def strip_whitespace(s: bytes) -> str:
|
||||
|
||||
|
||||
def add_hash(content: HTMLBook) -> HashedHTMLBook:
|
||||
|
||||
"""
|
||||
Adds hash of book content
|
||||
"""
|
||||
|
||||
h_content: HashedHTMLBook = content.copy()
|
||||
h_content["hash"] = sha256(content["content"].encode()).hexdigest()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user