Added basic epub html creation

This commit is contained in:
Dmitriy Shishkov 2021-07-03 00:17:03 +05:00
parent c3e79b8b85
commit 58e7b1488b
No known key found for this signature in database
GPG Key ID: 14358F96FCDD8060
3 changed files with 50 additions and 14 deletions

View File

@ -17,13 +17,11 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
"""
try:
tokens = await epub_to_tokens(file)
tokens, spine = await epub_to_tokens(file)
set_cover(tokens)
html_content = epub_tokens2html(spine, tokens)
print(tokens["metadata"])
...
# TODO: join tokens to HTML
html_content = ""
...
return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
except Exception as e:
@ -32,7 +30,9 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
)
async def epub_to_tokens(file: SpooledTemporaryFile) -> Document_Tokens:
async def epub_to_tokens(
file: SpooledTemporaryFile,
) -> tuple[Document_Tokens, list[tuple[str, str]]]:
"""
Passes file content to EbookLib library and parses epub tokens into dict of the following format:
@ -60,14 +60,14 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> Document_Tokens:
# Iterating over Items
for item in book.get_items():
item: epub.EpubItem
item_type = item.get_type()
name = item.get_name()
content = item.get_content()
if item_type == ebooklib.ITEM_DOCUMENT:
# Adding book chapters to tokens list
tokens[name] = content
name = item.id
tokens[name] = item.get_body_content()
elif item_type in (
ebooklib.ITEM_COVER,
@ -77,6 +77,8 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> Document_Tokens:
ebooklib.ITEM_VECTOR,
):
# Adding assets to tokens list
name = item.get_name()
content = item.get_content()
media_type = item.media_type
b64_content = b64encode(content).decode()
@ -85,7 +87,7 @@ async def epub_to_tokens(file: SpooledTemporaryFile) -> Document_Tokens:
if item_type == ebooklib.ITEM_COVER:
tokens["metadata"]["cover"] = name
return tokens
return tokens, book.spine.copy()
def convert_list(titles_list: list[tuple[str, dict[str, str]]]):
@ -94,3 +96,26 @@ def convert_list(titles_list: list[tuple[str, dict[str, str]]]):
res.append(title_obj[0])
return "; ".join(res)
def set_cover(tokens: Document_Tokens):
cover_name = tokens["metadata"]["cover"]
if cover_name in tokens.keys():
tokens["metadata"]["cover"] = tokens[cover_name]
def epub_tokens2html(spine: list[tuple[str, str]], tokens: Document_Tokens):
res = b""
print(spine)
for name, enabled in spine:
if name in tokens.keys():
res += process_xhtml(tokens[name], tokens)
return res
def process_xhtml(xhtml: bytes, tokens: Document_Tokens):
# TODO: Add xhtml procession
return xhtml

View File

@ -16,8 +16,8 @@ def root():
async def create_upload_file(file: UploadFile = File(...)):
if file.filename.endswith(".fb2"):
content = await fb22html(file.file)
# elif file.filename.endswith(".epub"):
# content = await epub2html(file.file)
elif file.filename.endswith(".epub"):
content = await epub2html(file.file)
else:
raise HTTPException(status_code=415, detail="Error! Unsupported file type")
return content

View File

@ -12,5 +12,16 @@ class HTMLBook(BaseModel):
content: str
replacements = [
("
", "\r"),
(">\s+?<", "><"),
]
def strip_whitespace(s: bytes) -> str:
return re.sub("\s+(?=<)", "", s.decode()).strip()
res = s.decode()
for old, new in replacements:
res = re.sub(old, new, res)
return res.strip()