Added witespace stripping
This commit is contained in:
parent
e5ade4b81e
commit
9327163e97
@ -7,7 +7,7 @@ from ebooklib import epub
|
||||
|
||||
from tempfile import SpooledTemporaryFile
|
||||
|
||||
from .utils import Document_Tokens
|
||||
from .utils import Document_Tokens, strip_whitespace
|
||||
|
||||
|
||||
async def epub2html(file: SpooledTemporaryFile) -> str:
|
||||
@ -24,7 +24,7 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
|
||||
# TODO: join tokens to HTML
|
||||
html_content = ""
|
||||
...
|
||||
return {**(tokens["metadata"]), "content": html_content}
|
||||
return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
|
@ -4,7 +4,7 @@ from xml.etree.ElementTree import Element
|
||||
from typing import Optional, Union
|
||||
from fastapi import HTTPException
|
||||
|
||||
from .utils import Document_Tokens
|
||||
from .utils import Document_Tokens, strip_whitespace
|
||||
|
||||
|
||||
namespaces = {
|
||||
@ -25,7 +25,7 @@ async def fb22html(file: SpooledTemporaryFile) -> dict[str, str]:
|
||||
set_cover(tokens)
|
||||
html_content = fb2body2html(tokens)
|
||||
|
||||
return {**(tokens["metadata"]), "content": html_content}
|
||||
return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
@ -71,7 +71,7 @@ def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
|
||||
metadata["title"] = book_info.find("./book-title", namespaces).text
|
||||
metadata["author"] = get_author(book_info.find("./author", namespaces))
|
||||
metadata["cover"] = get_cover(book_info.find("./coverpage", namespaces))
|
||||
if not 'cover' in metadata.keys():
|
||||
if not "cover" in metadata.keys():
|
||||
metadata.pop("cover")
|
||||
|
||||
if len(metadata.keys()):
|
||||
|
@ -1,5 +1,6 @@
|
||||
from typing import Union, Optional
|
||||
from pydantic import BaseModel
|
||||
import re
|
||||
|
||||
Document_Tokens = dict[str, Union[str, dict[str, str]]]
|
||||
|
||||
@ -9,3 +10,7 @@ class HTMLBook(BaseModel):
|
||||
author: str
|
||||
cover: Optional[str]
|
||||
content: str
|
||||
|
||||
|
||||
def strip_whitespace(s: bytes) -> str:
|
||||
return re.sub("\s+(?=<)", "", s.decode()).strip()
|
||||
|
Loading…
x
Reference in New Issue
Block a user