diff --git a/app/epub.py b/app/epub.py index c9c6946..08bbad0 100644 --- a/app/epub.py +++ b/app/epub.py @@ -7,7 +7,7 @@ from ebooklib import epub from tempfile import SpooledTemporaryFile -from .utils import Document_Tokens +from .utils import Document_Tokens, strip_whitespace async def epub2html(file: SpooledTemporaryFile) -> str: @@ -24,7 +24,7 @@ async def epub2html(file: SpooledTemporaryFile) -> str: # TODO: join tokens to HTML html_content = "" ... - return {**(tokens["metadata"]), "content": html_content} + return {**(tokens["metadata"]), "content": strip_whitespace(html_content)} except Exception as e: raise HTTPException( diff --git a/app/fb2.py b/app/fb2.py index 2a2e49f..1771147 100644 --- a/app/fb2.py +++ b/app/fb2.py @@ -4,7 +4,7 @@ from xml.etree.ElementTree import Element from typing import Optional, Union from fastapi import HTTPException -from .utils import Document_Tokens +from .utils import Document_Tokens, strip_whitespace namespaces = { @@ -25,7 +25,7 @@ async def fb22html(file: SpooledTemporaryFile) -> dict[str, str]: set_cover(tokens) html_content = fb2body2html(tokens) - return {**(tokens["metadata"]), "content": html_content} + return {**(tokens["metadata"]), "content": strip_whitespace(html_content)} except Exception as e: raise HTTPException( @@ -71,7 +71,7 @@ def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens: metadata["title"] = book_info.find("./book-title", namespaces).text metadata["author"] = get_author(book_info.find("./author", namespaces)) metadata["cover"] = get_cover(book_info.find("./coverpage", namespaces)) - if not 'cover' in metadata.keys(): + if not "cover" in metadata.keys(): metadata.pop("cover") if len(metadata.keys()): diff --git a/app/utils.py b/app/utils.py index b277cca..92fcd74 100644 --- a/app/utils.py +++ b/app/utils.py @@ -1,5 +1,6 @@ from typing import Union, Optional from pydantic import BaseModel +import re Document_Tokens = dict[str, Union[str, dict[str, str]]] @@ -9,3 +10,7 @@ class HTMLBook(BaseModel): author: str cover: Optional[str] content: str + + +def strip_whitespace(s: bytes) -> str: + return re.sub("\s+(?=<)", "", s.decode()).strip()