Added witespace stripping

2021-07-02 04:05:45 +05:00
parent e5ade4b81e
commit 9327163e97
3 changed files with 10 additions and 5 deletions
--- a/app/epub.py
+++ b/app/epub.py
@ -7,7 +7,7 @@ from ebooklib import epub
 from tempfile import SpooledTemporaryFile
-from .utils import Document_Tokens
+from .utils import Document_Tokens, strip_whitespace
 async def epub2html(file: SpooledTemporaryFile) -> str:
@ -24,7 +24,7 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
        # TODO: join tokens to HTML
        html_content = ""
        ...
-        return {**(tokens["metadata"]), "content": html_content}
+        return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
    except Exception as e:
        raise HTTPException(
--- a/app/fb2.py
+++ b/app/fb2.py
@ -4,7 +4,7 @@ from xml.etree.ElementTree import Element
 from typing import Optional, Union
 from fastapi import HTTPException
-from .utils import Document_Tokens
+from .utils import Document_Tokens, strip_whitespace
 namespaces = {
@ -25,7 +25,7 @@ async def fb22html(file: SpooledTemporaryFile) -> dict[str, str]:
        set_cover(tokens)
        html_content = fb2body2html(tokens)
-        return {**(tokens["metadata"]), "content": html_content}
+        return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
    except Exception as e:
        raise HTTPException(
@ -71,7 +71,7 @@ def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
        metadata["title"] = book_info.find("./book-title", namespaces).text
        metadata["author"] = get_author(book_info.find("./author", namespaces))
        metadata["cover"] = get_cover(book_info.find("./coverpage", namespaces))
-        if not 'cover' in metadata.keys():
+        if not "cover" in metadata.keys():
            metadata.pop("cover")
        if len(metadata.keys()):
--- a/app/utils.py
+++ b/app/utils.py
@ -1,5 +1,6 @@
 from typing import Union, Optional
 from pydantic import BaseModel
 import re
 Document_Tokens = dict[str, Union[str, dict[str, str]]]
@ -9,3 +10,7 @@ class HTMLBook(BaseModel):
    author: str
    cover: Optional[str]
    content: str
 def strip_whitespace(s: bytes) -> str:
    return re.sub("\s+(?=<)", "", s.decode()).strip()