Added witespace stripping

2021-07-02 04:05:45 +05:00
parent e5ade4b81e
commit 9327163e97
3 changed files with 10 additions and 5 deletions
--- a/app/epub.py
+++ b/app/epub.py
@ -7,7 +7,7 @@ from ebooklib import epub

 from tempfile import SpooledTemporaryFile

-from .utils import Document_Tokens
+from .utils import Document_Tokens, strip_whitespace


 async def epub2html(file: SpooledTemporaryFile) -> str:
@ -24,7 +24,7 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
        # TODO: join tokens to HTML
        html_content = ""
        ...
-        return {**(tokens["metadata"]), "content": html_content}
+        return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}

    except Exception as e:
        raise HTTPException(
--- a/app/fb2.py
+++ b/app/fb2.py
@ -4,7 +4,7 @@ from xml.etree.ElementTree import Element
 from typing import Optional, Union
 from fastapi import HTTPException

-from .utils import Document_Tokens
+from .utils import Document_Tokens, strip_whitespace


 namespaces = {
@ -25,7 +25,7 @@ async def fb22html(file: SpooledTemporaryFile) -> dict[str, str]:
        set_cover(tokens)
        html_content = fb2body2html(tokens)

-        return {**(tokens["metadata"]), "content": html_content}
+        return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}

    except Exception as e:
        raise HTTPException(
@ -71,7 +71,7 @@ def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
        metadata["title"] = book_info.find("./book-title", namespaces).text
        metadata["author"] = get_author(book_info.find("./author", namespaces))
        metadata["cover"] = get_cover(book_info.find("./coverpage", namespaces))
-        if not 'cover' in metadata.keys():
+        if not "cover" in metadata.keys():
            metadata.pop("cover")

        if len(metadata.keys()):
--- a/app/utils.py
+++ b/app/utils.py
@ -1,5 +1,6 @@
 from typing import Union, Optional
 from pydantic import BaseModel
+import re

 Document_Tokens = dict[str, Union[str, dict[str, str]]]

@ -9,3 +10,7 @@ class HTMLBook(BaseModel):
    author: str
    cover: Optional[str]
    content: str
+
+
+def strip_whitespace(s: bytes) -> str:
+    return re.sub("\s+(?=<)", "", s.decode()).strip()