Added witespace stripping

This commit is contained in:
Dmitriy Shishkov 2021-07-02 04:05:45 +05:00
parent e5ade4b81e
commit 9327163e97
No known key found for this signature in database
GPG Key ID: 14358F96FCDD8060
3 changed files with 10 additions and 5 deletions

View File

@ -7,7 +7,7 @@ from ebooklib import epub
from tempfile import SpooledTemporaryFile
from .utils import Document_Tokens
from .utils import Document_Tokens, strip_whitespace
async def epub2html(file: SpooledTemporaryFile) -> str:
@ -24,7 +24,7 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
# TODO: join tokens to HTML
html_content = ""
...
return {**(tokens["metadata"]), "content": html_content}
return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
except Exception as e:
raise HTTPException(

View File

@ -4,7 +4,7 @@ from xml.etree.ElementTree import Element
from typing import Optional, Union
from fastapi import HTTPException
from .utils import Document_Tokens
from .utils import Document_Tokens, strip_whitespace
namespaces = {
@ -25,7 +25,7 @@ async def fb22html(file: SpooledTemporaryFile) -> dict[str, str]:
set_cover(tokens)
html_content = fb2body2html(tokens)
return {**(tokens["metadata"]), "content": html_content}
return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
except Exception as e:
raise HTTPException(
@ -71,7 +71,7 @@ def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
metadata["title"] = book_info.find("./book-title", namespaces).text
metadata["author"] = get_author(book_info.find("./author", namespaces))
metadata["cover"] = get_cover(book_info.find("./coverpage", namespaces))
if not 'cover' in metadata.keys():
if not "cover" in metadata.keys():
metadata.pop("cover")
if len(metadata.keys()):

View File

@ -1,5 +1,6 @@
from typing import Union, Optional
from pydantic import BaseModel
import re
Document_Tokens = dict[str, Union[str, dict[str, str]]]
@ -9,3 +10,7 @@ class HTMLBook(BaseModel):
author: str
cover: Optional[str]
content: str
def strip_whitespace(s: bytes) -> str:
return re.sub("\s+(?=<)", "", s.decode()).strip()