Added witespace stripping
This commit is contained in:
parent
e5ade4b81e
commit
9327163e97
@ -7,7 +7,7 @@ from ebooklib import epub
|
|||||||
|
|
||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile
|
||||||
|
|
||||||
from .utils import Document_Tokens
|
from .utils import Document_Tokens, strip_whitespace
|
||||||
|
|
||||||
|
|
||||||
async def epub2html(file: SpooledTemporaryFile) -> str:
|
async def epub2html(file: SpooledTemporaryFile) -> str:
|
||||||
@ -24,7 +24,7 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
|
|||||||
# TODO: join tokens to HTML
|
# TODO: join tokens to HTML
|
||||||
html_content = ""
|
html_content = ""
|
||||||
...
|
...
|
||||||
return {**(tokens["metadata"]), "content": html_content}
|
return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
|
@ -4,7 +4,7 @@ from xml.etree.ElementTree import Element
|
|||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
from fastapi import HTTPException
|
from fastapi import HTTPException
|
||||||
|
|
||||||
from .utils import Document_Tokens
|
from .utils import Document_Tokens, strip_whitespace
|
||||||
|
|
||||||
|
|
||||||
namespaces = {
|
namespaces = {
|
||||||
@ -25,7 +25,7 @@ async def fb22html(file: SpooledTemporaryFile) -> dict[str, str]:
|
|||||||
set_cover(tokens)
|
set_cover(tokens)
|
||||||
html_content = fb2body2html(tokens)
|
html_content = fb2body2html(tokens)
|
||||||
|
|
||||||
return {**(tokens["metadata"]), "content": html_content}
|
return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
@ -71,7 +71,7 @@ def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
|
|||||||
metadata["title"] = book_info.find("./book-title", namespaces).text
|
metadata["title"] = book_info.find("./book-title", namespaces).text
|
||||||
metadata["author"] = get_author(book_info.find("./author", namespaces))
|
metadata["author"] = get_author(book_info.find("./author", namespaces))
|
||||||
metadata["cover"] = get_cover(book_info.find("./coverpage", namespaces))
|
metadata["cover"] = get_cover(book_info.find("./coverpage", namespaces))
|
||||||
if not 'cover' in metadata.keys():
|
if not "cover" in metadata.keys():
|
||||||
metadata.pop("cover")
|
metadata.pop("cover")
|
||||||
|
|
||||||
if len(metadata.keys()):
|
if len(metadata.keys()):
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from typing import Union, Optional
|
from typing import Union, Optional
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
import re
|
||||||
|
|
||||||
Document_Tokens = dict[str, Union[str, dict[str, str]]]
|
Document_Tokens = dict[str, Union[str, dict[str, str]]]
|
||||||
|
|
||||||
@ -9,3 +10,7 @@ class HTMLBook(BaseModel):
|
|||||||
author: str
|
author: str
|
||||||
cover: Optional[str]
|
cover: Optional[str]
|
||||||
content: str
|
content: str
|
||||||
|
|
||||||
|
|
||||||
|
def strip_whitespace(s: bytes) -> str:
|
||||||
|
return re.sub("\s+(?=<)", "", s.decode()).strip()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user