Compare commits

...

12 Commits
v0.1.0 ... main

17 changed files with 412 additions and 150 deletions

View File

@ -19,12 +19,10 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 black
pip install -r requirements/dev.txt
- name: Lint with flake8
run: |
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Lint with pylint
run: pylint app --extension-pkg-allow-list=lxml
- name: Format with black
run: black .

View File

@ -13,12 +13,6 @@ jobs:
- name: Checkout
uses: actions/checkout@v2
- name: Docker meta
id: meta
uses: docker/metadata-action@v3
with:
images: ghcr.io/${{ github.repository }}
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v1
@ -44,9 +38,10 @@ jobs:
with:
context: .
file: ./Dockerfile
builder: ${{ steps.buildx.outputs.name }}
cache-from: type=local,src=/tmp/.buildx-cache
cache-to: type=local,dest=/tmp/.buildx-cache
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
tags: ghcr.io/${{ github.repository }}:${{ github.sha }},ghcr.io/${{ github.repository }}:latest

View File

@ -50,9 +50,11 @@ jobs:
with:
context: .
file: ./Dockerfile
builder: ${{ steps.buildx.outputs.name }}
cache-from: type=local,src=/tmp/.buildx-cache
cache-to: type=local,dest=/tmp/.buildx-cache
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

3
.gitignore vendored
View File

@ -1,3 +1,4 @@
.venv
__pycache__/
.vscode
.vscode
.vercel

View File

@ -1,13 +1,11 @@
FROM python
FROM python:alpine
WORKDIR /srv
COPY ./requirements.txt /srv/requirements.txt
COPY ./requirements /srv/requirements
RUN pip install -r requirements.txt
EXPOSE 80
RUN pip install -r requirements/prod.txt
COPY ./app /srv/app
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80"]
CMD uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8081}

View File

@ -10,19 +10,54 @@ Backend for online ebook viewer publite
## Deploy
Run app locally (development only!)
```bash
# install requirements
pip install -r requirements/dev.txt
# run app with uvicorn
uvicorn app.main:app --reload --port <port>
```
Run app locally (test prod)
```bash
# install requirements
pip install -r requirements/prod.txt
# run app with uvicorn
uvicorn app.main:app --port <port>
# or
# run with python script
python run.py
```
Simple docker deployment
```bash
# build docker image
# build docker image
docker build . -t publite_backend
# run it with docker
docker run -p <port>:80 publite_backend
docker run -p <port>:8081 publite_backend
```
Dokku deployment with image from Docker Hub
```bash
dokku apps:create publitebackend
# increase file size limit to be able to upload bigger books
dokku nginx:set publitebackend client_max_body_size 50m
dokku git:from-image publitebackend publite/backend:latest
```
```
# TODO
- Separate epub and fb2 files to python modules
- Rewrite own `.opf` file parsing to get rid of dependency on EbookLib
- Add cli interfaces for epub and fb2 libs

1
api Symbolic link
View File

@ -0,0 +1 @@
app

0
app/__init__.py Normal file
View File

View File

@ -1,16 +1,28 @@
import aiofiles as aiof
"""
Module for EPUB file conversion to html
"""
import html
import os
from base64 import b64encode
from fastapi import HTTPException
import ebooklib
from ebooklib import epub
from functools import cache
from tempfile import SpooledTemporaryFile
from .utils import Document_Tokens, strip_whitespace
import aiofiles
import ebooklib
from ebooklib import epub
from fastapi import HTTPException
from lxml import etree
from .utils import DocumentTokens, HTMLBook, strip_whitespace
parser = etree.XMLParser(recover=True)
IMAGE = "{http://www.w3.org/2000/svg}image"
HREF = "{http://www.w3.org/1999/xlink}href"
async def epub2html(file: SpooledTemporaryFile) -> str:
async def epub2html(file: SpooledTemporaryFile) -> HTMLBook:
"""
Splits epub to tokens and joins them to one html file
@ -22,40 +34,43 @@ async def epub2html(file: SpooledTemporaryFile) -> str:
html_content = epub_tokens2html(spine, tokens)
return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
return {
**(tokens["metadata"]),
"content": html_content,
}
except Exception as e:
except Exception as err:
raise HTTPException(
status_code=500, detail="Error! Wrong epub file format: " + str(e)
)
status_code=500, detail="Error! Wrong epub file format: " + str(err)
) from err
async def epub_to_tokens(
file: SpooledTemporaryFile,
) -> tuple[Document_Tokens, list[tuple[str, str]]]:
) -> tuple[DocumentTokens, list[tuple[str, str]]]:
"""
Passes file content to EbookLib library and parses epub tokens into dict of the following format:
r"""
Passes file content to EbookLib library and parses epub tokens into dict of
the following format:
{ "\<file_name\>": "\<file_content\>" }
Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
Where file content is either plain text for xhtml or base64 encoded data
for other formats, prepared for embeding to html
"""
tokens = {}
async with aiof.tempfile.NamedTemporaryFile() as tmp:
async with aiofiles.tempfile.NamedTemporaryFile() as tmp:
await tmp.write(file.read())
book = epub.read_epub(tmp.name)
# Reading book file
reader = epub.EpubReader(tmp.name)
book = reader.load()
reader.process()
# Adding book metadata to tokens list
metadata = {}
metadata["title"] = convert_list(book.get_metadata("DC", "title"))
metadata["author"] = convert_list(book.get_metadata("DC", "creator"))
tokens["metadata"] = metadata.copy()
tokens["metadata"] = read_metadata(book)
tokens["toc"] = {}
# Iterating over Items
@ -63,34 +78,52 @@ async def epub_to_tokens(
item: epub.EpubItem
item_type = item.get_type()
file_path = os.path.join(reader.opf_dir, item.get_name())
if item_type == ebooklib.ITEM_DOCUMENT:
# Adding book chapters to tokens list
name = item.id
tokens[name] = item.get_body_content()
name = item.get_id()
tokens[file_path] = strip_whitespace(item.get_body_content())
tokens["toc"][name] = file_path
elif item_type in (
ebooklib.ITEM_AUDIO,
ebooklib.ITEM_COVER,
ebooklib.ITEM_IMAGE,
ebooklib.ITEM_STYLE,
ebooklib.ITEM_VIDEO,
ebooklib.ITEM_VECTOR,
ebooklib.ITEM_VIDEO,
):
# Adding assets to tokens list
name = item.get_name()
content = item.get_content()
media_type = item.media_type
b64_content = b64encode(content).decode()
tokens[name] = f"data:{media_type};base64,{b64_content}"
tokens[file_path] = f"data:{media_type};base64,{b64_content}"
if item_type == ebooklib.ITEM_COVER:
tokens["metadata"]["cover"] = name
tokens["metadata"]["cover"] = file_path
return tokens, book.spine.copy()
def convert_list(titles_list: list[tuple[str, dict[str, str]]]):
def read_metadata(book: epub.EpubBook) -> dict[str, str]:
"""
Reads metadata from xml to dict
"""
metadata = {}
metadata["title"] = book.get_metadata("DC", "title")[0][0]
metadata["author"] = convert_list(book.get_metadata("DC", "creator"))
return metadata.copy()
def convert_list(titles_list: list[tuple[str, dict[str, str]]]) -> str:
"""
Joins titles list to one string
"""
res = []
for title_obj in titles_list:
res.append(title_obj[0])
@ -98,24 +131,147 @@ def convert_list(titles_list: list[tuple[str, dict[str, str]]]):
return "; ".join(res)
def set_cover(tokens: Document_Tokens):
cover_name = tokens["metadata"]["cover"]
def set_cover(tokens: DocumentTokens) -> None:
"""
Converts cover file name to base64 image stored in `tokens`
"""
cover_name = tokens["metadata"].get("cover")
if cover_name in tokens.keys():
tokens["metadata"]["cover"] = tokens[cover_name]
def epub_tokens2html(spine: list[tuple[str, str]], tokens: Document_Tokens):
def epub_tokens2html(spine: list[tuple[str, str]], tokens: DocumentTokens) -> bytes:
"""
Joins chapters in `spice` to one html string
"""
res = ""
for name, _ in spine:
file_path = tokens["toc"].get(name)
if file_path:
res += process_xhtml(file_path, tokens)
return html.unescape(res)
def process_xhtml(path: str, tokens: DocumentTokens) -> bytes:
"""
Processes content of one xml body
"""
xml: etree.Element = etree.fromstring(tokens[path], parser=parser)
if xml.tag == "body":
xml.tag = "div"
process_content(xml, path, tokens)
return (
f'<section id="b_{path_to_name(path)}">{etree.tostring(xml).decode()}</section>'
)
def process_content(node: etree.Element, path: str, tokens: DocumentTokens) -> None:
"""
Recursive function for xml element convertion to valid html
"""
# Process universal tags
if node.get("epub:type"):
node.attrib.pop("epub:type")
el_id = node.get("id")
if el_id:
node.set("id", f"{path_to_name(path)}_{el_id}")
# Tag processing
if node.tag == "a":
process_a_element(node, path)
elif node.tag == "hgroup":
node.tag = "div"
elif node.tag in ("img", "source", "video", "audio"):
process_media_element(node, path, tokens)
elif node.tag == IMAGE:
href = node.get(HREF)
media_path = rel_to_abs_path(path, href)
if media_path in tokens.keys():
node.set(HREF, tokens[media_path])
elif node.tag == "trigger":
node.getparent().remove(node)
# Recursively run for all children
for child in node:
process_content(child, path, tokens)
def process_a_element(node: etree.Element, path: str):
r"""
Converts `filed` links to ids in \<a\> element
"""
href = node.get("href")
if href.count(".xhtml") or href.count(".html"):
id_pos = href.rfind("#")
if id_pos != -1:
href_path, el_id = href[:id_pos], href[id_pos:]
node.set("href", f"#{path_to_name(href_path)}_{el_id[1:]}")
else:
node.set("href", f"#b_{path_to_name(href)}")
elif href.count("#"):
node.set("href", f"#{path_to_name(path)}_{href[1:]}")
def process_media_element(node: etree.Element, path: str, tokens: DocumentTokens):
"""
Replaces file paths to base64 encoded media in `src` and `srcset` tags
"""
src = node.get("src")
attr = "src"
if not src:
src = node.get("srcset")
attr = "srcset"
if src:
media_path = rel_to_abs_path(path, src)
if media_path in tokens.keys():
node.set(attr, tokens[media_path])
def rel_to_abs_path(parent: str, rel: str):
"""
Helper for relative path to media convertion to absolute
"""
return os.path.normpath(os.path.join(os.path.dirname(parent), rel))
@cache
def path_to_name(path: str) -> str:
"""
Helper function for getting file name
"""
return os.path.basename(path).split(".")[0]
def children_to_html(root: etree.Element) -> bytes:
"""
Converts all xml children of element to string and joins them
"""
res = b""
print(spine)
for name, enabled in spine:
if name in tokens.keys():
res += process_xhtml(tokens[name], tokens)
for child in root:
res += etree.tostring(child)
return res
def process_xhtml(xhtml: bytes, tokens: Document_Tokens):
# TODO: Add xhtml procession
return xhtml

View File

@ -1,11 +1,16 @@
from tempfile import SpooledTemporaryFile
"""
Module for FB2 file conversion to html
"""
import html
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element
from tempfile import SpooledTemporaryFile
from typing import Optional
from xml.etree.ElementTree import Element
from fastapi import HTTPException
from .utils import Document_Tokens, strip_whitespace
from .utils import DocumentTokens, HTMLBook, strip_whitespace
namespaces = {
"": "http://www.gribuser.ru/xml/fictionbook/2.0",
@ -14,7 +19,7 @@ namespaces = {
HREF = f"{{{namespaces['xlink']}}}href"
async def fb22html(file: SpooledTemporaryFile) -> dict[str, str]:
async def fb22html(file: SpooledTemporaryFile) -> HTMLBook:
"""
Splits fb2 to tokens and joins them to one html file
@ -25,17 +30,19 @@ async def fb22html(file: SpooledTemporaryFile) -> dict[str, str]:
set_cover(tokens)
html_content = fb2body2html(tokens)
return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
return {
**(tokens["metadata"]),
"content": html.unescape(html_content.decode()),
}
except Exception as e:
except Exception as err:
raise HTTPException(
status_code=500, detail="Error! Wrong fb2 file format: " + str(e)
)
status_code=500, detail="Error! Wrong fb2 file format: " + str(err)
) from err
def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
"""
def fb22tokens(file: SpooledTemporaryFile) -> DocumentTokens:
r"""
Parses fb2 file as xml document.
It puts book metadata, its content and media to `tokens` dictionary and returns it.
@ -70,11 +77,12 @@ def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
metadata = {}
metadata["title"] = book_info.find("./book-title", namespaces).text
metadata["author"] = get_author(book_info.find("./author", namespaces))
metadata["cover"] = get_cover(book_info.find("./coverpage", namespaces))
metadata["cover"] = get_cover(
book_info.find("./coverpage", namespaces))
if "cover" not in metadata.keys():
metadata.pop("cover")
if len(metadata.keys()):
if len(metadata.keys()) != 0:
tokens["metadata"] = metadata.copy()
# Reading book content
@ -96,7 +104,6 @@ def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
def get_author(author: Element) -> str:
"""
Converts author xml structure to string
"""
@ -107,9 +114,9 @@ def get_author(author: Element) -> str:
"middle-name",
"last-name",
):
el = author.find("./" + tag_name, namespaces)
if el is not None:
res.append(el.text)
tag = author.find("./" + tag_name, namespaces)
if tag is not None and tag.text is not None:
res.append(tag.text)
if len(res) == 0:
res = author.find("./nickname", namespaces).text
else:
@ -119,7 +126,6 @@ def get_author(author: Element) -> str:
def get_cover(coverpage: Optional[Element]) -> Optional[str]:
"""
Extracts cover image id if exists
"""
@ -127,8 +133,11 @@ def get_cover(coverpage: Optional[Element]) -> Optional[str]:
if coverpage:
return coverpage.find("./image", namespaces).get(HREF)
return None
def set_cover(tokens: Document_Tokens) -> None:
def set_cover(tokens: DocumentTokens) -> None:
"""Gets cover from book and sets it in metadata"""
cover = tokens["metadata"]["cover"]
if cover is None:
tokens["metadata"]["cover"] = "none"
@ -136,34 +145,32 @@ def set_cover(tokens: Document_Tokens) -> None:
tokens["metadata"]["cover"] = tokens[cover[1:]]
def fb2body2html(tokens: Document_Tokens) -> str:
def fb2body2html(tokens: DocumentTokens) -> str:
"""
Convert fb2 xml to html, joins bodies into one string
"""
res = b""
xml_root = ET.fromstring(tokens["content"])
xml_root = ET.fromstring(strip_whitespace(tokens["content"]))
for body in xml_root.iterfind("./body"):
res += process_section(body, tokens)
return res
def process_section(body: Element, tokens: Document_Tokens) -> str:
def process_section(body: Element, tokens: DocumentTokens) -> str:
"""
Processes individual sections, recursively goes throw sections tree
"""
res = b"<section>\n"
for tag in ("title", "epigraph", "annotation"):
el = body.find("./" + tag)
if el:
process_content(el, tokens)
res += children_to_html(el)
for tag_name in ("title", "epigraph", "annotation"):
tag = body.find("./" + tag_name)
if tag:
process_content(tag, tokens)
res += children_to_html(tag)
image = body.find("./image")
if image:
process_image(image, tokens)
@ -180,7 +187,6 @@ def process_section(body: Element, tokens: Document_Tokens) -> str:
def children_to_html(root: Element) -> str:
"""
Converts xml tag children to string
"""
@ -193,18 +199,17 @@ def children_to_html(root: Element) -> str:
return res
def process_image(el: Element, tokens: Document_Tokens) -> None:
"""
def process_image(element: Element, tokens: DocumentTokens) -> None:
r"""
Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
"""
el.tag = "img"
element.tag = "img"
href = el.get(HREF)
el.attrib.pop(HREF)
href = element.get(HREF)
element.attrib.pop(HREF)
el.set("src", tokens[href[1:]] if href[0] == "#" else href)
element.set("src", tokens[href[1:]] if href[0] == "#" else href)
tag_replacement = {
@ -219,15 +224,12 @@ tag_with_class = {
"cite": "div",
"poem": "div",
"stanza": "div",
"poem": "div",
"poem": "div",
"epigraph": "div",
"text-author": "p",
}
def process_content(root: Element, tokens: Document_Tokens) -> None:
def process_content(root: Element, tokens: DocumentTokens) -> None:
"""
Converts fb2 xml tag names to html equivalents and my own styled elements.
Resolves binary data dependencies

View File

@ -1,23 +1,68 @@
from fastapi import FastAPI, File, UploadFile, HTTPException
"""Webserver for epub and fb2 files convertation to html"""
from datetime import datetime
from fastapi import FastAPI, File, HTTPException, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel # pylint: disable=no-name-in-module
from .epub import epub2html
from .fb2 import fb22html
from .utils import HTMLBook
from .utils import HashedHTMLBook, add_hash
origins = (
"*"
)
class DebugInfo(BaseModel): # pylint: disable=too-few-public-methods
"""Main handler return types"""
startup_time: str
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
start_time = datetime.now()
@app.get("/", response_model=DebugInfo)
def root():
return "Hello, World!"
"""
Test if server is running.
Returns startup time
"""
return {"startup_time": start_time.isoformat()}
@app.post("/uploadfile/", response_model=HTMLBook)
@app.post("/uploadfile/", response_model=HashedHTMLBook)
async def create_upload_file(file: UploadFile = File(...)):
"""
Main api handler:
Accepts files with fb2 and epub extensions
Returns HTTP 415 error if file has unsupported format
Else returns object with book metadata and its html
"""
if file.filename.endswith(".fb2"):
content = await fb22html(file.file)
elif file.filename.endswith(".epub"):
content = await epub2html(file.file)
else:
raise HTTPException(status_code=415, detail="Error! Unsupported file type")
return content
raise HTTPException(
status_code=415, detail="Error! Unsupported file type")
h_content = add_hash(content)
return h_content

View File

@ -1,27 +1,58 @@
from typing import Union, Optional
from pydantic import BaseModel
"""
Utils for publite_backend module
"""
import re
from hashlib import sha256
from typing import Optional, Union
Document_Tokens = dict[str, Union[str, dict[str, str]]]
from pydantic import BaseModel # pylint: disable=no-name-in-module
DocumentTokens = dict[str, Union[str, dict[str, str]]]
class HTMLBook(BaseModel):
class HTMLBook(BaseModel): # pylint: disable=too-few-public-methods
"""Transitional model for returned book data"""
title: str
author: str
cover: Optional[str]
cover: Optional[str] = None
content: str
class HashedHTMLBook(HTMLBook): # pylint: disable=too-few-public-methods
"""Model for returned book data with content hash"""
hash: str
replacements = [
("&#13;", "\r"),
(">\s+?<", "><"),
("&#13;", ""),
("&#17;", ""),
(r">\s+?<", "><"),
]
def strip_whitespace(s: bytes) -> str:
res = s.decode()
def strip_whitespace(string: bytes) -> str:
"""Removes"""
res = string.decode()
for old, new in replacements:
res = re.sub(old, new, res)
return res.strip()
def add_hash(content: HTMLBook) -> HashedHTMLBook:
"""
Adds hash of book content
"""
h_content: HashedHTMLBook = content.copy()
h_content["hash"] = sha256(content["content"].encode()).hexdigest()
return h_content

View File

@ -1,23 +1 @@
aiofiles==0.7.0
appdirs==1.4.4
asgiref==3.4.0
black==21.6b0
click==8.0.1
EbookLib==0.17.1
fastapi==0.65.2
flake8==3.9.2
h11==0.12.0
lxml==4.6.3
mccabe==0.6.1
mypy-extensions==0.4.3
pathspec==0.8.1
pycodestyle==2.7.0
pydantic==1.8.2
pyflakes==2.3.1
python-multipart==0.0.5
regex==2021.7.1
six==1.16.0
starlette==0.14.2
toml==0.10.2
typing-extensions==3.10.0.0
uvicorn==0.14.0
-r requirements/prod.txt

4
requirements/dev.txt Normal file
View File

@ -0,0 +1,4 @@
-r prod.txt
pylint
rope
black

7
requirements/prod.txt Normal file
View File

@ -0,0 +1,7 @@
fastapi
uvicorn
aiofiles
ebooklib
python-multipart
lxml
pydantic

4
run.py Normal file
View File

@ -0,0 +1,4 @@
import uvicorn
if __name__ == "__main__":
uvicorn.run("app.main:app")

5
vercel.json Normal file
View File

@ -0,0 +1,5 @@
{
"rewrites": [
{ "source": "/(.*)", "destination": "/api/main"}
]
}