17 changed files with 147 additions and 409 deletions
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@ -19,10 +19,12 @@ jobs:
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
-          pip install -r requirements/dev.txt
+          pip install flake8 black
-      - name: Lint with pylint
+      - name: Lint with flake8
-        run: pylint app --extension-pkg-allow-list=lxml
+        run: |
          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
      - name: Format with black
        run: black .
--- a/.github/workflows/main-ci.yml
+++ b/.github/workflows/main-ci.yml
@ -13,6 +13,12 @@ jobs:
      - name: Checkout
        uses: actions/checkout@v2
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v3
        with:
          images: ghcr.io/${{ github.repository }}
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@v1
@ -38,10 +44,9 @@ jobs:
        with:
          context: .
          file: ./Dockerfile
          builder: ${{ steps.buildx.outputs.name }}
          cache-from: type=local,src=/tmp/.buildx-cache
          cache-to: type=local,dest=/tmp/.buildx-cache
          push: ${{ github.event_name != 'pull_request' }}
-          tags: ghcr.io/${{ github.repository }}:${{ github.sha }},ghcr.io/${{ github.repository }}:latest
+          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
--- a/.github/workflows/release-ci.yml
+++ b/.github/workflows/release-ci.yml
@ -50,11 +50,9 @@ jobs:
        with:
          context: .
          file: ./Dockerfile
          builder: ${{ steps.buildx.outputs.name }}
          cache-from: type=local,src=/tmp/.buildx-cache
          cache-to: type=local,dest=/tmp/.buildx-cache
          push: true
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,3 @@
 .venv
 __pycache__/
 .vscode
 .vercel
--- a/10
+++ b/10
@ -1,11 +1,13 @@
-FROM python:alpine
+FROM python
 WORKDIR /srv
-COPY ./requirements /srv/requirements
+COPY ./requirements.txt /srv/requirements.txt
-RUN pip install -r requirements/prod.txt
+RUN pip install -r requirements.txt
 EXPOSE 80
 COPY ./app /srv/app
-CMD uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8081}
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80"]
--- a/README.md
+++ b/README.md
@ -10,31 +10,6 @@ Backend for online ebook viewer publite
 ## Deploy
 Run app locally (development only!)
 ```bash
 # install requirements
 pip install -r requirements/dev.txt
 # run app with uvicorn
 uvicorn app.main:app --reload --port <port>
 ```
 Run app locally (test prod)
 ```bash
 # install requirements
 pip install -r requirements/prod.txt
 # run app with uvicorn
 uvicorn app.main:app --port <port>
 # or
 # run with python script
 python run.py
 ```
 Simple docker deployment
 ```bash
@ -42,22 +17,12 @@ Simple docker deployment
 docker build . -t publite_backend
 # run it with docker
-docker run -p <port>:8081 publite_backend
+docker run -p <port>:80 publite_backend
 ```
 Dokku deployment with image from Docker Hub
 ```bash
 dokku apps:create publitebackend
 # increase file size limit to be able to upload bigger books
 dokku nginx:set publitebackend client_max_body_size 50m
 dokku git:from-image publitebackend publite/backend:latest
 ```
 # TODO
 - Separate epub and fb2 files to python modules
 - Rewrite own `.opf` file parsing to get rid of dependency on EbookLib
 - Add cli interfaces for epub and fb2 libs
--- a/1
+++ b/1
@ -1 +0,0 @@
 app
--- a/app/init.py
+++ b/app/init.py
--- a/app/epub.py
+++ b/app/epub.py
@ -1,28 +1,16 @@
-"""
+import aiofiles as aiof
 Module for EPUB file conversion to html
 """
 import html
 import os
 from base64 import b64encode
-from functools import cache
+from fastapi import HTTPException
 from tempfile import SpooledTemporaryFile
 import aiofiles
 import ebooklib
 from ebooklib import epub
 from fastapi import HTTPException
 from lxml import etree
-from .utils import DocumentTokens, HTMLBook, strip_whitespace
+from tempfile import SpooledTemporaryFile
-parser = etree.XMLParser(recover=True)
+from .utils import Document_Tokens, strip_whitespace
 IMAGE = "{http://www.w3.org/2000/svg}image"
 HREF = "{http://www.w3.org/1999/xlink}href"
-async def epub2html(file: SpooledTemporaryFile) -> HTMLBook:
+async def epub2html(file: SpooledTemporaryFile) -> str:
    """
    Splits epub to tokens and joins them to one html file
@ -34,43 +22,40 @@ async def epub2html(file: SpooledTemporaryFile) -> HTMLBook:
        html_content = epub_tokens2html(spine, tokens)
-        return {
+        return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
            **(tokens["metadata"]),
            "content": html_content,
        }
-    except Exception as err:
+    except Exception as e:
        raise HTTPException(
-            status_code=500, detail="Error! Wrong epub file format: " + str(err)
+            status_code=500, detail="Error! Wrong epub file format: " + str(e)
-        ) from err
+        )
 async def epub_to_tokens(
    file: SpooledTemporaryFile,
-) -> tuple[DocumentTokens, list[tuple[str, str]]]:
+) -> tuple[Document_Tokens, list[tuple[str, str]]]:
-    r"""
+    """
-    Passes file content to EbookLib library and parses epub tokens into dict of
+    Passes file content to EbookLib library and parses epub tokens into dict of the following format:
    the following format:
    { "\<file_name\>": "\<file_content\>" }
-    Where file content is either plain text for xhtml or base64 encoded data
+    Where file content is either plain text for xhtml or base64 encoded data for other formats, prepared for embeding to html
    for other formats, prepared for embeding to html
    """
    tokens = {}
-    async with aiofiles.tempfile.NamedTemporaryFile() as tmp:
+    async with aiof.tempfile.NamedTemporaryFile() as tmp:
        await tmp.write(file.read())
-        # Reading book file
+        book = epub.read_epub(tmp.name)
        reader = epub.EpubReader(tmp.name)
        book = reader.load()
        reader.process()
-        tokens["metadata"] = read_metadata(book)
+        # Adding book metadata to tokens list
-        tokens["toc"] = {}
+
        metadata = {}
        metadata["title"] = convert_list(book.get_metadata("DC", "title"))
        metadata["author"] = convert_list(book.get_metadata("DC", "creator"))
        tokens["metadata"] = metadata.copy()
        # Iterating over Items
@ -78,52 +63,34 @@ async def epub_to_tokens(
            item: epub.EpubItem
            item_type = item.get_type()
            file_path = os.path.join(reader.opf_dir, item.get_name())
            if item_type == ebooklib.ITEM_DOCUMENT:
                # Adding book chapters to tokens list
-                name = item.get_id()
+                name = item.id
-                tokens[file_path] = strip_whitespace(item.get_body_content())
+                tokens[name] = item.get_body_content()
                tokens["toc"][name] = file_path
            elif item_type in (
                ebooklib.ITEM_AUDIO,
                ebooklib.ITEM_COVER,
                ebooklib.ITEM_IMAGE,
-                ebooklib.ITEM_VECTOR,
+                ebooklib.ITEM_STYLE,
                ebooklib.ITEM_VIDEO,
                ebooklib.ITEM_VECTOR,
            ):
                # Adding assets to tokens list
-
+                name = item.get_name()
                content = item.get_content()
                media_type = item.media_type
                b64_content = b64encode(content).decode()
-                tokens[file_path] = f"data:{media_type};base64,{b64_content}"
+                tokens[name] = f"data:{media_type};base64,{b64_content}"
                if item_type == ebooklib.ITEM_COVER:
-                    tokens["metadata"]["cover"] = file_path
+                    tokens["metadata"]["cover"] = name
    return tokens, book.spine.copy()
-def read_metadata(book: epub.EpubBook) -> dict[str, str]:
+def convert_list(titles_list: list[tuple[str, dict[str, str]]]):
    """
    Reads metadata from xml to dict
    """
    metadata = {}
    metadata["title"] = book.get_metadata("DC", "title")[0][0]
    metadata["author"] = convert_list(book.get_metadata("DC", "creator"))
    return metadata.copy()
 def convert_list(titles_list: list[tuple[str, dict[str, str]]]) -> str:
    """
    Joins titles list to one string
    """
    res = []
    for title_obj in titles_list:
        res.append(title_obj[0])
@ -131,147 +98,24 @@ def convert_list(titles_list: list[tuple[str, dict[str, str]]]) -> str:
    return "; ".join(res)
-def set_cover(tokens: DocumentTokens) -> None:
+def set_cover(tokens: Document_Tokens):
-    """
+    cover_name = tokens["metadata"]["cover"]
    Converts cover file name to base64 image stored in `tokens`
    """
    cover_name = tokens["metadata"].get("cover")
    if cover_name in tokens.keys():
        tokens["metadata"]["cover"] = tokens[cover_name]
-def epub_tokens2html(spine: list[tuple[str, str]], tokens: DocumentTokens) -> bytes:
+def epub_tokens2html(spine: list[tuple[str, str]], tokens: Document_Tokens):
    """
    Joins chapters in `spice` to one html string
    """
    res = ""
    for name, _ in spine:
        file_path = tokens["toc"].get(name)
        if file_path:
            res += process_xhtml(file_path, tokens)
    return html.unescape(res)
 def process_xhtml(path: str, tokens: DocumentTokens) -> bytes:
    """
    Processes content of one xml body
    """
    xml: etree.Element = etree.fromstring(tokens[path], parser=parser)
    if xml.tag == "body":
        xml.tag = "div"
    process_content(xml, path, tokens)
    return (
        f'<section id="b_{path_to_name(path)}">{etree.tostring(xml).decode()}</section>'
    )
 def process_content(node: etree.Element, path: str, tokens: DocumentTokens) -> None:
    """
    Recursive function for xml element convertion to valid html
    """
    # Process universal tags
    if node.get("epub:type"):
        node.attrib.pop("epub:type")
    el_id = node.get("id")
    if el_id:
        node.set("id", f"{path_to_name(path)}_{el_id}")
    # Tag processing
    if node.tag == "a":
        process_a_element(node, path)
    elif node.tag == "hgroup":
        node.tag = "div"
    elif node.tag in ("img", "source", "video", "audio"):
        process_media_element(node, path, tokens)
    elif node.tag == IMAGE:
        href = node.get(HREF)
        media_path = rel_to_abs_path(path, href)
        if media_path in tokens.keys():
            node.set(HREF, tokens[media_path])
    elif node.tag == "trigger":
        node.getparent().remove(node)
    # Recursively run for all children
    for child in node:
        process_content(child, path, tokens)
 def process_a_element(node: etree.Element, path: str):
    r"""
    Converts `filed` links to ids in \<a\> element
    """
    href = node.get("href")
    if href.count(".xhtml") or href.count(".html"):
        id_pos = href.rfind("#")
        if id_pos != -1:
            href_path, el_id = href[:id_pos], href[id_pos:]
            node.set("href", f"#{path_to_name(href_path)}_{el_id[1:]}")
        else:
            node.set("href", f"#b_{path_to_name(href)}")
    elif href.count("#"):
        node.set("href", f"#{path_to_name(path)}_{href[1:]}")
 def process_media_element(node: etree.Element, path: str, tokens: DocumentTokens):
    """
    Replaces file paths to base64 encoded media in `src` and `srcset` tags
    """
    src = node.get("src")
    attr = "src"
    if not src:
        src = node.get("srcset")
        attr = "srcset"
    if src:
        media_path = rel_to_abs_path(path, src)
        if media_path in tokens.keys():
            node.set(attr, tokens[media_path])
 def rel_to_abs_path(parent: str, rel: str):
    """
    Helper for relative path to media convertion to absolute
    """
    return os.path.normpath(os.path.join(os.path.dirname(parent), rel))
@cache
 def path_to_name(path: str) -> str:
    """
    Helper function for getting file name
    """
    return os.path.basename(path).split(".")[0]
 def children_to_html(root: etree.Element) -> bytes:
    """
    Converts all xml children of element to string and joins them
    """
    res = b""
-    for child in root:
+    print(spine)
-        res += etree.tostring(child)
+
    for name, enabled in spine:
        if name in tokens.keys():
            res += process_xhtml(tokens[name], tokens)
    return res
 def process_xhtml(xhtml: bytes, tokens: Document_Tokens):
    # TODO: Add xhtml procession
    return xhtml
--- a/app/fb2.py
+++ b/app/fb2.py
@ -1,16 +1,11 @@
 """
 Module for FB2 file conversion to html
 """
 import html
 import xml.etree.ElementTree as ET
 from tempfile import SpooledTemporaryFile
-from typing import Optional
+import xml.etree.ElementTree as ET
 from xml.etree.ElementTree import Element
-
+from typing import Optional
 from fastapi import HTTPException
-from .utils import DocumentTokens, HTMLBook, strip_whitespace
+from .utils import Document_Tokens, strip_whitespace
 namespaces = {
    "": "http://www.gribuser.ru/xml/fictionbook/2.0",
@ -19,7 +14,7 @@ namespaces = {
 HREF = f"{{{namespaces['xlink']}}}href"
-async def fb22html(file: SpooledTemporaryFile) -> HTMLBook:
+async def fb22html(file: SpooledTemporaryFile) -> dict[str, str]:
    """
    Splits fb2 to tokens and joins them to one html file
@ -30,19 +25,17 @@ async def fb22html(file: SpooledTemporaryFile) -> HTMLBook:
        set_cover(tokens)
        html_content = fb2body2html(tokens)
-        return {
+        return {**(tokens["metadata"]), "content": strip_whitespace(html_content)}
            **(tokens["metadata"]),
            "content": html.unescape(html_content.decode()),
        }
-    except Exception as err:
+    except Exception as e:
        raise HTTPException(
-            status_code=500, detail="Error! Wrong fb2 file format: " + str(err)
+            status_code=500, detail="Error! Wrong fb2 file format: " + str(e)
-        ) from err
+        )
-def fb22tokens(file: SpooledTemporaryFile) -> DocumentTokens:
+def fb22tokens(file: SpooledTemporaryFile) -> Document_Tokens:
-    r"""
+
    """
    Parses fb2 file as xml document.
    It puts book metadata, its content and media to `tokens` dictionary and returns it.
@ -77,12 +70,11 @@ def fb22tokens(file: SpooledTemporaryFile) -> DocumentTokens:
        metadata = {}
        metadata["title"] = book_info.find("./book-title", namespaces).text
        metadata["author"] = get_author(book_info.find("./author", namespaces))
-        metadata["cover"] = get_cover(
+        metadata["cover"] = get_cover(book_info.find("./coverpage", namespaces))
            book_info.find("./coverpage", namespaces))
        if "cover" not in metadata.keys():
            metadata.pop("cover")
-        if len(metadata.keys()) != 0:
+        if len(metadata.keys()):
            tokens["metadata"] = metadata.copy()
    # Reading book content
@ -104,6 +96,7 @@ def fb22tokens(file: SpooledTemporaryFile) -> DocumentTokens:
 def get_author(author: Element) -> str:
    """
    Converts author xml structure to string
    """
@ -114,9 +107,9 @@ def get_author(author: Element) -> str:
        "middle-name",
        "last-name",
    ):
-        tag = author.find("./" + tag_name, namespaces)
+        el = author.find("./" + tag_name, namespaces)
-        if tag is not None and tag.text is not None:
+        if el is not None:
-            res.append(tag.text)
+            res.append(el.text)
    if len(res) == 0:
        res = author.find("./nickname", namespaces).text
    else:
@ -126,6 +119,7 @@ def get_author(author: Element) -> str:
 def get_cover(coverpage: Optional[Element]) -> Optional[str]:
    """
    Extracts cover image id if exists
    """
@ -133,11 +127,8 @@ def get_cover(coverpage: Optional[Element]) -> Optional[str]:
    if coverpage:
        return coverpage.find("./image", namespaces).get(HREF)
    return None
-
+def set_cover(tokens: Document_Tokens) -> None:
 def set_cover(tokens: DocumentTokens) -> None:
    """Gets cover from book and sets it in metadata"""
    cover = tokens["metadata"]["cover"]
    if cover is None:
        tokens["metadata"]["cover"] = "none"
@ -145,32 +136,34 @@ def set_cover(tokens: DocumentTokens) -> None:
        tokens["metadata"]["cover"] = tokens[cover[1:]]
-def fb2body2html(tokens: DocumentTokens) -> str:
+def fb2body2html(tokens: Document_Tokens) -> str:
    """
    Convert fb2 xml to html, joins bodies into one string
    """
    res = b""
-    xml_root = ET.fromstring(strip_whitespace(tokens["content"]))
+    xml_root = ET.fromstring(tokens["content"])
    for body in xml_root.iterfind("./body"):
        res += process_section(body, tokens)
    return res
-def process_section(body: Element, tokens: DocumentTokens) -> str:
+def process_section(body: Element, tokens: Document_Tokens) -> str:
    """
    Processes individual sections, recursively goes throw sections tree
    """
    res = b"<section>\n"
-    for tag_name in ("title", "epigraph", "annotation"):
+    for tag in ("title", "epigraph", "annotation"):
-        tag = body.find("./" + tag_name)
+        el = body.find("./" + tag)
-        if tag:
+        if el:
-            process_content(tag, tokens)
+            process_content(el, tokens)
-            res += children_to_html(tag)
+            res += children_to_html(el)
    image = body.find("./image")
    if image:
        process_image(image, tokens)
@ -187,6 +180,7 @@ def process_section(body: Element, tokens: DocumentTokens) -> str:
 def children_to_html(root: Element) -> str:
    """
    Converts xml tag children to string
    """
@ -199,17 +193,18 @@ def children_to_html(root: Element) -> str:
    return res
-def process_image(element: Element, tokens: DocumentTokens) -> None:
+def process_image(el: Element, tokens: Document_Tokens) -> None:
-    r"""
+
    """
    Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
    """
-    element.tag = "img"
+    el.tag = "img"
-    href = element.get(HREF)
+    href = el.get(HREF)
-    element.attrib.pop(HREF)
+    el.attrib.pop(HREF)
-    element.set("src", tokens[href[1:]] if href[0] == "#" else href)
+    el.set("src", tokens[href[1:]] if href[0] == "#" else href)
 tag_replacement = {
@ -224,12 +219,15 @@ tag_with_class = {
    "cite": "div",
    "poem": "div",
    "stanza": "div",
    "poem": "div",
    "poem": "div",
    "epigraph": "div",
    "text-author": "p",
 }
-def process_content(root: Element, tokens: DocumentTokens) -> None:
+def process_content(root: Element, tokens: Document_Tokens) -> None:
    """
    Converts fb2 xml tag names to html equivalents and my own styled elements.
    Resolves binary data dependencies
--- a/app/main.py
+++ b/app/main.py
@ -1,68 +1,23 @@
-"""Webserver for epub and fb2 files convertation to html"""
+from fastapi import FastAPI, File, UploadFile, HTTPException
 from datetime import datetime
 from fastapi import FastAPI, File, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel  # pylint: disable=no-name-in-module
 from .epub import epub2html
 from .fb2 import fb22html
-from .utils import HashedHTMLBook, add_hash
+from .utils import HTMLBook
 origins = (
    "*"
 )
 class DebugInfo(BaseModel):  # pylint: disable=too-few-public-methods
    """Main handler return types"""
    startup_time: str
 app = FastAPI()
 app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
-start_time = datetime.now()
+@app.get("/")
@app.get("/", response_model=DebugInfo)
 def root():
-    """
+    return "Hello, World!"
    Test if server is running.
    Returns startup time
    """
    return {"startup_time": start_time.isoformat()}
-@app.post("/uploadfile/", response_model=HashedHTMLBook)
+@app.post("/uploadfile/", response_model=HTMLBook)
 async def create_upload_file(file: UploadFile = File(...)):
    """
    Main api handler:
    Accepts files with fb2 and epub extensions
    Returns HTTP 415 error if file has unsupported format
    Else returns object with book metadata and its html
    """
    if file.filename.endswith(".fb2"):
        content = await fb22html(file.file)
    elif file.filename.endswith(".epub"):
        content = await epub2html(file.file)
    else:
-        raise HTTPException(
+        raise HTTPException(status_code=415, detail="Error! Unsupported file type")
-            status_code=415, detail="Error! Unsupported file type")
+    return content
    h_content = add_hash(content)
    return h_content
--- a/app/utils.py
+++ b/app/utils.py
@ -1,58 +1,27 @@
-"""
+from typing import Union, Optional
-Utils for publite_backend module
+from pydantic import BaseModel
 """
 import re
 from hashlib import sha256
 from typing import Optional, Union
-from pydantic import BaseModel  # pylint: disable=no-name-in-module
+Document_Tokens = dict[str, Union[str, dict[str, str]]]
 DocumentTokens = dict[str, Union[str, dict[str, str]]]
-class HTMLBook(BaseModel):  # pylint: disable=too-few-public-methods
+class HTMLBook(BaseModel):
    """Transitional model for returned book data"""
    title: str
    author: str
-    cover: Optional[str] = None
+    cover: Optional[str]
    content: str
 class HashedHTMLBook(HTMLBook):  # pylint: disable=too-few-public-methods
    """Model for returned book data with content hash"""
    hash: str
 replacements = [
-    ("&#13;", ""),
+    ("&#13;", "\r"),
-    ("&#17;", ""),
+    (">\s+?<", "><"),
    (r">\s+?<", "><"),
 ]
-def strip_whitespace(string: bytes) -> str:
+def strip_whitespace(s: bytes) -> str:
-
+    res = s.decode()
    """Removes"""
    res = string.decode()
    for old, new in replacements:
        res = re.sub(old, new, res)
    return res.strip()
 def add_hash(content: HTMLBook) -> HashedHTMLBook:
    """
    Adds hash of book content
    """
    h_content: HashedHTMLBook = content.copy()
    h_content["hash"] = sha256(content["content"].encode()).hexdigest()
    return h_content
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1,23 @@
-r requirements/prod.txt
+aiofiles==0.7.0
 appdirs==1.4.4
 asgiref==3.4.0
 black==21.6b0
 click==8.0.1
 EbookLib==0.17.1
 fastapi==0.65.2
 flake8==3.9.2
 h11==0.12.0
 lxml==4.6.3
 mccabe==0.6.1
 mypy-extensions==0.4.3
 pathspec==0.8.1
 pycodestyle==2.7.0
 pydantic==1.8.2
 pyflakes==2.3.1
 python-multipart==0.0.5
 regex==2021.7.1
 six==1.16.0
 starlette==0.14.2
 toml==0.10.2
 typing-extensions==3.10.0.0
 uvicorn==0.14.0
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@ -1,4 +0,0 @@
 -r prod.txt
 pylint
 rope
 black
--- a/requirements/prod.txt
+++ b/requirements/prod.txt
@ -1,7 +0,0 @@
 fastapi
 uvicorn
 aiofiles
 ebooklib
 python-multipart
 lxml
 pydantic
--- a/run.py
+++ b/run.py
@ -1,4 +0,0 @@
 import uvicorn
 if __name__ == "__main__":
    uvicorn.run("app.main:app")
--- a/vercel.json
+++ b/vercel.json
@ -1,5 +0,0 @@
 {
    "rewrites": [
        { "source": "/(.*)", "destination": "/api/main"}
    ]
 }