initial commit: added html conversion, lost tokens.py

2022-05-25 11:37:01 +03:00
commit 3b2c7aa208
10 changed files with 342 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
--- a/fb2/init.py
+++ b/fb2/init.py
@ -0,0 +1 @@
+# from .main import
--- a/fb2/main.py
+++ b/fb2/main.py
@ -0,0 +1,3 @@
+from .main import main
+
+main()
--- a/fb2/html.py
+++ b/fb2/html.py
@ -0,0 +1,113 @@
+import xml.etree.ElementTree as ET
+from xml.etree.ElementTree import Element
+
+from .types import DocumentTokens
+from .utils import HREF, strip_whitespace
+
+def join_tokens(tokens: DocumentTokens) -> str:
+    """
+    Convert fb2 xml to html, joins bodies into one string
+    """
+
+    res = b""
+
+    xml_root = ET.fromstring(strip_whitespace(tokens["content"]))
+    for body in xml_root.iterfind("./body"):
+        res += process_section(body, tokens)
+
+    return res
+
+
+def process_section(body: Element, tokens: DocumentTokens) -> str:
+    """
+    Processes individual sections, recursively goes throw sections tree
+    """
+
+    res = b"<section>\n"
+
+    for tag_name in ("title", "epigraph", "annotation"):
+        tag = body.find("./" + tag_name)
+        if tag:
+            process_content(tag, tokens)
+            res += children_to_html(tag)
+    image = body.find("./image")
+    if image:
+        process_image(image, tokens)
+        res += ET.tostring(image)
+
+    for section in body.findall("./section"):
+        if section.find("./section"):
+            res += process_section(section, tokens)
+        else:
+            process_content(section, tokens)
+            res += b"<section>\n" + children_to_html(section) + b"</section>\n"
+
+    return res + b"</section>\n"
+
+
+def children_to_html(root: Element) -> str:
+    """
+    Converts xml tag children to string
+    """
+
+    res = b""
+
+    for child in root:
+        res += ET.tostring(child)
+
+    return res
+
+
+def process_image(element: Element, tokens: DocumentTokens) -> None:
+    r"""
+    Converts fb2 \<image /\> to html \<img /\>. Replaces xlink:href with src="\<base64_image_data\>"
+    """
+
+    element.tag = "img"
+
+    href = element.get(HREF)
+    element.attrib.pop(HREF)
+
+    element.set("src", tokens[href[1:]] if href[0] == "#" else href)
+
+
+tag_replacement = {
+    "empty-line": "br",
+    "emphasis": "em",
+    "strikethrough": "strike",
+    "v": "p",
+}
+
+tag_with_class = {
+    "subtitle": "p",
+    "cite": "div",
+    "poem": "div",
+    "stanza": "div",
+    "epigraph": "div",
+    "text-author": "p",
+}
+
+
+def process_content(root: Element, tokens: DocumentTokens) -> None:
+    """
+    Converts fb2 xml tag names to html equivalents and my own styled elements.
+    Resolves binary data dependencies
+    """
+
+    for child in root:
+        process_content(child, tokens)
+
+        if child.tag == "a":
+            href = child.get(HREF)
+            child.attrib.pop(HREF)
+            child.set("href", href)
+
+        if child.tag == "image":
+            process_image(child, tokens)
+
+        elif child.tag in tag_replacement.keys():
+            child.tag = tag_replacement[child.tag]
+
+        elif child.tag in tag_with_class.keys():
+            child.set("class", child.tag)
+            child.tag = tag_with_class[child.tag]
--- a/fb2/main.py
+++ b/fb2/main.py
@ -0,0 +1,33 @@
+"""
+Module for FB2 file conversion to html or txt
+"""
+
+import sys
+import argparse
+from .txt import join_tokens_to_txt
+from .tokens import fb2_to_tokens
+from .html import join_tokens as join_tokens_to_html
+
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('file', help="file to process", type=argparse.FileType('rb'))
+    parser.add_argument('-o', '--output', help="path to output file", default=sys.stdout, type=argparse.FileType('w'))
+    parser.add_argument('-t', '--target', help='specify target format', choices=['html', 'txt', 'text'], default='html')
+    args = parser.parse_args()
+
+    file = args.file
+
+    tokens = fb2_to_tokens(file)
+
+    if args.target == 'html':
+        content = join_tokens_to_html(tokens)
+    else:
+        content = join_tokens_to_txt(tokens)
+
+    output_file = args.output
+    
+    output_file.write(content)
+    if output_file != sys.stdout:
+        print(f'Finished writing to {output_file.name}')
--- a/fb2/tokens.py
+++ b/fb2/tokens.py
@ -0,0 +1,10 @@
+from typing import IO
+
+from .types import DocumentTokens
+
+def fb2_to_tokens(file: IO) -> DocumentTokens:
+    """
+    TODO: implement
+    """
+
+    pass
--- a/fb2/txt.py
+++ b/fb2/txt.py
@ -0,0 +1,5 @@
+from .types import DocumentTokens
+
+
+def join_tokens_to_txt(tokens: DocumentTokens) -> str:
+    return ''
--- a/fb2/types.py
+++ b/fb2/types.py
@ -0,0 +1,3 @@
+from typing import Union
+
+DocumentTokens = dict[str, Union[str, dict[str, str], bytes]]
--- a/fb2/utils.py
+++ b/fb2/utils.py
@ -0,0 +1,14 @@
+import re
+
+namespaces = {
+    "": "http://www.gribuser.ru/xml/fictionbook/2.0",
+    "xlink": "http://www.w3.org/1999/xlink",
+}
+
+HREF = f"{{{namespaces['xlink']}}}href"
+
+replacements = [
+    ("&#13;", ""),
+    ("&#17;", ""),
+    (r">\s+?<", "><"),
+]
--- a/requirements.txt
+++ b/requirements.txt