commit 3b2c7aa208427a3ae51fd6e94d4e1dd36b757ebc Author: dm1sh Date: Wed May 25 11:37:01 2022 +0300 initial commit: added html conversion, lost tokens.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6769e21 --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/fb2/__init__.py b/fb2/__init__.py new file mode 100644 index 0000000..542d660 --- /dev/null +++ b/fb2/__init__.py @@ -0,0 +1 @@ +# from .main import diff --git a/fb2/__main__.py b/fb2/__main__.py new file mode 100644 index 0000000..5d6a810 --- /dev/null +++ b/fb2/__main__.py @@ -0,0 +1,3 @@ +from .main import main + +main() diff --git a/fb2/html.py b/fb2/html.py new file mode 100644 index 0000000..6a5c4c0 --- /dev/null +++ b/fb2/html.py @@ -0,0 +1,113 @@ +import xml.etree.ElementTree as ET +from xml.etree.ElementTree import Element + +from .types import DocumentTokens +from .utils import HREF, strip_whitespace + +def join_tokens(tokens: DocumentTokens) -> str: + """ + Convert fb2 xml to html, joins bodies into one string + """ + + res = b"" + + xml_root = ET.fromstring(strip_whitespace(tokens["content"])) + for body in xml_root.iterfind("./body"): + res += process_section(body, tokens) + + return res + + +def process_section(body: Element, tokens: DocumentTokens) -> str: + """ + Processes individual sections, recursively goes throw sections tree + """ + + res = b"
\n" + + for tag_name in ("title", "epigraph", "annotation"): + tag = body.find("./" + tag_name) + if tag: + process_content(tag, tokens) + res += children_to_html(tag) + image = body.find("./image") + if image: + process_image(image, tokens) + res += ET.tostring(image) + + for section in body.findall("./section"): + if section.find("./section"): + res += process_section(section, tokens) + else: + process_content(section, tokens) + res += b"
\n" + children_to_html(section) + b"
\n" + + return res + b"
\n" + + +def children_to_html(root: Element) -> str: + """ + Converts xml tag children to string + """ + + res = b"" + + for child in root: + res += ET.tostring(child) + + return res + + +def process_image(element: Element, tokens: DocumentTokens) -> None: + r""" + Converts fb2 \ to html \. Replaces xlink:href with src="\" + """ + + element.tag = "img" + + href = element.get(HREF) + element.attrib.pop(HREF) + + element.set("src", tokens[href[1:]] if href[0] == "#" else href) + + +tag_replacement = { + "empty-line": "br", + "emphasis": "em", + "strikethrough": "strike", + "v": "p", +} + +tag_with_class = { + "subtitle": "p", + "cite": "div", + "poem": "div", + "stanza": "div", + "epigraph": "div", + "text-author": "p", +} + + +def process_content(root: Element, tokens: DocumentTokens) -> None: + """ + Converts fb2 xml tag names to html equivalents and my own styled elements. + Resolves binary data dependencies + """ + + for child in root: + process_content(child, tokens) + + if child.tag == "a": + href = child.get(HREF) + child.attrib.pop(HREF) + child.set("href", href) + + if child.tag == "image": + process_image(child, tokens) + + elif child.tag in tag_replacement.keys(): + child.tag = tag_replacement[child.tag] + + elif child.tag in tag_with_class.keys(): + child.set("class", child.tag) + child.tag = tag_with_class[child.tag] diff --git a/fb2/main.py b/fb2/main.py new file mode 100644 index 0000000..3710f10 --- /dev/null +++ b/fb2/main.py @@ -0,0 +1,33 @@ +""" +Module for FB2 file conversion to html or txt +""" + +import sys +import argparse +from .txt import join_tokens_to_txt +from .tokens import fb2_to_tokens +from .html import join_tokens as join_tokens_to_html + + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('file', help="file to process", type=argparse.FileType('rb')) + parser.add_argument('-o', '--output', help="path to output file", default=sys.stdout, type=argparse.FileType('w')) + parser.add_argument('-t', '--target', help='specify target format', choices=['html', 'txt', 'text'], default='html') + args = parser.parse_args() + + file = args.file + + tokens = fb2_to_tokens(file) + + if args.target == 'html': + content = join_tokens_to_html(tokens) + else: + content = join_tokens_to_txt(tokens) + + output_file = args.output + + output_file.write(content) + if output_file != sys.stdout: + print(f'Finished writing to {output_file.name}') diff --git a/fb2/tokens.py b/fb2/tokens.py new file mode 100644 index 0000000..98f1486 --- /dev/null +++ b/fb2/tokens.py @@ -0,0 +1,10 @@ +from typing import IO + +from .types import DocumentTokens + +def fb2_to_tokens(file: IO) -> DocumentTokens: + """ + TODO: implement + """ + + pass \ No newline at end of file diff --git a/fb2/txt.py b/fb2/txt.py new file mode 100644 index 0000000..e298b4b --- /dev/null +++ b/fb2/txt.py @@ -0,0 +1,5 @@ +from .types import DocumentTokens + + +def join_tokens_to_txt(tokens: DocumentTokens) -> str: + return '' diff --git a/fb2/types.py b/fb2/types.py new file mode 100644 index 0000000..567a2cd --- /dev/null +++ b/fb2/types.py @@ -0,0 +1,3 @@ +from typing import Union + +DocumentTokens = dict[str, Union[str, dict[str, str], bytes]] diff --git a/fb2/utils.py b/fb2/utils.py new file mode 100644 index 0000000..21b75fc --- /dev/null +++ b/fb2/utils.py @@ -0,0 +1,14 @@ +import re + +namespaces = { + "": "http://www.gribuser.ru/xml/fictionbook/2.0", + "xlink": "http://www.w3.org/1999/xlink", +} + +HREF = f"{{{namespaces['xlink']}}}href" + +replacements = [ + (" ", ""), + ("", ""), + (r">\s+?<", "><"), +] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29