diff --git a/.github/workflows/Publish.yml b/.github/workflows/Publish.yml index e878f0b..11798e0 100644 --- a/.github/workflows/Publish.yml +++ b/.github/workflows/Publish.yml @@ -16,16 +16,19 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version-file: pyproject.toml + python-version-file: scraper/pyproject.toml architecture: x64 - name: Build packages + working-directory: scraper run: | pip install -U pip build python -m build --sdist --wheel - name: Upload to PyPI uses: pypa/gh-action-pypi-publish@release/v1.8 + with: + packages-dir: scraper/dist/ - name: Build and push Docker image uses: openzim/docker-publish-action@v10 diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index a9d2172..106548f 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -7,7 +7,7 @@ on: - main jobs: - check-qa: + check-scraper-qa: runs-on: ubuntu-22.04 steps: @@ -16,19 +16,49 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version-file: pyproject.toml + python-version-file: scraper/pyproject.toml architecture: x64 - - name: Install dependencies (and project) + - name: Install dependencies + working-directory: scraper run: | pip install -U pip pip install -e .[lint,check,scripts,test] - name: Check black formatting + working-directory: scraper run: inv lint-black - name: Check ruff + working-directory: scraper run: inv lint-ruff - name: Check pyright + working-directory: scraper run: inv check-pyright + + check-zimui-qa: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Node.js + uses: actions/setup-node@v3 + with: + node-version-file: zimui/.node-version + + - name: Install JS dependencies + working-directory: zimui + run: | + yarn install + + - name: Check prettier + working-directory: zimui + run: | + yarn format + + - name: Check eslint + working-directory: zimui + run: | + yarn lint diff --git a/.github/workflows/Tests.yml b/.github/workflows/Tests.yml index 838269f..3c6276f 100644 --- a/.github/workflows/Tests.yml +++ b/.github/workflows/Tests.yml @@ -7,7 +7,7 @@ on: - main jobs: - run-tests: + test-scraper: runs-on: ubuntu-22.04 steps: @@ -16,15 +16,17 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version-file: pyproject.toml + python-version-file: scraper/pyproject.toml architecture: x64 - name: Install dependencies (and project) + working-directory: scraper run: | pip install -U pip pip install -e .[test,scripts] - name: Run the tests + working-directory: scraper run: inv coverage --args "-vvv" - name: Upload coverage report to codecov @@ -32,7 +34,7 @@ jobs: with: token: ${{ secrets.CODECOV_TOKEN }} - build_python: + build-scraper: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 @@ -40,15 +42,36 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version-file: pyproject.toml + python-version-file: scraper/pyproject.toml architecture: x64 - name: Ensure we can build Python targets + working-directory: scraper run: | pip install -U pip build python3 -m build --sdist --wheel - build_docker: + build-zimui: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + + - name: Set up Node.js + uses: actions/setup-node@v3 + with: + node-version-file: zimui/.node-version + + - name: Install dependencies + working-directory: zimui + run: | + yarn install + + - name: Build + working-directory: zimui + run: | + yarn build + + build-docker: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 diff --git a/.gitignore b/.gitignore index db971bd..ece9e81 100644 --- a/.gitignore +++ b/.gitignore @@ -368,16 +368,6 @@ pyrightconfig.json # assets that we download .dockerignore -src/kolibri2zim/templates/assets/bootstrap/ -src/kolibri2zim/templates/assets/pdfjs/ -src/kolibri2zim/templates/assets/videojs/ -src/kolibri2zim/templates/assets/jquery.min.js -src/kolibri2zim/templates/assets/ogvjs/ -src/kolibri2zim/templates/assets/videojs-ogvjs.js -src/kolibri2zim/templates/assets/epub.min.js -src/kolibri2zim/templates/assets/bootstrap-icons/ -src/kolibri2zim/templates/assets/jszip.min.js -src/kolibri2zim/templates/assets/perseus/ # output dir output diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 28396f6..70e7be2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,8 +4,24 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 hooks: - - id: trailing-whitespace - - id: end-of-file-fixer + - id: trailing-whitespace + - id: end-of-file-fixer +- repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.0.3 + hooks: + - id: prettier + files: zimui\/.*$ # files in zimui folder +- repo: https://github.com/pre-commit/mirrors-eslint + rev: v8.51.0 + hooks: + - id: eslint + types: [file] + files: zimui\/src\/.*(?:\.[jt]sx?|\.vue)$ # *.js, *.jsx, *.ts, *.tsx, *.vue in zimui/src folder + args: + - --ignore-path + - zimui/.eslintignore + - --config + - zimui/.eslintrc.cjs - repo: https://github.com/psf/black rev: "24.2.0" hooks: diff --git a/CHANGELOG.md b/CHANGELOG.md index 20fd120..7104eb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Enhance contribution guidelines: Refactored shell scripts, added additional steps. (#91) + ## [1.2.1] - 2024-02-29 ### Changed @@ -48,6 +50,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Default publisher is not correctly spelled (#78) - Adapt to hatchling v1.19.0 which mandates packages setting (#79) - Small fixes in invoke tasks +- Force Python version to 3.11 (3.12 is not yet ready in our dependencies) ### Changed @@ -55,6 +58,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Github workflow: publish `dev` tag on every push to `main` branch - Github workflow: build Docker image + test its startup - Github workflow: adopt new standard execution structure (`on` conditions) +- Scraper (Python code) has been moved to the scraper subfolder +- Vue.JS is now used as main UI framework + - all its code is in the zimui subfolder + - it is rendered with Vite to produce a static website + - developpers instruction have been adapted +- QA and Tests workflows have been adapted + - to the new folder structure + - to also QA and Test the Vue.JS part +- precommit hooks have been configured for the Vue.JS part +- Dockerfile has been adapted to first build the Vue.JS part in a dedicated stage and then embed the generated files into the final Python-based image +- Topics are stored as JSON files in the ZIM + - JSON is generated by pydantic + - these files are consumed by the Vue.JS UI + - content (video, audio, pdf, epub, ...) is still rendered by Jinja2 as before +- URLs are meaningful slugs + - permalink based on Kolibri node title + 4 chars from node ID + - generated by Python slugify lib +- changes in the ZIM "folder" structure: + - files generated by Vite are placed in / + - thumbnails are placed in /thumbnails + - JSON files generated to render topics are placed in /topics + - most Kolibri content (video, audio, ePub, PDF) are placed in /files (some content is still placed at the root to not break some stuff which was found hard to fix for now, will be tackled in specific issues for each content type) +- legacy MANIFEST.in has been deleted (left-over from migration to hatch) +- is_front property has been adjusted when adding the item to the ZIM +- one new CLI argument --zimui-dist to specify the folder where zimui has been built (by Vite) + + ## [1.1.0] - 2023-07-25 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3d3cd75..17ffe80 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,9 @@ * Open issues, bug reports and send PRs [on github](https://github.com/openzim/kolibri2zim). * Make sure it's `py3.6+` compatible. +* Docker (which engine must run in the background). * Use [black](https://github.com/psf/black) code formatting. +* `pre-commit` must be activated before (code must pass all pre-commit checks to succeed in the CI) ## notes @@ -25,12 +27,44 @@ To add a new locale (`fr` in this example, use only ISO-639-1): 3. translate the PO file ([poedit](https://poedit.net/) is your friend) 4. compile updated translation `pybabel compile -d kolibri2zim/locale -l fr` -## releasing -* Update your dependencies: `pip install -U setuptools wheel twine` -* Make sure CHANGELOG is up-to-date -* Bump version on `kolibri2zim/VERSION` -* Build packages `python ./setup.py sdist bdist_wheel` -* Upload to PyPI `python -m twine upload dist/kolibri2zim-1.0.0*`. -* Commit your CHANGELOG + version bump changes -* Tag version on git `git tag -a v1.0.0` +## Developing the ZIM UI in Vue.JS + +Sometimes you need to alter something in the ZIM UI in Vue.JS but for this to work, you need assets which are generated by the scraper (e.g. channel.json, ...). + +To simplify this, it is possible to: +- run the scraper (with original code base or your modified one) +- extract assets from generated files and place them in a directory where ZIM UI will find them +- iterate on ZIM UI code + +To achieve this, first build the Docker image based on current code base. + +``` +docker build -t local-kolibri2zim . +``` + +Scrape a channel (here we use the minimal channel, but you could use any other one of interest for your UI developments). + +``` +docker run --rm -it -v "$PWD/output":/output local-kolibri2zim kolibri2zim --name "minimal_test" --title "Minimal Kolibri Channel Test" --description "This is a minimal K +olibri Channel, with new Kolibri UI" --channel-id "7f744ce8d28b471eaf663abd60c92267" --zim-file "Minimal_Test.zim" +``` + +Extract interesting ZIM content and move it to `public` folder. + +``` +find zimui/public/ -mindepth 1 -maxdepth 1 ! -name ".gitignore" -delete +docker run -it --rm -v $(pwd)/output:/data ghcr.io/openzim/zim-tools:latest zimdump dump --dir=/data/Minimal_Test /data/Minimal_Test.zim +sudo chown -R $(id -u -n):$(id -g -n) output/Minimal_Test +mv output/Minimal_Test/* zimui/public/ +rm -rf output/Minimal_Test +``` + +Start ZIM UI locally. + +``` +cd zimui +yarn dev +``` + +Do not forget to cleanup `public` folder before building the docker image again, otherwise all assets will be pushed to the ZIM. diff --git a/Dockerfile b/Dockerfile index f83a6d4..a60c038 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,3 +1,10 @@ +FROM node:20-alpine as zimui + +WORKDIR /src +COPY zimui /src +RUN yarn install --frozen-lockfile +RUN yarn build + FROM python:3.12-bookworm LABEL org.opencontainers.image.source https://github.com/openzim/kolibri @@ -12,22 +19,28 @@ RUN apt-get update \ pip # Copy pyproject.toml and its dependencies -COPY pyproject.toml openzim.toml README.md /src/ -COPY src/kolibri2zim/__about__.py /src/src/kolibri2zim/__about__.py +COPY README.md /src/ +COPY scraper/pyproject.toml scraper/openzim.toml /src/scraper/ +COPY scraper/src/kolibri2zim/__about__.py /src/scraper/src/kolibri2zim/__about__.py # Install Python dependencies -RUN pip install --no-cache-dir /src +RUN pip install --no-cache-dir /src/scraper # Copy code + associated artifacts -COPY src /src/src -COPY *.md LICENSE *.py /src/ +COPY scraper/src /src/scraper/src +COPY *.md LICENSE /src/ # Install + cleanup -RUN pip install --no-cache-dir /src \ - && rm -rf /src +RUN pip install --no-cache-dir /src/scraper \ + && rm -rf /src/scraper + +# Copy zimui build output +COPY --from=zimui /src/dist /src/zimui # default output directory RUN mkdir -p /output WORKDIR /output +ENV KOLIBRI_ZIMUI_DIST=/src/zimui + CMD ["kolibri2zim", "--help"] diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 15b313b..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include *.md -include get_js_deps.sh -include requirements.txt -recursive-include kolibri2zim * diff --git a/README.md b/README.md index d7ebc11..e1843d6 100644 --- a/README.md +++ b/README.md @@ -13,16 +13,23 @@ recompress them in lower-quality, smaller size), the thumbnails, the subtitles and the authors' profile pictures ; then, it create a static HTML files folder of it before creating a ZIM off of it. +> [!WARNING] +> This scraper is under heavy modifications to prepare a v2 including a brand new UI for navigating the tree of content and a move to Vue.JS. These changes +are already merged into `main` branch but not yet completed. Should you be interested in a stable version, please used published versions (PyPI or Docker). +We also have a `v1` branch for any urgent patch needed to current production version. + Requirements ------------ +* Node 20.x +* Python 3.11 * [`ffmpeg`](https://ffmpeg.org/) for video transcoding (only used with `--use-webm` or `--low-quality`). -* `curl` and `unzip` to install Javascript dependencies. See `get_js_deps.sh` if you want to do it manually. +* `curl` and `unzip` to install Javascript dependencies. See `get_web_deps.sh` if you want to do it manually. Installation ------------ -## Virtualenv +### Virtualenv `kolibri2zim` is a Python3 software. If you are not using the [Docker](https://docker.com) image, you are advised to use it in a @@ -37,9 +44,15 @@ kolibri2zim --help # Display kolibri2zim help Call `deactivate` to quit the virtual environment. -See `requirements.txt` for the list of python dependencies. +See `pyproject.toml` for the list of python dependencies. + +To test epubs and pdfs rendering, a potential usefull command is: + +```bash +kolibri2zim --name "Biblioteca Elejandria" --output /output --tmp-dir /tmp --zim-file Biblioteca_Elejandria.zim --channel-id "fed29d60e4d84a1e8dcfc781d920b40e" --node-ids 'd92c07655128458f8248416154b18a68,89fe2f86ee3f4fbaa7fb2bf9bd56d088,75f99e6b97d14b14a4e74762ad77391f,89fe2f86ee3f4fbaa7fb2bf9bd56d088' +``` -## Docker +### Docker ```bash docker run -v my_dir:/output ghcr.io/openzim/kolibri kolibri2zim --help @@ -60,9 +73,34 @@ kolibri2zim has implemented openZIM's [Python bootstrap, conventions and policie Before contributing be sure to check out the [CONTRIBUTING.md](CONTRIBUTING.md) guidelines. -To test epubs and pdfs rendering, a potential useful command is: -```bash -kolibri2zim --name "Biblioteca Elejandria" --output /output --tmp-dir /tmp --zim-file Biblioteca_Elejandria.zim --channel-id "fed29d60e4d84a1e8dcfc781d920b40e" --node-ids 'd92c07655128458f8248416154b18a68,89fe2f86ee3f4fbaa7fb2bf9bd56d088,75f99e6b97d14b14a4e74762ad77391f,89fe2f86ee3f4fbaa7fb2bf9bd56d088' +Some usefull test channels: + +- 7f744ce8d28b471eaf663abd60c92267: a very minimal channel with all kind of content +- 9f15f4e9aeaa48b5ae271e5749d6fe80 : a small channel with significantly nested items and all kind of content + +### Build and running scraper locally + +You have to: + +- build the `zimui` frontend which will be embededed inside the ZIM (and redo it every time you make modifications to the `zimui`) +- run the `scraper` to retrieve FCC curriculum and build the ZIM + +Sample commands: + +``` +cd zimui +yarn install +yarn build +cd ../scraper +hatch run kolibri2zim --name "Biblioteca Elejandria" --output output --zim-file Biblioteca_Elejandria.zim --channel-id "fed29d60e4d84a1e8dcfc781d920b40e" --node-ids 'd92c07655128458f8248416154b18a68,89fe2f86ee3f4fbaa7fb2bf9bd56d088,75f99e6b97d14b14a4e74762ad77391f,89fe2f86ee3f4fbaa7fb2bf9bd56d088' +``` + +### Running scraper with Docker + +Run from official version (published on GHCR.io) ; ZIM will be available in the `output` sub-folder of current working directory. + +``` +docker run --rm -it -v $(pwd)/output:/output ghcr.io/openzim/kolibri:latest kolibri2zim --name "Biblioteca Elejandria" --output /output --tmp-dir /tmp --zim-file Biblioteca_Elejandria.zim --channel-id "fed29d60e4d84a1e8dcfc781d920b40e" --node-ids 'd92c07655128458f8248416154b18a68,89fe2f86ee3f4fbaa7fb2bf9bd56d088,75f99e6b97d14b14a4e74762ad77391f,89fe2f86ee3f4fbaa7fb2bf9bd56d088' ``` License diff --git a/USAGE.md b/USAGE.md new file mode 100644 index 0000000..668111c --- /dev/null +++ b/USAGE.md @@ -0,0 +1,32 @@ +# Usage of Kolibri2zim +Docker +------ + +- Clone the kolibri2zim repository to your local machine + +- run the following command with the channel id and `name-of-the-zim` you are converting to .zim, `channel-id` is a 32-characters long ID that you can find in the URL of the channel you want, either from [Kolibri Studio](https://studio.learningequality.org) or the [Kolibri Catalog](https://kolibri-catalog-en.learningequality.org) + +```bash +docker run -v my_dir:/output ghcr.io/openzim/kolibri kolibri2zim --channel-id `channel-id` --name `name-of-the-channel` +``` + +- This will create a `.zim` file in the `/output` file, which will be persisted in the my_dir Docker volume. + +-For getting this .zim file on to your local machine you can save it to your desktop by using `save` command. + +- For opening this `.zim` file, you need a ZIM reader, you could use a Kiwix one and you might use [kiwix-serve](https://kiwix.org/en/applications/). + +- now you can access that created `.zim` file from the `kiwix-serve ui` and start the server on the localhost. + +- Whenever you make code changes during development, you need to create a Docker image of your modified code using + +```bash +docker build -t `your-image-name`:`version` . +``` +- Here, "your-image-name" would be replaced with the name you choose for your Docker image, and "version" would be replaced with a version tag, like "latest," "v1.0," etc. this image is for local use only, and thus doesn't need to follow any standardized naming or versioning conventions. + +- You need to run that image going into the `images` section in docker. + +```bash +docker run -v my_dir:/output `your-image-name:version` kolibri2zim --channel-id `channel-id` --name `name-of-the-zim` +``` diff --git a/dump_channel_to_fs.py b/scraper/dump_channel_to_fs.py similarity index 100% rename from dump_channel_to_fs.py rename to scraper/dump_channel_to_fs.py diff --git a/openzim.toml b/scraper/openzim.toml similarity index 88% rename from openzim.toml rename to scraper/openzim.toml index bab94b8..7da472a 100644 --- a/openzim.toml +++ b/scraper/openzim.toml @@ -93,3 +93,15 @@ remove = ["*.txt","*.md",] execute_after=[ '''sed -i '1s/""/"assets\/perseus\/"/' perseus/build/frame-perseus.js''' ] + +[files.assets.actions."lato_ttf"] +action="get_file" +source="https://dev.kiwix.org/fonts/lato/lato-v24-latin-regular.ttf" +target_file="lato-v24-latin-regular.ttf" +target_dir="fonts" + +[files.assets.actions."lato_woff2"] +action="get_file" +source="https://dev.kiwix.org/fonts/lato/lato-v24-latin-regular.woff2" +target_file="lato-v24-latin-regular.woff2" +target_dir="fonts" diff --git a/pyproject.toml b/scraper/pyproject.toml similarity index 97% rename from pyproject.toml rename to scraper/pyproject.toml index e84bb5f..ad104a9 100644 --- a/pyproject.toml +++ b/scraper/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "hatchling.build" name = "kolibri2zim" requires-python = ">=3.12,<3.13" description = "Make ZIM file from Kolibri Channels" -readme = "README.md" +readme = "../README.md" dependencies = [ "zimscraperlib==3.3.1", "kiwixstorage==0.8.3", @@ -14,6 +14,9 @@ dependencies = [ "pif==0.8.2", "beautifulsoup4==4.9.3", "retrying==1.3.4", + "pydantic==2.4.2", + "python-slugify==8.0.1", + "pyhumps==3.8.0", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] @@ -100,6 +103,7 @@ all = "inv checkall --args '{args}'" [tool.black] line-length = 88 target-version = ['py312'] +exclude="(src/kolibri2zim/templates/.*|.hatch/.*)" [tool.ruff] target-version = "py312" diff --git a/src/kolibri2zim/__about__.py b/scraper/src/kolibri2zim/__about__.py similarity index 100% rename from src/kolibri2zim/__about__.py rename to scraper/src/kolibri2zim/__about__.py diff --git a/src/kolibri2zim/__init__.py b/scraper/src/kolibri2zim/__init__.py similarity index 100% rename from src/kolibri2zim/__init__.py rename to scraper/src/kolibri2zim/__init__.py diff --git a/src/kolibri2zim/constants.py b/scraper/src/kolibri2zim/constants.py similarity index 90% rename from src/kolibri2zim/constants.py rename to scraper/src/kolibri2zim/constants.py index 1f3de18..062c150 100644 --- a/src/kolibri2zim/constants.py +++ b/scraper/src/kolibri2zim/constants.py @@ -31,6 +31,8 @@ "jszip.min.js", "jquery.min.js", "videojs-ogvjs.js", + "fonts/lato-v24-latin-regular.woff2", + "fonts/lato-v24-latin-regular.ttf", ] diff --git a/src/kolibri2zim/database.py b/scraper/src/kolibri2zim/database.py similarity index 98% rename from src/kolibri2zim/database.py rename to scraper/src/kolibri2zim/database.py index 7ecfa6e..bdcc17d 100644 --- a/src/kolibri2zim/database.py +++ b/scraper/src/kolibri2zim/database.py @@ -102,7 +102,7 @@ def get_node_children(self, node_id, left=None, right=None): right = node["right"] for row in self.get_rows( - "SELECT id, title, kind " + "SELECT id, title, description, kind, lft as left, rght as right " "FROM content_contentnode WHERE lft > ? AND rght < ? " "AND parent_id=?" "ORDER BY level ASC", diff --git a/src/kolibri2zim/debug.py b/scraper/src/kolibri2zim/debug.py similarity index 100% rename from src/kolibri2zim/debug.py rename to scraper/src/kolibri2zim/debug.py diff --git a/src/kolibri2zim/entrypoint.py b/scraper/src/kolibri2zim/entrypoint.py similarity index 96% rename from src/kolibri2zim/entrypoint.py rename to scraper/src/kolibri2zim/entrypoint.py index f003403..7dfb3a5 100755 --- a/src/kolibri2zim/entrypoint.py +++ b/scraper/src/kolibri2zim/entrypoint.py @@ -2,6 +2,7 @@ # vim: ai ts=4 sts=4 et sw=4 nu import argparse +import os import sys from kolibri2zim.constants import NAME, SCRAPER, logger @@ -136,6 +137,15 @@ def parse_args(raw_args): "Receives all data (storage space)", ) + parser.add_argument( + "--zimui-dist", + type=str, + help=( + "Directory containing Vite build output from the Zim UI Vue.JS application" + ), + default=os.getenv("KOLIBRI_ZIMUI_DIST", "../zimui/dist"), + ) + parser.add_argument( "--zim-file", help="ZIM file name (based on --name if not provided)", diff --git a/src/kolibri2zim/nodes.py b/scraper/src/kolibri2zim/nodes.py similarity index 100% rename from src/kolibri2zim/nodes.py rename to scraper/src/kolibri2zim/nodes.py diff --git a/src/kolibri2zim/processing.py b/scraper/src/kolibri2zim/processing.py similarity index 100% rename from src/kolibri2zim/processing.py rename to scraper/src/kolibri2zim/processing.py diff --git a/scraper/src/kolibri2zim/schemas.py b/scraper/src/kolibri2zim/schemas.py new file mode 100644 index 0000000..c7ca5d5 --- /dev/null +++ b/scraper/src/kolibri2zim/schemas.py @@ -0,0 +1,59 @@ +from humps import camelize +from pydantic import BaseModel + + +class CamelModel(BaseModel): + """Model than transform Python snake_case into JSON camelCase""" + + class Config: + alias_generator = camelize + populate_by_name = True + + +class TopicSubSection(CamelModel): + """One subclass to serialize data about one Kolibri topic""" + + slug: str + title: str + description: str + kind: str + thumbnail: str | None + + +class TopicSection(CamelModel): + """Another subclass to serialize data about one Kolibri topic""" + + slug: str + title: str + description: str + kind: str + thumbnail: str | None + subsections: list[TopicSubSection] + + +class TopicParent(CamelModel): + """Information about a parent of one Kolibri topic""" + + slug: str + title: str + + +class Topic(CamelModel): + """Class to serialize data about one Kolibri topic + + One topic is composed of parents, sections and subsections. + This is already preprocessed information, closely adapted + to current UI needs + """ + + parents: list[TopicParent] + title: str + description: str + sections: list[TopicSection] + thumbnail: str | None + + +class Channel(CamelModel): + """Class to serialize data about the Kolibri channel""" + + root_slug: str diff --git a/src/kolibri2zim/scraper.py b/scraper/src/kolibri2zim/scraper.py similarity index 81% rename from src/kolibri2zim/scraper.py rename to scraper/src/kolibri2zim/scraper.py index 649704a..5d11539 100644 --- a/src/kolibri2zim/scraper.py +++ b/scraper/src/kolibri2zim/scraper.py @@ -20,17 +20,12 @@ from bs4 import BeautifulSoup from kiwixstorage import KiwixStorage from pif import get_public_ip -from zimscraperlib.constants import ( - MAXIMUM_DESCRIPTION_METADATA_LENGTH as MAX_DESC_LENGTH, -) -from zimscraperlib.constants import ( - MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LENGTH, -) +from slugify import slugify from zimscraperlib.filesystem import get_file_mimetype from zimscraperlib.i18n import find_language_names from zimscraperlib.image.convertion import convert_image, create_favicon from zimscraperlib.image.transformation import resize_image -from zimscraperlib.inputs import handle_user_provided_file +from zimscraperlib.inputs import compute_descriptions, handle_user_provided_file from zimscraperlib.video.presets import VideoMp4Low, VideoWebmHigh, VideoWebmLow from zimscraperlib.zim.creator import Creator from zimscraperlib.zim.items import StaticItem @@ -43,6 +38,13 @@ get_size_and_mime, safer_reencode, ) +from kolibri2zim.schemas import ( + Channel, + Topic, + TopicParent, + TopicSection, + TopicSubSection, +) options = [ "debug", @@ -93,10 +95,13 @@ def read_from_zip(ark, member): def wrap_failure_details(func): def wrapper(self, item): + node_id = kind = None try: node_id, kind = item return func(self, item) except Exception as exc: + if not node_id or not kind: + raise raise RuntimeError(f"Failed to process {kind} node {node_id}") from exc return wrapper @@ -145,6 +150,7 @@ def go(option): if go("tmp_dir"): Path(go("tmp_dir")).mkdir(parents=True, exist_ok=True) # pyright: ignore self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir"))) + self.zimui_dist = Path(go("zimui_dist") or "../zimui/dist") # performances options self.nb_threads = int(go("threads") or 1) @@ -169,19 +175,20 @@ def go(option): loader=jinja2.FileSystemLoader(str(self.templates_dir)), autoescape=True ) + # a dictionnary mapping node_id (keys) to slug (values) + self.nodes_ids_to_slugs: dict[str, str] = {} + @property def templates_dir(self): return ROOT_DIR.joinpath("templates") def add_local_files(self, root_path, folder): """recursively add local files from {folder} starting at {path}""" - non_front = ("viewer.html", "epub_embed.html") for fpath in folder.iterdir(): path = "/".join([root_path, fpath.name]) if fpath.is_file(): - mimetype = "text/html;raw=true" if fpath.name in non_front else None self.creator.add_item_for( - path=path, title="", fpath=fpath, mimetype=mimetype + path=path, title="", fpath=fpath, is_front=False ) logger.debug(f"Adding {path}") else: @@ -202,6 +209,61 @@ def schedule_node(item): if self.node_ids is None or node["id"] in self.node_ids: schedule_node((node["id"], node["kind"])) + def get_or_create_node_slug(self, node) -> str: + """Compute a unique slug to be used as URL for a given node""" + if node["id"] in self.nodes_ids_to_slugs: + return self.nodes_ids_to_slugs[node["id"]] + if "title" in node: + slug = f"{slugify(node['title'])}-{node['id'][:4]}" + else: + slug = node["id"] + if slug in self.nodes_ids_to_slugs.values(): + # detect extreme case where we have a conflict + conflicting_node_id = { + slug: node_id for node_id, slug in self.nodes_ids_to_slugs.items() + }[slug] + logger.error( + f"Slug conflict detected between node {conflicting_node_id} and node" + f" {node['id']}, both have same slug {slug}" + ) + raise Exception("Slug conflict, cannot proceed any further") + self.nodes_ids_to_slugs[node["id"]] = slug + return slug + + def get_node_with_slugs(self, node_id, *, with_parents=False, with_children=False): + node = self.db.get_node( + node_id=node_id, with_parents=with_parents, with_children=with_children + ) + + node["slug"] = self.get_or_create_node_slug(node) + if with_parents: + # transform generators into list so we can use them multiple times + node["parents"] = list(node["parents"]) + for parent in node["parents"]: + parent["slug"] = self.get_or_create_node_slug(parent) + if with_children: + # transform generators into list so we can use them multiple times + node["children"] = list(node["children"]) + for child in node["children"]: + child["slug"] = self.get_or_create_node_slug(child) + return node + + def add_channel_json(self): + node = self.get_node_with_slugs( + node_id=self.root_id, with_parents=True, with_children=True + ) + + with self.creator_lock: + self.creator.add_item_for( + path="channel.json", + title=node["title"], + content=Channel(root_slug=node["slug"]).model_dump_json( + by_alias=True, indent=2 + ), + mimetype="application/json", + is_front=False, + ) + @wrap_failure_details def add_node(self, item): """process a content node from the tuple in queue""" @@ -217,18 +279,18 @@ def add_node(self, item): # add thumbnail to zim if there's one for this node thumbnail = self.db.get_node_thumbnail(node_id) if thumbnail: - self.funnel_file(thumbnail["id"], thumbnail["ext"]) + self.funnel_file(thumbnail["id"], thumbnail["ext"], "thumbnails/") # fire the add_{kind}_node() method which will actually process it handler(node_id) - def funnel_file(self, fid, fext): + def funnel_file(self, fid, fext, path_prefix=""): """directly add a Kolibri file to the ZIM using same name""" url, fname = get_kolibri_url_for(fid, fext) size, mimetype = get_size_and_mime(url) item_kw = { - "path": fname, + "path": path_prefix + fname, "title": "", "mimetype": mimetype, "delete_fpath": True, @@ -322,19 +384,56 @@ def add_topic_node(self, node_id): Topic nodes are used only for hierarchy and solely contains metadata""" # fetch details including parents for breadcrumb and children to link to - node = self.db.get_node(node_id, with_parents=True, with_children=True) - - html = self.jinja2_env.get_template("topic.html").render( - node_id=node_id, **node + node = self.get_node_with_slugs( + node_id=node_id, with_parents=True, with_children=True ) + with self.creator_lock: self.creator.add_item_for( - path=node_id, + path=f"topics/{node['slug']}.json", title=node["title"], - content=html, - mimetype="text/html", + content=Topic( + parents=[ + TopicParent( + slug=self.get_or_create_node_slug(parent), + title=parent["title"], + ) + for parent in node["parents"] + ], + title=node["title"], + description=node["description"], + sections=[ + TopicSection( + slug=self.get_or_create_node_slug(section), + title=section["title"], + description=section["description"], + kind=section["kind"], + thumbnail=self.db.get_thumbnail_name(section["id"]), + subsections=[ + TopicSubSection( + slug=self.get_or_create_node_slug(subsection), + title=subsection["title"], + description=subsection["description"], + kind=subsection["kind"], + thumbnail=self.db.get_thumbnail_name( + subsection["id"] + ), + ) + for subsection in self.db.get_node_children( + section["id"], + section["left"], + section["right"], + ) + ], + ) + for section in node["children"] + ], + thumbnail=self.db.get_thumbnail_name(node_id), + ).model_dump_json(by_alias=True, indent=2), + mimetype="application/json", + is_front=False, ) - logger.debug(f"Added topic #{node_id}") + logger.debug(f"Added topic #{node_id} - {node['slug']}") def add_video_node(self, node_id): """Add content from this `video` node to zim @@ -438,7 +537,7 @@ def add_video_node(self, node_id): } ) - node = self.db.get_node(node_id, with_parents=True) + node = self.get_node_with_slugs(node_id, with_parents=True) html = self.jinja2_env.get_template("video.html").render( node_id=node_id, video_filename=video_filename, @@ -450,12 +549,13 @@ def add_video_node(self, node_id): ) with self.creator_lock: self.creator.add_item_for( - path=node_id, + path=f"files/{node['slug']}/", title=node["title"], content=html, mimetype="text/html", + is_front=True, ) - logger.debug(f"Added video #{node_id}") + logger.debug(f"Added video #{node_id} - {node['slug']}") @contextmanager def cleanup_future_once_done(self, future): @@ -559,7 +659,7 @@ def add_audio_node(self, node_id): return self.funnel_file(file["id"], file["ext"]) - node = self.db.get_node(node_id, with_parents=True) + node = self.get_node_with_slugs(node_id, with_parents=True) html = self.jinja2_env.get_template("audio.html").render( node_id=node_id, filename=filename_for(file), @@ -570,12 +670,13 @@ def add_audio_node(self, node_id): ) with self.creator_lock: self.creator.add_item_for( - path=node_id, + path=f"files/{node['slug']}/", title=node["title"], content=html, mimetype="text/html", + is_front=True, ) - logger.debug(f"Added audio #{node_id}") + logger.debug(f"Added audio #{node_id} - {node['slug']}") def add_exercise_node(self, node_id): """Add content from this `exercise` node to zim @@ -622,22 +723,24 @@ def add_exercise_node(self, node_id): ) assessment_items.append(perseus_content) + node = self.get_node_with_slugs(node_id, with_parents=True, with_children=False) + # add all support files to ZIM for ark_member in zip_ark.namelist(): if ark_member == manifest_name: continue - path = f"{node_id}/{ark_member}" + path = f"files/{node_id}/{ark_member}" with self.creator_lock: self.creator.add_item_for( path=path, title="", content=read_from_zip(zip_ark, ark_member), + is_front=False, ) logger.debug(f"Added exercise support file {path}") # prepare and add exercise HTML article - node = self.db.get_node(node_id, with_parents=True, with_children=False) html = self.jinja2_env.get_template("perseus_exercise.html").render( node_id=node_id, perseus_content=f"[{', '.join(assessment_items)}]", @@ -646,12 +749,13 @@ def add_exercise_node(self, node_id): ) with self.creator_lock: self.creator.add_item_for( - path=node_id, + path=f"files/{node['slug']}/", title=node["title"], content=html, mimetype="text/html", + is_front=True, ) - logger.debug(f"Added exercise node #{node_id}") + logger.debug(f"Added exercise node #{node_id} - {node['slug']}") def add_document_node(self, node_id): """Add content from this `document` node to zim @@ -672,9 +776,12 @@ def add_document_node(self, node_id): def target_for(file): filename = filename_for(file) if file["ext"] == "pdf": - return f"./assets/pdfjs/web/viewer.html?file=../../../{filename}" - if file["ext"] == "epub": - return f"./assets/epub_embed.html?url=../{filename}" + return f"../assets/pdfjs/web/viewer.html?file=../../../files/{filename}" + if get_is_epub(file): + return f"../assets/epub_embed.html?url=../files/{filename}" + + def get_is_epub(file): + return file["ext"] == "epub" # record the actual document files = self.db.get_node_files(node_id, thumbnail=False) @@ -694,10 +801,11 @@ def target_for(file): alt_document = None for file in files: - self.funnel_file(file["id"], file["ext"]) + self.funnel_file(file["id"], file["ext"], path_prefix="files/") file["target"] = target_for(file) - node = self.db.get_node(node_id, with_parents=True) + node = self.get_node_with_slugs(node_id, with_parents=True) + # convert generator to list as we might read it twice node["parents"] = list(node["parents"]) @@ -710,16 +818,18 @@ def target_for(file): for is_alt in options: html = self.jinja2_env.get_template("document.html").render( node_id=node_id, + node_slug=node["slug"], main_document=filename_for(main_document), main_document_ext=main_document["ext"], alt_document=filename_for(alt_document) if alt_document else None, alt_document_ext=alt_document["ext"] if alt_document else None, target=target_for(alt_document if is_alt else main_document), is_alt=is_alt, + is_epub=get_is_epub(alt_document if is_alt else main_document), **node, ) with self.creator_lock: - path = node_id + path = f"files/{node['slug']}/" if is_alt: path += "_alt" self.creator.add_item_for( @@ -727,8 +837,9 @@ def target_for(file): title=node["title"], content=html, mimetype="text/html", + is_front=is_alt, ) - logger.debug(f"Added document #{node_id}") + logger.debug(f"Added document #{node_id} - {node['slug']}") def add_html5_node(self, node_id): """Add content from this `html5` node to zim @@ -745,6 +856,8 @@ def add_html5_node(self, node_id): if not file: return + node = self.get_node_with_slugs(node_id) + # download ZIP file to memory ark_url, ark_name = get_kolibri_url_for(file["id"], file["ext"]) ark_data = io.BytesIO() @@ -756,8 +869,13 @@ def add_html5_node(self, node_id): if not self.dedup_html_files: with self.creator_lock: self.creator.add_item_for( - path=f"{node_id}/{ark_member}", + path=( + f"files/{node['slug']}/{ark_member}" + if ark_member != "index.html" + else f"files/{node['slug']}/" + ), content=zip_ark.open(ark_member).read(), + is_front=(ark_member == "index.html"), ) continue @@ -771,16 +889,22 @@ def add_html5_node(self, node_id): self.creator.add_item_for( path=f"html5_files/{content_hash}", content=content, + is_front=False, ) # add redirect to the unique sum-based entry for that file's path with self.creator_lock: self.creator.add_redirect( - path=f"{node_id}/{ark_member}", + path=( + f"files/{node['slug']}/{ark_member}" + if ark_member != "index.html" + else f"files/{node['slug']}/" + ), target_path=f"html5_files/{content_hash}", + is_front=ark_member == "index.html", ) - logger.debug(f"Added HTML5 node #{node_id}") + logger.debug(f"Added HTML5 node #{node_id} - {node['slug']}") def run(self): if self.s3_url_with_credentials and not self.s3_credentials_ok(): @@ -844,7 +968,7 @@ def run(self): return 1 self.creator = Creator( filename=self.output_dir.joinpath(self.clean_fname), - main_path=self.root_id, + main_path="home", ignore_duplicates=True, ) self.creator.config_metadata( @@ -863,9 +987,11 @@ def run(self): succeeded = False try: self.add_favicon() + self.add_zimui() + self.add_custom_about_and_css() - # add static files + # add assets files logger.info("Adding local files (assets)") self.add_local_files("assets", self.templates_dir.joinpath("assets")) @@ -891,6 +1017,8 @@ def run(self): # only awaits future completion and doesn't include callbacks self.videos_executor.shutdown() + self.add_channel_json() + nb_done_with_failure = sum( 1 if future.exception() else 0 for future in futures.done ) @@ -973,26 +1101,11 @@ def sanitize_inputs(self): if not self.title: self.title = channel_meta["name"] self.title = self.title.strip() - - if self.description and len(self.description) > MAX_DESC_LENGTH: - raise ValueError( - f"Description too long ({len(self.description)}>{MAX_DESC_LENGTH})" - ) - if self.long_description and len(self.long_description) > MAX_LONG_DESC_LENGTH: - raise ValueError( - f"LongDescription too long ({len(self.long_description)}" - f">{MAX_LONG_DESC_LENGTH})" - ) - - kolibri_desc = channel_meta["description"].strip() - if not self.long_description and len(kolibri_desc) > MAX_DESC_LENGTH: - self.long_description = kolibri_desc[0:MAX_LONG_DESC_LENGTH] - if len(kolibri_desc) > MAX_LONG_DESC_LENGTH: - self.long_description = self.long_description[:-1] + "…" - if not self.description: - self.description = kolibri_desc[0:MAX_DESC_LENGTH] - if len(kolibri_desc) > MAX_DESC_LENGTH: - self.description = self.description[:-1] + "…" + (self.description, self.long_description) = compute_descriptions( + channel_meta["description"].strip(), + self.description, + self.long_description, + ) if not self.author: self.author = channel_meta["author"] or "Kolibri" @@ -1047,8 +1160,25 @@ def retrieve_favicon(self): def add_favicon(self): self.creator.add_illustration(96, self.favicon_96_fpath.read_bytes()) - self.creator.add_item_for("favicon.png", fpath=self.favicon_96_fpath) - self.creator.add_item_for("favicon.ico", fpath=self.favicon_ico_path) + self.creator.add_item_for( + "favicon.png", fpath=self.favicon_96_fpath, is_front=False + ) + self.creator.add_item_for( + "favicon.ico", fpath=self.favicon_ico_path, is_front=False + ) + + def add_zimui(self): + logger.info(f"Adding files in {self.zimui_dist}") + for file in self.zimui_dist.rglob("*"): + if file.is_dir(): + continue + path = str(Path(file).relative_to(self.zimui_dist)) + logger.debug(f"Adding {path} to ZIM") + self.creator.add_item_for( + path if path != "index.html" else "home", + fpath=file, + is_front=path == "index.html", + ) def add_custom_about_and_css(self): channel_meta = self.db.get_channel_metadata(self.channel_id) @@ -1080,10 +1210,11 @@ def add_custom_about_and_css(self): ) with self.creator_lock: self.creator.add_item_for( - path="about", + path="files/about", title=title, content=html, mimetype="text/html", + is_front=True, ) del html @@ -1100,13 +1231,15 @@ def add_custom_about_and_css(self): else: content = "" - self.creator.add_item_for("custom.css", content=content, mimetype="text/css") + self.creator.add_item_for( + "custom.css", content=content, mimetype="text/css", is_front=False + ) logger.debug("Added about page and custom CSS") def ensure_js_deps_are_present(self): for dep in JS_DEPS: if not self.templates_dir.joinpath(f"assets/{dep}").exists(): raise ValueError( - "It looks like JS deps have not been installed," + "It looks like web deps have not been installed," f" {dep} is missing" ) diff --git a/src/kolibri2zim/templates/about.html b/scraper/src/kolibri2zim/templates/about.html similarity index 85% rename from src/kolibri2zim/templates/about.html rename to scraper/src/kolibri2zim/templates/about.html index 046fc84..00d6622 100644 --- a/src/kolibri2zim/templates/about.html +++ b/scraper/src/kolibri2zim/templates/about.html @@ -8,7 +8,7 @@
{{ description }}
{% if author %}Created by {{ author }}
{% endif %} {% if last_updated %}Updated on {{ last_updated }}
{% endif %} - + {% endif %} {% endblock %} diff --git a/scraper/src/kolibri2zim/templates/assets/.gitignore b/scraper/src/kolibri2zim/templates/assets/.gitignore new file mode 100644 index 0000000..c36eb42 --- /dev/null +++ b/scraper/src/kolibri2zim/templates/assets/.gitignore @@ -0,0 +1,11 @@ +bootstrap/ +pdfjs/ +videojs/ +jquery.min.js +ogvjs/ +videojs-ogvjs.js +epub.min.js +bootstrap-icons/ +jszip.min.js +perseus/ +lato* diff --git a/src/kolibri2zim/templates/assets/document.js b/scraper/src/kolibri2zim/templates/assets/document.js similarity index 100% rename from src/kolibri2zim/templates/assets/document.js rename to scraper/src/kolibri2zim/templates/assets/document.js diff --git a/src/kolibri2zim/templates/assets/epub_embed.css b/scraper/src/kolibri2zim/templates/assets/epub_embed.css similarity index 100% rename from src/kolibri2zim/templates/assets/epub_embed.css rename to scraper/src/kolibri2zim/templates/assets/epub_embed.css diff --git a/src/kolibri2zim/templates/assets/epub_embed.html b/scraper/src/kolibri2zim/templates/assets/epub_embed.html similarity index 100% rename from src/kolibri2zim/templates/assets/epub_embed.html rename to scraper/src/kolibri2zim/templates/assets/epub_embed.html diff --git a/src/kolibri2zim/templates/assets/epub_embed.js b/scraper/src/kolibri2zim/templates/assets/epub_embed.js similarity index 100% rename from src/kolibri2zim/templates/assets/epub_embed.js rename to scraper/src/kolibri2zim/templates/assets/epub_embed.js diff --git a/src/kolibri2zim/templates/assets/perseus_exercise.js b/scraper/src/kolibri2zim/templates/assets/perseus_exercise.js similarity index 100% rename from src/kolibri2zim/templates/assets/perseus_exercise.js rename to scraper/src/kolibri2zim/templates/assets/perseus_exercise.js diff --git a/src/kolibri2zim/templates/audio.html b/scraper/src/kolibri2zim/templates/audio.html similarity index 71% rename from src/kolibri2zim/templates/audio.html rename to scraper/src/kolibri2zim/templates/audio.html index 3153cbe..07e6ff3 100644 --- a/src/kolibri2zim/templates/audio.html +++ b/scraper/src/kolibri2zim/templates/audio.html @@ -1,7 +1,7 @@ {% extends "base.html" %} {% block head %} - + {% block head %}{% endblock %} - ++ {% endif %} + {% block content %}{% endblock %} {% block footer %} @@ -41,7 +49,7 @@ {% endblock %}