diff --git a/CHANGELOG b/CHANGELOG index 0162aae..c736d63 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) as of 1.3.0. +## [Unreleased] + +### Added + +- Add urls support to collections. (#59) + ## [1.1.1] - 2023-05-05 ### Changed @@ -41,9 +47,6 @@ as of 1.3.0. - Replaced zimwriterfs with zimscraperlib –using libzim 7 without namespaces– (#41) - Removed inline javascript to comply with some CSP (#34) -## [1.2.1] - 2022-08-03 - - ## [1.0.7] - 2022-01-04 - removed inline JS in homepage (#34) diff --git a/README.md b/README.md index 4a47f26..cc6c314 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,19 @@ At the moment, the JSON file needs to provide the following fields for each item "description": "...", "authors": "...", "files": ["relative/path/to/file"] - } + }, + { + "title": "...", + "description": "...", + "authors": "...", + "files": [ + { + "archive-member": "01 BOOK for printing .pdf", // optional, member name inside archive (same as simpler format) + "url": "http://books.com/310398120.pdf", // optional, has precedence over `archive-member`, url to download file from + "filename": "My book.pdf", // optional, filename to use in ZIM, regardless of original one + } + ] + } ] ``` diff --git a/nautiluszim/scraper.py b/nautiluszim/scraper.py index a206b8b..2c71816 100644 --- a/nautiluszim/scraper.py +++ b/nautiluszim/scraper.py @@ -8,11 +8,12 @@ import os import pathlib import shutil +import tempfile import unicodedata import uuid import zipfile from pathlib import Path -from typing import Optional +from typing import Dict, Optional, Tuple, Union import jinja2 from zimscraperlib.constants import ( @@ -338,16 +339,33 @@ def test_collection(self): nb_files = sum([len(i.get("files", [])) for i in self.json_collection]) logger.info(f"Collection loaded. {nb_items} items, {nb_files} files") + self.test_files() + + def test_files(self): with zipfile.ZipFile(self.archive_path, "r") as zh: all_names = zh.namelist() missing_files = [] + all_file_names = [] for entry in self.json_collection: if not entry.get("files"): continue - for relative_path in entry["files"]: - if relative_path not in all_names: - missing_files.append(relative_path) + for file in entry["files"]: + try: + uri, filename = self.get_file_entry_from(file) + all_file_names.append(filename) + if not uri.startswith("http") and uri not in all_names: + missing_files.append(uri) + except ValueError: + missing_files.append(entry["title"]) + + duplicate_file_names = set( + [ + filename + for filename in all_file_names + if all_file_names.count(filename) > 1 + ] + ) if missing_files: raise ValueError( @@ -355,16 +373,54 @@ def test_collection(self): + "\n - ".join(missing_files) ) + if duplicate_file_names: + raise ValueError( + "Files in collection are duplicate:\n - " + + "\n - ".join(duplicate_file_names) + ) + + def get_file_entry_from(self, file: Union[str, Dict[str, str]]) -> Tuple[str, str]: + """Converting a file entity to the (uri, filename)""" + # It's for old-format, pathname-only entries + if isinstance(file, str): + return (file, file) + archive_member = file.get("archive-member", None) + url = file.get("url", None) + uri = None + filename = None + if not archive_member and not url: + raise ValueError("archive_member and url are both missing") + if url: + uri = url + filename = Path(url).name + else: + uri = archive_member + filename = archive_member + filename = file.get("filename", filename) + return (uri, filename) + def process_collection_entries(self): for entry in self.json_collection: if not entry.get("files"): continue - for relative_path in entry["files"]: - logger.debug(f"> {relative_path}") + for file in entry["files"]: + uri, filename = self.get_file_entry_from(file) + logger.debug(f"> {uri}") + + if uri.startswith("http"): + fpath = pathlib.Path( + tempfile.NamedTemporaryFile( + dir=self.build_dir, delete=False + ).name + ) + save_large_file(uri, fpath) + else: + fpath = self.extract_to_fs(uri) + self.zim_creator.add_item_for( - path="files/" + normalized_path(relative_path), - fpath=self.extract_to_fs(relative_path), + path="files/" + normalized_path(filename), + fpath=fpath, delete_fpath=True, is_front=False, ) @@ -443,7 +499,8 @@ def add_ui(self): "dsc": document.get("description") or "", "aut": document.get("authors") or "", "fp": [ - normalized_path(path) for path in document.get("files", []) + normalized_path(self.get_file_entry_from(file)[1]) + for file in document.get("files", []) ], } )