openzim · rgaudin · Jul 21, 2023 · Jul 19, 2023 · Jul 19, 2023 · Jul 20, 2023
diff --git a/CHANGELOG b/CHANGELOG
@@ -11,6 +11,7 @@ as of 1.3.0.
 ### Added
 
 - Add urls support to collections. (#59)
+- Add archiveless collection.json support. (#60)
 
 ## [1.1.1] - 2023-05-05
 

diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ The user-interface only gives access to files referenced properly in the collect
 
 At the moment, the JSON file needs to provide the following fields for each item in an array:
 
-``` JSON
+```json
 [
     {
         "title": "...",
@@ -79,13 +79,13 @@ Either inside the archive ZIP as `/about.html` or elsewhere, specified via `--ab
 
 ## docker
 
-```
+```sh
 docker run -v my_dir:/output ghcr.io/openzim/nautilus nautiluszim --help
 ```
 
 ## pip
 
-```
+```sh
 pip install nautiluszim
 nautiluszim --help
 ```
@@ -101,9 +101,14 @@ nautiluszim --help
 
 # Usage
 
-```
+```sh
 nautiluszim --archive my-content.zip
 ```
+Or
+```sh
+nautiluszim --collection https://example.com/to-your-collection-file
+# In this mode every file entry must have a valid url.
+```
 
 ## Notes
 

diff --git a/nautiluszim/entrypoint.py b/nautiluszim/entrypoint.py
@@ -16,7 +16,7 @@ def main():
     parser.add_argument(
         "--archive",
         help="Path or URL to a ZIP archive containing all the documents",
-        required=True,
+        required=False,
     )
     parser.add_argument(
         "--collection",

diff --git a/nautiluszim/scraper.py b/nautiluszim/scraper.py
@@ -13,7 +13,7 @@
 import uuid
 import zipfile
 from pathlib import Path
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, List, Optional, Set, Tuple, Union
 
 import jinja2
 from zimscraperlib.constants import (
@@ -145,14 +145,18 @@ def run(self):
         self.check_branding_values()
 
         # download archive
-        self.download_archive()
+        if self.archive:
+            self.download_archive()
 
         if not self.collection:
             self.collection = self.extract_to_fs("collection.json")
             if not self.about:
                 self.extract_to_fs("about.html", failsafe=True)
 
-        self.test_collection()
+        if not self.archive:
+            self.test_archiveless_collection()
+        else:
+            self.test_archive_collection()
 
         logger.info("update general metadata")
         self.update_metadata()
@@ -332,52 +336,93 @@ def extract_to_fs(
                     return
                 raise exc
 
-    def test_collection(self):
+    def load_collection(self):
+        """Load the collection.json"""
         with open(self.collection, "r") as fp:
             self.json_collection = [i for i in json.load(fp) if i.get("files", [])]
         nb_items = len(self.json_collection)
         nb_files = sum([len(i.get("files", [])) for i in self.json_collection])
         logger.info(f"Collection loaded. {nb_items} items, {nb_files} files")
 
-        self.test_files()
+    def test_archiveless_collection(self):
+        """Test the collection.json without archive file"""
+        self.load_collection()
+
+        (
+            duplicate_filename,
+            missing_filenames,
+            all_uris,
+        ) = self.test_files()
+
+        if not all_uris:
+            raise ValueError("Collection is emtpy:\n")
+
+        for uri in all_uris:
+            if not uri.startswith("http"):
+                raise ValueError(
+                    f"File referenced in collection which are not urls:\n - {uri}\n "
+                )
+
+        self._ensure_no_missing_files(missing_filenames)
+        self._ensure_no_duplicate_filenames(duplicate_filename)
 
-    def test_files(self):
+    def test_archive_collection(self):
+        """Test the collection.json with the archive file"""
+        self.load_collection()
         with zipfile.ZipFile(self.archive_path, "r") as zh:
             all_names = zh.namelist()
+        duplicate_filename, missing_filenames, _ = self.test_files(all_names)
+
+        self._ensure_no_missing_files(missing_filenames)
+        self._ensure_no_duplicate_filenames(duplicate_filename)
+
+    def _ensure_no_missing_files(self, files):
+        if not files:
+            return
+        raise ValueError(
+            "File(s) referenced in collection but missing:\n - " + "\n - ".join(files)
+        )
+
+    def _ensure_no_duplicate_filenames(self, files):
+        if not files:
+            return
+        raise ValueError(
+            "Files in collection are duplicate:\n - " + "\n - ".join(files)
+        )
+
+    def test_files(
+        self, available_filenames: Optional[List[str]] = None
+    ) -> Tuple[Set[str], List[str], List[str]]:
+        """Tests the file entries and returns:
+        duplicate_filename: list of target (in ZIM) filenames that are present 2+ times
+        missing_filenames: list of entry titles for which a filename is missing
+        all_uris: list of all target filenames
+        """
+
+        duplicate_filename = set()
+        missing_filenames = []
+        all_uris = []
 
-        missing_files = []
-        all_file_names = []
         for entry in self.json_collection:
             if not entry.get("files"):
                 continue
             for file in entry["files"]:
                 try:
-                    uri, filename = self.get_file_entry_from(file)
-                    all_file_names.append(filename)
-                    if not uri.startswith("http") and uri not in all_names:
-                        missing_files.append(uri)
+                    uri, _ = self.get_file_entry_from(file)
+                    all_uris.append(uri)
+                    if (
+                        not uri.startswith("http")
+                        and available_filenames
+                        and uri not in available_filenames
+                    ):
+                        missing_filenames.append(uri)
                 except ValueError:
-                    missing_files.append(entry["title"])
+                    missing_filenames.append(entry["title"])
 
-        duplicate_file_names = set(
-            [
-                filename
-                for filename in all_file_names
-                if all_file_names.count(filename) > 1
-            ]
+        duplicate_filename = set(
+            [filename for filename in all_uris if all_uris.count(filename) > 1]
         )
-
-        if missing_files:
-            raise ValueError(
-                "File(s) referenced in collection but missing:\n - "
-                + "\n - ".join(missing_files)
-            )
-
-        if duplicate_file_names:
-            raise ValueError(
-                "Files in collection are duplicate:\n - "
-                + "\n - ".join(duplicate_file_names)
-            )
+        return (duplicate_filename, missing_filenames, all_uris)
 
     def get_file_entry_from(self, file: Union[str, Dict[str, str]]) -> Tuple[str, str]:
         """Converting a file entity to the (uri, filename)"""