Merge pull request #62 from openzim/add-urls-support

openzim · Jul 19, 2023 · 15ac800 · 15ac800
2 parents 0cdede1 + 3f46c2b
commit 15ac800
Show file tree

Hide file tree

Showing 3 changed files with 85 additions and 13 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
 as of 1.3.0.
 
+## [Unreleased]
+
+### Added
+
+- Add urls support to collections. (#59)
+
 ## [1.1.1] - 2023-05-05
 
 ### Changed
@@ -41,9 +47,6 @@ as of 1.3.0.
 - Replaced zimwriterfs with zimscraperlib –using libzim 7 without namespaces– (#41)
 - Removed inline javascript to comply with some CSP (#34)
 
-## [1.2.1] - 2022-08-03
-
-
 ## [1.0.7] - 2022-01-04
 
 - removed inline JS in homepage (#34)

diff --git a/README.md b/README.md
@@ -40,7 +40,19 @@ At the moment, the JSON file needs to provide the following fields for each item
         "description": "...",
         "authors": "...",
         "files": ["relative/path/to/file"]
-     }
+    },
+    {
+        "title": "...",
+        "description": "...",
+        "authors": "...",
+        "files": [
+            {
+                "archive-member": "01 BOOK for printing .pdf",  // optional, member name inside archive (same as simpler format)
+                "url": "http://books.com/310398120.pdf",  // optional, has precedence over `archive-member`, url to download file from
+                "filename": "My book.pdf",  // optional, filename to use in ZIM, regardless of original one
+            }
+        ]
+    }
 ]
 ```
 

diff --git a/nautiluszim/scraper.py b/nautiluszim/scraper.py
@@ -8,11 +8,12 @@
 import os
 import pathlib
 import shutil
+import tempfile
 import unicodedata
 import uuid
 import zipfile
 from pathlib import Path
-from typing import Optional
+from typing import Dict, Optional, Tuple, Union
 
 import jinja2
 from zimscraperlib.constants import (
@@ -338,33 +339,88 @@ def test_collection(self):
         nb_files = sum([len(i.get("files", [])) for i in self.json_collection])
         logger.info(f"Collection loaded. {nb_items} items, {nb_files} files")
 
+        self.test_files()
+
+    def test_files(self):
         with zipfile.ZipFile(self.archive_path, "r") as zh:
             all_names = zh.namelist()
 
         missing_files = []
+        all_file_names = []
         for entry in self.json_collection:
             if not entry.get("files"):
                 continue
-            for relative_path in entry["files"]:
-                if relative_path not in all_names:
-                    missing_files.append(relative_path)
+            for file in entry["files"]:
+                try:
+                    uri, filename = self.get_file_entry_from(file)
+                    all_file_names.append(filename)
+                    if not uri.startswith("http") and uri not in all_names:
+                        missing_files.append(uri)
+                except ValueError:
+                    missing_files.append(entry["title"])
+
+        duplicate_file_names = set(
+            [
+                filename
+                for filename in all_file_names
+                if all_file_names.count(filename) > 1
+            ]
+        )
 
         if missing_files:
             raise ValueError(
                 "File(s) referenced in collection but missing:\n - "
                 + "\n - ".join(missing_files)
             )
 
+        if duplicate_file_names:
+            raise ValueError(
+                "Files in collection are duplicate:\n - "
+                + "\n - ".join(duplicate_file_names)
+            )
+
+    def get_file_entry_from(self, file: Union[str, Dict[str, str]]) -> Tuple[str, str]:
+        """Converting a file entity to the (uri, filename)"""
+        # It's for old-format, pathname-only entries
+        if isinstance(file, str):
+            return (file, file)
+        archive_member = file.get("archive-member", None)
+        url = file.get("url", None)
+        uri = None
+        filename = None
+        if not archive_member and not url:
+            raise ValueError("archive_member and url are both missing")
+        if url:
+            uri = url
+            filename = Path(url).name
+        else:
+            uri = archive_member
+            filename = archive_member
+        filename = file.get("filename", filename)
+        return (uri, filename)
+
     def process_collection_entries(self):
         for entry in self.json_collection:
             if not entry.get("files"):
                 continue
 
-            for relative_path in entry["files"]:
-                logger.debug(f"> {relative_path}")
+            for file in entry["files"]:
+                uri, filename = self.get_file_entry_from(file)
+                logger.debug(f"> {uri}")
+
+                if uri.startswith("http"):
+                    fpath = pathlib.Path(
+                        tempfile.NamedTemporaryFile(
+                            dir=self.build_dir, delete=False
+                        ).name
+                    )
+                    save_large_file(uri, fpath)
+                else:
+                    fpath = self.extract_to_fs(uri)
+
                 self.zim_creator.add_item_for(
-                    path="files/" + normalized_path(relative_path),
-                    fpath=self.extract_to_fs(relative_path),
+                    path="files/" + normalized_path(filename),
+                    fpath=fpath,
                     delete_fpath=True,
                     is_front=False,
                 )
@@ -443,7 +499,8 @@ def add_ui(self):
                         "dsc": document.get("description") or "",
                         "aut": document.get("authors") or "",
                         "fp": [
-                            normalized_path(path) for path in document.get("files", [])
+                            normalized_path(self.get_file_entry_from(file)[1])
+                            for file in document.get("files", [])
                         ],
                     }
                 )