Skip to content

Commit

Permalink
Merge pull request #62 from openzim/add-urls-support
Browse files Browse the repository at this point in the history
  • Loading branch information
rgaudin authored Jul 19, 2023
2 parents 0cdede1 + 3f46c2b commit 15ac800
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 13 deletions.
9 changes: 6 additions & 3 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
as of 1.3.0.

## [Unreleased]

### Added

- Add urls support to collections. (#59)

## [1.1.1] - 2023-05-05

### Changed
Expand Down Expand Up @@ -41,9 +47,6 @@ as of 1.3.0.
- Replaced zimwriterfs with zimscraperlib –using libzim 7 without namespaces– (#41)
- Removed inline javascript to comply with some CSP (#34)

## [1.2.1] - 2022-08-03


## [1.0.7] - 2022-01-04

- removed inline JS in homepage (#34)
Expand Down
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,19 @@ At the moment, the JSON file needs to provide the following fields for each item
"description": "...",
"authors": "...",
"files": ["relative/path/to/file"]
}
},
{
"title": "...",
"description": "...",
"authors": "...",
"files": [
{
"archive-member": "01 BOOK for printing .pdf", // optional, member name inside archive (same as simpler format)
"url": "http://books.com/310398120.pdf", // optional, has precedence over `archive-member`, url to download file from
"filename": "My book.pdf", // optional, filename to use in ZIM, regardless of original one
}
]
}
]
```

Expand Down
75 changes: 66 additions & 9 deletions nautiluszim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
import os
import pathlib
import shutil
import tempfile
import unicodedata
import uuid
import zipfile
from pathlib import Path
from typing import Optional
from typing import Dict, Optional, Tuple, Union

import jinja2
from zimscraperlib.constants import (
Expand Down Expand Up @@ -338,33 +339,88 @@ def test_collection(self):
nb_files = sum([len(i.get("files", [])) for i in self.json_collection])
logger.info(f"Collection loaded. {nb_items} items, {nb_files} files")

self.test_files()

def test_files(self):
with zipfile.ZipFile(self.archive_path, "r") as zh:
all_names = zh.namelist()

missing_files = []
all_file_names = []
for entry in self.json_collection:
if not entry.get("files"):
continue
for relative_path in entry["files"]:
if relative_path not in all_names:
missing_files.append(relative_path)
for file in entry["files"]:
try:
uri, filename = self.get_file_entry_from(file)
all_file_names.append(filename)
if not uri.startswith("http") and uri not in all_names:
missing_files.append(uri)
except ValueError:
missing_files.append(entry["title"])

duplicate_file_names = set(
[
filename
for filename in all_file_names
if all_file_names.count(filename) > 1
]
)

if missing_files:
raise ValueError(
"File(s) referenced in collection but missing:\n - "
+ "\n - ".join(missing_files)
)

if duplicate_file_names:
raise ValueError(
"Files in collection are duplicate:\n - "
+ "\n - ".join(duplicate_file_names)
)

def get_file_entry_from(self, file: Union[str, Dict[str, str]]) -> Tuple[str, str]:
"""Converting a file entity to the (uri, filename)"""
# It's for old-format, pathname-only entries
if isinstance(file, str):
return (file, file)
archive_member = file.get("archive-member", None)
url = file.get("url", None)
uri = None
filename = None
if not archive_member and not url:
raise ValueError("archive_member and url are both missing")
if url:
uri = url
filename = Path(url).name
else:
uri = archive_member
filename = archive_member
filename = file.get("filename", filename)
return (uri, filename)

def process_collection_entries(self):
for entry in self.json_collection:
if not entry.get("files"):
continue

for relative_path in entry["files"]:
logger.debug(f"> {relative_path}")
for file in entry["files"]:
uri, filename = self.get_file_entry_from(file)
logger.debug(f"> {uri}")

if uri.startswith("http"):
fpath = pathlib.Path(
tempfile.NamedTemporaryFile(
dir=self.build_dir, delete=False
).name
)
save_large_file(uri, fpath)
else:
fpath = self.extract_to_fs(uri)

self.zim_creator.add_item_for(
path="files/" + normalized_path(relative_path),
fpath=self.extract_to_fs(relative_path),
path="files/" + normalized_path(filename),
fpath=fpath,
delete_fpath=True,
is_front=False,
)
Expand Down Expand Up @@ -443,7 +499,8 @@ def add_ui(self):
"dsc": document.get("description") or "",
"aut": document.get("authors") or "",
"fp": [
normalized_path(path) for path in document.get("files", [])
normalized_path(self.get_file_entry_from(file)[1])
for file in document.get("files", [])
],
}
)
Expand Down

0 comments on commit 15ac800

Please sign in to comment.