Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add urls support to collections. #62

Merged
merged 11 commits into from
Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
as of 1.3.0.

## [Unreleased]

### Added

- Add urls support to collections. (#59)

## [1.1.1] - 2023-05-05

### Changed
Expand Down Expand Up @@ -41,9 +47,6 @@ as of 1.3.0.
- Replaced zimwriterfs with zimscraperlib –using libzim 7 without namespaces– (#41)
- Removed inline javascript to comply with some CSP (#34)

## [1.2.1] - 2022-08-03


## [1.0.7] - 2022-01-04

- removed inline JS in homepage (#34)
Expand Down
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,19 @@ At the moment, the JSON file needs to provide the following fields for each item
"description": "...",
"authors": "...",
"files": ["relative/path/to/file"]
}
},
{
"title": "...",
"description": "...",
"authors": "...",
"files": [
{
"archive-member": "01 BOOK for printing .pdf", // optional, member name inside archive (same as simpler format)
"url": "http://books.com/310398120.pdf", // optional, has precedence over `archive-member`, url to download file from
"filename": "My book.pdf", // optional, filename to use in ZIM, regardless of original one
}
]
}
]
```

Expand Down
75 changes: 66 additions & 9 deletions nautiluszim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
import os
import pathlib
import shutil
import tempfile
import unicodedata
import uuid
import zipfile
from pathlib import Path
from typing import Optional
from typing import Dict, Optional, Tuple, Union

import jinja2
from zimscraperlib.constants import (
Expand Down Expand Up @@ -338,33 +339,88 @@ def test_collection(self):
nb_files = sum([len(i.get("files", [])) for i in self.json_collection])
logger.info(f"Collection loaded. {nb_items} items, {nb_files} files")

self.test_files()

def test_files(self):
with zipfile.ZipFile(self.archive_path, "r") as zh:
all_names = zh.namelist()

missing_files = []
all_file_names = []
for entry in self.json_collection:
if not entry.get("files"):
continue
for relative_path in entry["files"]:
if relative_path not in all_names:
missing_files.append(relative_path)
for file in entry["files"]:
try:
uri, filename = self.get_file_entry_from(file)
all_file_names.append(filename)
if not uri.startswith("http") and uri not in all_names:
missing_files.append(uri)
except ValueError:
missing_files.append(entry["title"])

duplicate_file_names = set(
[
filename
for filename in all_file_names
if all_file_names.count(filename) > 1
]
)

if missing_files:
raise ValueError(
"File(s) referenced in collection but missing:\n - "
+ "\n - ".join(missing_files)
)

if duplicate_file_names:
raise ValueError(
"Files in collection are duplicate:\n - "
+ "\n - ".join(duplicate_file_names)
)

def get_file_entry_from(self, file: Union[str, Dict[str, str]]) -> Tuple[str, str]:
"""Converting a file entity to the (uri, filename)"""
# It's for old-format, pathname-only entries
if isinstance(file, str):
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved
return (file, file)
archive_member = file.get("archive-member", None)
url = file.get("url", None)
uri = None
filename = None
if not archive_member and not url:
raise ValueError("archive_member and url are both missing")
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved
if url:
uri = url
filename = Path(url).name
else:
uri = archive_member
filename = archive_member
filename = file.get("filename", filename)
return (uri, filename)

def process_collection_entries(self):
for entry in self.json_collection:
if not entry.get("files"):
continue

for relative_path in entry["files"]:
logger.debug(f"> {relative_path}")
for file in entry["files"]:
uri, filename = self.get_file_entry_from(file)
logger.debug(f"> {uri}")

if uri.startswith("http"):
fpath = pathlib.Path(
tempfile.NamedTemporaryFile(
dir=self.build_dir, delete=False
).name
)
save_large_file(uri, fpath)
else:
fpath = self.extract_to_fs(uri)

self.zim_creator.add_item_for(
path="files/" + normalized_path(relative_path),
fpath=self.extract_to_fs(relative_path),
path="files/" + normalized_path(filename),
fpath=fpath,
delete_fpath=True,
is_front=False,
)
Expand Down Expand Up @@ -443,7 +499,8 @@ def add_ui(self):
"dsc": document.get("description") or "",
"aut": document.get("authors") or "",
"fp": [
normalized_path(path) for path in document.get("files", [])
normalized_path(self.get_file_entry_from(file)[1])
for file in document.get("files", [])
],
}
)
Expand Down