Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add urls support to collections. #62

Merged
merged 11 commits into from
Jul 19, 2023
Merged
9 changes: 6 additions & 3 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
as of 1.3.0.

## [1.2.0] - 2023-07-19
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved

### Added

- Add urls support to collections.
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved

## [1.1.1] - 2023-05-05

### Changed
Expand Down Expand Up @@ -41,9 +47,6 @@ as of 1.3.0.
- Replaced zimwriterfs with zimscraperlib –using libzim 7 without namespaces– (#41)
- Removed inline javascript to comply with some CSP (#34)

## [1.2.1] - 2022-08-03


## [1.0.7] - 2022-01-04

- removed inline JS in homepage (#34)
Expand Down
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,19 @@ At the moment, the JSON file needs to provide the following fields for each item
"description": "...",
"authors": "...",
"files": ["relative/path/to/file"]
}
},
{
"title": "...",
"description": "...",
"authors": "...",
"files": [
{
"archive-member": "01 BOOK for printing .pdf", // optional, member name inside archive (same as current)
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved
"url": "http://books.com/310398120.pdf", // optional, has precedence over `archive-member`, url to download file from
"filename": "My book.pdf", // optional, filename to use in ZIM, regardless of original one
}
]
}
]
```

Expand Down
67 changes: 58 additions & 9 deletions nautiluszim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import uuid
import zipfile
from pathlib import Path
from typing import Optional
from typing import Dict, Optional, Tuple, Union

import jinja2
from zimscraperlib.constants import (
Expand Down Expand Up @@ -97,6 +97,7 @@ def __init__(
self.keep_build_dir = keep_build_dir

self.build_dir = self.output_dir.joinpath("build")
self.tempfiles_dir = self.build_dir.joinpath("build")
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved

# set and record locale for translations
locale_name = (
Expand Down Expand Up @@ -199,6 +200,7 @@ def make_build_folder(self):

# create build folder
os.makedirs(self.build_dir, exist_ok=True)
os.makedirs(self.tempfiles_dir, exist_ok=True)
for fname in ("favicon.png", "main-logo.png"):
shutil.copy2(
self.templates_dir.joinpath(fname),
Expand Down Expand Up @@ -342,29 +344,75 @@ def test_collection(self):
all_names = zh.namelist()

missing_files = []
saved_file_paths = []
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved
for entry in self.json_collection:
if not entry.get("files"):
continue
for relative_path in entry["files"]:
if relative_path not in all_names:
missing_files.append(relative_path)
for file in entry["files"]:
try:
path = self.get_file_entry_from(file)[0]
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved
saved_file_paths.append(path)
if not path.startswith("http") and path not in all_names:
missing_files.append(path)
except ValueError:
missing_files.append(entry["title"])

if missing_files:
raise ValueError(
"File(s) referenced in collection but missing:\n - "
+ "\n - ".join(missing_files)
)
duplicate_file_paths = [
path for path in saved_file_paths if saved_file_paths.count(path) > 1
]
if duplicate_file_paths:
raise ValueError(
"Files in collection are duplicate:\n - "
+ "\n - ".join(duplicate_file_paths)
)

def get_file_entry_from(
self, file: Union[str, Dict[str, str]]
) -> Tuple[str | None, str | None]:
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved
"""Converting a file entity to the (path, filename)"""
# It's for old-format, pathname-only entries
if isinstance(file, str):
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved
return (file, file)
archive_member = file.get("archive-member", None)
url = file.get("url", None)
path = None
filename = None
if archive_member is None and url is None:
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("archive_member and url are both missing")
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved
if url:
path = url
filename = Path(url).name
else:
path = archive_member
filename = archive_member
filename = file.get("filename", filename)
return (path, filename)

def process_collection_entries(self):
for entry in self.json_collection:
if not entry.get("files"):
continue

for relative_path in entry["files"]:
logger.debug(f"> {relative_path}")
for file in entry["files"]:
file_entity = self.get_file_entry_from(file)
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved
path = file_entity[0]
filename = file_entity[1]
logger.debug(f"> {path}")

if path.startswith("http"):
fpath = self.tempfiles_dir.joinpath(filename).resolve()
FledgeXu marked this conversation as resolved.
Show resolved Hide resolved
save_large_file(path, fpath)
else:
fpath = self.extract_to_fs(path)

self.zim_creator.add_item_for(
path="files/" + normalized_path(relative_path),
fpath=self.extract_to_fs(relative_path),
path="files/" + normalized_path(filename),
fpath=fpath,
delete_fpath=True,
is_front=False,
)
Expand Down Expand Up @@ -443,7 +491,8 @@ def add_ui(self):
"dsc": document.get("description") or "",
"aut": document.get("authors") or "",
"fp": [
normalized_path(path) for path in document.get("files", [])
normalized_path(self.get_file_entry_from(file)[1])
for file in document.get("files", [])
],
}
)
Expand Down