From 3f46c2b54618e92e05bd2c7635fb6e9343cdd960 Mon Sep 17 00:00:00 2001 From: Fledge Shiu Date: Wed, 19 Jul 2023 11:57:22 -0400 Subject: [PATCH] Refactor test_collections in scraper. --- nautiluszim/scraper.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/nautiluszim/scraper.py b/nautiluszim/scraper.py index ba42240..2c71816 100644 --- a/nautiluszim/scraper.py +++ b/nautiluszim/scraper.py @@ -339,6 +339,9 @@ def test_collection(self): nb_files = sum([len(i.get("files", [])) for i in self.json_collection]) logger.info(f"Collection loaded. {nb_items} items, {nb_files} files") + self.test_files() + + def test_files(self): with zipfile.ZipFile(self.archive_path, "r") as zh: all_names = zh.namelist() @@ -356,16 +359,20 @@ def test_collection(self): except ValueError: missing_files.append(entry["title"]) + duplicate_file_names = set( + [ + filename + for filename in all_file_names + if all_file_names.count(filename) > 1 + ] + ) + if missing_files: raise ValueError( "File(s) referenced in collection but missing:\n - " + "\n - ".join(missing_files) ) - duplicate_file_names = set([ - filename - for filename in all_file_names - if all_file_names.count(filename) > 1 - ]) + if duplicate_file_names: raise ValueError( "Files in collection are duplicate:\n - "