openfoodfacts · alexgarel · Dec 14, 2022 · May 4, 2020 · May 4, 2020 · May 4, 2020
@@ -1,6 +1,6 @@
 # Non-EU Packager Codes
 
-A Python application to download and manage non-EU packager codes, as listed on the official page: https://webgate.ec.europa.eu/sanco/traces/output/non_eu_listsPerCountry_en.htm.
+A Python application to download and manage non-EU packager codes, as listed on [the official page](https://webgate.ec.europa.eu/sanco/traces/output/non_eu_listsPerCountry_en.htm).
 
 ## Setup
 
@@ -24,17 +24,11 @@ Simply run `python packager_codes.py --help` to see the main help.
 To download or update packager code files in the directory `packager_codes_data`:
 
 ```shell script
-python packager_codes.py sync packager_codes_data
+python packager_codes.py sync
 ```
 
 To display the status of the locally downloaded files as compared to the remote:
 
 ````shell script
-python packager_codes.py status packager_codes_data
-````
-
-To extract codes
-````
-find . -name "*.pdf"
-python ./pdf_extraction.py    
+python packager_codes.py status
 ````
@@ -15,15 +15,19 @@
 
 JSONObject = Mapping[str, Any]
 
+SCRAPY_SPIDER_FILE_PATH = Path("non_eu_spider.py").absolute()
+
 
 def scrape_document_info() -> List[JSONObject]:
-    logger.info("Scraping remote document information")
-    # Try importing scrapy to check for dependency
-    import scrapy  # noqa
+    """Scrape official non-EU packager codes page and extract documents information.
 
-    spider_file_path = Path("non_eu_spider.py").absolute()
+    Returns:
+        list of JSONObject: List of document information as dictionaries with the keys:
+        country_name, title, url, publication_date, file_path, section.
+    """
+    logger.info("Scraping remote document information")
     cmd = "scrapy runspider --output - --output-format json --loglevel WARN".split(" ")
-    cmd.append(str(spider_file_path))
+    cmd.append(str(SCRAPY_SPIDER_FILE_PATH))
     cmd_res = subprocess.run(cmd, stdout=subprocess.PIPE, check=True)
     return json.loads(cmd_res.stdout.decode())
 
@@ -86,10 +90,12 @@ def main():
 
 @main.command(
     help="Show local data status as compared to remote source.\n\n"
-    "DATA_DIR is the path to the local packager code data storage.",
-    no_args_is_help=True,
+    "DATA_DIR is the path to the local directory containing packager code data. "
+    "Defaults to 'packager_codes_data'.",
+)
+@click.argument(
+    "data_dir", type=click.Path(file_okay=False), default="packager_codes_data"
 )
-@click.argument("data_dir", type=click.Path(file_okay=False))
 @click.option(
     "--output-format",
     "-f",
@@ -103,6 +109,7 @@ def status(data_dir: str, output_format: str) -> None:
 
     local_meta = load_local_meta(data_dir)
     scraped_info = scrape_document_info()
+    print(scraped_info)
     doc_diff = document_info_diff(scraped_info, local_meta["document_info"])
 
     if output_format == "json":
@@ -122,10 +129,12 @@ def status(data_dir: str, output_format: str) -> None:
 
 @main.command(
     help="Sync packager code files with remote.\n\n"
-    "DATA_DIR is the path of the local directory in which to sync data.",
-    no_args_is_help=True,
+    "DATA_DIR is the path of the local directory in which to sync data. Defaults to "
+    "'packager_codes_data'.",
+)
+@click.argument(
+    "data_dir", type=click.Path(file_okay=False), default="packager_codes_data"
 )
-@click.argument("data_dir", type=click.Path(file_okay=False))
 def sync(data_dir: str) -> None:
     data_dir = Path(data_dir)
     data_dir.mkdir(exist_ok=True)

@@ -1,4 +1,3 @@
 # Requires Python >= 3.5
 click
-scrapy
-pdfplumber
+scrapy==2.0