Skip to content

Commit

Permalink
feat: Non-EU packager codes download (#3364)
Browse files Browse the repository at this point in the history

Co-authored-by: Alexandre Marty <[email protected]>
Co-authored-by: Pierre Slamich <[email protected]>
  • Loading branch information
3 people authored Dec 14, 2022
1 parent 487c1bf commit b118d6e
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 22 deletions.
12 changes: 3 additions & 9 deletions scripts/packager-codes/non-eu/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Non-EU Packager Codes

A Python application to download and manage non-EU packager codes, as listed on the official page: https://webgate.ec.europa.eu/sanco/traces/output/non_eu_listsPerCountry_en.htm.
A Python application to download and manage non-EU packager codes, as listed on [the official page](https://webgate.ec.europa.eu/sanco/traces/output/non_eu_listsPerCountry_en.htm).

## Setup

Expand All @@ -24,17 +24,11 @@ Simply run `python packager_codes.py --help` to see the main help.
To download or update packager code files in the directory `packager_codes_data`:

```shell script
python packager_codes.py sync packager_codes_data
python packager_codes.py sync
```

To display the status of the locally downloaded files as compared to the remote:

````shell script
python packager_codes.py status packager_codes_data
````

To extract codes
````
find . -name "*.pdf"
python ./pdf_extraction.py
python packager_codes.py status
````
31 changes: 20 additions & 11 deletions scripts/packager-codes/non-eu/packager_codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,19 @@

JSONObject = Mapping[str, Any]

SCRAPY_SPIDER_FILE_PATH = Path("non_eu_spider.py").absolute()


def scrape_document_info() -> List[JSONObject]:
logger.info("Scraping remote document information")
# Try importing scrapy to check for dependency
import scrapy # noqa
"""Scrape official non-EU packager codes page and extract documents information.
spider_file_path = Path("non_eu_spider.py").absolute()
Returns:
list of JSONObject: List of document information as dictionaries with the keys:
country_name, title, url, publication_date, file_path, section.
"""
logger.info("Scraping remote document information")
cmd = "scrapy runspider --output - --output-format json --loglevel WARN".split(" ")
cmd.append(str(spider_file_path))
cmd.append(str(SCRAPY_SPIDER_FILE_PATH))
cmd_res = subprocess.run(cmd, stdout=subprocess.PIPE, check=True)
return json.loads(cmd_res.stdout.decode())

Expand Down Expand Up @@ -86,10 +90,12 @@ def main():

@main.command(
help="Show local data status as compared to remote source.\n\n"
"DATA_DIR is the path to the local packager code data storage.",
no_args_is_help=True,
"DATA_DIR is the path to the local directory containing packager code data. "
"Defaults to 'packager_codes_data'.",
)
@click.argument(
"data_dir", type=click.Path(file_okay=False), default="packager_codes_data"
)
@click.argument("data_dir", type=click.Path(file_okay=False))
@click.option(
"--output-format",
"-f",
Expand All @@ -103,6 +109,7 @@ def status(data_dir: str, output_format: str) -> None:

local_meta = load_local_meta(data_dir)
scraped_info = scrape_document_info()
print(scraped_info)
doc_diff = document_info_diff(scraped_info, local_meta["document_info"])

if output_format == "json":
Expand All @@ -122,10 +129,12 @@ def status(data_dir: str, output_format: str) -> None:

@main.command(
help="Sync packager code files with remote.\n\n"
"DATA_DIR is the path of the local directory in which to sync data.",
no_args_is_help=True,
"DATA_DIR is the path of the local directory in which to sync data. Defaults to "
"'packager_codes_data'.",
)
@click.argument(
"data_dir", type=click.Path(file_okay=False), default="packager_codes_data"
)
@click.argument("data_dir", type=click.Path(file_okay=False))
def sync(data_dir: str) -> None:
data_dir = Path(data_dir)
data_dir.mkdir(exist_ok=True)
Expand Down
3 changes: 1 addition & 2 deletions scripts/packager-codes/non-eu/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# Requires Python >= 3.5
click
scrapy
pdfplumber
scrapy==2.0

0 comments on commit b118d6e

Please sign in to comment.