Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: packaging add Denmark #10388

Merged
merged 1 commit into from
Jun 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions lib/ProductOpener/Display.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3964,6 +3964,15 @@ HTML
;
}

if ($packager_codes{$canon_tagid}{cc} eq 'dk') {
$description .= <<HTML
<p>$packager_codes{$canon_tagid}{name}<br>
$packager_codes{$canon_tagid}{address} (Denmark)
</p>
HTML
;
}

if ($packager_codes{$canon_tagid}{cc} eq 'es') {
# Razón Social;Provincia/Localidad
$description .= <<HTML
Expand Down
1 change: 1 addition & 0 deletions lib/ProductOpener/PackagerCodes.pm
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ my %local_ec = (
CY => "EK",
CZ => "ES",
DE => "EG",
DK => "EF",
EE => "EÜ",
ES => "CE",
FI => "EY",
Expand Down
1,477 changes: 1,477 additions & 0 deletions packager-codes/DK-merge-UTF-8.csv

Large diffs are not rendered by default.

Binary file modified packager-codes/geocode_addresses.sto
Binary file not shown.
Binary file modified packager-codes/packager_codes.sto
Binary file not shown.
100 changes: 100 additions & 0 deletions scripts/packager-codes/dk_packagers_refresh_part1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
'''
This file is part of Product Opener.
Product Opener
Copyright (C) 2011-2023 Association Open Food Facts
Contact: [email protected]
Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France
Product Opener is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.

# PREREQUISITES
python3
update apikey file for geocode.maps.co (free account) in xx_packagers_refresh_part2.py
from the government website,
download a Excel file of all establishments:
https://foedevarestyrelsen.dk/kost-og-foedevarer/start-og-drift-af-foedevarevirksomhed/autorisation-og-registrering/registrerede-og-autorisede-foedevarevirksomheder

# INSTALLATION
## install virtual environment
sudo apt install python3.11-venv
python3 -m venv venv
source venv/bin/activate

## install needed packages
pip install pandas
pip install openpyxl
pip install xlsx2csv
pip install requests

# RUN
python3 xx_packagers_refresh_part1.py

This create a csv file XX-merge-UTF-8_no_coord.csv

python3 xx_packagers_refresh_part2.py

Note that API sometimes return status code 500 or other, try to rerun before to debug it.
The file at_packagers_refresh_part2_index_tmp.txt is used during process, to resume processing only.

# POSTPROCESSING
- deactivate the virtual environment:
deactivate
- delete all temporary files
- update .sto file
'''

import pandas as pd


def read_input_file(file_name: str) -> pd.core.frame.DataFrame:

# could not use polars to read excel, used pandas instead
# ignore two first tabs
ids = [i for i in range(2, 21)]
# skip first few rows (~page header)
excel_file = pd.read_excel('Autoriserede_Foedevarevirksomheder_Excel(1).xlsx', sheet_name=ids, skiprows=5)
# take only first three columns (code, name, address)
filtered_dfs = [df.iloc[:, :3] for df in excel_file.values()]
# combine all tabs into single one
df = pd.concat(filtered_dfs)

# rename columns name
df.columns = ['code', 'name', 'address']

# one approval number can have more than a single category
# this leads to null rows in the df
df.dropna(how='all', inplace=True)

# some rows are missing approval number
df.dropna(subset=['code'], inplace=True)

# some approval number became float (60.0)
df['code'] = df['code'].apply(lambda x: str(x).replace('.0', ''))

# append prefix DF and suffix EK
# at the end of the packaging codes
df['code'] = df['code'].str.strip()
df['code'] = df['code'].apply(lambda x: f"DK {x} EF")

# rm duplicates
df.drop_duplicates(subset="code", keep="first", inplace=True)

return df



if __name__ == "__main__":
input_file = 'Autoriserede_Foedevarevirksomheder_Excel(1).xlsx'
output_file = 'DK-merge-UTF-8_no_coord.csv'

df = read_input_file(input_file)

df.to_csv(output_file, sep=';', index=False)
198 changes: 198 additions & 0 deletions scripts/packager-codes/dk_packagers_refresh_part2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
'''
This file is part of Product Opener.
Product Opener
Copyright (C) 2011-2023 Association Open Food Facts
Contact: [email protected]
Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France
Product Opener is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
'''

import csv
import requests
import sys
from time import sleep


def extract_address_components(address_to_convert):
address_split = address_to_convert.split(',')
address_split = [x.strip() for x in address_split]

street = None
postal_code = None
town = None

if address_split == [""]:
print("warning, missing address")
elif len(address_split) < 2:
print("info, address without comma")
elif len(address_split) == 2:
print("info, exactly 1 comma", address_split)
street, post_and_town = address_split[0].strip(), address_split[1].strip()
# sometimes there are no postcode, just town
if post_and_town[:4].isdigit():
postal_code = post_and_town.split()[0]
town = " ".join(post_and_town.split()[1:])
else:
print("warning: could not extract postal code, set second element as town", address_split)
town = post_and_town
else:
print("info, more than 1 comma", address_split)

# first element of the address split by comma contains digit
if any(char.isdigit() for char in address_split[0]):
print("info, first element")
street = address_split[0]
# other elements of the address split by comma contains digit and are not postcode
# street and number can be second or third
elif any(char.isdigit() for char in address_split[1]) and not address_split[1][:4].isdigit():
print("info, second element")
street = address_split[1]
elif any(char.isdigit() for char in address_split[2]) and not address_split[2][:4].isdigit():
print("info, third element")
street = address_split[2]
else:
print("warning, could not extract street", address_split)
street = None

# start from the end
for i in range(len(address_split)-1, -1, -1):
# strip to remove space after comma
address_chunk = address_split[i].strip()

if address_chunk[:4].isdigit():
print("info, at the beginning")
postal_code = address_chunk[:4]
town = address_chunk[4:]
break

print(f"street: {street}, postal_code: {postal_code}, town: {town}")
return street, postal_code, town


def convert_address_to_lat_lng(address_to_convert: str) -> list:
# free plan: 1 request per second
sleep(1)

print(f"\ninfo, address_to_convert: {address_to_convert}")

street, postal_code, town = extract_address_components(address_to_convert)

url = "https://geocode.maps.co/search?"
if street:
url += f"street={street}&"
if town:
url += f"city={town}&"
if postal_code:
url += f"postal_code={postal_code}&"
url += f"country={country_name}&country_code={country_code}&api_key={api_key}"

try:
print(f"url_1 {url}")
response = requests.get(url)
data = response.json()
if data != []:
lat, lng = data[0]['lat'], data[0]['lon']
else:
sleep(1)
old_street = street
old_town = town

# drop additional letter at the end of city name
# example: Aarhus C
town_split = town.split()
if len(town_split[-1]) == 1 or len(town_split[-1]) == 2:
town = " ".join(town_split[:-1])
print("info, drop suffix of town")

if old_town != town:
url = url.replace(old_town, town)


url_2 = url.replace(f"street={old_street}&", "")
print("info, drop street")

try:
print("url_2", url_2)
response = requests.get(url_2)
data = response.json()
if data != []:
lat, lng = data[0]['lat'], data[0]['lon']
else:
sleep(1)

# can be in Greenland
# example: Fiskervej B 99, Postboks 69, 3921 Narsaq
url_3 = url_2.replace(f"country=Denmark&country_code=DK&", "")

try:
print("url_3", url_3)
response = requests.get(url_3)
data = response.json()
if data != []:
lat, lng = data[0]['lat'], data[0]['lon']
else:
print(f'Empty response for: {address_to_convert}" {url_3}')
sys.exit(1)
except (requests.exceptions.RequestException, KeyError, IndexError) as e:
print(f"Error: {e}, url: {url}")
sys.exit(1)
except (requests.exceptions.RequestException, KeyError, IndexError) as e:
print(f"Error: {e}, url: {url}")
sys.exit(1)
except (requests.exceptions.RequestException, KeyError, IndexError) as e:
print(f"Error: {e}, url: {url}")
sys.exit(1)

return [lat, lng]



if __name__ == "__main__":
source_file = 'DK-merge-UTF-8_no_coord.csv'
target_file = "DK-merge-UTF-8.csv"
index_last_line_processed = 'dk_packagers_refresh_part2_index_tmp.txt'
api_key = "" # TODO remove
country_name = "Denmark"
country_code = "DK"


data = []
try:
with open(index_last_line_processed, 'r') as f:
index = int(f.read())
except FileNotFoundError as e:
print(f"info, create temporary file {index_last_line_processed}")
index = 0

print(f"info, index is set to {index}")

l = 0
with open(source_file, mode='r', newline='') as csv_file_read:
with open(target_file, mode='a', newline='') as csv_file_write:
reader = csv.reader(csv_file_read, delimiter=";")
writer = csv.writer(csv_file_write, delimiter=";")
for row in reader:
# continue previous run
if l <= index and index != 0:
l += 1
continue
# header
elif l == 0:
row += ['lat', 'lng']
else:
row += convert_address_to_lat_lng(row[2])

writer.writerow(row)

with open(index_last_line_processed, 'w') as f:
f.write(str(l))
l += 1
2 changes: 2 additions & 0 deletions scripts/update_packager_codes.pl
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ sub normalize_code {
"$code" when 'cy';
"$code" when 'cz';
"DE $code EC" when 'de';
"$code" when 'dk';
"EE $code EC" when 'ee';
"ES $code CE" when 'es';
"FI $code EC" when 'fi';
Expand Down Expand Up @@ -170,6 +171,7 @@ sub normalize_local_authority {
cy => 'code',
cz => 'code',
de => 'code',
dk => 'code',
ee => 'tunnusnumber',
es => 'n_rgseaa',
fi => 'numero',
Expand Down
Loading