openfoodfacts · stephanegigandet · Jun 3, 2024 · May 26, 2024 · May 26, 2024 · May 26, 2024
@@ -3955,6 +3955,15 @@ HTML
 						;
 				}
 
+				if ($packager_codes{$canon_tagid}{cc} eq 'cz') {
+					$description .= <<HTML
+<p>$packager_codes{$canon_tagid}{name}<br>
+$packager_codes{$canon_tagid}{address} (Czech)
+</p>
+HTML
+						;
+				}
+
 				if ($packager_codes{$canon_tagid}{cc} eq 'es') {
 					# Razón Social;Provincia/Localidad
 					$description .= <<HTML

@@ -186,6 +186,7 @@ sub normalize_packager_codes ($codes) {
 my %local_ec = (
 	AT => "EG",
 	CY => "EK",
+	CZ => "ES",
 	DE => "EG",
 	EE => "EÜ",
 	ES => "CE",

@@ -0,0 +1,99 @@
+'''
+This file is part of Product Opener.
+Product Opener
+Copyright (C) 2011-2023 Association Open Food Facts
+Contact: [email protected]
+Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France
+Product Opener is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+# PREREQUISITES
+python3
+update apikey file for geocode.maps.co (free account) in xx_packagers_refresh_part2.py 
+from the government website,
+    download a csv of all establishments:
+    https://en.svscr.cz/registered-subjects/lists-of-establishments/
+
+# INSTALLATION
+## install virtual environment
+sudo apt install python3.11-venv
+python3 -m venv venv
+source venv/bin/activate
+
+## install needed packages
+pip install pyarrow # to convert to polars
+pip install polars
+pip install requests
+
+# RUN
+python3 xx_packagers_refresh_part1.py
+
+This create a csv file XX-merge-UTF-8_no_coord.csv
+
+python3 xx_packagers_refresh_part2.py 
+
+Note that API sometimes return status code 500 or other, try to rerun before to debug it.
+The file at_packagers_refresh_part2_index_tmp.txt is used during process, to resume processing only.
+
+# POSTPROCESSING
+- deactivate the virtual environment:
+deactivate
+- delete all temporary files
+- update .sto file
+'''
+
+import os
+import polars as pl
+
+
+def read_input_file() -> pl.dataframe.frame.DataFrame:
+    current_directory = '.'
+
+    dfs = []
+    for filename in os.listdir(current_directory):
+        if filename.endswith('.csv') and filename != output_file:
+            print(filename)
+
+            file_path = os.path.join(current_directory, filename)
+
+            try:
+                df = pl.read_csv(filename)  
+
+                df = df.select(df.columns[0:3])
+
+                new_column_names = ['code', 'name', 'address']
+                df = df.rename({i: j for i, j in zip(df.columns, new_column_names)})
+
+                # append suffix EK at the end of the packaging codes
+                df = df.with_columns((pl.col(df.columns[0]) + " ES").alias(df.columns[0]))
+
+                dfs.append(df)
+
+            except Exception as e:
+                print(f"Error processing {file_path}: {e}")
+                sys.exit(1)
+
+    # Concatenate all DataFrames into a single DataFrame
+    result_df = pl.concat(dfs)
+
+    return result_df
+
+
+
+if __name__ == "__main__":
+    output_file = 'CZ-merge-UTF-8_no_coord.csv'
+
+    df = read_input_file()
+
+    # rm duplicates
+    df = df.lazy().group_by('code').agg(pl.first('name'), pl.first('address')).sort('code').collect()
+
+    df.write_csv(output_file, separator=';')
@@ -0,0 +1,264 @@
+'''
+This file is part of Product Opener.
+Product Opener
+Copyright (C) 2011-2023 Association Open Food Facts
+Contact: [email protected]
+Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France
+Product Opener is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
+'''
+
+import csv
+import re
+import requests
+import sys
+from time import sleep
+
+
+def extract_address_components(address_to_convert):
+    address_split = address_to_convert.split(',')
+    address_split = [x.strip() for x in address_split]
+
+    street = None
+    postal_code = None
+    town = None
+
+    if address_split == [""]:
+        print("warning, missing address")
+    elif len(address_split) < 2:
+        # can be postal code and town without comma
+        # example: 281 63 Přehvozdí 62/2
+        if address_split[0][:5].isdigit():
+            print("warning, address without street name (only postcode and town):", address_split)
+            post_and_town = address_split[0]
+            postal_code = post_and_town.split()[0]
+            town = " ".join(post_and_town.split()[1:])
+        elif address_split[0][:6].replace(" ", "").isdigit():
+            print("warning, address without street name (only postcode and town):", address_split)
+            post_and_town = address_split[0]
+            postal_code = " ".join(post_and_town.split()[:2])
+            town = " ".join(post_and_town.split()[2:])
+        else:
+            print("warning, address is missing comma, set as town:", address_split)
+            # assume it is town name only
+            # remove prefix words without uppercase as first letter
+            # Example: průmyslová zóna Kožlany
+            if address_split[0].lower() != address_split[0]:
+                all_words = address_split[0].split()
+                updated_town = []
+                found_title_word = False
+                for w in all_words:
+                    if w.lower() != w or found_title_word:
+                        updated_town.append(w)
+
+                # can be none 
+                # example: parc.č. 164 - zemědělský areál
+                if updated_town:
+                    town = updated_town
+
+    elif len(address_split) == 2:
+        street, post_and_town = address_split[0].strip(), address_split[1].strip()
+        # sometimes there are no postcode, just town
+        if post_and_town[:5].isdigit():
+            postal_code = post_and_town.split()[0]
+            town = " ".join(post_and_town.split()[1:])
+        elif post_and_town[:6].replace(" ", "").isdigit():
+            postal_code = post_and_town[:6]
+            town = post_and_town[6:]
+        else:
+            print("warning: could not extract postal code, set second element as town", address_split)
+            town = post_and_town
+
+    else:
+        print("info: more than 2 comma", address_split)
+        # first element of the address split by comma contains digit
+        if any(char.isdigit() for char in address_split[0]):
+            street = address_split[0]
+        # other elements of the address split by comma contains digit and are not postcode
+        # street and number can be second or third
+        elif any(char.isdigit() for char in address_split[1]) and not any([address_split[1][:5].isdigit(), address_split[1][:6].replace(" ", "").isdigit()]):
+            street = address_split[1]
+        elif any(char.isdigit() for char in address_split[2]) and not any([address_split[1][:5].isdigit(), address_split[1][:6].replace(" ", "").isdigit()]):
+            street = address_split[2]
+        else:
+            print("warning, could not extract street", address_split)
+            street = None
+
+        # start from the end
+        for i in range(len(address_split)-1, -1, -1):
+            # strip to remove space after comma
+            address_chunk = address_split[i].strip()
+
+            if address_chunk[:5].isdigit():
+                postal_code = address_chunk[:5]
+                town = address_chunk[5:]
+                break
+            # can be at the end also
+            # example: Podbořany 44101,Podbořany,Vroutecká 230
+            elif address_chunk[-5:].isdigit():
+                postal_code = address_chunk[-5:]
+                town = address_chunk[:-5]
+                break
+            elif address_chunk[:6].replace(" ", "").isdigit():
+                postal_code = address_chunk[:6]
+                town = address_chunk[6:]
+                break
+            elif address_chunk[-6:].replace(" ", "").isdigit():
+                postal_code = address_chunk[-6:]
+                town = address_chunk[:-6]
+                break
+
+    # remove digit at the end of town
+    # example: "Praha 20" -> Praha"
+    if town:
+        town = "".join([c for c in town if c.isalpha() or c.isspace()]).strip()
+    else:
+        print("error, town undefined, lat and lng will be search for the country only.")
+
+
+    print(f"street: {street}, postal_code: {postal_code}, town: {town}")
+    return street, postal_code, town
+
+
+def convert_address_to_lat_lng(address_to_convert: str) -> list:
+    # free plan: 1 request per second
+    sleep(1)
+
+    print(f"\ninfo, address_to_convert: {address_to_convert}")
+
+    street, postal_code, town = extract_address_components(address_to_convert)
+
+    url = "https://geocode.maps.co/search?"
+    if street:
+        url += f"street={street}&"
+    if town:
+        url += f"town={town}&"
+    if postal_code:
+        url += f"postal_code={postal_code}&"
+    url += f"country=Czechia&country_code=cz&api_key={api_key}"
+
+    try:
+        print(f"url_1 {url}")
+        response = requests.get(url)
+        data = response.json()
+        if data != []:
+            lat, lng = data[0]['lat'], data[0]['lon']
+        else:
+            sleep(1)
+            # drop additional number or number (example: Podlanig 3 /1)
+            old_street = street
+            street_split = street.split("/")
+
+            # number can be Roman numeral (example: Jiráskovo předměstí 638/III)
+            is_roman_numeral = True
+            for c in street_split[-1]:
+                if c.lower() not in ['i', 'v', 'x']:
+                    is_roman_numeral = False
+                    break
+
+            if street_split[-1].isdigit() or is_roman_numeral:
+                street = " ".join(street_split[0:-1])
+
+            # drop abbreviations (example: Příšovice č. p. 177)
+            # (?<!^) -> prevent first word to be dropped
+            if "." in street[1:-1]:
+                pattern = r"(?<!^)(^|(?<=\s))\S+\. (?=|$)"
+                street = re.sub(pattern, "", street)
+
+            # drop duplicated street name 
+            # example: hotecká 1538 1538/1538 -> hotecká 1538 1538 (previously)
+            # hotecká 1538 1538 -> hotecká 1538 (hereafter)
+            pattern = r"(\d+)\s*\1\b"
+            street = re.sub(pattern, r"\1", street)
+
+            if old_street != street:
+                url_2 = url.replace(old_street, street)
+            # drop street
+            else:
+                url_2 = url.replace(f"street={old_street}&", "")
+
+            try:
+                print("url_2", url_2)
+                response = requests.get(url_2)
+                data = response.json()
+                if data != []:
+                    lat, lng = data[0]['lat'], data[0]['lon']
+                else:
+                    sleep(1)
+                    # drop street (example: Gabrovlje 14, 3214 Zreče)
+                    if street in url_2:
+                        url_3 = url_2.replace(f"street={street}&", "")
+                    else:
+                        print(f'Empty response for before url_3: {address_to_convert}: {url_2}')
+                        sys.exit(1)
+
+                    try:
+                        print("url_3", url_3)
+                        response = requests.get(url_3)
+                        data = response.json()
+                        if data != []:
+                            lat, lng = data[0]['lat'], data[0]['lon']
+                        else:
+                            print(f'Empty response for: {address_to_convert}" {url_3}')
+                            sys.exit(1)
+                    except (requests.exceptions.RequestException, KeyError, IndexError) as e:
+                        print(f"Error: {e}, url: {url}")
+                        sys.exit(1)
+            except (requests.exceptions.RequestException, KeyError, IndexError) as e:
+                print(f"Error: {e}, url: {url}")
+                sys.exit(1)
+    except (requests.exceptions.RequestException, KeyError, IndexError) as e:
+        print(f"Error: {e}, url: {url}")
+        sys.exit(1)
+
+    return [lat, lng]
+
+
+
+if __name__ == "__main__":
+    source_file = 'CZ-merge-UTF-8_no_coord.csv'
+    target_file = "CZ-merge-UTF-8.csv"
+    index_last_line_processed = 'cz_packagers_refresh_part2_index_tmp.txt'
+    api_key = "" # TODO remove
+
+
+    data = []
+    try:
+        with open(index_last_line_processed, 'r') as f:
+            index = int(f.read())
+    except FileNotFoundError as e:
+        print(f"info, create temporary file {index_last_line_processed}")
+        index = 0
+
+    print(f"info, index is set to {index}")
+
+    l = 0
+    with open(source_file, mode='r', newline='') as csv_file_read:
+        with open(target_file, mode='a', newline='') as csv_file_write:
+            reader = csv.reader(csv_file_read, delimiter=";")
+            writer = csv.writer(csv_file_write, delimiter=";")
+            for row in reader:
+                # print(row)
+                # continue previous run
+                if l <= index and index != 0:
+                    l += 1
+                    continue
+                # header
+                elif l == 0:
+                    row += ['lat', 'lng']
+                else:
+                    row += convert_address_to_lat_lng(row[2])
+
+                writer.writerow(row)
+
+                with open(index_last_line_processed, 'w') as f:
+                    f.write(str(l))
+                l += 1