Skip to content

Commit

Permalink
feat: add script to export products data and images for docker dev (#…
Browse files Browse the repository at this point in the history
…6010)

* feat: add script to export products data and images #6009

* add --sample-mod parameter to get a random sample of products

* Generate new random sample products data and images for docker dev

* Update scripts/export_products_data_and_images.pl

Co-authored-by: Alex Garel <[email protected]>

* Update scripts/export_products_data_and_images.pl

Co-authored-by: Alex Garel <[email protected]>

* Update scripts/export_products_data_and_images.pl

Co-authored-by: Alex Garel <[email protected]>

Co-authored-by: Alex Garel <[email protected]>
  • Loading branch information
stephanegigandet and alexgarel authored Oct 21, 2021
1 parent 40eb397 commit a3d1a55
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 6 deletions.
191 changes: 191 additions & 0 deletions scripts/export_products_data_and_images.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#!/usr/bin/perl -w

# This file is part of Product Opener.
#
# Product Opener
# Copyright (C) 2011-2021 Association Open Food Facts
# Contact: [email protected]
# Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France
#
# Product Opener is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

use Modern::Perl '2017';
use utf8;

use ProductOpener::Config qw/:all/;
use ProductOpener::Export qw/:all/;
use ProductOpener::Display qw/:all/;
use ProductOpener::Products qw/:all/;

use URI::Escape::XS;
use Storable qw/dclone/;
use Encode;
use JSON::PP;
use Time::Local;
use Data::Dumper;
use Getopt::Long;
use CGI qw(:cgi :cgi-lib);
use ProductOpener::Data qw/:all/;


binmode(STDOUT, ":encoding(UTF-8)");
binmode(STDERR, ":encoding(UTF-8)");


my $usage = <<TXT
export_products_data_and_images.pl exports product data and/or images for a sample of products,
with files in the native format used by Product Opener:
- a .tar.gz archive containing product data
- a .tar.gz archive containg product images
Those 2 files can be uncompressed in the "products" and "html/images/products" directories of
Product Opener.
The --query parameter allows to select only products that match a specific query.
The --query-codes-from-file parameter allows to specify a file containing barcodes (one barcode per line).
The --sample_mod [divisor],[remainder] parameter allows to get a sample of products,
based on a modulo of their creation timestamp.
e.g. --sample_mod 10000,0 will return about 1/10000th of the full database.
Usage:
export_products_data_and_images.pl --query field_name=field_value --query other_field_name=other_field_value
[--products-file=path to .tar.gz file] [--images-file=path to .tar.gz file]
TXT
;

my %query_fields_values = ();
my $query_codes_from_file;
my $products_file;
my $images_file;
my $sample_mod;

GetOptions (
"query=s%" => \%query_fields_values,
"query-codes-from-file=s" => \$query_codes_from_file,
"images-file=s" => \$images_file,
"products-file=s" => \$products_file,
"sample-mod=s" => \$sample_mod,

)
or die("Error in command line arguments:\n\n$usage");

print STDERR "export_products_data_and_images.pl
- query fields values:
";

my $query_ref = {};
my $request_ref = {};

foreach my $field (sort keys %query_fields_values) {
print STDERR "-- $field: $query_fields_values{$field}\n";
param($field, $query_fields_values{$field});
}

# Construct the MongoDB query

add_params_to_query($request_ref, $query_ref);

use boolean;

# Substitute values like null or exists to mongodb query values
foreach my $field (sort keys %{$query_ref}) {
if ($query_ref->{$field} eq 'null') {
# $query_ref->{$field} = { '$exists' => false };
$query_ref->{$field} = undef;
}
if ($query_ref->{$field} eq 'exists') {
$query_ref->{$field} = { '$exists' => true };
}
}

# transform file of code list to a mongodb query
if (defined $query_codes_from_file) {
my @codes = ();
open(my $in, "<", "$query_codes_from_file") or die ("Cannot read $query_codes_from_file: $!\n");
while (<$in>) {
if ($_ =~ /^(\d+)/) {
push @codes, $1;
}
}
close($in);
$query_ref->{"code"} = { '$in' => \@codes };
}

# Sample of products whose creation timestamp modulo a divisor is equal to a remainder
if (defined $sample_mod) {
if ($sample_mod =~ /^(\d+),(\d+)$/) {
my $divisor = $1 + 0; # add 0 to turn scalar into number
my $remainder = $2 + 0;
$query_ref->{"created_t"} = { '$mod' => [ $divisor, $remainder ] };
}
else {
die("--sample-mod argument must be of the form divisor],remainder (e.g. 10,0):\n\n$usage");
}
}

use Data::Dumper;
print STDERR "MongoDB query:\n" . Dumper($query_ref) . "\n";

# harvest products'code from mongo db
my $cursor = get_products_collection(3 * 60 * 60 * 1000)
->query($query_ref)
->fields({ "code" => 1})
->sort({code=>1});

$cursor->immortal(1);

# Create a list of directories to be exported

my $files = "";
my $i = 0;

while (my $product_ref = $cursor->next) {
$files .= product_path_from_id($product_ref->{code}) . "\n";
$i++;
}

print STDERR "$i products to export.\n";

# Save the list of directories to a tmp file so that we can pass it as a parameter to tar

my $tmp_file = "/tmp/export_products_data_and_images." . time() . ".txt";

open (my $out, ">$tmp_file") or die("Could not open $tmp_file for writing: $!\n");
print $out $files;
close($out);

if (defined $products_file) {
my $tar_cmd = "cvf";
if ($products_file =~ /\.gz$/) {
$tar_cmd = "cvfz";
}
print STDERR "Executing tar command: tar $tar_cmd $products_file -C $data_root/products -T $tmp_file\n";
system('tar', $tar_cmd, $products_file, "-C", "$data_root/products", "-T", $tmp_file);
}

if (defined $images_file) {
my $tar_cmd = "cvf";
# Probably not a good idea to compress images, but allow it anyway
if ($images_file =~ /\.gz$/) {
$tar_cmd = "cvfz";
}
print STDERR "Executing tar command: tar $tar_cmd $images_file -C $www_root/images/products -T $tmp_file\n";
system('tar', $tar_cmd, $images_file, "-C", "$www_root/images/products", "-T", $tmp_file);
}

print STDERR "$i products exported.\n";
5 changes: 5 additions & 0 deletions scripts/gen_feeds_daily_off.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ gzip < en.openfoodfacts.org.products.csv > en.openfoodfacts.org.products.csv.gz
gzip < fr.openfoodfacts.org.products.csv > fr.openfoodfacts.org.products.csv.gz

cd /srv/off/scripts

# Small products data and images export for Docker dev environments
# for about 1/10000th of the products contained in production.
./export_products_data_and_images.pl --sample-mod 10000,0 --products-file /srv/off/html/exports/products.random-modulo-10000.tar.gz --images-file /srv/off/html/exports/products.random-modulo-10000.images.tar.gz

./generate_dump_for_offline_apps_off.py
cd /srv2/off/html/data/offline
zip en.openfoodfacts.org.products.small.csv.zip en.openfoodfacts.org.products.small.csv
Expand Down
14 changes: 8 additions & 6 deletions scripts/import_sample_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@ set -e
cd /tmp

echo "\033[32m------------------ 1/ Retrieve products -----------------\033[0m";
wget https://static.openfoodfacts.org/exports/39-.tar.gz 2>&1
tar -xzvf 39-.tar.gz -C /mnt/podata/products
rm 39-.tar.gz
# explicitly specify the wget output file name so that wget does not append .1 if already present
# e.g. if the tar command failed and the script was stopped
wget -O products.tar.gz https://static.openfoodfacts.org/exports/products.random-modulo-10000.tar.gz 2>&1
tar -xzvf products.tar.gz -C /mnt/podata/products
rm products.tar.gz

echo "\033[32m------------------ 2/ Retrieve product images -------------------\033[0m";
wget https://static.openfoodfacts.org/exports/39-.images.tar.gz 2>&1
tar -xzvf 39-.images.tar.gz -C /mnt/podata/product_images
rm 39-.images.tar.gz
wget -O products.images.tar.gz https://static.openfoodfacts.org/exports/products.random-modulo-10000.images.tar.gz 2>&1
tar -xzvf products.images.tar.gz -C /mnt/podata/product_images
rm products.images.tar.gz

echo "\033[32m------------------ 3/ Import products -------------------\033[0m";
perl -I/opt/product-opener/lib /opt/product-opener/scripts/update_all_products_from_dir_in_mongodb.pl

0 comments on commit a3d1a55

Please sign in to comment.