Skip to content

Commit

Permalink
feat: Extract ingredients origins from labels and use them in Eco-Sco…
Browse files Browse the repository at this point in the history
…re (#6377)

* new test for ingredient labels

* Extract ingredients origins from labels and use them in Eco-Score #4905

* Update lib/ProductOpener/Ingredients.pm

Co-authored-by: Alex Garel <[email protected]>

* Update lib/ProductOpener/Ingredients.pm

Co-authored-by: Alex Garel <[email protected]>

* suggestions from code review

* more ingredients/origins for labels

* update tests

Co-authored-by: Alex Garel <[email protected]>
  • Loading branch information
stephanegigandet and alexgarel authored Feb 4, 2022
1 parent 78b21e9 commit d5bd976
Show file tree
Hide file tree
Showing 12 changed files with 1,422 additions and 46 deletions.
16 changes: 13 additions & 3 deletions lib/ProductOpener/Ecoscore.pm
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ use ProductOpener::Config qw/:all/;
use ProductOpener::Store qw/:all/;
use ProductOpener::Tags qw/:all/;
use ProductOpener::Packaging qw/:all/;
use ProductOpener::Ingredients qw/:all/;

use Storable qw(dclone freeze);
use Text::CSV();
Expand Down Expand Up @@ -1161,7 +1162,17 @@ sub compute_ecoscore_origins_of_ingredients_adjustment($) {
}
}

if (scalar @origins_from_origins_field == 0) {
# If we don't have ingredients, check if we have an origin for a specific ingredient
# (e.g. we have the label "French eggs" even though we don't have ingredients)
if ((scalar @origins_from_origins_field == 0)
and ((not defined $product_ref->{ingredients}) or (scalar @{$product_ref->{ingredients}} == 0))) {
my $origin_id = has_specific_ingredient_property($product_ref, undef, "origins");
if ((defined $origin_id) and (defined $ecoscore_data{origins}{$origin_id})) {
push @origins_from_origins_field, $origin_id;
}
}

if (scalar @origins_from_origins_field == 0) {
@origins_from_origins_field = ("en:unknown");
}

Expand All @@ -1177,6 +1188,7 @@ sub compute_ecoscore_origins_of_ingredients_adjustment($) {
else {
# If we don't have ingredients listed, apply the origins from the origins field
# using a dummy ingredient

aggregate_origins_of_ingredients(\@origins_from_origins_field, \%aggregated_origins , [ { percent_estimate => 100} ]);
}

Expand Down Expand Up @@ -1479,9 +1491,7 @@ sub localize_ecoscore ($$) {
$origin_ref->{transportation_score} = $ecoscore_data{origins}{$origin_id}{"transportation_score_" . $cc};
}
}

}

}
}

Expand Down
206 changes: 175 additions & 31 deletions lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ BEGIN
&add_milk
&estimate_milk_percent_from_ingredients
&has_specific_ingredient_property
); # symbols to export on request
%EXPORT_TAGS = (all => [@EXPORT_OK]);
}
Expand Down Expand Up @@ -863,13 +865,161 @@ my %ignore_strings_after_percent = (
);


=head2 has_specific_ingredient_property ( product_ref, searched_ingredient_id, property )
Check if the specific ingredients structure (extracted from the end of the ingredients list and product labels)
contains a property for an ingredient. (e.g. do we have an origin specified for a specific ingredient)
=head3 Arguments
=head4 product_ref
=head4 searched_ingredient_id
If the ingredient_id parameter is undef, then we return the value for any specific ingredient.
(useful for products for which we do not have ingredients, but for which we have a label like "French eggs":
we can still derive the origin of the ingredients, e.g. for the Eco-Score)
=head4 property
e.g. "origins"
=head3 Return values
=head4 value
- undef if we don't have a specific ingredient with the requested property matching the requested ingredient
- otherwise the value for the matching specific ingredient
=cut

sub has_specific_ingredient_property($$$) {

my $product_ref = shift;
my $searched_ingredient_id = shift;
my $property = shift;

my $value;

# If we have specific ingredients, check if we have a parent of searched ingredient
if (defined $product_ref->{specific_ingredients}) {
foreach my $specific_ingredient_ref (@{$product_ref->{specific_ingredients}}) {
my $specific_ingredient_id = $specific_ingredient_ref->{id};
if ((defined $specific_ingredient_ref->{$property}) # we have a value for the property for the specific ingredient
# and we did not target a specific ingredient, or this is equivalent to the searched ingredient
and ((not defined $searched_ingredient_id) or (is_a("ingredients", $searched_ingredient_id, $specific_ingredient_id)))) {

if (not defined $value) {
$value = $specific_ingredient_ref->{$property};
}
elsif ($specific_ingredient_ref->{$property} ne $value) {
$log->warn("has_specific_ingredient_property: different values for property", { searched_ingredient_id => $searched_ingredient_id, property => $property, current_value => $value, specific_ingredient_id => $specific_ingredient_id, new_value => $specific_ingredient_ref->{$property}}) if $log->is_warn();
}
}
}
}

return $value;
}


=head2 add_properties_from_specific_ingredients ( product_ref )
Go through the ingredients structure, and ad properties to ingredients that match specific ingredients
for which we have extra information (e.g. origins from a label).
=cut

sub add_properties_from_specific_ingredients($) {

my $product_ref = shift;

# Traverse the ingredients tree, breadth first

my @ingredients = @{$product_ref->{ingredients}};

while (@ingredients) {

# Remove and process the first ingredient
my $ingredient_ref = shift @ingredients;
my $ingredientid = $ingredient_ref->{id};

# Add sub-ingredients at the beginning of the ingredients array
if (defined $ingredient_ref->{ingredients}) {

unshift @ingredients, @{$ingredient_ref->{ingredients}};
}

foreach my $property (qw(origins)) {
my $property_value = has_specific_ingredient_property($product_ref, $ingredientid, "origins");
if ((defined $property_value) and (not defined $ingredient_ref->{$property})) {
$ingredient_ref->{$property} = $property_value;
}
}
}
}


=head2 parse_specific_ingredients_text ( product_ref, $text )
=head2 add_specific_ingredients_from_labels ( product_ref )
Check if the product has labels that indicate properties (e.g. origins) for specific ingredients.
e.g.
en:French pork
fr:Viande Porcine Française, VPF, viande de porc française, Le Porc Français, Porc Origine France, porc français, porc 100% France
origins:en: en:france
ingredients:en: en:pork
This function extracts those mentions and adds them to the specific_ingredients structure.
=head3 Return values
=head4 specific_ingredients structure
Array of specific ingredients.
=head4
=cut

sub add_specific_ingredients_from_labels($) {

my $product_ref = shift;

my $product_lc = $product_ref->{lc};

if (defined $product_ref->{labels_tags}) {
foreach my $labelid (@{$product_ref->{labels_tags}}) {
my $ingredients = get_property("labels", $labelid, "ingredients:en");
if (defined $ingredients) {
my $origins = get_property("labels", $labelid, "origins:en");

if (defined $origins) {

my $ingredient_id = canonicalize_taxonomy_tag("en", "ingredients", $ingredients);

my $specific_ingredients_ref = {
id => $ingredient_id,
ingredient => $ingredients,
label => $labelid,
origins => join(",", map {canonicalize_taxonomy_tag("en", "origins", $_)} split(/,/, $origins ))
};

push @{$product_ref->{specific_ingredients}}, $specific_ingredients_ref;
}
}
}
}
}


=head2 parse_specific_ingredients_from_text ( product_ref, $text )
Lists of ingredients sometime include extra mentions for specific ingredients
at the end of the ingredients list. e.g. "Prepared with 50g of fruits for 100g of finished product".
This function extracts those mentions and adds them to a special specific_ingredients structure.
This function extracts those mentions and adds them to the specific_ingredients structure.
=head3 Return values
Expand All @@ -881,16 +1031,14 @@ Array of specific ingredients.
=cut

sub parse_specific_ingredients_text($$$) {
sub parse_specific_ingredients_from_text($$$) {

my $product_ref = shift;
my $text = shift;
my $percent_regexp = shift;

my $product_lc = $product_ref->{lc};

$product_ref->{specific_ingredients} = [];

# Go through the ingredient lists multiple times
# as long as we have one match
my $ingredient = "start";
Expand All @@ -901,7 +1049,7 @@ sub parse_specific_ingredients_text($$$) {
$ingredient = undef;
my $matched_text;
my $percent;
my $origin;
my $origins;

# Note: in regular expressions below, use non-capturing groups (starting with (?: )
# for all groups, except groups that capture actual data: ingredient name, percent, origins
Expand All @@ -925,7 +1073,7 @@ sub parse_specific_ingredients_text($$$) {
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
# in order to not overmatch something like "Origin of milk: UK, some other mention."
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
$origin = $2;
$origins = $2;
$ingredient = $1;
$matched_text = $&;
# Remove the matched text
Expand Down Expand Up @@ -963,7 +1111,7 @@ sub parse_specific_ingredients_text($$$) {
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
# in order to not overmatch something like "Origin of milk: UK, some other mention."
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
$origin = $2;
$origins = $2;
$ingredient = $1;
$matched_text = $&;
# Remove the matched text
Expand All @@ -985,17 +1133,12 @@ sub parse_specific_ingredients_text($$$) {
};

defined $percent and $specific_ingredients_ref->{percent} = $percent;
defined $origin and $specific_ingredients_ref->{origin} = join(",", map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,/, $origin ));
defined $origins and $specific_ingredients_ref->{origins} = join(",", map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,/, $origins ));

push @{$product_ref->{specific_ingredients}}, $specific_ingredients_ref;
}
}

# Delete specific ingredients if empty
if (scalar @{$product_ref->{specific_ingredients}} == 0) {
delete $product_ref->{specific_ingredients};
}

return $text;
}

Expand Down Expand Up @@ -1049,6 +1192,7 @@ sub parse_ingredients_text($) {
# remove ending . and ending whitespaces
$text =~ s/(\s|\.)+$//;

# initialize the structure to store the parsed ingredients and specific ingredients
$product_ref->{ingredients} = [];

# farine (12%), chocolat (beurre de cacao (15%), sucre [10%], protéines de lait, oeuf 1%) - émulsifiants : E463, E432 et E472 - correcteurs d'acidité : E322/E333 E474-E475, acidifiant (acide citrique, acide phosphorique) - sel : 1% ...
Expand All @@ -1075,7 +1219,7 @@ sub parse_ingredients_text($) {
my $percent_regexp = '(?:<|' . $min_regexp . '|\s|\.|:)*(\d+(?:(?:\,|\.)\d+)?)\s*(?:\%|g)\s*(?:' . $min_regexp . '|' . $ignore_strings_after_percent . '|\s|\)|\]|\}|\*)*';

# Extract phrases related to specific ingredients at the end of the ingredients list
$text = parse_specific_ingredients_text($product_ref, $text, $percent_regexp);
$text = parse_specific_ingredients_from_text($product_ref, $text, $percent_regexp);

my $analyze_ingredients_function = sub($$$$) {

Expand Down Expand Up @@ -1930,12 +2074,7 @@ sub compute_ingredients_tags($) {

# Traverse the ingredients tree, breadth first

my @ingredients = ();

for (my $i = 0; $i < @{$product_ref->{ingredients}}; $i++) {

push @ingredients, $product_ref->{ingredients}[$i];
}
my @ingredients = @{$product_ref->{ingredients}};

while (@ingredients) {

Expand All @@ -1945,10 +2084,7 @@ sub compute_ingredients_tags($) {

if (defined $ingredient_ref->{ingredients}) {

for (my $i = 0; $i < @{$ingredient_ref->{ingredients}}; $i++) {

push @ingredients, $ingredient_ref->{ingredients}[$i];
}
push @ingredients, @{$ingredient_ref->{ingredients}};
}
else {
# Count specified percent only for ingredients that do not have sub ingredients
Expand Down Expand Up @@ -2028,10 +2164,18 @@ sub extract_ingredients_from_text($) {
# Parse the ingredients list to extract individual ingredients and sub-ingredients
# to create the ingredients array with nested sub-ingredients arrays

$product_ref->{specific_ingredients} = [];

parse_ingredients_text($product_ref);

# Add specific ingredients from labels
add_specific_ingredients_from_labels($product_ref);

if (defined $product_ref->{ingredients}) {

# Add properties like origins from specific ingredients extracted from labels or the end of the ingredients list
add_properties_from_specific_ingredients($product_ref);

# Compute minimum and maximum percent ranges for each ingredient and sub ingredient

if (compute_ingredients_percent_values(100, 100, $product_ref->{ingredients}) < 0) {
Expand Down Expand Up @@ -2060,6 +2204,11 @@ sub extract_ingredients_from_text($) {

analyze_ingredients($product_ref);

# Delete specific ingredients if empty
if ((exists $product_ref->{specific_ingredients}) and (scalar @{$product_ref->{specific_ingredients}} == 0)) {
delete $product_ref->{specific_ingredients};
}

return;
}

Expand Down Expand Up @@ -2522,12 +2671,7 @@ sub analyze_ingredients($) {

# Traverse the ingredients tree, breadth first

my @ingredients = ();

for (my $i = 0; $i < @{$product_ref->{ingredients}}; $i++) {

push @ingredients, $product_ref->{ingredients}[$i];
}
my @ingredients = @{$product_ref->{ingredients}};

while (@ingredients) {

Expand Down
Loading

0 comments on commit d5bd976

Please sign in to comment.