Skip to content

Commit

Permalink
feat: match synonyms for taxonomy suggestions #8002
Browse files Browse the repository at this point in the history
  • Loading branch information
stephanegigandet committed Mar 13, 2023
1 parent 9cb5ab6 commit 05c66e6
Show file tree
Hide file tree
Showing 20 changed files with 178 additions and 71 deletions.
106 changes: 73 additions & 33 deletions lib/ProductOpener/TaxonomySuggestions.pm
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,52 @@ sub add_sorted_entries_to_tags ($tags_ref, $seen_tags_ref, $entries_ref, $tagtyp
return;
}


# Match the normalized form of a tag synonym to the normalized input of an user

sub match_stringids($stringid, $fuzzystringid, $synonymid) {

$log->debug("match string ids",
{stringid => $stringid, fuzzystringid=>$fuzzystringid, synonymid=>$synonymid})
if $log->is_debug();

# matching at start, best matches
if ($synonymid =~ /^$stringid/) {
return "start";
}
# matching inside
elsif ($synonymid =~ /$stringid/) {
return "inside";
}
# fuzzy match
elsif ($synonymid =~ /$fuzzystringid/) {
return "fuzzy";
}

return "none";
}

# best_match is used to see how well matches the best matching synonym

sub best_match($stringid, $fuzzystringid, $synonyms_ids_ref) {

my $best_match = "none";

foreach my $synonymid (@$synonyms_ids_ref) {
my $match = match_stringids($stringid, $fuzzystringid, $synonymid);
if ($match eq "start") {
# Best match, we can return without looking at the other synonyms
return "start";
}
elsif (($match eq "inside")
or (($match eq "fuzzy") and ($best_match eq "none"))) {
$best_match = $match;
}
}
return $best_match;
}


=head2 filter_suggestions_matching_string ($tags_ref, $tagtype, $search_lc, $string, $options_ref)
Filter a list of potential taxonomy suggestions matching a string.
Expand Down Expand Up @@ -333,45 +379,39 @@ sub filter_suggestions_matching_string ($tags_ref, $tagtype, $search_lc, $string
# just_synonyms are not real entries
next if defined $just_synonyms{$tagtype}{$canon_tagid};

my $tag; # this is the content string
my $tagid; # this is the tag
# We will match synonyms in the search language, and in the wildcard xx: language
my $tag = display_taxonomy_tag ($search_lc, $tagtype, $canon_tagid);
my $tag_xx = display_taxonomy_tag ("xx", $tagtype, $canon_tagid);

# search if the tag exists in target language
if (defined $translations_to{$tagtype}{$canon_tagid}{$search_lc}) {
# Build a list of normalized synonyms in the search language and the wildcard xx: language
my @synonyms_ids = map { get_string_id_for_lang($search_lc, $_) }
(
@{deep_get(\%synonyms_for, $tagtype, $search_lc, get_string_id_for_lang($search_lc, $tag)) || []},
@{deep_get(\%synonyms_for, $tagtype, "xx", get_string_id_for_lang("xx", $tag_xx)) || []}
);

$tag = $translations_to{$tagtype}{$canon_tagid}{$search_lc};
# TODO: explain why $tagid can be different from $canon_tagid
$tagid = get_string_id_for_lang($search_lc, $tag);
# check how well the synonyms match the input string
my $best_match = best_match($stringid, $fuzzystringid, \@synonyms_ids);

# add language prefix if we are not searching current interface language
if (not($search_lc eq $original_lc)) {
$tag = $search_lc . ":" . $tag;
}
}
# also search for special language code "xx" which is universal
elsif (defined $translations_to{$tagtype}{$canon_tagid}{xx}) {
$tag = $translations_to{$tagtype}{$canon_tagid}{xx};
$tagid = get_string_id_for_lang("xx", $tag);
}
$log->debug("synonyms_ids for canon_tagid",
{tagtype => $tagtype, canon_tagid=>$canon_tagid, tag=>$tag, synonym_ids=>\@synonyms_ids, best_match => $best_match})
if $log->is_debug();

if (defined $tag) {
# matching at start, best matches
if ($tagid =~ /^$stringid/) {
push @suggestions, $tag;
# only matches at start are considered
$suggestions_count++;
}
# matching inside
elsif ($tagid =~ /$stringid/) {
push @suggestions_c, $tag;
}
# fuzzy match
elsif ($tagid =~ /$fuzzystringid/) {
push @suggestions_f, $tag;
}
# end as soon as we got enough
# matching at start, best matches
if ($best_match eq "start") {
push @suggestions, $tag;
# count matches at start so that we can return only if we have enough matches
$suggestions_count++;
last if $suggestions_count >= $limit;
}
# matching inside
elsif ($best_match eq "inside") {
push @suggestions_c, $tag;
}
# fuzzy match
elsif ($best_match eq "fuzzy") {
push @suggestions_f, $tag;
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion tests/integration/api_v3_taxonomy_suggestions.t
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ my $tests_ref = [
{
test_case => 'packaging-shapes-string-fr-po',
method => 'GET',
path => '/api/v3/taxonomy_suggestions?tagtype=packaging_shapes&string=po',
path => '/api/v3/taxonomy_suggestions?tagtype=packaging_shapes&string=po&lc=fr',
expected_status_code => 200,
},
# Packaging shape suggestions can be specific to a country and categories, and shape
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[
"Mint-flavoured syrup with sugar diluted in water",
"Strawberry and blueberry compotes",
"Strawberry applesauces",
"Strawberry biscuits",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
"Ail frais",
"Aliments à base de plantes frais",
"Ananas frais",
"Aneth fraîche",
"Artichauts frais",
"Barquettes à la fraise",
"Barre de chocolat au lait frais",
"Barre de chocolat au lait frais avec génoise",
"Barres de céréales aux fraises",
"Basilic fraîche",
"Biscuits à la fraise",
"Boisson lactée aromatisée à la fraise sucrée au lait partiellement écrémé enrichie à la vitamine D",
"Boudin noir rayon frais",
"Brocolis frais"
"Boisson lactée aromatisée à la fraise sucrée au lait partiellement écrémé enrichie à la vitamine D"
]
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"Barres de céréales aux fraises",
"Biscuits à la fraise",
"Boisson lactée aromatisée à la fraise sucrée au lait partiellement écrémé enrichie à la vitamine D",
"Boisson préparée sucrée à partir de sirop à la menthe à diluer dans l'eau",
"Cheesecakes à la fraise",
"Compotes de fraise",
"Compotes fraise groseille",
Expand All @@ -22,6 +23,5 @@
"Coulis de fraise",
"Crêpes fourrées aux fraises",
"Cônes vanille fraise",
"Glaces à la fraise",
"Jus de fraise"
"Glaces à la fraise"
]
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[
"Mint-flavoured syrup with sugar diluted in water",
"Strawberry and blueberry compotes",
"Strawberry applesauces",
"Strawberry biscuits",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@
"Ail frais",
"Aliments à base de plantes frais",
"Ananas frais",
"Aneth fraîche",
"Artichauts frais",
"Barquettes à la fraise",
"Barre de chocolat au lait frais",
"Barre de chocolat au lait frais avec génoise",
"Barres de céréales aux fraises",
"Basilic fraîche",
"Biscuits à la fraise",
"Boisson lactée aromatisée à la fraise sucrée au lait partiellement écrémé enrichie à la vitamine D",
"Boudin noir rayon frais",
"Brocolis frais"
"Boisson lactée aromatisée à la fraise sucrée au lait partiellement écrémé enrichie à la vitamine D"
],
"warnings" : []
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"Barres de céréales aux fraises",
"Biscuits à la fraise",
"Boisson lactée aromatisée à la fraise sucrée au lait partiellement écrémé enrichie à la vitamine D",
"Boisson préparée sucrée à partir de sirop à la menthe à diluer dans l'eau",
"Cheesecakes à la fraise",
"Compotes de fraise",
"Compotes fraise groseille",
Expand All @@ -25,8 +26,7 @@
"Coulis de fraise",
"Crêpes fourrées aux fraises",
"Cônes vanille fraise",
"Glaces à la fraise",
"Jus de fraise"
"Glaces à la fraise"
],
"warnings" : []
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"errors" : [],
"status" : "success",
"suggestions" : [
"Mint-flavoured syrup with sugar diluted in water",
"Strawberry and blueberry compotes",
"Strawberry applesauces",
"Strawberry biscuits",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"errors" : [],
"status" : "success",
"suggestions" : [
"Mint-flavoured syrup with sugar diluted in water",
"Strawberry and blueberry compotes",
"Strawberry applesauces",
"Strawberry biscuits",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
{
"errors" : [],
"status" : "success",
"suggestions" : [],
"suggestions" : [
"PET - Polyethylene terephthalate"
],
"warnings" : []
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
"errors" : [],
"status" : "success",
"suggestions" : [
"PET - Polyethylene terephthalate",
"Paper and plastic",
"81 C/PET",
"81 C/PETmet",
"81 C/rPET",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@
"errors" : [],
"status" : "success",
"suggestions" : [
"PET - Polyethylene terephthalate",
"Recycled cardboard",
"Recycled paper",
"Lithium battery",
"Nickel–cadmium battery",
"Nickel–metal hydride battery",
"Silver-oxide battery",
"Zinc–carbon battery",
"Aluminium",
"Non-corrugated cardboard",
"Cork",
"Paper and plastic",
"Green Glass",
"81 C/PAP",
"91 C/FE",
"81 C/PP",
Expand All @@ -13,20 +26,7 @@
"81 C/LLDPE",
"81 C/MDPE",
"81 C/OPP",
"81 C/PA",
"81 C/PC",
"81 C/PET",
"81 C/PETmet",
"81 C/PEmet",
"81 C/PLA",
"81 C/PVC",
"81 C/PVDC",
"81 C/rPET",
"91 C/ABS",
"91 C/CPP",
"91 C/EVOH",
"91 C/HDPE",
"91 C/LDPE"
"81 C/PA"
],
"warnings" : []
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
{
"errors" : [],
"status" : "success",
"suggestions" : [],
"suggestions" : [
"PET - Polyethylene terephthalate"
],
"warnings" : []
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
"Recycle in paper bin",
"Recycle with drink cartons",
"Recycle with plastics",
"Recycle with plastics - metal and bricks"
"Recycle with plastics - metal and bricks",
"Discard"
],
"warnings" : []
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@
"errors" : [],
"status" : "success",
"suggestions" : [
"Sachet",
"Pot",
"Pouch flask",
"Terrine pot",
"Individual pot",
"Spoon",
"Spout",
"Gourde",
"Pot individuel",
"Support",
"Flacon à pompe",
"Ampoule",
"Tablespoon",
"Teaspoon"
"Sac de transport"
],
"warnings" : []
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
"suggestions" : [
"Pot",
"Pouch flask",
"Spout",
"Terrine pot",
"Backing",
"Individual pot",
"Spoon",
"Spout",
"Ampoule",
"Tablespoon",
"Teaspoon"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"calcium_value" : "",
"carbohydrates_unit" : "g",
"carbohydrates_value" : "12",
"categories" : "Plant-based foods and beverages, Beverages, Plant-based beverages, Fruit-based beverages, Non-Alcoholic beverages, Aloe Vera drinks",
"categories" : "Plant-based foods and beverages, Beverages, Plant-based beverages, Fruit-based beverages, Non-alcoholic beverages, Aloe Vera drinks",
"cholesterol_unit" : "",
"cholesterol_value" : "",
"code" : "0850032917148",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"calcium_value" : "",
"carbohydrates_unit" : "g",
"carbohydrates_value" : "12",
"categories" : "Plant-based foods and beverages, Beverages, Plant-based beverages, Fruit-based beverages, Non-Alcoholic beverages, Aloe Vera drinks",
"categories" : "Plant-based foods and beverages, Beverages, Plant-based beverages, Fruit-based beverages, Non-alcoholic beverages, Aloe Vera drinks",
"categories_tags" : "en:plant-based-foods-and-beverages,en:beverages,en:plant-based-beverages,en:fruit-based-beverages,en:non-alcoholic-beverages,en:aloe-vera-drinks",
"cholesterol_unit" : "",
"cholesterol_value" : "",
Expand Down
Loading

0 comments on commit 05c66e6

Please sign in to comment.