feat: suggest fuzzy + limit as a param (#6887)

openfoodfacts · Jun 13, 2022 · a6622f8 · a6622f8
1 parent 7354ebf
commit a6622f8
Show file tree

Hide file tree

Showing 2 changed files with 105 additions and 21 deletions.
diff --git a/cgi/suggest.pl b/cgi/suggest.pl
@@ -42,84 +42,151 @@
 use Storable qw/dclone/;
 use Encode;
 use JSON::PP;
+use List::Util qw/min/;
 
 ProductOpener::Display::init();
 
+=head1 CGI script to auto-complete entries for tags
+
+=head2 Request parameters
+
+=head3 tagtype - the type of tag
+
+=head3 string - string to search
+
+=head3 term - term to search
+
+If string and term are passed together, they are contatenated together as separated words
+
+=head3 limit - max number of suggestions
+
+Warning, we are currently doing a brute force search, so avoid setting it too high
+
+=cut
+
 my $tagtype = param('tagtype');
 my $string = decode utf8=>param('string');
+# searched term
 my $term = decode utf8=>param('term');
 
+# search language code
 my $search_lc = $lc;
-
+# superseed by request parameter
 if (defined param('lc')) {
 	$search_lc = param('lc');
 }
 
 my $original_lc = $search_lc;
 
+# if search begins with a language code, use it for search
 if ($term =~ /^(\w\w):/) {
 	$search_lc = $1;
 	$term = $';
 }
 
+# max results
+my $limit = 25;
+# superseed by request parameter
+if (defined param('limit')) {
+	# we put a hard limit however
+	$limit = min(int(param('limit')), 400);
+}
+
+
 my @suggestions = (); # Suggestions starting with the term
 my @suggestions_c = (); # Suggestions containing the term
+my @suggestions_f = (); # fuzzy suggestions
 
 my $cache_max_age = 0;
-my $limit = 25;
-my $i = 0;
+my $suggestions_count = 0;
+
+
+# search for emb codes
 if ($tagtype eq 'emb_codes') {
 	my $stringid = get_string_id_for_lang("no_language", normalize_packager_codes($term));
 	my @tags = sort keys %packager_codes;
 	foreach my $canon_tagid (@tags) {
 		next if $canon_tagid !~ /^$stringid/;
 		push @suggestions, normalize_packager_codes($canon_tagid);
-		last if ++$i >= $limit;
+		last if ++$suggestions_count >= $limit;
 	}
+	# add cache to request
 	$cache_max_age = 3600;
 }
 else {
-	my $stringid = get_string_id_for_lang($search_lc, $string) . get_string_id_for_lang($search_lc, $term);
+	# search for term in a taxonomy
+
+	# normalize string and term
+	my $stringid = get_string_id_for_lang($search_lc, $string) . "-" . get_string_id_for_lang($search_lc, $term);
+	# remove eventual leading or ending "-"
+	$stringid =~ s/^-//;
+	$stringid =~ s/^-$//;
+	# fuzzy match whole words with eventual inter-words
+	my $fuzzystringid = join(".*", split("-", $stringid));
+	# all tags can be retrieve from the $translations_to hash
 	my @tags = sort keys %{$translations_to{$tagtype}} ;
 	foreach my $canon_tagid (@tags) {
-
+		# just_synonyms are not real entries
 		next if defined $just_synonyms{$tagtype}{$canon_tagid};
-
-		my $tag;
-		my $tagid;
-
+
+		my $tag;  # this is the content string
+		my $tagid;  # this is the tag
+
+		# search if the tag exists in target language
 		if (defined $translations_to{$tagtype}{$canon_tagid}{$search_lc}) {
-		
+
 			$tag = $translations_to{$tagtype}{$canon_tagid}{$search_lc};
+			# TODO: explain why $tagid can be different from $canon_tagid
 			$tagid = get_string_id_for_lang($search_lc, $tag);
-
+
+			# add language prefix if we are not searching current interface language
 			if (not ($search_lc eq $original_lc)) {
 				$tag = $search_lc . ":" . $tag;
 			}
 		}
+		# also search for special language code "xx" which is universal
 		elsif (defined $translations_to{$tagtype}{$canon_tagid}{xx}) {
 			$tag = $translations_to{$tagtype}{$canon_tagid}{xx};
 			$tagid = get_string_id_for_lang("xx", $tag);
 		}
-		
+
 		if (defined $tag) {
-
-			next if $tagid !~ /$stringid/;
-
-			if ($tag =~ /^$stringid/i) {
+			# matching at start, best matches
+			if ($tagid =~ /^$stringid/) {
 				push @suggestions, $tag;
+				# only matches at start are considered
+				$suggestions_count++;
 			}
-			else {
+			# matching inside
+			elsif ($tagid =~ /$stringid/) {
 				push @suggestions_c, $tag;
 			}
-			last if ++$i >= $limit;
+			# fuzzy match
+			elsif ($tagid =~ /$fuzzystringid/) {
+				push @suggestions_f, $tag;
+			}
+			# end as soon as we got enough
+			last if $suggestions_count >= $limit;
 		}
 	}
+	# add cache to request
 	$cache_max_age = 3600;
 }
-push @suggestions, @suggestions_c;
-my $data =  encode_json(\@suggestions);
+# sort best suggestions
+@suggestions = sort @suggestions;
+# suggestions containing term
+my $contains_to_add = min($limit - (scalar @suggestions), scalar @suggestions_c) - 1;
+if ($contains_to_add >= 0) {
+	push @suggestions, @suggestions_c[0..$contains_to_add];
+}
+# Suggestions as fuzzy match
+my $fuzzy_to_add = min($limit - (scalar @suggestions), scalar @suggestions_f) - 1;
+if ($fuzzy_to_add >= 0) {
+    push @suggestions, @suggestions_f[0..$fuzzy_to_add];
+}
+my $data = encode_json(\@suggestions);
 
+# send response
 print header(
 	-type => 'application/json',
 	-charset => 'utf-8',

diff --git a/lib/ProductOpener/Tags.pm b/lib/ProductOpener/Tags.pm
@@ -783,6 +783,21 @@ sub remove_plurals($$) {
 
 
 
+=head2 build_tags_taxonomy( $tagtype, $file, $publish )
+
+Build taxonomy from the taxonomy file
+
+=head3 Arguments
+
+=head4 str $tagtype - the tagtype
+
+Like "categories", "ingredients"
+
+=head3 $file - name of the file to read in taxonomies folder
+
+=head3 $publish - if 1, store the result in sto
+
+=cut
 sub build_tags_taxonomy($$$) {
 
 	my $tagtype = shift;
@@ -807,6 +822,8 @@ sub build_tags_taxonomy($$$) {
 	$root_entries{$tagtype} = {};
 
 	$just_tags{$tagtype} = {};
+	# synonyms that are not real entries, but only enrich existing tags
+	# they correspond to synonyms: entries
 	$just_synonyms{$tagtype} = {};
 	$properties{$tagtype} = {};