diff --git a/lib/ProductOpener/Tags.pm b/lib/ProductOpener/Tags.pm index 8d4581db00bd6..1bb2af5eaa326 100644 --- a/lib/ProductOpener/Tags.pm +++ b/lib/ProductOpener/Tags.pm @@ -879,7 +879,14 @@ sub remove_stopwords ($tagtype, $lc, $tagid) { my $regexp = $stopwords_regexps{$tagtype . '.' . $lc}; - $tagid =~ s/(^|-)($regexp)(-($regexp))*(-|$)/-/g; + # In Japanese, do not require a word boundary, and do not introduce a hyphen + if ($lc eq 'ja') { + $tagid =~ s/$regexp//g; + } + # In other languages, require a word boundary, and replace stopwords with a hyphen + else { + $tagid =~ s/(^|-)($regexp)(-($regexp))*(-|$)/-/g; + } $tagid =~ tr/-/-/s; $tagid =~ s/^-//; diff --git a/lib/ProductOpener/Test.pm b/lib/ProductOpener/Test.pm index 8c5e3f4871d33..875412e736395 100644 --- a/lib/ProductOpener/Test.pm +++ b/lib/ProductOpener/Test.pm @@ -358,6 +358,7 @@ sub compare_to_expected_results ($object_ref, $expected_results_file, $update_ex my $pretty_json = $json->pretty->encode($object_ref); print $result $pretty_json; close($result); + ok(1, "Updated $expected_results_file"); } else { # Compare the result with the expected result diff --git a/taxonomies/additives.txt b/taxonomies/additives.txt index c51d62454108e..f2bd35e0b810e 100644 --- a/taxonomies/additives.txt +++ b/taxonomies/additives.txt @@ -24464,3 +24464,24 @@ hu:kálium-jodid wikipedia:en:https://en.wikipedia.org/wiki/Potassium_iodide wikidata:en:Q121874 +# Japanese additives can be listed only with their type (e.g. amino acids), without the specific additive name + +# Explanation of 「調味料」(flavoring?) https://www.hokeniryo.metro.tokyo.lg.jp/shokuhin/shokuten/chomiryo.html +# Flavors, as additives, consists of 4 categories: +# アミノ酸 (amino acids, e.g. sodium L-aspartate), +# 核酸 (nucleic acids, e.g. disodium inosinate), +# 有機酸 (organic acids, e.g. calcium citrate), +# 無機塩 (inorganic salts, e.g. potassium chloride). +# They are labeled in form of 「調味料({category name})」, or 「調味料({dominant category name}等)」 if more than two categories are included. + +en:Amino acids +ja:アミノ酸, アミノ酸等 + +en:Nucleic acids +ja:核酸 + +en:Organic acids +ja:有機酸 + +en:Inorganic salts +ja:無機塩 \ No newline at end of file diff --git a/taxonomies/ingredients.txt b/taxonomies/ingredients.txt index 8c2d9defb74f1..69f92387ff4a4 100644 --- a/taxonomies/ingredients.txt +++ b/taxonomies/ingredients.txt @@ -87,6 +87,7 @@ stopwords:hu:tartalmaz, változó arányban, min, zsírtartalom, összetevő, ö stopwords:id:mengandung stopwords:is:úr stopwords:it:contiene, nella +# Japanese stopwords are matched without word boundaries, do not put as stopwords characters or words that could be part of an actual ingredient entry stopwords:ja:等, その他 stopwords:lt:iš, su, su pridėtiniu, mažiausiai, įskaitant stopwords:lv:no @@ -13796,7 +13797,7 @@ vegetarian:en:yes # usage:fr:fécules (dont blé) "each capsule contains: paracetamol 500 m 5 060198 790 0 mg.", } ], - + # Japanese additives + [ + "ja-additives", + { + lc => "ja", + ingredients_text => "増粘剤(加工デンプン、キサンタン)、酢酸Na、トレハロース、加工デンプン、グリシン、調味料(アミノ酸等)、酸化防止剤(V.C,V.E)、着色料(野菜色素)", + }, + ], # 148g per 100g [ "en-quantity-per-100g", @@ -588,6 +595,7 @@ foreach my $test_ref (@tests) { # Run the test if (defined $product_ref->{labels}) { + compute_field_tags($product_ref, $product_ref->{lc}, "labels"); }