From 6e05eb81a72dc9d6865bca5c4394516e823c55b9 Mon Sep 17 00:00:00 2001 From: benbenben2 <110821832+benbenben2@users.noreply.github.com> Date: Tue, 18 Jul 2023 09:24:10 +0200 Subject: [PATCH] fix: update middle dot to keep catalan words (#8690) update middle dot to keep catalan words --- lib/ProductOpener/Ingredients.pm | 12 +- .../ingredients/ca-middle-dot.json | 155 ++++++++++++++++++ tests/unit/ingredients.t | 9 + 3 files changed, 175 insertions(+), 1 deletion(-) create mode 100644 tests/unit/expected_test_results/ingredients/ca-middle-dot.json diff --git a/lib/ProductOpener/Ingredients.pm b/lib/ProductOpener/Ingredients.pm index 525e16b3bd55d..e4439b3fcbd26 100644 --- a/lib/ProductOpener/Ingredients.pm +++ b/lib/ProductOpener/Ingredients.pm @@ -126,8 +126,18 @@ use List::MoreUtils qw(uniq); use Test::More; # MIDDLE DOT with common substitutes (BULLET variants, BULLET OPERATOR and DOT OPERATOR (multiplication)) +# U+00B7 "·" (Middle Dot). Is a common character in Catalan. To avoid to break ingredients, +# spaces are added before and after the symbol hereafter. +# U+2022 "•" (Bullet) +# U+2023 "‣" (Triangular Bullet ) +# U+25E6 "◦" (White Bullet) +# U+2043 "⁃" (Hyphen Bullet) +# U+204C "⁌" (Black Leftwards Bullet) +# U+204D "⁍" (Black Rightwards Bullet) +# U+2219 "∙" (Bullet Operator ) +# U+22C5 "⋅" (Dot Operator) my $middle_dot - = qr/(?:\N{U+00B7}|\N{U+2022}|\N{U+2023}|\N{U+25E6}|\N{U+2043}|\N{U+204C}|\N{U+204D}|\N{U+2219}|\N{U+22C5})/i; + = qr/(?: \N{U+00B7} |\N{U+2022}|\N{U+2023}|\N{U+25E6}|\N{U+2043}|\N{U+204C}|\N{U+204D}|\N{U+2219}|\N{U+22C5})/i; # Unicode category 'Punctuation, Dash', SWUNG DASH and MINUS SIGN my $dashes = qr/(?:\p{Pd}|\N{U+2053}|\N{U+2212})/i; diff --git a/tests/unit/expected_test_results/ingredients/ca-middle-dot.json b/tests/unit/expected_test_results/ingredients/ca-middle-dot.json new file mode 100644 index 0000000000000..e7e47785feaa2 --- /dev/null +++ b/tests/unit/expected_test_results/ingredients/ca-middle-dot.json @@ -0,0 +1,155 @@ +{ + "ingredients" : [ + { + "id" : "en:mozzarella", + "ingredients" : [ + { + "id" : "en:pasteurised-cow-s-milk", + "percent_estimate" : 43.75, + "percent_max" : 100, + "percent_min" : 12.5, + "text" : "llet de vaca pasteuritzada", + "vegan" : "no", + "vegetarian" : "yes" + }, + { + "ciqual_food_code" : "11058", + "id" : "en:salt", + "percent_estimate" : 15.625, + "percent_max" : 50, + "percent_min" : 0, + "text" : "sal", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:lactic-ferments", + "percent_estimate" : 7.8125, + "percent_max" : 33.3333333333333, + "percent_min" : 0, + "text" : "ferments làctics", + "vegan" : "maybe", + "vegetarian" : "yes" + }, + { + "id" : "en:rennet", + "percent_estimate" : 7.8125, + "percent_max" : 25, + "percent_min" : 0, + "text" : "quall", + "vegan" : "maybe", + "vegetarian" : "maybe" + } + ], + "percent_estimate" : 75, + "percent_max" : 100, + "percent_min" : 50, + "text" : "Formatge mozzarella", + "vegan" : "no", + "vegetarian" : "maybe" + }, + { + "id" : "en:anti-caking-agent", + "ingredients" : [ + { + "id" : "en:e460", + "percent_estimate" : 25, + "percent_max" : 50, + "percent_min" : 0, + "text" : "cel·lulosa", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], + "percent_estimate" : 25, + "percent_max" : 50, + "percent_min" : 0, + "text" : "antiaglomerant" + } + ], + "ingredients_analysis" : { + "en:maybe-vegetarian" : [ + "en:mozzarella", + "en:rennet" + ], + "en:non-vegan" : [ + "en:mozzarella", + "en:pasteurised-cow-s-milk" + ] + }, + "ingredients_analysis_tags" : [ + "en:palm-oil-free", + "en:non-vegan", + "en:maybe-vegetarian" + ], + "ingredients_hierarchy" : [ + "en:mozzarella", + "en:dairy", + "en:cheese", + "en:anti-caking-agent", + "en:pasteurised-cow-s-milk", + "en:milk", + "en:pasteurised-milk", + "en:cow-s-milk", + "en:salt", + "en:lactic-ferments", + "en:ferment", + "en:microbial-culture", + "en:rennet", + "en:enzyme", + "en:coagulating-enzyme", + "en:e460" + ], + "ingredients_n" : 7, + "ingredients_n_tags" : [ + "7", + "1-10" + ], + "ingredients_original_tags" : [ + "en:mozzarella", + "en:anti-caking-agent", + "en:pasteurised-cow-s-milk", + "en:salt", + "en:lactic-ferments", + "en:rennet", + "en:e460" + ], + "ingredients_percent_analysis" : 1, + "ingredients_tags" : [ + "en:mozzarella", + "en:dairy", + "en:cheese", + "en:anti-caking-agent", + "en:pasteurised-cow-s-milk", + "en:milk", + "en:pasteurised-milk", + "en:cow-s-milk", + "en:salt", + "en:lactic-ferments", + "en:ferment", + "en:microbial-culture", + "en:rennet", + "en:enzyme", + "en:coagulating-enzyme", + "en:e460" + ], + "ingredients_text" : "Formatge mozzarella (llet de vaca pasteuritzada, sal, ferments làctics i quall) i antiaglomerant (cel·lulosa).", + "ingredients_with_specified_percent_n" : 0, + "ingredients_with_specified_percent_sum" : 0, + "ingredients_with_unspecified_percent_n" : 5, + "ingredients_with_unspecified_percent_sum" : 100, + "ingredients_without_ciqual_codes" : [ + "en:e460", + "en:lactic-ferments", + "en:pasteurised-cow-s-milk", + "en:rennet" + ], + "ingredients_without_ciqual_codes_n" : 4, + "known_ingredients_n" : 16, + "lc" : "ca", + "nutriments" : { + "fruits-vegetables-nuts-estimate-from-ingredients_100g" : 0, + "fruits-vegetables-nuts-estimate-from-ingredients_serving" : 0 + }, + "unknown_ingredients_n" : 0 +} diff --git a/tests/unit/ingredients.t b/tests/unit/ingredients.t index a31e4668efb8e..50b336f964276 100755 --- a/tests/unit/ingredients.t +++ b/tests/unit/ingredients.t @@ -530,6 +530,15 @@ Origin of peaches: Spain. Origin of some unknown ingredient: France. origin of A ingredients_text => "砂糖、小麦粉、全粉乳、カカオマス、ショートニング、植物油脂、ココアバター、小麦全粒粉、小麦ふすま、食塩、小麦胚芽 / 加工デンプン、乳化剤(大豆由来)、膨脹剤、香料", } ], + # U+00B7 "·" (Middle Dot) is a character found in ingredient forsome countries (Catalan) + [ + "ca-middle-dot", + { + lc => "ca", + ingredients_text => + "Formatge mozzarella (llet de vaca pasteuritzada, sal, ferments làctics i quall) i antiaglomerant (cel·lulosa).", + } + ], ); my $json = JSON->new->allow_nonref->canonical;