Skip to content

Commit

Permalink
fix: update middle dot to keep catalan words (#8690)
Browse files Browse the repository at this point in the history
update middle dot to keep catalan words
  • Loading branch information
benbenben2 authored Jul 18, 2023
1 parent 586c85d commit 6e05eb8
Show file tree
Hide file tree
Showing 3 changed files with 175 additions and 1 deletion.
12 changes: 11 additions & 1 deletion lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,18 @@ use List::MoreUtils qw(uniq);
use Test::More;

# MIDDLE DOT with common substitutes (BULLET variants, BULLET OPERATOR and DOT OPERATOR (multiplication))
# U+00B7 "·" (Middle Dot). Is a common character in Catalan. To avoid to break ingredients,
# spaces are added before and after the symbol hereafter.
# U+2022 "•" (Bullet)
# U+2023 "‣" (Triangular Bullet )
# U+25E6 "◦" (White Bullet)
# U+2043 "⁃" (Hyphen Bullet)
# U+204C "⁌" (Black Leftwards Bullet)
# U+204D "⁍" (Black Rightwards Bullet)
# U+2219 "∙" (Bullet Operator )
# U+22C5 "⋅" (Dot Operator)
my $middle_dot
= qr/(?:\N{U+00B7}|\N{U+2022}|\N{U+2023}|\N{U+25E6}|\N{U+2043}|\N{U+204C}|\N{U+204D}|\N{U+2219}|\N{U+22C5})/i;
= qr/(?: \N{U+00B7} |\N{U+2022}|\N{U+2023}|\N{U+25E6}|\N{U+2043}|\N{U+204C}|\N{U+204D}|\N{U+2219}|\N{U+22C5})/i;

# Unicode category 'Punctuation, Dash', SWUNG DASH and MINUS SIGN
my $dashes = qr/(?:\p{Pd}|\N{U+2053}|\N{U+2212})/i;
Expand Down
155 changes: 155 additions & 0 deletions tests/unit/expected_test_results/ingredients/ca-middle-dot.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
{
"ingredients" : [
{
"id" : "en:mozzarella",
"ingredients" : [
{
"id" : "en:pasteurised-cow-s-milk",
"percent_estimate" : 43.75,
"percent_max" : 100,
"percent_min" : 12.5,
"text" : "llet de vaca pasteuritzada",
"vegan" : "no",
"vegetarian" : "yes"
},
{
"ciqual_food_code" : "11058",
"id" : "en:salt",
"percent_estimate" : 15.625,
"percent_max" : 50,
"percent_min" : 0,
"text" : "sal",
"vegan" : "yes",
"vegetarian" : "yes"
},
{
"id" : "en:lactic-ferments",
"percent_estimate" : 7.8125,
"percent_max" : 33.3333333333333,
"percent_min" : 0,
"text" : "ferments làctics",
"vegan" : "maybe",
"vegetarian" : "yes"
},
{
"id" : "en:rennet",
"percent_estimate" : 7.8125,
"percent_max" : 25,
"percent_min" : 0,
"text" : "quall",
"vegan" : "maybe",
"vegetarian" : "maybe"
}
],
"percent_estimate" : 75,
"percent_max" : 100,
"percent_min" : 50,
"text" : "Formatge mozzarella",
"vegan" : "no",
"vegetarian" : "maybe"
},
{
"id" : "en:anti-caking-agent",
"ingredients" : [
{
"id" : "en:e460",
"percent_estimate" : 25,
"percent_max" : 50,
"percent_min" : 0,
"text" : "cel·lulosa",
"vegan" : "yes",
"vegetarian" : "yes"
}
],
"percent_estimate" : 25,
"percent_max" : 50,
"percent_min" : 0,
"text" : "antiaglomerant"
}
],
"ingredients_analysis" : {
"en:maybe-vegetarian" : [
"en:mozzarella",
"en:rennet"
],
"en:non-vegan" : [
"en:mozzarella",
"en:pasteurised-cow-s-milk"
]
},
"ingredients_analysis_tags" : [
"en:palm-oil-free",
"en:non-vegan",
"en:maybe-vegetarian"
],
"ingredients_hierarchy" : [
"en:mozzarella",
"en:dairy",
"en:cheese",
"en:anti-caking-agent",
"en:pasteurised-cow-s-milk",
"en:milk",
"en:pasteurised-milk",
"en:cow-s-milk",
"en:salt",
"en:lactic-ferments",
"en:ferment",
"en:microbial-culture",
"en:rennet",
"en:enzyme",
"en:coagulating-enzyme",
"en:e460"
],
"ingredients_n" : 7,
"ingredients_n_tags" : [
"7",
"1-10"
],
"ingredients_original_tags" : [
"en:mozzarella",
"en:anti-caking-agent",
"en:pasteurised-cow-s-milk",
"en:salt",
"en:lactic-ferments",
"en:rennet",
"en:e460"
],
"ingredients_percent_analysis" : 1,
"ingredients_tags" : [
"en:mozzarella",
"en:dairy",
"en:cheese",
"en:anti-caking-agent",
"en:pasteurised-cow-s-milk",
"en:milk",
"en:pasteurised-milk",
"en:cow-s-milk",
"en:salt",
"en:lactic-ferments",
"en:ferment",
"en:microbial-culture",
"en:rennet",
"en:enzyme",
"en:coagulating-enzyme",
"en:e460"
],
"ingredients_text" : "Formatge mozzarella (llet de vaca pasteuritzada, sal, ferments làctics i quall) i antiaglomerant (cel·lulosa).",
"ingredients_with_specified_percent_n" : 0,
"ingredients_with_specified_percent_sum" : 0,
"ingredients_with_unspecified_percent_n" : 5,
"ingredients_with_unspecified_percent_sum" : 100,
"ingredients_without_ciqual_codes" : [
"en:e460",
"en:lactic-ferments",
"en:pasteurised-cow-s-milk",
"en:rennet"
],
"ingredients_without_ciqual_codes_n" : 4,
"known_ingredients_n" : 16,
"lc" : "ca",
"nutriments" : {
"fruits-vegetables-nuts-estimate-from-ingredients_100g" : 0,
"fruits-vegetables-nuts-estimate-from-ingredients_serving" : 0
},
"unknown_ingredients_n" : 0
}
9 changes: 9 additions & 0 deletions tests/unit/ingredients.t
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,15 @@ Origin of peaches: Spain. Origin of some unknown ingredient: France. origin of A
ingredients_text => "砂糖、小麦粉、全粉乳、カカオマス、ショートニング、植物油脂、ココアバター、小麦全粒粉、小麦ふすま、食塩、小麦胚芽 / 加工デンプン、乳化剤(大豆由来)、膨脹剤、香料",
}
],
# U+00B7 "·" (Middle Dot) is a character found in ingredient forsome countries (Catalan)
[
"ca-middle-dot",
{
lc => "ca",
ingredients_text =>
"Formatge mozzarella (llet de vaca pasteuritzada, sal, ferments làctics i quall) i antiaglomerant (cel·lulosa).",
}
],
);

my $json = JSON->new->allow_nonref->canonical;
Expand Down

0 comments on commit 6e05eb8

Please sign in to comment.