diff --git a/lib/ProductOpener/Ingredients.pm b/lib/ProductOpener/Ingredients.pm index c0fdd1d160e7e..dc7c768f59f03 100644 --- a/lib/ProductOpener/Ingredients.pm +++ b/lib/ProductOpener/Ingredients.pm @@ -778,6 +778,12 @@ my %min_regexp = ( hr => "min|min\.|mini|minimum", ); +my %max_regexp = ( + en => "max|max\.|maximum", + fr => "max|max\.|maxi|maximum", + hr => "max|max\.|maxi|maximum", +); + # Words that can be ignored after a percent # e.g. 50% du poids total, 30% of the total weight # groups need to be non-capturing: prefixed with (?: @@ -1553,15 +1559,6 @@ sub parse_processing_from_ingredient ($ingredients_lc, $ingredient) { ingredient_recognized => $ingredient_recognized } ) if $log->is_debug(); - $log->debug( - "processing - return", - { - processings => \@processings, - ingredient => $ingredient, - ingredient_id => $ingredient_id, - ingredient_recognized => $ingredient_recognized - } - ) if $log->is_debug(); return (\@processings, $ingredient, $ingredient_id, $ingredient_recognized); } @@ -1801,15 +1798,17 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) { my $min_regexp = $min_regexp{$ingredients_lc} || ''; + my $max_regexp = $max_regexp{$ingredients_lc} || ''; + my $ignore_strings_after_percent = $ignore_strings_after_percent{$ingredients_lc} || ''; # Regular expression to find percent or quantities # $percent_or_quantity_regexp has 2 capturing group: one for the number, and one for the % sign or the unit my $percent_or_quantity_regexp = '(?:' . "(?:$prepared_with )" . ' )?' # optional produced with - . '(?:<|' . $min_regexp . '|\s|\.|:)*' # optional minimum, and separators + . '(?:>|' . $max_regexp . '|<|' . $min_regexp . '|\s|\.|:)*' # optional maximum, minimum, and separators . '(\d+(?:(?:\,|\.)\d+)?)\s*' # number, possibly with a dot or comma . '(\%|g|gr|mg|kg|ml|cl|dl|l)\s*' # % or unit - . '(?:' . $min_regexp . '|' # optional minimum + . '(?:' . $min_regexp . '|' . $max_regexp . '|' # optional minimum, optional maximum . $ignore_strings_after_percent . '|\s|\)|\]|\}|\*)*'; # strings that can be ignored my $per = $per{$ingredients_lc} || ' per '; @@ -2527,13 +2526,16 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) { 'hr' => [ '^u tragovima$', # in traces + 'čokolada sadrži biljne masnoće uz kakaov maslac' + , # Chocolate contains vegetable fats along with cocoa butter 'označene podebljano', # marked in bold 'savjet kod alergije', # allergy advice + 'u čokoladi kakaovi dijelovi' + , # Cocoa parts in chocolate 48%. Usually at the end of the ingredients list. Chocolate can contain many sub-ingredients (cacao, milk, sugar, etc.) 'u promjenjivim omjerima|u promjenjivim udjelima|u promijenljivom udjelu' , # in variable proportions 'uključujući žitarice koje sadrže gluten', # including grains containing gluten 'za alergene', # for allergens - 'u promjenjivim udjelima' # in variable proportions ], 'it' => ['^in proporzion[ei] variabil[ei]$',], @@ -4042,7 +4044,16 @@ sub normalize_a_of_b ($lc, $a, $b, $of_bool, $alternate_names_ref = undef) { my $a_of_b; if (($lc eq "en") or ($lc eq "hr")) { - $a_of_b = $b . " " . $a; + # start by "with" (example: "mlijeko (s 1.0% mliječne masti)"), in which case it $b should be added after $a + # start by "with etc." should be added at the end of the previous ingredient + my %with = (hr => '(s | sa )',); + my $with = $with{$lc} || " will not match "; + if ($b =~ /^$with/i) { + $a_of_b = $a . " " . $b; + } + else { + $a_of_b = $b . " " . $a; + } } elsif ($lc eq "es") { $a_of_b = $a . " de " . $b; @@ -5284,6 +5295,11 @@ my %ingredients_categories_and_types = ( categories => ["slad",], types => ["ječmeni", "pšenični",] }, + # milk + { + categories => ["mlijeko",], + types => ["s 1.0% mliječne masti",] + }, ], pl => [ @@ -5421,18 +5437,18 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) { or ($ingredients_lc eq "ru") or ($ingredients_lc eq "pl")) { - # vegetable oil (palm, sunflower and olive) + # vegetable oil (palm, sunflower and olive) -> palm vegetable oil, sunflower vegetable oil, olive vegetable oil $text =~ s/($category_regexp)(?::|\(|\[| | $of )+((($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, |$and|$of|$and_of|$and_or)+)+($type_regexp)($symbols_regexp|\s)*)\b(\s?(\)|\]))?/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names})/ieg; - # vegetable oil (palm) + # vegetable oil (palm) -> palm vegetable oil $text =~ s/($category_regexp)\s?(?:\(|\[)\s?($type_regexp)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool,$categories_and_types_ref->{alternate_names})/ieg; # vegetable oil: palm $text =~ s/($category_regexp)\s?(?::)\s?($type_regexp)(?=$separators|.|$)/normalize_enumeration($ingredients_lc,$1,$2,$of_bool,$categories_and_types_ref->{alternate_names})/ieg; - # ječmeni i pšenični slad (barley and wheat malt) + # ječmeni i pšenični slad (barley and wheat malt) -> ječmeni slad, pšenični slad $text =~ s/((?:(?:$type_regexp)(?: |\/| \/ | - |,|, |$and|$of|$and_of|$and_or)+)+(?:$type_regexp))\s*($category_regexp)/normalize_enumeration($ingredients_lc,$2,$1,$of_bool,$categories_and_types_ref->{alternate_names})/ieg; } diff --git a/taxonomies/additives.txt b/taxonomies/additives.txt index e6d06765bdeed..ce3dd547b6d4c 100644 --- a/taxonomies/additives.txt +++ b/taxonomies/additives.txt @@ -6706,7 +6706,7 @@ es:E300, Ácido ascórbico, Ácido l-ascórbico, Ácido L-ascórbico et:E300, Askorbiinhape fi:E300, Askorbiinihappo, L-askorbiinihappo, Askorbiinihappoa, L-askorbiinihappoa, c-vitamiini fr:E300, Acide ascorbique, Acide L-ascorbique, Acide ascorbique (L-), Acide L(+)-ascorbique, Ascorbate, vitamine c -hr:E300, askorbinska kiselina, l-askorbinska kiselina, askrobinska kiselina +hr:E300, askorbinska kiselina, l-askorbinska kiselina, askrobinska kiselina, askorbinska hu:E300, Aszkorbinsav, l-aszkorbinsav it:E300, Acido ascorbico, acido l-ascorbico, Ascorbato lt:E300, Askorbo rūgštis, l-askorbo rūgštis, Askorbinas @@ -12060,7 +12060,7 @@ es:E433, Monooleato de sorbitán polioxietilenado, Polioxietilen sorbitan monool et:E433, Polüoksüetüleen sorbitaanmonooleaat, Polüsorbaat 80 fi:E433, Polyoksyetyleenisorbitaanimono-oleaatti, Polysorbaatti 80, Polyoksyetyleenisorbitaanimono-oleaattia fr:E433, Monooléate de polyoxyéthylène de sorbitane, polysorbate 80, Polyoxyethylene sorbitan monooleate (polysorbate 80) -hr:E433 +hr:E433, polioksietilen sorbitan monooleat hu:E433, Polioxietilén-szorbitan-monooleát, Poliszorbát 80, Polioxietilén(20)-szorbitán-oleát it:E433, Monoleato di poliossietilene sorbitano, Polisorbato 80 lt:E433, Polioksietileno sorbitano monooleatas, Polisorbatas 80 @@ -14289,7 +14289,7 @@ es:E471, mono- y diglicéridos de ácidos grasos, Monoglicéridos y diglicérido et:E471, Rasvhapete mono- ja diglütseriidid, Glütserüülmonostearaat, glütserüülmonopalmitaat, glütserüülmonooleaat, monosteariin, monopalmitiin fi:E471, Rasvahappojen mono- ja diglyseridit, Glyseryylimonostearaatti, Glyseryylimonopalmitaatti, Glyseryylimono-oleaatti, Monosteariini, Monopalmitiini, Mono-oleiini fr:E471, Mono- et diglycérides d'acides gras, Mono- et diglycérides d'acides gras alimentaires, Monoglycérides et diglycérides d'acides gras, Mono et diglycérides d'acides gras, Monostéarate de glycérine, Monopalmitate de glycérine, Monooléate de glycérine, Monostéarine, monopalmitine, monooléine, Mono and diglycerides of fatty acids, glyceryl monostearate, glyceryl distearate , Monostéarine -hr:E471, mono- i digliceridi masnih kiselina, mono - i digliceridi masnih kiselina, emulgator mono - i digliceridi masnih kiselina, emuglator e471, emulgator mono i digliceridi masnih kiselina, monogliceridi i digliceridi masnih kiselina e471 +hr:E471, mono- i digliceridi masnih kiselina, mono - i digliceridi masnih kiselina, emulgator mono - i digliceridi masnih kiselina, emuglator e471, emulgator mono i digliceridi masnih kiselina, monogliceridi i digliceridi masnih kiselina e471, mono- i diglicerida masnih kiselina, mono - i diglicerida masnih kiselina hu:E471, Zsírsavak mono- és digliceridjei, Gliceril-monosztearát, Gliceril-monopalmitát, Gliceril-monooleát, Monosztearin, monopalmitin, monoolein it:E471, Mono- e digliceridi degli acidi grassi, Monostearato di glicerile, monopalmitato di glicerile, monooleato di glicerile, monostearina, monopalmitina, monooleina, mono- e digliceridi degli acidi grassi alimentari lt:E471, Riebalų rūgščių mono- ir digliceridai, Glicerilmonostearatas, glicerilmonopalmitatas, glicerilmonooleatas, monostearinas, monopalmitinas, monooleinas diff --git a/taxonomies/ingredients.txt b/taxonomies/ingredients.txt index 38c13372554ed..cf9dd1bed1bae 100644 --- a/taxonomies/ingredients.txt +++ b/taxonomies/ingredients.txt @@ -3173,6 +3173,10 @@ uk:Коров'яче молоко zh:牛奶 wikipedia:fr:https://fr.wikipedia.org/wiki/Lait_de_vache + "hr", ingredients_text => "Pasterizirano mlijeko (s 1.0% mliječne masti)"}, + [ + { + 'id' => 'en:milk-with-1-0-milk-fat', + 'processing' => 'en:pasteurised', + 'text' => 'mlijeko s 1.0% mliječne masti' + } + ] + ], ################################################################## #