From fee8d3ddd4fe4675e0ed1d423d57b19e6ba78c76 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Thu, 7 Mar 2024 20:00:51 +0100 Subject: [PATCH] feat: revamped taxonomy sort script (#9818) to really sort everything (within an entry) --- .github/labeler.yml | 1 - Makefile | 18 +- scripts/sort_languages_in_taxonomy.pl | 62 ------- .../taxonomies/sort_each_taxonomy_entry.pl | 171 ++++++++++++++++++ .../taxonomies/sort_each_taxonomy_entry.sh | 43 +++++ 5 files changed, 230 insertions(+), 65 deletions(-) delete mode 100755 scripts/sort_languages_in_taxonomy.pl create mode 100755 scripts/taxonomies/sort_each_taxonomy_entry.pl create mode 100755 scripts/taxonomies/sort_each_taxonomy_entry.sh diff --git a/.github/labeler.yml b/.github/labeler.yml index 4993315f92167..f1b19f4535c3f 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -35,7 +35,6 @@ Stores: 🧬 Taxonomies: - taxonomies/**/* - scripts/taxonomies/**/* -- scripts/sort_languages_in_taxonomy.pl - cgi/translate_taxonomy.pl 🧬 Taxonomies - Rebuild: diff --git a/Makefile b/Makefile index e3188337e040e..6647053171534 100644 --- a/Makefile +++ b/Makefile @@ -241,8 +241,10 @@ front_build: checks: front_build front_lint check_perltidy check_perl_fast check_critic +# TODO: add check_taxonomies when taxonomies ready lint: lint_perltidy +# TODO: add lint_taxonomies when taxonomies ready tests: build_lang_test unit_test integration_test @@ -321,7 +323,7 @@ bash_test: # the ls at the end is to avoid removed files. # the first commad is to check we have git (to avoid trying to run this line inside the container on check_perl*) # We have to finally filter out "." as this will the output if we have no file -TO_CHECK=$(shell [ -x "`which git 2>/dev/null`" ] && git diff origin/main --name-only | grep '.*\.\(pl\|pm\|t\)$$' | grep -v "scripts/obsolete" | xargs ls -d 2>/dev/null | grep -v "^.$$" ) +TO_CHECK := $(shell [ -x "`which git 2>/dev/null`" ] && git diff origin/main --name-only | grep '.*\.\(pl\|pm\|t\)$$' | grep -v "scripts/obsolete" | xargs ls -d 2>/dev/null | grep -v "^.$$" ) check_perl_fast: @echo "🥫 Checking ${TO_CHECK}" @@ -341,7 +343,7 @@ check_perl: # check with perltidy # we exclude files that are in .perltidy_excludes -TO_TIDY_CHECK = $(shell echo ${TO_CHECK}| tr " " "\n" | grep -vFf .perltidy_excludes) +TO_TIDY_CHECK := $(shell echo ${TO_CHECK}| tr " " "\n" | grep -vFf .perltidy_excludes) check_perltidy: @echo "🥫 Checking with perltidy ${TO_TIDY_CHECK}" test -z "${TO_TIDY_CHECK}" || ${DOCKER_COMPOSE} run --rm --no-deps backend perltidy --assert-tidy -opath=/tmp/ --standard-error-output ${TO_TIDY_CHECK} @@ -358,6 +360,18 @@ check_critic: @echo "🥫 Checking with perlcritic" test -z "${TO_CHECK}" || ${DOCKER_COMPOSE} run --rm --no-deps backend perlcritic ${TO_CHECK} +TAXONOMIES_TO_CHECK := $(shell [ -x "`which git 2>/dev/null`" ] && git diff origin/main --name-only | grep 'taxonomies*/*\.txt$$' | grep -v '\.result.txt' | xargs ls -d 2>/dev/null | grep -v "^.$$") + +check_taxonomies: + @echo "🥫 Checking taxonomies" + test -z "${TAXONOMIES_TO_CHECK}" || \ + ${DOCKER_COMPOSE} run --rm --no-deps backend scripts/taxonomies/sort_each_taxonomy_entry.sh --check ${TAXONOMIES_TO_CHECK} + +lint_taxonomies: + @echo "🥫 Linting taxonomies" + test -z "${TAXONOMIES_TO_CHECK}" || \ + ${DOCKER_COMPOSE} run --rm --no-deps backend scripts/taxonomies/sort_each_taxonomy_entry.sh ${TAXONOMIES_TO_CHECK} + check_openapi_v2: docker run --rm \ diff --git a/scripts/sort_languages_in_taxonomy.pl b/scripts/sort_languages_in_taxonomy.pl deleted file mode 100755 index 235b8ee48a9d1..0000000000000 --- a/scripts/sort_languages_in_taxonomy.pl +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/perl -w - -# This file is part of Product Opener. -# -# Product Opener -# Copyright (C) 2011-2023 Association Open Food Facts -# Contact: contact@openfoodfacts.org -# Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France -# -# Product Opener is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -use Modern::Perl '2017'; -use utf8; - -use ProductOpener::Store qw/:all/; -use ProductOpener::Config qw/:all/; - -binmode(STDIN, ":encoding(UTF-8)"); -binmode(STDOUT, ":encoding(UTF-8)"); -binmode(STDERR, ":encoding(UTF-8)"); - -my $english = ""; -my $others = ""; - -while () { - - my $line = $_; - - if ($line =~ /^(\w\w):(.*)$/) { - my $lc = $1; - if ($lc eq "en") { - $english = $line; - } - else { - $others .= $line; - } - } - else { - - print $english; - print $others; - print $line; - $english = ""; - $others = ""; - } - -} - -print $english; -print $others; - diff --git a/scripts/taxonomies/sort_each_taxonomy_entry.pl b/scripts/taxonomies/sort_each_taxonomy_entry.pl new file mode 100755 index 0000000000000..a4aa0121f05ba --- /dev/null +++ b/scripts/taxonomies/sort_each_taxonomy_entry.pl @@ -0,0 +1,171 @@ +#!/usr/bin/perl -w + +# This file is part of Product Opener. +# +# Product Opener +# Copyright (C) 2011-2023 Association Open Food Facts +# Contact: contact@openfoodfacts.org +# Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France +# +# Product Opener is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +use Modern::Perl '2017'; +use utf8; + +use ProductOpener::Store qw/:all/; +use ProductOpener::Config qw/:all/; + +my $is_check = grep {$_ eq "--check"} @ARGV; +my $is_verbose = grep {$_ eq "-v"} @ARGV; +my $has_changes = 0; + +binmode(STDIN, ":encoding(UTF-8)"); +binmode(STDOUT, ":encoding(UTF-8)"); +binmode(STDERR, ":encoding(UTF-8)"); + +sub cmp_on_language ($$) { + my $a = shift; + my $b = shift; + if ((!defined $a) || (!defined $b)) { + return $a cmp $b; + } + $a = lc($a); + $b = lc($b); + my $a_prefix = undef; + my $b_prefix = undef; + if ($a =~ /^(\w+):(\w+)$/) { + $a_prefix = $1; + $a = $2; + } + if ($b =~ /^(\w+):(\w+)$/) { + $b_prefix = $1; + $b = $2; + } + if ($a_prefix && $b_prefix) { + return $a_prefix cmp $b_prefix if ($a_prefix ne $b_prefix); + } + return 0 if ($a eq $b); + # en and xx takes precedence over all others + return -1 if ($a eq "xx"); + return 1 if ($b eq "xx"); + return -1 if ($a eq "en"); # because of lines above, $b ne xx + return 1 if ($b eq "en"); # because of lines above, $a ne xx + return $a cmp $b; +} + +# read all in memory to take care of last line in a simple way +my @lines = (); + +# be sure to end with a blank line +push @lines, "\n" unless $lines[-1] =~ /^\s*$/; + +# structures for one entry +my @parents = (); # lines defining parents +my $entry_id_line = undef; # line defining entry id, we don't want to change it's position +my %entries = (); # lines defining synonyms +my %props = (); # lines defining properties +my @original_lines = (); +# non meaningful lines above a meaningful line (entry, parent or porperty) +my @previous_lines = (); +my $line_num = 0; +my $entry_start_line = 1; # tracking line number of the first line of an entry +foreach my $line (@lines) { + $line_num += 1; + push @original_lines, $line; # collect lines for comparison + + # blank line means we are changing entry, so let's print collected lines + if ($line =~ /^\s*$/) { + my @output_lines = (); + # sort items + @parents = sort {$a->{line} cmp $b->{line}} @parents; + my @sorted_entries = sort cmp_on_language (keys %entries); + my @sorted_props = sort cmp_on_language (keys %props); + # print parents, line id, synonyms, sorted props + for my $parent (@parents) { + push @output_lines, @{$parent->{previous}}; + push @output_lines, $parent->{line}; + } + if (defined $entry_id_line) { + push @output_lines, @{$entry_id_line->{previous}}; + push @output_lines, $entry_id_line->{line}; + } + for my $key (@sorted_entries) { + push @output_lines, @{$entries{$key}->{previous}}; + push @output_lines, $entries{$key}->{line}; + } + for my $key (@sorted_props) { + push @output_lines, @{$props{$key}->{previous}}; + push @output_lines, $props{$key}->{line}; + } + # print remaining previous_lines (if any) + push @output_lines, @previous_lines; + # print this blank line + push @output_lines, $line; + my $original = join("", @original_lines); + my $output = join("", @output_lines); + if ($is_check) { + # compare with original lines + if (not $original eq $output) { + $has_changes = 1; + if ($is_verbose) { + print "Error: output is not the same as original, line $entry_start_line..$line_num\n"; + print "Original --------------------\n"; + print "$original\n"; + print "Sorted --------------------\n"; + print "$output\n"; + } + } + } + else { + print "$output"; + } + # re-init + $entry_id_line = undef; + @parents = (); + %entries = (); + %props = (); + @previous_lines = (); + @original_lines = (); + $entry_start_line = $line_num; + } + # parents + elsif ($line =~ /^ $line, previous => [@previous_lines]}; + @previous_lines = (); + } + # synonym + elsif ($line =~ /^(\w+):[^:]*(,.*)*$/) { + if (!defined $entry_id_line) { + $entry_id_line = {line => $line, previous => [@previous_lines]}; + } + else { + my $lc = $1; + $entries{$lc} = {line => $line, previous => [@previous_lines]}; + } + @previous_lines = (); + } + # property + elsif ($line =~ /^(\w+):(\w+):(.*)$/) { + my $prop = $1; + my $lc = $2; + $props{"$prop:$lc"} = {line => $line, previous => [@previous_lines]}; + @previous_lines = (); + } + # comments or undefined + else { + push @previous_lines, $line; + } +} + +exit($is_check and $has_changes); diff --git a/scripts/taxonomies/sort_each_taxonomy_entry.sh b/scripts/taxonomies/sort_each_taxonomy_entry.sh new file mode 100755 index 0000000000000..2497bd87effd4 --- /dev/null +++ b/scripts/taxonomies/sort_each_taxonomy_entry.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +ARGS=(); +FILES=(); + +ACTION="Linting" +# options are passed as arguments to the script +for arg in "$@" +do + if [[ "$arg" = -* ]] + then + ARGS+=( "$arg" ); + else + FILES+=( "$arg" ); + fi + if [[ "$arg" = "--check" ]] + then + IS_CHECK=1; + ACTION="Checking" + fi +done + +script=$(dirname $0 )"/sort_each_taxonomy_entry.pl" +FINAL_EXIT=0; +for taxonomy in "${FILES[@]}" +do + echo "$ACTION $taxonomy ===============" + rm -f $taxonomy.tmp + # redirect output only if we're not checking + ( \ + [[ -z "$IS_CHECK" ]] && exec >$taxonomy.tmp; \ + $script "${ARGS[@]}" <$taxonomy; \ + ) + [[ -s $taxonomy.tmp ]] && mv $taxonomy.tmp $taxonomy + EXIT=$? + if [[ $EXIT -ne 0 ]] + then + echo "=> Error in $taxonomy" + FINAL_EXIT=$EXIT; + fi +done + +exit $FINAL_EXIT; \ No newline at end of file