From aabeabccb430e70018956d58015dec4e9c4003c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Wed, 2 Aug 2023 10:20:02 +0200 Subject: [PATCH] fix: don't allow bot crawlers to index unsupported lc for cc ex: fr-es.openfoodfacts.org shouldn't be indexable by web crawlers See https://github.com/openfoodfacts/openfoodfacts-server/issues/8779 for more context --- lib/ProductOpener/Display.pm | 18 ++++++++++++++- lib/ProductOpener/Routing.pm | 6 +---- tests/integration/facet_page_crawler.t | 31 ++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm index 504fa85ea0354..21168d4f76bf1 100644 --- a/lib/ProductOpener/Display.pm +++ b/lib/ProductOpener/Display.pm @@ -566,6 +566,14 @@ sub init_request ($request_ref = {}) { # remove the / to normalize the query string, as we use it to build some redirect urls $request_ref->{original_query_string} =~ s/^\///; + # Set $request_ref->{is_crawl_bot} + set_user_agent_request_ref_attributes($request_ref); + + # `no_index` specifies whether we send an empty HTML page with a + # in the HTML headers. This is only done for known web crawlers (Google, Bing, Yandex,...) on webpages that + # trigger heavy DB aggregation queries and overload our server. + $request_ref->{no_index} = 0; + # TODO: global variables should be moved to $request_ref $styles = ''; $scripts = ''; @@ -614,6 +622,7 @@ sub init_request ($request_ref = {}) { ($cc, $country, $lc) = ('world', 'en:world', 'en'); } elsif (defined $country_codes{$subdomain}) { + # subdomain is the country code: fr.openfoodfacts.org, uk.openfoodfacts.org,... local $log->context->{subdomain_format} = 1; $cc = $subdomain; @@ -633,6 +642,7 @@ sub init_request ($request_ref = {}) { } elsif ($subdomain =~ /(.*?)-(.*)/) { + # subdomain contains the country code and the language code: world-fr.openfoodfacts.org, ch-it.openfoodfacts.org,... local $log->context->{subdomain_format} = 2; $log->debug("subdomain in cc-lc format - checking values", {subdomain => $subdomain, lc => $lc, cc => $cc, country => $country}) @@ -733,6 +743,13 @@ sub init_request ($request_ref = {}) { } } + # If lc is not one of the official languages of the country and if the request comes from + # a bot crawler, don't index the webpage (return an empty noindex HTML page) + # It also disables every world-{lc} combinations for lc != 'en' for web crawlers + if (!($lc ~~ $country_languages{$cc}) and ($request_ref->{is_crawl_bot} eq 1)) { + $request_ref->{no_index} = 1; + } + # select the nutriment table format according to the country $nutriment_table = $cc_nutriment_table{default}; if (exists $cc_nutriment_table{$cc}) { @@ -883,7 +900,6 @@ CSS $request_ref->{cc} = $cc; $request_ref->{country} = $country; $request_ref->{lcs} = \@lcs; - set_user_agent_request_ref_attributes($request_ref); return $request_ref; } diff --git a/lib/ProductOpener/Routing.pm b/lib/ProductOpener/Routing.pm index a988ecc56e6f5..4fb57fcaf2d6e 100644 --- a/lib/ProductOpener/Routing.pm +++ b/lib/ProductOpener/Routing.pm @@ -94,11 +94,6 @@ sub analyze_request ($request_ref) { $request_ref->{query_string} = $request_ref->{original_query_string}; - # `no_index` specifies whether we send an empty HTML page with a - # in the HTML headers. This is only done for known web crawlers (Google, Bing, Yandex,...) on webpages that - # trigger heavy DB aggregation queries and overload our server. - $request_ref->{no_index} = 0; - $log->debug("analyzing query_string, step 0 - unmodified", {query_string => $request_ref->{query_string}}) if $log->is_debug(); @@ -576,6 +571,7 @@ sub analyze_request ($request_ref) { # Return noindex empty HTML page for web crawlers that crawl specific facet pages if ($request_ref->{is_crawl_bot} eq 1) { if (defined $request_ref->{groupby_tagtype}) { + # $request_ref->{no_index} is set to 0 by default in init_request() $request_ref->{no_index} = 1; } elsif (defined $request_ref->{tagtype}) { diff --git a/tests/integration/facet_page_crawler.t b/tests/integration/facet_page_crawler.t index 47a6f63a3e7dd..d5e1b9b49d03f 100644 --- a/tests/integration/facet_page_crawler.t +++ b/tests/integration/facet_page_crawler.t @@ -156,6 +156,37 @@ my $tests_ref = [ expected_type => 'html', response_content_must_not_match => 'Fetching facet knowledge panel' }, + # Normal user should get access to every possible cc-lc combination + { + test_case => 'normal-user-get-non-official-cc-lc', + method => 'GET', + path => '/?cc=ch&lc=es', + headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_not_match => '

NOINDEX

' + }, + # Crawling bot should not have access to non official cc-lc combination + # Here lc=es is not an official language of cc=ch + { + test_case => 'crawler-get-non-official-cc-lc', + method => 'GET', + path => '/?cc=ch&lc=es', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => '

NOINDEX

' + }, + # Crawling bot should not have access to world-{lc} where lc != en + { + test_case => 'crawler-get-non-official-cc-lc', + method => 'GET', + path => '/?cc=world&lc=es', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => '

NOINDEX

' + }, ]; execute_api_tests(__FILE__, $tests_ref);