From aabeabccb430e70018956d58015dec4e9c4003c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@bournhonesque.eu>
Date: Wed, 2 Aug 2023 10:20:02 +0200
Subject: [PATCH] fix: don't allow bot crawlers to index unsupported lc for cc

ex: fr-es.openfoodfacts.org shouldn't be indexable by web crawlers
See https://github.com/openfoodfacts/openfoodfacts-server/issues/8779
for more context
---
 lib/ProductOpener/Display.pm           | 18 ++++++++++++++-
 lib/ProductOpener/Routing.pm           |  6 +----
 tests/integration/facet_page_crawler.t | 31 ++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 6 deletions(-)
diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm
index 504fa85ea0354..21168d4f76bf1 100644
--- a/lib/ProductOpener/Display.pm
+++ b/lib/ProductOpener/Display.pm
@@ -566,6 +566,14 @@ sub init_request ($request_ref = {}) {
 	# remove the / to normalize the query string, as we use it to build some redirect urls
 	$request_ref->{original_query_string} =~ s/^\///;
 
+	# Set $request_ref->{is_crawl_bot}
+	set_user_agent_request_ref_attributes($request_ref);
+
+	# `no_index` specifies whether we send an empty HTML page with a <meta name="robots" content="noindex">
+	# in the HTML headers. This is only done for known web crawlers (Google, Bing, Yandex,...) on webpages that
+	# trigger heavy DB aggregation queries and overload our server.
+	$request_ref->{no_index} = 0;
+
 	# TODO: global variables should be moved to $request_ref
 	$styles = '';
 	$scripts = '';
@@ -614,6 +622,7 @@ sub init_request ($request_ref = {}) {
 		($cc, $country, $lc) = ('world', 'en:world', 'en');
 	}
 	elsif (defined $country_codes{$subdomain}) {
+		# subdomain is the country code: fr.openfoodfacts.org, uk.openfoodfacts.org,...
 		local $log->context->{subdomain_format} = 1;
 
 		$cc = $subdomain;
@@ -633,6 +642,7 @@ sub init_request ($request_ref = {}) {
 
 	}
 	elsif ($subdomain =~ /(.*?)-(.*)/) {
+		# subdomain contains the country code and the language code: world-fr.openfoodfacts.org, ch-it.openfoodfacts.org,...
 		local $log->context->{subdomain_format} = 2;
 		$log->debug("subdomain in cc-lc format - checking values",
 			{subdomain => $subdomain, lc => $lc, cc => $cc, country => $country})
@@ -733,6 +743,13 @@ sub init_request ($request_ref = {}) {
 		}
 	}
 
+	# If lc is not one of the official languages of the country and if the request comes from
+	# a bot crawler, don't index the webpage (return an empty noindex HTML page)
+	# It also disables every world-{lc} combinations for lc != 'en' for web crawlers
+	if (!($lc ~~ $country_languages{$cc}) and ($request_ref->{is_crawl_bot} eq 1)) {
+		$request_ref->{no_index} = 1;
+	}
+
 	# select the nutriment table format according to the country
 	$nutriment_table = $cc_nutriment_table{default};
 	if (exists $cc_nutriment_table{$cc}) {
@@ -883,7 +900,6 @@ CSS
 	$request_ref->{cc} = $cc;
 	$request_ref->{country} = $country;
 	$request_ref->{lcs} = \@lcs;
-	set_user_agent_request_ref_attributes($request_ref);
 
 	return $request_ref;
 }
diff --git a/lib/ProductOpener/Routing.pm b/lib/ProductOpener/Routing.pm
index a988ecc56e6f5..4fb57fcaf2d6e 100644
--- a/lib/ProductOpener/Routing.pm
+++ b/lib/ProductOpener/Routing.pm
@@ -94,11 +94,6 @@ sub analyze_request ($request_ref) {
 
 	$request_ref->{query_string} = $request_ref->{original_query_string};
 
-	# `no_index` specifies whether we send an empty HTML page with a <meta name="robots" content="noindex">
-	# in the HTML headers. This is only done for known web crawlers (Google, Bing, Yandex,...) on webpages that
-	# trigger heavy DB aggregation queries and overload our server.
-	$request_ref->{no_index} = 0;
-
 	$log->debug("analyzing query_string, step 0 - unmodified", {query_string => $request_ref->{query_string}})
 		if $log->is_debug();
 
@@ -576,6 +571,7 @@ sub analyze_request ($request_ref) {
 	# Return noindex empty HTML page for web crawlers that crawl specific facet pages
 	if ($request_ref->{is_crawl_bot} eq 1) {
 		if (defined $request_ref->{groupby_tagtype}) {
+			# $request_ref->{no_index} is set to 0 by default in init_request()
 			$request_ref->{no_index} = 1;
 		}
 		elsif (defined $request_ref->{tagtype}) {
diff --git a/tests/integration/facet_page_crawler.t b/tests/integration/facet_page_crawler.t
index 47a6f63a3e7dd..d5e1b9b49d03f 100644
--- a/tests/integration/facet_page_crawler.t
+++ b/tests/integration/facet_page_crawler.t
@@ -156,6 +156,37 @@ my $tests_ref = [
 		expected_type => 'html',
 		response_content_must_not_match => 'Fetching facet knowledge panel'
 	},
+	# Normal user should get access to every possible cc-lc combination
+	{
+		test_case => 'normal-user-get-non-official-cc-lc',
+		method => 'GET',
+		path => '/?cc=ch&lc=es',
+		headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_not_match => '<h1>NOINDEX</h1>'
+	},
+	# Crawling bot should not have access to non official cc-lc combination
+	# Here lc=es is not an official language of cc=ch
+	{
+		test_case => 'crawler-get-non-official-cc-lc',
+		method => 'GET',
+		path => '/?cc=ch&lc=es',
+		headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_match => '<h1>NOINDEX</h1>'
+	},
+	# Crawling bot should not have access to world-{lc} where lc != en
+	{
+		test_case => 'crawler-get-non-official-cc-lc',
+		method => 'GET',
+		path => '/?cc=world&lc=es',
+		headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_match => '<h1>NOINDEX</h1>'
+	},
 ];
 
 execute_api_tests(__FILE__, $tests_ref);