From d763210a911bbce6b00b2ce28da3b4f9fc299d72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?=
 <raphael0202@users.noreply.github.com>
Date: Fri, 28 Jul 2023 16:16:19 +0200
Subject: [PATCH] fix: return empty noindex webpage when crawlers hit specific
 pages (#8744)

* fix: return empty noindex webpage when crawlers hit specific pages

when a crawler hit nested facets (ex:
/category/popcorn-with-caramel/data-quality-error/nutrition-value-total-over-105)
we return a blank HTML page with a noindex directive to prevent the
crawler from overloading our servers.

* fix: fix Seekport bot directive in robots.txt

The User-Agent has changed: see https://bot.seekport.com/

* fix: fix unit tests

* fix: don't let bot index most facet pages

Crawling bots can't visit all page and crawl OFF continuously.
We want to limit crawlers on interesting pages, so we return a noindex
html page on most facet pages (except most interesting ones such as
brand, category,...)
---
 cgi/display.pl                         |   7 ++
 html/robots.txt                        |   2 +-
 lib/ProductOpener/Display.pm           |  60 ++++++++++++
 lib/ProductOpener/Routing.pm           |  23 +++++
 stop_words.txt                         |   2 +
 tests/integration/facet_page_crawler.t | 122 +++++++++++++++++++++++++
 tests/unit/routing.t                   |  15 ++-
 7 files changed, 225 insertions(+), 6 deletions(-)
 create mode 100644 tests/integration/facet_page_crawler.t
diff --git a/cgi/display.pl b/cgi/display.pl
index c0ae3c24aeb27..281c0badac447 100755
--- a/cgi/display.pl
+++ b/cgi/display.pl
@@ -110,6 +110,13 @@
 	return Apache2::Const::OK;
 }
 
+if ($request_ref->{no_index} eq 1) {
+	# The request is made from a known web crawler and the web-page shouldn't be indexed:
+	# return directly a "noindex" empty HTML page
+	display_no_index_page_and_exit();
+	return Apache2::Const::OK;
+}
+
 $log->debug(
 	"after analyze_request",
 	{
diff --git a/html/robots.txt b/html/robots.txt
index 48990e897a5d4..236673a1b61cf 100644
--- a/html/robots.txt
+++ b/html/robots.txt
@@ -43,7 +43,7 @@ User-agent: Cliqzbot/3.0
 Disallow: /
 User-agent: Cliqzbot
 Disallow: /
-User-agent: Seekport Crawler
+User-agent: SeekportBot
 Disallow: /
 User-agent: Paracrawl
 Disallow: /
diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm
index 78ddd6dcec169..2237f09c83d8b 100644
--- a/lib/ProductOpener/Display.pm
+++ b/lib/ProductOpener/Display.pm
@@ -55,6 +55,7 @@ BEGIN {
 		&display_icon
 
 		&display_structured_response
+		&display_no_index_page_and_exit
 		&display_page
 		&display_text
 		&display_points
@@ -882,10 +883,37 @@ CSS
 	$request_ref->{cc} = $cc;
 	$request_ref->{country} = $country;
 	$request_ref->{lcs} = \@lcs;
+	set_user_agent_request_ref_attributes($request_ref);
 
 	return $request_ref;
 }
 
+=head2 set_user_agent_request_ref_attributes ($request_ref)
+
+Set two attributes to `request_ref`:
+
+- `user_agent`: the request User-Agent
+- `is_crawl_bot`: a flag (0 or 1) that indicates whether the request comes
+  from a known web crawler (Google, Bing,...). We only use User-Agent value
+  to set this flag.
+
+=cut
+
+sub set_user_agent_request_ref_attributes ($request_ref) {
+	my $user_agent = user_agent();
+	$request_ref->{user_agent} = $user_agent;
+
+	my $is_crawl_bot = 0;
+	if ($user_agent
+		=~ /Googlebot|Googlebot-Image|bingbot|Applebot|YandexBot|YandexRenderResourcesBot|DuckDuckBot|DotBot|SeekportBot|AhrefsBot|DataForSeoBot|SeznamBot|ZoomBot|MojeekBot|QRbot|www\.qwant\.com|facebookexternalhit/
+		)
+	{
+		$is_crawl_bot = 1;
+	}
+	$request_ref->{is_crawl_bot} = $is_crawl_bot;
+	return;
+}
+
 sub _get_date ($t) {
 
 	if (defined $t) {
@@ -1009,6 +1037,38 @@ sub display_error_and_exit ($error_message, $status_code) {
 	exit();
 }
 
+=head2 display_no_index_page_and_exit ()
+
+Return an empty HTML page with a '<meta name="robots" content="noindex">' directive
+in the HTML header.
+
+This is useful to prevent web crawlers to overload our servers by querying webpages
+that require a lot of resources (especially aggregation queries).
+
+=cut
+
+sub display_no_index_page_and_exit () {
+	my $html
+		= '<!DOCTYPE html><html><head><meta name="robots" content="noindex"></head><body><h1>NOINDEX</h1><p>We detected that your browser is a web crawling bot, and this page should not be indexed by web crawlers. If this is unexpected, contact us on Slack or write us an email at <a href="mailto:contact@openfoodfacts.org">contact@openfoodfacts.org</a>.</p></body></html>';
+	my $http_headers_ref = {
+		'-status' => 200,
+		'-expires' => '-1d',
+		'-charset' => 'UTF-8',
+	};
+
+	print header(%$http_headers_ref);
+
+	my $r = Apache2::RequestUtil->request();
+	$r->rflush;
+	# Setting the status makes mod_perl append a default error to the body
+	# Send 200 instead.
+	$r->status(200);
+	binmode(STDOUT, ":encoding(UTF-8)");
+	print $html;
+	return;
+	exit();
+}
+
 # Specific index for producer on the platform for producers
 sub display_index_for_producer ($request_ref) {
 
diff --git a/lib/ProductOpener/Routing.pm b/lib/ProductOpener/Routing.pm
index 1e0d9975cb53f..49cf5da2b45ed 100644
--- a/lib/ProductOpener/Routing.pm
+++ b/lib/ProductOpener/Routing.pm
@@ -94,6 +94,11 @@ sub analyze_request ($request_ref) {
 
 	$request_ref->{query_string} = $request_ref->{original_query_string};
 
+	# `no_index` specifies whether we send an empty HTML page with a <meta name="robots" content="noindex">
+	# in the HTML headers. This is only done for known web crawlers (Google, Bing, Yandex,...) on webpages that
+	# trigger heavy DB aggregation queries and overload our server.
+	$request_ref->{no_index} = 0;
+
 	$log->debug("analyzing query_string, step 0 - unmodified", {query_string => $request_ref->{query_string}})
 		if $log->is_debug();
 
@@ -564,6 +569,24 @@ sub analyze_request ($request_ref) {
 		$request_ref->{text} = 'index-pro';
 	}
 
+	# Return noindex empty HTML page for web crawlers that crawl specific facet pages
+	if (($request_ref->{is_crawl_bot} eq 1) and (defined $request_ref->{tagtype})) {
+		if ($request_ref->{tagtype} !~ /^brands|categories|labels|additives|nova-groups|ecoscore|nutrition-grades$/) {
+			# Only allow indexation of a selected number of facets
+			# Ingredients were left out because of the number of possible ingredients (1.2M)
+			$request_ref->{no_index} = 1;
+		}
+		elsif ($request_ref->{page} >= 2) {
+			# Don't index facet pages with page number > 1 (we want only 1 index page per facet value)
+			$request_ref->{no_index} = 1;
+		}
+		elsif (defined $request_ref->{tagtype2}) {
+			# Don't index web pages with 2 nested tags: as an example, there are billions of combinations for
+			# category x ingredient alone
+			$request_ref->{no_index} = 1;
+		}
+	}
+
 	$log->debug("request analyzed", {lc => $lc, lang => $lang, request_ref => $request_ref}) if $log->is_debug();
 
 	return 1;
diff --git a/stop_words.txt b/stop_words.txt
index 7e178fd9d27d2..32a37106f6db0 100644
--- a/stop_words.txt
+++ b/stop_words.txt
@@ -223,3 +223,5 @@ weighers
 www
 xml
 gzipped
+webpages
+bing
\ No newline at end of file
diff --git a/tests/integration/facet_page_crawler.t b/tests/integration/facet_page_crawler.t
new file mode 100644
index 0000000000000..1fa89631d385c
--- /dev/null
+++ b/tests/integration/facet_page_crawler.t
@@ -0,0 +1,122 @@
+#!/usr/bin/perl -w
+
+use ProductOpener::PerlStandards;
+
+use Test::More;
+use ProductOpener::APITest qw/:all/;
+use ProductOpener::Test qw/:all/;
+use ProductOpener::TestDefaults qw/:all/;
+
+use File::Basename "dirname";
+
+use Storable qw(dclone);
+
+remove_all_users();
+
+remove_all_products();
+
+wait_application_ready();
+
+my $ua = new_client();
+
+my $CRAWLING_BOT_USER_AGENT
+	= 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/';
+my $NORMAL_USER_USER_AGENT
+	= 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0';
+
+my %product_form = (
+	%{dclone(\%default_product_form)},
+	(
+		code => '0200000000235',
+		product_name => "Only-Product",
+	)
+);
+
+edit_product($ua, \%product_form);
+
+my $tests_ref = [
+	# Normal user should have access to product page
+	{
+		test_case => 'normal-user-access-product-page',
+		method => 'GET',
+		path => '/product/0200000000235/only-product',
+		headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_match => '<title>Only-Product - 100 g</title>'
+	},
+	# Crawling bot should have access to product page
+	{
+		test_case => 'crawler-access-product-page',
+		method => 'GET',
+		path => '/product/0200000000235/only-product',
+		headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_match => '<title>Only-Product - 100 g</title>'
+	},
+	# Crawling bot should receive a noindex page for nested facets
+	{
+		test_case => 'crawler-access-nested-facet-page',
+		method => 'GET',
+		path => '/category/hazelnut-spreads/brand/nutella',
+		headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_match => '<h1>NOINDEX</h1>'
+	},
+	# Normal user should have access to nested facets
+	{
+		test_case => 'normal-user-access-nested-facet-page',
+		method => 'GET',
+		path => '/category/hazelnut-spreads/brand/nutella',
+		headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_not_match => '<h1>NOINDEX</h1>'
+	},
+	# Crawling bot should have access to specific facet pages (such as category)
+	{
+		test_case => 'crawler-access-category-facet-page',
+		method => 'GET',
+		path => '/category/hazelnut-spreads',
+		headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_not_match => '<h1>NOINDEX</h1>'
+	},
+	# Normal user should have access to facet pages
+	{
+		test_case => 'normal-user-access-category-facet-page',
+		method => 'GET',
+		path => '/category/hazelnut-spreads',
+		headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_not_match => '<h1>NOINDEX</h1>'
+	},
+	# Crawling bot should receive a noindex page for most facet pages (including editor facet)
+	{
+		test_case => 'crawler-access-editor-facet-page',
+		method => 'GET',
+		path => '/editor/unknown-user',
+		headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_match => '<h1>NOINDEX</h1>'
+	},
+	# Normal user should have access to editor facet
+	{
+		test_case => 'normal-user-access-editor-facet-page',
+		method => 'GET',
+		path => '/editor/unknown-user',
+		headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_match => 'Unknown user.'
+	},
+];
+
+execute_api_tests(__FILE__, $tests_ref);
+
+done_testing();
diff --git a/tests/unit/routing.t b/tests/unit/routing.t
index b49d45fbea8ba..9c9bfd9c8a250 100644
--- a/tests/unit/routing.t
+++ b/tests/unit/routing.t
@@ -32,7 +32,8 @@ my @tests = (
 			'lc' => 'en',
 			'original_query_string' => 'api/v0/attribute_groups',
 			'page' => 1,
-			'query_string' => 'api/v0/attribute_groups'
+			'query_string' => 'api/v0/attribute_groups',
+			'no_index' => '0'
 		},
 	},
 	{
@@ -56,7 +57,8 @@ my @tests = (
 			'tag' => 'en:breads',
 			'tag_prefix' => '',
 			'tagid' => 'en:breads',
-			'tagtype' => 'categories'
+			'tagtype' => 'categories',
+			'no_index' => '0'
 		},
 	},
 	{
@@ -78,7 +80,8 @@ my @tests = (
 			'tag' => 'en:breads',
 			'tag_prefix' => '',
 			'tagid' => 'en:breads',
-			'tagtype' => 'categories'
+			'tagtype' => 'categories',
+			'no_index' => '0'
 		},
 	},
 	{
@@ -100,7 +103,8 @@ my @tests = (
 			'tag' => 'en:breads',
 			'tag_prefix' => '',
 			'tagid' => 'en:breads',
-			'tagtype' => 'categories'
+			'tagtype' => 'categories',
+			'no_index' => '0'
 		},
 	},
 	{
@@ -122,7 +126,8 @@ my @tests = (
 			'tag' => 'en:bread',
 			'tag_prefix' => '',
 			'tagid' => 'en:bread',
-			'tagtype' => 'categories'
+			'tagtype' => 'categories',
+			'no_index' => '0'
 		},
 	},