fix: don't let bot index most facet pages

Crawling bots can't visit all page and crawl OFF continuously. We want to limit crawlers on interesting pages, so we return a noindex html page on most facet pages (except most interesting ones such as brand, category,...)
openfoodfacts · Jul 28, 2023 · 0e82a6f · 0e82a6f
1 parent 3f20822
commit 0e82a6f
Show file tree

Hide file tree

Showing 3 changed files with 141 additions and 7 deletions.
diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm
@@ -1049,7 +1049,7 @@ that require a lot of resources (especially aggregation queries).
 
 sub display_no_index_page_and_exit () {
 	my $html
-		= '<html><head><meta name="robots" content="noindex"></head><body><h1>NOINDEX</h1><p>We detected that your browser is a web crawling bot, and this page should not be indexed by web crawlers. If this is is unexpected, contact us on Slack or write us an email at <a href="mailto:[email protected]">[email protected]</a>.</p></body></html>';
+		= '<!DOCTYPE html><html><head><meta name="robots" content="noindex"></head><body><h1>NOINDEX</h1><p>We detected that your browser is a web crawling bot, and this page should not be indexed by web crawlers. If this is unexpected, contact us on Slack or write us an email at <a href="mailto:[email protected]">[email protected]</a>.</p></body></html>';
 	my $http_headers_ref = {
 		'-status' => 200,
 		'-expires' => '-1d',

diff --git a/lib/ProductOpener/Routing.pm b/lib/ProductOpener/Routing.pm
@@ -484,12 +484,6 @@ sub analyze_request ($request_ref) {
 				}
 				my $tagtype = $request_ref->{tagtype2};
 
-				if ($request_ref->{is_crawl_bot} eq 1) {
-					# Don't index web pages with 2 nested tags: as an example, there are billions of combinations for
-					# category x ingredient alone
-					$request_ref->{no_index} = 1;
-				}
-
 				if (($#components >= 0)) {
 					$request_ref->{tag2} = shift @components;
 
@@ -575,6 +569,24 @@ sub analyze_request ($request_ref) {
 		$request_ref->{text} = 'index-pro';
 	}
 
+	# Return noindex empty HTML page for web crawlers that crawl specific facet pages
+	if (($request_ref->{is_crawl_bot} eq 1) and (defined $request_ref->{tagtype})) {
+		if ($request_ref->{tagtype} !~ /^brands|categories|labels|additives|nova-groups|ecoscore|nutrition-grades$/) {
+			# Only allow indexation of a selected number of facets
+			# Ingredients were left out because of the number of possible ingredients (1.2M)
+			$request_ref->{no_index} = 1;
+		}
+		elsif ($request_ref->{page} >= 2) {
+			# Don't index facet pages with page number > 1 (we want only 1 index page per facet value)
+			$request_ref->{no_index} = 1;
+		}
+		elsif (defined $request_ref->{tagtype2}) {
+			# Don't index web pages with 2 nested tags: as an example, there are billions of combinations for
+			# category x ingredient alone
+			$request_ref->{no_index} = 1;
+		}
+	}
+
 	$log->debug("request analyzed", {lc => $lc, lang => $lang, request_ref => $request_ref}) if $log->is_debug();
 
 	return 1;

diff --git a/tests/integration/facet_page_crawler.t b/tests/integration/facet_page_crawler.t
@@ -0,0 +1,122 @@
+#!/usr/bin/perl -w
+
+use ProductOpener::PerlStandards;
+
+use Test::More;
+use ProductOpener::APITest qw/:all/;
+use ProductOpener::Test qw/:all/;
+use ProductOpener::TestDefaults qw/:all/;
+
+use File::Basename "dirname";
+
+use Storable qw(dclone);
+
+remove_all_users();
+
+remove_all_products();
+
+wait_application_ready();
+
+my $ua = new_client();
+
+my $CRAWLING_BOT_USER_AGENT
+	= 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/';
+my $NORMAL_USER_USER_AGENT
+	= 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0';
+
+my %product_form = (
+	%{dclone(\%default_product_form)},
+	(
+		code => '0200000000235',
+		product_name => "Only-Product",
+	)
+);
+
+edit_product($ua, \%product_form);
+
+my $tests_ref = [
+	# Normal user should have access to product page
+	{
+		test_case => 'normal-user-access-product-page',
+		method => 'GET',
+		path => '/product/0200000000235/only-product',
+		headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_match => '<title>Only-Product - 100 g</title>'
+	},
+	# Crawling bot should have access to product page
+	{
+		test_case => 'crawler-access-product-page',
+		method => 'GET',
+		path => '/product/0200000000235/only-product',
+		headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_match => '<title>Only-Product - 100 g</title>'
+	},
+	# Crawling bot should receive a noindex page for nested facets
+	{
+		test_case => 'crawler-access-nested-facet-page',
+		method => 'GET',
+		path => '/category/hazelnut-spreads/brand/nutella',
+		headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_match => '<h1>NOINDEX</h1>'
+	},
+	# Normal user should have access to nested facets
+	{
+		test_case => 'normal-user-access-nested-facet-page',
+		method => 'GET',
+		path => '/category/hazelnut-spreads/brand/nutella',
+		headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_not_match => '<h1>NOINDEX</h1>'
+	},
+	# Crawling bot should have access to specific facet pages (such as category)
+	{
+		test_case => 'crawler-access-category-facet-page',
+		method => 'GET',
+		path => '/category/hazelnut-spreads',
+		headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_not_match => '<h1>NOINDEX</h1>'
+	},
+	# Normal user should have access to facet pages
+	{
+		test_case => 'normal-user-access-category-facet-page',
+		method => 'GET',
+		path => '/category/hazelnut-spreads',
+		headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_not_match => '<h1>NOINDEX</h1>'
+	},
+	# Crawling bot should receive a noindex page for most facet pages (including editor facet)
+	{
+		test_case => 'crawler-access-editor-facet-page',
+		method => 'GET',
+		path => '/editor/unknown-user',
+		headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_match => '<h1>NOINDEX</h1>'
+	},
+	# Normal user should have access to editor facet
+	{
+		test_case => 'normal-user-access-editor-facet-page',
+		method => 'GET',
+		path => '/editor/unknown-user',
+		headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_match => 'Unknown user.'
+	},
+];
+
+execute_api_tests(__FILE__, $tests_ref);
+
+done_testing();