From d763210a911bbce6b00b2ce28da3b4f9fc299d72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Fri, 28 Jul 2023 16:16:19 +0200 Subject: [PATCH] fix: return empty noindex webpage when crawlers hit specific pages (#8744) * fix: return empty noindex webpage when crawlers hit specific pages when a crawler hit nested facets (ex: /category/popcorn-with-caramel/data-quality-error/nutrition-value-total-over-105) we return a blank HTML page with a noindex directive to prevent the crawler from overloading our servers. * fix: fix Seekport bot directive in robots.txt The User-Agent has changed: see https://bot.seekport.com/ * fix: fix unit tests * fix: don't let bot index most facet pages Crawling bots can't visit all page and crawl OFF continuously. We want to limit crawlers on interesting pages, so we return a noindex html page on most facet pages (except most interesting ones such as brand, category,...) --- cgi/display.pl | 7 ++ html/robots.txt | 2 +- lib/ProductOpener/Display.pm | 60 ++++++++++++ lib/ProductOpener/Routing.pm | 23 +++++ stop_words.txt | 2 + tests/integration/facet_page_crawler.t | 122 +++++++++++++++++++++++++ tests/unit/routing.t | 15 ++- 7 files changed, 225 insertions(+), 6 deletions(-) create mode 100644 tests/integration/facet_page_crawler.t diff --git a/cgi/display.pl b/cgi/display.pl index c0ae3c24aeb27..281c0badac447 100755 --- a/cgi/display.pl +++ b/cgi/display.pl @@ -110,6 +110,13 @@ return Apache2::Const::OK; } +if ($request_ref->{no_index} eq 1) { + # The request is made from a known web crawler and the web-page shouldn't be indexed: + # return directly a "noindex" empty HTML page + display_no_index_page_and_exit(); + return Apache2::Const::OK; +} + $log->debug( "after analyze_request", { diff --git a/html/robots.txt b/html/robots.txt index 48990e897a5d4..236673a1b61cf 100644 --- a/html/robots.txt +++ b/html/robots.txt @@ -43,7 +43,7 @@ User-agent: Cliqzbot/3.0 Disallow: / User-agent: Cliqzbot Disallow: / -User-agent: Seekport Crawler +User-agent: SeekportBot Disallow: / User-agent: Paracrawl Disallow: / diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm index 78ddd6dcec169..2237f09c83d8b 100644 --- a/lib/ProductOpener/Display.pm +++ b/lib/ProductOpener/Display.pm @@ -55,6 +55,7 @@ BEGIN { &display_icon &display_structured_response + &display_no_index_page_and_exit &display_page &display_text &display_points @@ -882,10 +883,37 @@ CSS $request_ref->{cc} = $cc; $request_ref->{country} = $country; $request_ref->{lcs} = \@lcs; + set_user_agent_request_ref_attributes($request_ref); return $request_ref; } +=head2 set_user_agent_request_ref_attributes ($request_ref) + +Set two attributes to `request_ref`: + +- `user_agent`: the request User-Agent +- `is_crawl_bot`: a flag (0 or 1) that indicates whether the request comes + from a known web crawler (Google, Bing,...). We only use User-Agent value + to set this flag. + +=cut + +sub set_user_agent_request_ref_attributes ($request_ref) { + my $user_agent = user_agent(); + $request_ref->{user_agent} = $user_agent; + + my $is_crawl_bot = 0; + if ($user_agent + =~ /Googlebot|Googlebot-Image|bingbot|Applebot|YandexBot|YandexRenderResourcesBot|DuckDuckBot|DotBot|SeekportBot|AhrefsBot|DataForSeoBot|SeznamBot|ZoomBot|MojeekBot|QRbot|www\.qwant\.com|facebookexternalhit/ + ) + { + $is_crawl_bot = 1; + } + $request_ref->{is_crawl_bot} = $is_crawl_bot; + return; +} + sub _get_date ($t) { if (defined $t) { @@ -1009,6 +1037,38 @@ sub display_error_and_exit ($error_message, $status_code) { exit(); } +=head2 display_no_index_page_and_exit () + +Return an empty HTML page with a '' directive +in the HTML header. + +This is useful to prevent web crawlers to overload our servers by querying webpages +that require a lot of resources (especially aggregation queries). + +=cut + +sub display_no_index_page_and_exit () { + my $html + = '

NOINDEX

We detected that your browser is a web crawling bot, and this page should not be indexed by web crawlers. If this is unexpected, contact us on Slack or write us an email at contact@openfoodfacts.org.

'; + my $http_headers_ref = { + '-status' => 200, + '-expires' => '-1d', + '-charset' => 'UTF-8', + }; + + print header(%$http_headers_ref); + + my $r = Apache2::RequestUtil->request(); + $r->rflush; + # Setting the status makes mod_perl append a default error to the body + # Send 200 instead. + $r->status(200); + binmode(STDOUT, ":encoding(UTF-8)"); + print $html; + return; + exit(); +} + # Specific index for producer on the platform for producers sub display_index_for_producer ($request_ref) { diff --git a/lib/ProductOpener/Routing.pm b/lib/ProductOpener/Routing.pm index 1e0d9975cb53f..49cf5da2b45ed 100644 --- a/lib/ProductOpener/Routing.pm +++ b/lib/ProductOpener/Routing.pm @@ -94,6 +94,11 @@ sub analyze_request ($request_ref) { $request_ref->{query_string} = $request_ref->{original_query_string}; + # `no_index` specifies whether we send an empty HTML page with a + # in the HTML headers. This is only done for known web crawlers (Google, Bing, Yandex,...) on webpages that + # trigger heavy DB aggregation queries and overload our server. + $request_ref->{no_index} = 0; + $log->debug("analyzing query_string, step 0 - unmodified", {query_string => $request_ref->{query_string}}) if $log->is_debug(); @@ -564,6 +569,24 @@ sub analyze_request ($request_ref) { $request_ref->{text} = 'index-pro'; } + # Return noindex empty HTML page for web crawlers that crawl specific facet pages + if (($request_ref->{is_crawl_bot} eq 1) and (defined $request_ref->{tagtype})) { + if ($request_ref->{tagtype} !~ /^brands|categories|labels|additives|nova-groups|ecoscore|nutrition-grades$/) { + # Only allow indexation of a selected number of facets + # Ingredients were left out because of the number of possible ingredients (1.2M) + $request_ref->{no_index} = 1; + } + elsif ($request_ref->{page} >= 2) { + # Don't index facet pages with page number > 1 (we want only 1 index page per facet value) + $request_ref->{no_index} = 1; + } + elsif (defined $request_ref->{tagtype2}) { + # Don't index web pages with 2 nested tags: as an example, there are billions of combinations for + # category x ingredient alone + $request_ref->{no_index} = 1; + } + } + $log->debug("request analyzed", {lc => $lc, lang => $lang, request_ref => $request_ref}) if $log->is_debug(); return 1; diff --git a/stop_words.txt b/stop_words.txt index 7e178fd9d27d2..32a37106f6db0 100644 --- a/stop_words.txt +++ b/stop_words.txt @@ -223,3 +223,5 @@ weighers www xml gzipped +webpages +bing \ No newline at end of file diff --git a/tests/integration/facet_page_crawler.t b/tests/integration/facet_page_crawler.t new file mode 100644 index 0000000000000..1fa89631d385c --- /dev/null +++ b/tests/integration/facet_page_crawler.t @@ -0,0 +1,122 @@ +#!/usr/bin/perl -w + +use ProductOpener::PerlStandards; + +use Test::More; +use ProductOpener::APITest qw/:all/; +use ProductOpener::Test qw/:all/; +use ProductOpener::TestDefaults qw/:all/; + +use File::Basename "dirname"; + +use Storable qw(dclone); + +remove_all_users(); + +remove_all_products(); + +wait_application_ready(); + +my $ua = new_client(); + +my $CRAWLING_BOT_USER_AGENT + = 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/'; +my $NORMAL_USER_USER_AGENT + = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0'; + +my %product_form = ( + %{dclone(\%default_product_form)}, + ( + code => '0200000000235', + product_name => "Only-Product", + ) +); + +edit_product($ua, \%product_form); + +my $tests_ref = [ + # Normal user should have access to product page + { + test_case => 'normal-user-access-product-page', + method => 'GET', + path => '/product/0200000000235/only-product', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => 'Only-Product - 100 g' + }, + # Crawling bot should have access to product page + { + test_case => 'crawler-access-product-page', + method => 'GET', + path => '/product/0200000000235/only-product', + headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => 'Only-Product - 100 g' + }, + # Crawling bot should receive a noindex page for nested facets + { + test_case => 'crawler-access-nested-facet-page', + method => 'GET', + path => '/category/hazelnut-spreads/brand/nutella', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => '

NOINDEX

' + }, + # Normal user should have access to nested facets + { + test_case => 'normal-user-access-nested-facet-page', + method => 'GET', + path => '/category/hazelnut-spreads/brand/nutella', + headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_not_match => '

NOINDEX

' + }, + # Crawling bot should have access to specific facet pages (such as category) + { + test_case => 'crawler-access-category-facet-page', + method => 'GET', + path => '/category/hazelnut-spreads', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_not_match => '

NOINDEX

' + }, + # Normal user should have access to facet pages + { + test_case => 'normal-user-access-category-facet-page', + method => 'GET', + path => '/category/hazelnut-spreads', + headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_not_match => '

NOINDEX

' + }, + # Crawling bot should receive a noindex page for most facet pages (including editor facet) + { + test_case => 'crawler-access-editor-facet-page', + method => 'GET', + path => '/editor/unknown-user', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => '

NOINDEX

' + }, + # Normal user should have access to editor facet + { + test_case => 'normal-user-access-editor-facet-page', + method => 'GET', + path => '/editor/unknown-user', + headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => 'Unknown user.' + }, +]; + +execute_api_tests(__FILE__, $tests_ref); + +done_testing(); diff --git a/tests/unit/routing.t b/tests/unit/routing.t index b49d45fbea8ba..9c9bfd9c8a250 100644 --- a/tests/unit/routing.t +++ b/tests/unit/routing.t @@ -32,7 +32,8 @@ my @tests = ( 'lc' => 'en', 'original_query_string' => 'api/v0/attribute_groups', 'page' => 1, - 'query_string' => 'api/v0/attribute_groups' + 'query_string' => 'api/v0/attribute_groups', + 'no_index' => '0' }, }, { @@ -56,7 +57,8 @@ my @tests = ( 'tag' => 'en:breads', 'tag_prefix' => '', 'tagid' => 'en:breads', - 'tagtype' => 'categories' + 'tagtype' => 'categories', + 'no_index' => '0' }, }, { @@ -78,7 +80,8 @@ my @tests = ( 'tag' => 'en:breads', 'tag_prefix' => '', 'tagid' => 'en:breads', - 'tagtype' => 'categories' + 'tagtype' => 'categories', + 'no_index' => '0' }, }, { @@ -100,7 +103,8 @@ my @tests = ( 'tag' => 'en:breads', 'tag_prefix' => '', 'tagid' => 'en:breads', - 'tagtype' => 'categories' + 'tagtype' => 'categories', + 'no_index' => '0' }, }, { @@ -122,7 +126,8 @@ my @tests = ( 'tag' => 'en:bread', 'tag_prefix' => '', 'tagid' => 'en:bread', - 'tagtype' => 'categories' + 'tagtype' => 'categories', + 'no_index' => '0' }, },