Skip to content

Commit

Permalink
fix: return empty noindex webpage when crawlers hit specific pages (#…
Browse files Browse the repository at this point in the history
…8744)

* fix: return empty noindex webpage when crawlers hit specific pages

when a crawler hit nested facets (ex:
/category/popcorn-with-caramel/data-quality-error/nutrition-value-total-over-105)
we return a blank HTML page with a noindex directive to prevent the
crawler from overloading our servers.

* fix: fix Seekport bot directive in robots.txt

The User-Agent has changed: see https://bot.seekport.com/

* fix: fix unit tests

* fix: don't let bot index most facet pages

Crawling bots can't visit all page and crawl OFF continuously.
We want to limit crawlers on interesting pages, so we return a noindex
html page on most facet pages (except most interesting ones such as
brand, category,...)
  • Loading branch information
raphael0202 authored Jul 28, 2023
1 parent 35a1666 commit d763210
Show file tree
Hide file tree
Showing 7 changed files with 225 additions and 6 deletions.
7 changes: 7 additions & 0 deletions cgi/display.pl
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,13 @@
return Apache2::Const::OK;
}

if ($request_ref->{no_index} eq 1) {
# The request is made from a known web crawler and the web-page shouldn't be indexed:
# return directly a "noindex" empty HTML page
display_no_index_page_and_exit();
return Apache2::Const::OK;
}

$log->debug(
"after analyze_request",
{
Expand Down
2 changes: 1 addition & 1 deletion html/robots.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ User-agent: Cliqzbot/3.0
Disallow: /
User-agent: Cliqzbot
Disallow: /
User-agent: Seekport Crawler
User-agent: SeekportBot
Disallow: /
User-agent: Paracrawl
Disallow: /
Expand Down
60 changes: 60 additions & 0 deletions lib/ProductOpener/Display.pm
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ BEGIN {
&display_icon
&display_structured_response
&display_no_index_page_and_exit
&display_page
&display_text
&display_points
Expand Down Expand Up @@ -882,10 +883,37 @@ CSS
$request_ref->{cc} = $cc;
$request_ref->{country} = $country;
$request_ref->{lcs} = \@lcs;
set_user_agent_request_ref_attributes($request_ref);

return $request_ref;
}

=head2 set_user_agent_request_ref_attributes ($request_ref)
Set two attributes to `request_ref`:
- `user_agent`: the request User-Agent
- `is_crawl_bot`: a flag (0 or 1) that indicates whether the request comes
from a known web crawler (Google, Bing,...). We only use User-Agent value
to set this flag.
=cut

sub set_user_agent_request_ref_attributes ($request_ref) {
my $user_agent = user_agent();
$request_ref->{user_agent} = $user_agent;

my $is_crawl_bot = 0;
if ($user_agent
=~ /Googlebot|Googlebot-Image|bingbot|Applebot|YandexBot|YandexRenderResourcesBot|DuckDuckBot|DotBot|SeekportBot|AhrefsBot|DataForSeoBot|SeznamBot|ZoomBot|MojeekBot|QRbot|www\.qwant\.com|facebookexternalhit/
)
{
$is_crawl_bot = 1;
}
$request_ref->{is_crawl_bot} = $is_crawl_bot;
return;
}

sub _get_date ($t) {

if (defined $t) {
Expand Down Expand Up @@ -1009,6 +1037,38 @@ sub display_error_and_exit ($error_message, $status_code) {
exit();
}

=head2 display_no_index_page_and_exit ()
Return an empty HTML page with a '<meta name="robots" content="noindex">' directive
in the HTML header.
This is useful to prevent web crawlers to overload our servers by querying webpages
that require a lot of resources (especially aggregation queries).
=cut

sub display_no_index_page_and_exit () {
my $html
= '<!DOCTYPE html><html><head><meta name="robots" content="noindex"></head><body><h1>NOINDEX</h1><p>We detected that your browser is a web crawling bot, and this page should not be indexed by web crawlers. If this is unexpected, contact us on Slack or write us an email at <a href="mailto:[email protected]">[email protected]</a>.</p></body></html>';
my $http_headers_ref = {
'-status' => 200,
'-expires' => '-1d',
'-charset' => 'UTF-8',
};

print header(%$http_headers_ref);

my $r = Apache2::RequestUtil->request();
$r->rflush;
# Setting the status makes mod_perl append a default error to the body
# Send 200 instead.
$r->status(200);
binmode(STDOUT, ":encoding(UTF-8)");
print $html;
return;
exit();
}

# Specific index for producer on the platform for producers
sub display_index_for_producer ($request_ref) {

Expand Down
23 changes: 23 additions & 0 deletions lib/ProductOpener/Routing.pm
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ sub analyze_request ($request_ref) {

$request_ref->{query_string} = $request_ref->{original_query_string};

# `no_index` specifies whether we send an empty HTML page with a <meta name="robots" content="noindex">
# in the HTML headers. This is only done for known web crawlers (Google, Bing, Yandex,...) on webpages that
# trigger heavy DB aggregation queries and overload our server.
$request_ref->{no_index} = 0;

$log->debug("analyzing query_string, step 0 - unmodified", {query_string => $request_ref->{query_string}})
if $log->is_debug();

Expand Down Expand Up @@ -564,6 +569,24 @@ sub analyze_request ($request_ref) {
$request_ref->{text} = 'index-pro';
}

# Return noindex empty HTML page for web crawlers that crawl specific facet pages
if (($request_ref->{is_crawl_bot} eq 1) and (defined $request_ref->{tagtype})) {
if ($request_ref->{tagtype} !~ /^brands|categories|labels|additives|nova-groups|ecoscore|nutrition-grades$/) {
# Only allow indexation of a selected number of facets
# Ingredients were left out because of the number of possible ingredients (1.2M)
$request_ref->{no_index} = 1;
}
elsif ($request_ref->{page} >= 2) {
# Don't index facet pages with page number > 1 (we want only 1 index page per facet value)
$request_ref->{no_index} = 1;
}
elsif (defined $request_ref->{tagtype2}) {
# Don't index web pages with 2 nested tags: as an example, there are billions of combinations for
# category x ingredient alone
$request_ref->{no_index} = 1;
}
}

$log->debug("request analyzed", {lc => $lc, lang => $lang, request_ref => $request_ref}) if $log->is_debug();

return 1;
Expand Down
2 changes: 2 additions & 0 deletions stop_words.txt
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,5 @@ weighers
www
xml
gzipped
webpages
bing
122 changes: 122 additions & 0 deletions tests/integration/facet_page_crawler.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/perl -w

use ProductOpener::PerlStandards;

use Test::More;
use ProductOpener::APITest qw/:all/;
use ProductOpener::Test qw/:all/;
use ProductOpener::TestDefaults qw/:all/;

use File::Basename "dirname";

use Storable qw(dclone);

remove_all_users();

remove_all_products();

wait_application_ready();

my $ua = new_client();

my $CRAWLING_BOT_USER_AGENT
= 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/';
my $NORMAL_USER_USER_AGENT
= 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0';

my %product_form = (
%{dclone(\%default_product_form)},
(
code => '0200000000235',
product_name => "Only-Product",
)
);

edit_product($ua, \%product_form);

my $tests_ref = [
# Normal user should have access to product page
{
test_case => 'normal-user-access-product-page',
method => 'GET',
path => '/product/0200000000235/only-product',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<title>Only-Product - 100 g</title>'
},
# Crawling bot should have access to product page
{
test_case => 'crawler-access-product-page',
method => 'GET',
path => '/product/0200000000235/only-product',
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<title>Only-Product - 100 g</title>'
},
# Crawling bot should receive a noindex page for nested facets
{
test_case => 'crawler-access-nested-facet-page',
method => 'GET',
path => '/category/hazelnut-spreads/brand/nutella',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<h1>NOINDEX</h1>'
},
# Normal user should have access to nested facets
{
test_case => 'normal-user-access-nested-facet-page',
method => 'GET',
path => '/category/hazelnut-spreads/brand/nutella',
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_not_match => '<h1>NOINDEX</h1>'
},
# Crawling bot should have access to specific facet pages (such as category)
{
test_case => 'crawler-access-category-facet-page',
method => 'GET',
path => '/category/hazelnut-spreads',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_not_match => '<h1>NOINDEX</h1>'
},
# Normal user should have access to facet pages
{
test_case => 'normal-user-access-category-facet-page',
method => 'GET',
path => '/category/hazelnut-spreads',
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_not_match => '<h1>NOINDEX</h1>'
},
# Crawling bot should receive a noindex page for most facet pages (including editor facet)
{
test_case => 'crawler-access-editor-facet-page',
method => 'GET',
path => '/editor/unknown-user',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<h1>NOINDEX</h1>'
},
# Normal user should have access to editor facet
{
test_case => 'normal-user-access-editor-facet-page',
method => 'GET',
path => '/editor/unknown-user',
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => 'Unknown user.'
},
];

execute_api_tests(__FILE__, $tests_ref);

done_testing();
15 changes: 10 additions & 5 deletions tests/unit/routing.t
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ my @tests = (
'lc' => 'en',
'original_query_string' => 'api/v0/attribute_groups',
'page' => 1,
'query_string' => 'api/v0/attribute_groups'
'query_string' => 'api/v0/attribute_groups',
'no_index' => '0'
},
},
{
Expand All @@ -56,7 +57,8 @@ my @tests = (
'tag' => 'en:breads',
'tag_prefix' => '',
'tagid' => 'en:breads',
'tagtype' => 'categories'
'tagtype' => 'categories',
'no_index' => '0'
},
},
{
Expand All @@ -78,7 +80,8 @@ my @tests = (
'tag' => 'en:breads',
'tag_prefix' => '',
'tagid' => 'en:breads',
'tagtype' => 'categories'
'tagtype' => 'categories',
'no_index' => '0'
},
},
{
Expand All @@ -100,7 +103,8 @@ my @tests = (
'tag' => 'en:breads',
'tag_prefix' => '',
'tagid' => 'en:breads',
'tagtype' => 'categories'
'tagtype' => 'categories',
'no_index' => '0'
},
},
{
Expand All @@ -122,7 +126,8 @@ my @tests = (
'tag' => 'en:bread',
'tag_prefix' => '',
'tagid' => 'en:bread',
'tagtype' => 'categories'
'tagtype' => 'categories',
'no_index' => '0'
},
},

Expand Down

0 comments on commit d763210

Please sign in to comment.