-
-
Notifications
You must be signed in to change notification settings - Fork 383
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: return empty noindex webpage when crawlers hit specific pages (#…
…8744) * fix: return empty noindex webpage when crawlers hit specific pages when a crawler hit nested facets (ex: /category/popcorn-with-caramel/data-quality-error/nutrition-value-total-over-105) we return a blank HTML page with a noindex directive to prevent the crawler from overloading our servers. * fix: fix Seekport bot directive in robots.txt The User-Agent has changed: see https://bot.seekport.com/ * fix: fix unit tests * fix: don't let bot index most facet pages Crawling bots can't visit all page and crawl OFF continuously. We want to limit crawlers on interesting pages, so we return a noindex html page on most facet pages (except most interesting ones such as brand, category,...)
- Loading branch information
1 parent
35a1666
commit d763210
Showing
7 changed files
with
225 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,6 +55,7 @@ BEGIN { | |
&display_icon | ||
&display_structured_response | ||
&display_no_index_page_and_exit | ||
&display_page | ||
&display_text | ||
&display_points | ||
|
@@ -882,10 +883,37 @@ CSS | |
$request_ref->{cc} = $cc; | ||
$request_ref->{country} = $country; | ||
$request_ref->{lcs} = \@lcs; | ||
set_user_agent_request_ref_attributes($request_ref); | ||
|
||
return $request_ref; | ||
} | ||
|
||
=head2 set_user_agent_request_ref_attributes ($request_ref) | ||
Set two attributes to `request_ref`: | ||
- `user_agent`: the request User-Agent | ||
- `is_crawl_bot`: a flag (0 or 1) that indicates whether the request comes | ||
from a known web crawler (Google, Bing,...). We only use User-Agent value | ||
to set this flag. | ||
=cut | ||
|
||
sub set_user_agent_request_ref_attributes ($request_ref) { | ||
my $user_agent = user_agent(); | ||
$request_ref->{user_agent} = $user_agent; | ||
|
||
my $is_crawl_bot = 0; | ||
if ($user_agent | ||
=~ /Googlebot|Googlebot-Image|bingbot|Applebot|YandexBot|YandexRenderResourcesBot|DuckDuckBot|DotBot|SeekportBot|AhrefsBot|DataForSeoBot|SeznamBot|ZoomBot|MojeekBot|QRbot|www\.qwant\.com|facebookexternalhit/ | ||
) | ||
{ | ||
$is_crawl_bot = 1; | ||
} | ||
$request_ref->{is_crawl_bot} = $is_crawl_bot; | ||
return; | ||
} | ||
|
||
sub _get_date ($t) { | ||
|
||
if (defined $t) { | ||
|
@@ -1009,6 +1037,38 @@ sub display_error_and_exit ($error_message, $status_code) { | |
exit(); | ||
} | ||
|
||
=head2 display_no_index_page_and_exit () | ||
Return an empty HTML page with a '<meta name="robots" content="noindex">' directive | ||
in the HTML header. | ||
This is useful to prevent web crawlers to overload our servers by querying webpages | ||
that require a lot of resources (especially aggregation queries). | ||
=cut | ||
|
||
sub display_no_index_page_and_exit () { | ||
my $html | ||
= '<!DOCTYPE html><html><head><meta name="robots" content="noindex"></head><body><h1>NOINDEX</h1><p>We detected that your browser is a web crawling bot, and this page should not be indexed by web crawlers. If this is unexpected, contact us on Slack or write us an email at <a href="mailto:[email protected]">[email protected]</a>.</p></body></html>'; | ||
my $http_headers_ref = { | ||
'-status' => 200, | ||
'-expires' => '-1d', | ||
'-charset' => 'UTF-8', | ||
}; | ||
|
||
print header(%$http_headers_ref); | ||
|
||
my $r = Apache2::RequestUtil->request(); | ||
$r->rflush; | ||
# Setting the status makes mod_perl append a default error to the body | ||
# Send 200 instead. | ||
$r->status(200); | ||
binmode(STDOUT, ":encoding(UTF-8)"); | ||
print $html; | ||
return; | ||
exit(); | ||
} | ||
|
||
# Specific index for producer on the platform for producers | ||
sub display_index_for_producer ($request_ref) { | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -223,3 +223,5 @@ weighers | |
www | ||
xml | ||
gzipped | ||
webpages | ||
bing |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
#!/usr/bin/perl -w | ||
|
||
use ProductOpener::PerlStandards; | ||
|
||
use Test::More; | ||
use ProductOpener::APITest qw/:all/; | ||
use ProductOpener::Test qw/:all/; | ||
use ProductOpener::TestDefaults qw/:all/; | ||
|
||
use File::Basename "dirname"; | ||
|
||
use Storable qw(dclone); | ||
|
||
remove_all_users(); | ||
|
||
remove_all_products(); | ||
|
||
wait_application_ready(); | ||
|
||
my $ua = new_client(); | ||
|
||
my $CRAWLING_BOT_USER_AGENT | ||
= 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/'; | ||
my $NORMAL_USER_USER_AGENT | ||
= 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0'; | ||
|
||
my %product_form = ( | ||
%{dclone(\%default_product_form)}, | ||
( | ||
code => '0200000000235', | ||
product_name => "Only-Product", | ||
) | ||
); | ||
|
||
edit_product($ua, \%product_form); | ||
|
||
my $tests_ref = [ | ||
# Normal user should have access to product page | ||
{ | ||
test_case => 'normal-user-access-product-page', | ||
method => 'GET', | ||
path => '/product/0200000000235/only-product', | ||
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, | ||
expected_status_code => 200, | ||
expected_type => 'html', | ||
response_content_must_match => '<title>Only-Product - 100 g</title>' | ||
}, | ||
# Crawling bot should have access to product page | ||
{ | ||
test_case => 'crawler-access-product-page', | ||
method => 'GET', | ||
path => '/product/0200000000235/only-product', | ||
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, | ||
expected_status_code => 200, | ||
expected_type => 'html', | ||
response_content_must_match => '<title>Only-Product - 100 g</title>' | ||
}, | ||
# Crawling bot should receive a noindex page for nested facets | ||
{ | ||
test_case => 'crawler-access-nested-facet-page', | ||
method => 'GET', | ||
path => '/category/hazelnut-spreads/brand/nutella', | ||
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, | ||
expected_status_code => 200, | ||
expected_type => 'html', | ||
response_content_must_match => '<h1>NOINDEX</h1>' | ||
}, | ||
# Normal user should have access to nested facets | ||
{ | ||
test_case => 'normal-user-access-nested-facet-page', | ||
method => 'GET', | ||
path => '/category/hazelnut-spreads/brand/nutella', | ||
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, | ||
expected_status_code => 200, | ||
expected_type => 'html', | ||
response_content_must_not_match => '<h1>NOINDEX</h1>' | ||
}, | ||
# Crawling bot should have access to specific facet pages (such as category) | ||
{ | ||
test_case => 'crawler-access-category-facet-page', | ||
method => 'GET', | ||
path => '/category/hazelnut-spreads', | ||
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, | ||
expected_status_code => 200, | ||
expected_type => 'html', | ||
response_content_must_not_match => '<h1>NOINDEX</h1>' | ||
}, | ||
# Normal user should have access to facet pages | ||
{ | ||
test_case => 'normal-user-access-category-facet-page', | ||
method => 'GET', | ||
path => '/category/hazelnut-spreads', | ||
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, | ||
expected_status_code => 200, | ||
expected_type => 'html', | ||
response_content_must_not_match => '<h1>NOINDEX</h1>' | ||
}, | ||
# Crawling bot should receive a noindex page for most facet pages (including editor facet) | ||
{ | ||
test_case => 'crawler-access-editor-facet-page', | ||
method => 'GET', | ||
path => '/editor/unknown-user', | ||
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, | ||
expected_status_code => 200, | ||
expected_type => 'html', | ||
response_content_must_match => '<h1>NOINDEX</h1>' | ||
}, | ||
# Normal user should have access to editor facet | ||
{ | ||
test_case => 'normal-user-access-editor-facet-page', | ||
method => 'GET', | ||
path => '/editor/unknown-user', | ||
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, | ||
expected_status_code => 200, | ||
expected_type => 'html', | ||
response_content_must_match => 'Unknown user.' | ||
}, | ||
]; | ||
|
||
execute_api_tests(__FILE__, $tests_ref); | ||
|
||
done_testing(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters