Skip to content

Commit

Permalink
fix: don't let bot index most facet pages
Browse files Browse the repository at this point in the history
Crawling bots can't visit all page and crawl OFF continuously.
We want to limit crawlers on interesting pages, so we return a noindex
html page on most facet pages (except most interesting ones such as
brand, category,...)
  • Loading branch information
raphael0202 committed Jul 28, 2023
1 parent 3f20822 commit 0e82a6f
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 7 deletions.
2 changes: 1 addition & 1 deletion lib/ProductOpener/Display.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1049,7 +1049,7 @@ that require a lot of resources (especially aggregation queries).

sub display_no_index_page_and_exit () {
my $html
= '<html><head><meta name="robots" content="noindex"></head><body><h1>NOINDEX</h1><p>We detected that your browser is a web crawling bot, and this page should not be indexed by web crawlers. If this is is unexpected, contact us on Slack or write us an email at <a href="mailto:[email protected]">[email protected]</a>.</p></body></html>';
= '<!DOCTYPE html><html><head><meta name="robots" content="noindex"></head><body><h1>NOINDEX</h1><p>We detected that your browser is a web crawling bot, and this page should not be indexed by web crawlers. If this is unexpected, contact us on Slack or write us an email at <a href="mailto:[email protected]">[email protected]</a>.</p></body></html>';
my $http_headers_ref = {
'-status' => 200,
'-expires' => '-1d',
Expand Down
24 changes: 18 additions & 6 deletions lib/ProductOpener/Routing.pm
Original file line number Diff line number Diff line change
Expand Up @@ -484,12 +484,6 @@ sub analyze_request ($request_ref) {
}
my $tagtype = $request_ref->{tagtype2};

if ($request_ref->{is_crawl_bot} eq 1) {
# Don't index web pages with 2 nested tags: as an example, there are billions of combinations for
# category x ingredient alone
$request_ref->{no_index} = 1;
}

if (($#components >= 0)) {
$request_ref->{tag2} = shift @components;

Expand Down Expand Up @@ -575,6 +569,24 @@ sub analyze_request ($request_ref) {
$request_ref->{text} = 'index-pro';
}

# Return noindex empty HTML page for web crawlers that crawl specific facet pages
if (($request_ref->{is_crawl_bot} eq 1) and (defined $request_ref->{tagtype})) {
if ($request_ref->{tagtype} !~ /^brands|categories|labels|additives|nova-groups|ecoscore|nutrition-grades$/) {
# Only allow indexation of a selected number of facets
# Ingredients were left out because of the number of possible ingredients (1.2M)
$request_ref->{no_index} = 1;
}
elsif ($request_ref->{page} >= 2) {
# Don't index facet pages with page number > 1 (we want only 1 index page per facet value)
$request_ref->{no_index} = 1;
}
elsif (defined $request_ref->{tagtype2}) {
# Don't index web pages with 2 nested tags: as an example, there are billions of combinations for
# category x ingredient alone
$request_ref->{no_index} = 1;
}
}

$log->debug("request analyzed", {lc => $lc, lang => $lang, request_ref => $request_ref}) if $log->is_debug();

return 1;
Expand Down
122 changes: 122 additions & 0 deletions tests/integration/facet_page_crawler.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/perl -w

use ProductOpener::PerlStandards;

use Test::More;
use ProductOpener::APITest qw/:all/;
use ProductOpener::Test qw/:all/;
use ProductOpener::TestDefaults qw/:all/;

use File::Basename "dirname";

use Storable qw(dclone);

remove_all_users();

remove_all_products();

wait_application_ready();

my $ua = new_client();

my $CRAWLING_BOT_USER_AGENT
= 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/';
my $NORMAL_USER_USER_AGENT
= 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0';

my %product_form = (
%{dclone(\%default_product_form)},
(
code => '0200000000235',
product_name => "Only-Product",
)
);

edit_product($ua, \%product_form);

my $tests_ref = [
# Normal user should have access to product page
{
test_case => 'normal-user-access-product-page',
method => 'GET',
path => '/product/0200000000235/only-product',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<title>Only-Product - 100 g</title>'
},
# Crawling bot should have access to product page
{
test_case => 'crawler-access-product-page',
method => 'GET',
path => '/product/0200000000235/only-product',
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<title>Only-Product - 100 g</title>'
},
# Crawling bot should receive a noindex page for nested facets
{
test_case => 'crawler-access-nested-facet-page',
method => 'GET',
path => '/category/hazelnut-spreads/brand/nutella',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<h1>NOINDEX</h1>'
},
# Normal user should have access to nested facets
{
test_case => 'normal-user-access-nested-facet-page',
method => 'GET',
path => '/category/hazelnut-spreads/brand/nutella',
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_not_match => '<h1>NOINDEX</h1>'
},
# Crawling bot should have access to specific facet pages (such as category)
{
test_case => 'crawler-access-category-facet-page',
method => 'GET',
path => '/category/hazelnut-spreads',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_not_match => '<h1>NOINDEX</h1>'
},
# Normal user should have access to facet pages
{
test_case => 'normal-user-access-category-facet-page',
method => 'GET',
path => '/category/hazelnut-spreads',
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_not_match => '<h1>NOINDEX</h1>'
},
# Crawling bot should receive a noindex page for most facet pages (including editor facet)
{
test_case => 'crawler-access-editor-facet-page',
method => 'GET',
path => '/editor/unknown-user',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<h1>NOINDEX</h1>'
},
# Normal user should have access to editor facet
{
test_case => 'normal-user-access-editor-facet-page',
method => 'GET',
path => '/editor/unknown-user',
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => 'Unknown user.'
},
];

execute_api_tests(__FILE__, $tests_ref);

done_testing();

0 comments on commit 0e82a6f

Please sign in to comment.