Skip to content

Commit

Permalink
fix: don't let bot index most facet pages
Browse files Browse the repository at this point in the history
Crawling bots can't visit all page and crawl OFF continuously.
We want to limit crawlers on interesting pages, so we return a noindex
html page on most facet pages (except most interesting ones such as
brand, category,...)
  • Loading branch information
raphael0202 committed Jul 28, 2023
1 parent 3f20822 commit a88949f
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 0 deletions.
6 changes: 6 additions & 0 deletions lib/ProductOpener/Routing.pm
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,12 @@ sub analyze_request ($request_ref) {

my $tagtype = $request_ref->{tagtype};

if ( ($tagtype !~ /^brands|categories|labels|ingredients|additives$/)
and ($request_ref->{is_crawl_bot} eq 1))
{
$request_ref->{no_index} = 1;
}

if (($#components >= 0)) {
$request_ref->{tag} = shift @components;

Expand Down
122 changes: 122 additions & 0 deletions tests/integration/facet_page_crawler.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/perl -w

use ProductOpener::PerlStandards;

use Test::More;
use ProductOpener::APITest qw/:all/;
use ProductOpener::Test qw/:all/;
use ProductOpener::TestDefaults qw/:all/;

use File::Basename "dirname";

use Storable qw(dclone);

remove_all_users();

remove_all_products();

wait_application_ready();

my $ua = new_client();

my $CRAWLING_BOT_USER_AGENT
= 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/';
my $NORMAL_USER_USER_AGENT
= 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0';

my %product_form = (
%{dclone(\%default_product_form)},
(
code => '0200000000235',
product_name => "Only-Product",
)
);

edit_product($ua, \%product_form);

my $tests_ref = [
# Normal user should have access to product page
{
test_case => 'normal-user-access-product-page',
method => 'GET',
path => '/product/0200000000235/only-product',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<title>Only-Product - 100 g</title>'
},
# Crawling bot should have access to product page
{
test_case => 'crawler-access-product-page',
method => 'GET',
path => '/product/0200000000235/only-product',
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<title>Only-Product - 100 g</title>'
},
# Crawling bot should receive a noindex page for nested facets
{
test_case => 'crawler-access-nested-facet-page',
method => 'GET',
path => '/category/hazelnut-spreads/brand/nutella',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<h1>NOINDEX</h1>'
},
# Normal user should have access to nested facets
{
test_case => 'normal-user-access-nested-facet-page',
method => 'GET',
path => '/category/hazelnut-spreads/brand/nutella',
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_not_match => '<h1>NOINDEX</h1>'
},
# Crawling bot should have access to specific facet pages (such as category)
{
test_case => 'crawler-access-category-facet-page',
method => 'GET',
path => '/category/hazelnut-spreads',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_not_match => '<h1>NOINDEX</h1>'
},
# Normal user should have access to facet pages
{
test_case => 'normal-user-access-category-facet-page',
method => 'GET',
path => '/category/hazelnut-spreads',
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_not_match => '<h1>NOINDEX</h1>'
},
# Crawling bot should receive a noindex page for most facet pages (including editor facet)
{
test_case => 'crawler-access-editor-facet-page',
method => 'GET',
path => '/editor/unknown-user',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<h1>NOINDEX</h1>'
},
# Normal user should have access to editor facet
{
test_case => 'normal-user-access-editor-facet-page',
method => 'GET',
path => '/editor/unknown-user',
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => 'Unknown user.'
},
];

execute_api_tests(__FILE__, $tests_ref);

done_testing();

0 comments on commit a88949f

Please sign in to comment.