From a88949f234bff68baefa380d1291ff12878c274c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Fri, 28 Jul 2023 12:01:47 +0200 Subject: [PATCH] fix: don't let bot index most facet pages Crawling bots can't visit all page and crawl OFF continuously. We want to limit crawlers on interesting pages, so we return a noindex html page on most facet pages (except most interesting ones such as brand, category,...) --- lib/ProductOpener/Routing.pm | 6 ++ tests/integration/facet_page_crawler.t | 122 +++++++++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 tests/integration/facet_page_crawler.t diff --git a/lib/ProductOpener/Routing.pm b/lib/ProductOpener/Routing.pm index 089dee67ae63d..81a01cc72906f 100644 --- a/lib/ProductOpener/Routing.pm +++ b/lib/ProductOpener/Routing.pm @@ -429,6 +429,12 @@ sub analyze_request ($request_ref) { my $tagtype = $request_ref->{tagtype}; + if ( ($tagtype !~ /^brands|categories|labels|ingredients|additives$/) + and ($request_ref->{is_crawl_bot} eq 1)) + { + $request_ref->{no_index} = 1; + } + if (($#components >= 0)) { $request_ref->{tag} = shift @components; diff --git a/tests/integration/facet_page_crawler.t b/tests/integration/facet_page_crawler.t new file mode 100644 index 0000000000000..1fa89631d385c --- /dev/null +++ b/tests/integration/facet_page_crawler.t @@ -0,0 +1,122 @@ +#!/usr/bin/perl -w + +use ProductOpener::PerlStandards; + +use Test::More; +use ProductOpener::APITest qw/:all/; +use ProductOpener::Test qw/:all/; +use ProductOpener::TestDefaults qw/:all/; + +use File::Basename "dirname"; + +use Storable qw(dclone); + +remove_all_users(); + +remove_all_products(); + +wait_application_ready(); + +my $ua = new_client(); + +my $CRAWLING_BOT_USER_AGENT + = 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/'; +my $NORMAL_USER_USER_AGENT + = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0'; + +my %product_form = ( + %{dclone(\%default_product_form)}, + ( + code => '0200000000235', + product_name => "Only-Product", + ) +); + +edit_product($ua, \%product_form); + +my $tests_ref = [ + # Normal user should have access to product page + { + test_case => 'normal-user-access-product-page', + method => 'GET', + path => '/product/0200000000235/only-product', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => 'Only-Product - 100 g' + }, + # Crawling bot should have access to product page + { + test_case => 'crawler-access-product-page', + method => 'GET', + path => '/product/0200000000235/only-product', + headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => 'Only-Product - 100 g' + }, + # Crawling bot should receive a noindex page for nested facets + { + test_case => 'crawler-access-nested-facet-page', + method => 'GET', + path => '/category/hazelnut-spreads/brand/nutella', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => '

NOINDEX

' + }, + # Normal user should have access to nested facets + { + test_case => 'normal-user-access-nested-facet-page', + method => 'GET', + path => '/category/hazelnut-spreads/brand/nutella', + headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_not_match => '

NOINDEX

' + }, + # Crawling bot should have access to specific facet pages (such as category) + { + test_case => 'crawler-access-category-facet-page', + method => 'GET', + path => '/category/hazelnut-spreads', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_not_match => '

NOINDEX

' + }, + # Normal user should have access to facet pages + { + test_case => 'normal-user-access-category-facet-page', + method => 'GET', + path => '/category/hazelnut-spreads', + headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_not_match => '

NOINDEX

' + }, + # Crawling bot should receive a noindex page for most facet pages (including editor facet) + { + test_case => 'crawler-access-editor-facet-page', + method => 'GET', + path => '/editor/unknown-user', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => '

NOINDEX

' + }, + # Normal user should have access to editor facet + { + test_case => 'normal-user-access-editor-facet-page', + method => 'GET', + path => '/editor/unknown-user', + headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => 'Unknown user.' + }, +]; + +execute_api_tests(__FILE__, $tests_ref); + +done_testing();