diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm index 8d5f631b9473c..2237f09c83d8b 100644 --- a/lib/ProductOpener/Display.pm +++ b/lib/ProductOpener/Display.pm @@ -1049,7 +1049,7 @@ that require a lot of resources (especially aggregation queries). sub display_no_index_page_and_exit () { my $html - = '

NOINDEX

We detected that your browser is a web crawling bot, and this page should not be indexed by web crawlers. If this is is unexpected, contact us on Slack or write us an email at contact@openfoodfacts.org.

'; + = '

NOINDEX

We detected that your browser is a web crawling bot, and this page should not be indexed by web crawlers. If this is unexpected, contact us on Slack or write us an email at contact@openfoodfacts.org.

'; my $http_headers_ref = { '-status' => 200, '-expires' => '-1d', diff --git a/lib/ProductOpener/Routing.pm b/lib/ProductOpener/Routing.pm index 089dee67ae63d..49cf5da2b45ed 100644 --- a/lib/ProductOpener/Routing.pm +++ b/lib/ProductOpener/Routing.pm @@ -484,12 +484,6 @@ sub analyze_request ($request_ref) { } my $tagtype = $request_ref->{tagtype2}; - if ($request_ref->{is_crawl_bot} eq 1) { - # Don't index web pages with 2 nested tags: as an example, there are billions of combinations for - # category x ingredient alone - $request_ref->{no_index} = 1; - } - if (($#components >= 0)) { $request_ref->{tag2} = shift @components; @@ -575,6 +569,24 @@ sub analyze_request ($request_ref) { $request_ref->{text} = 'index-pro'; } + # Return noindex empty HTML page for web crawlers that crawl specific facet pages + if (($request_ref->{is_crawl_bot} eq 1) and (defined $request_ref->{tagtype})) { + if ($request_ref->{tagtype} !~ /^brands|categories|labels|additives|nova-groups|ecoscore|nutrition-grades$/) { + # Only allow indexation of a selected number of facets + # Ingredients were left out because of the number of possible ingredients (1.2M) + $request_ref->{no_index} = 1; + } + elsif ($request_ref->{page} >= 2) { + # Don't index facet pages with page number > 1 (we want only 1 index page per facet value) + $request_ref->{no_index} = 1; + } + elsif (defined $request_ref->{tagtype2}) { + # Don't index web pages with 2 nested tags: as an example, there are billions of combinations for + # category x ingredient alone + $request_ref->{no_index} = 1; + } + } + $log->debug("request analyzed", {lc => $lc, lang => $lang, request_ref => $request_ref}) if $log->is_debug(); return 1; diff --git a/tests/integration/facet_page_crawler.t b/tests/integration/facet_page_crawler.t new file mode 100644 index 0000000000000..1fa89631d385c --- /dev/null +++ b/tests/integration/facet_page_crawler.t @@ -0,0 +1,122 @@ +#!/usr/bin/perl -w + +use ProductOpener::PerlStandards; + +use Test::More; +use ProductOpener::APITest qw/:all/; +use ProductOpener::Test qw/:all/; +use ProductOpener::TestDefaults qw/:all/; + +use File::Basename "dirname"; + +use Storable qw(dclone); + +remove_all_users(); + +remove_all_products(); + +wait_application_ready(); + +my $ua = new_client(); + +my $CRAWLING_BOT_USER_AGENT + = 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/'; +my $NORMAL_USER_USER_AGENT + = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0'; + +my %product_form = ( + %{dclone(\%default_product_form)}, + ( + code => '0200000000235', + product_name => "Only-Product", + ) +); + +edit_product($ua, \%product_form); + +my $tests_ref = [ + # Normal user should have access to product page + { + test_case => 'normal-user-access-product-page', + method => 'GET', + path => '/product/0200000000235/only-product', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => 'Only-Product - 100 g' + }, + # Crawling bot should have access to product page + { + test_case => 'crawler-access-product-page', + method => 'GET', + path => '/product/0200000000235/only-product', + headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => 'Only-Product - 100 g' + }, + # Crawling bot should receive a noindex page for nested facets + { + test_case => 'crawler-access-nested-facet-page', + method => 'GET', + path => '/category/hazelnut-spreads/brand/nutella', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => '

NOINDEX

' + }, + # Normal user should have access to nested facets + { + test_case => 'normal-user-access-nested-facet-page', + method => 'GET', + path => '/category/hazelnut-spreads/brand/nutella', + headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_not_match => '

NOINDEX

' + }, + # Crawling bot should have access to specific facet pages (such as category) + { + test_case => 'crawler-access-category-facet-page', + method => 'GET', + path => '/category/hazelnut-spreads', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_not_match => '

NOINDEX

' + }, + # Normal user should have access to facet pages + { + test_case => 'normal-user-access-category-facet-page', + method => 'GET', + path => '/category/hazelnut-spreads', + headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_not_match => '

NOINDEX

' + }, + # Crawling bot should receive a noindex page for most facet pages (including editor facet) + { + test_case => 'crawler-access-editor-facet-page', + method => 'GET', + path => '/editor/unknown-user', + headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => '

NOINDEX

' + }, + # Normal user should have access to editor facet + { + test_case => 'normal-user-access-editor-facet-page', + method => 'GET', + path => '/editor/unknown-user', + headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT}, + expected_status_code => 200, + expected_type => 'html', + response_content_must_match => 'Unknown user.' + }, +]; + +execute_api_tests(__FILE__, $tests_ref); + +done_testing();