Skip to content

Commit

Permalink
fix: don't allow bot crawlers to index unsupported lc for cc
Browse files Browse the repository at this point in the history
ex: fr-es.openfoodfacts.org shouldn't be indexable by web crawlers
See #8779
for more context
  • Loading branch information
raphael0202 committed Aug 8, 2023
1 parent d9ef66b commit aabeabc
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 6 deletions.
18 changes: 17 additions & 1 deletion lib/ProductOpener/Display.pm
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,14 @@ sub init_request ($request_ref = {}) {
# remove the / to normalize the query string, as we use it to build some redirect urls
$request_ref->{original_query_string} =~ s/^\///;

# Set $request_ref->{is_crawl_bot}
set_user_agent_request_ref_attributes($request_ref);

# `no_index` specifies whether we send an empty HTML page with a <meta name="robots" content="noindex">
# in the HTML headers. This is only done for known web crawlers (Google, Bing, Yandex,...) on webpages that
# trigger heavy DB aggregation queries and overload our server.
$request_ref->{no_index} = 0;

# TODO: global variables should be moved to $request_ref
$styles = '';
$scripts = '';
Expand Down Expand Up @@ -614,6 +622,7 @@ sub init_request ($request_ref = {}) {
($cc, $country, $lc) = ('world', 'en:world', 'en');
}
elsif (defined $country_codes{$subdomain}) {
# subdomain is the country code: fr.openfoodfacts.org, uk.openfoodfacts.org,...
local $log->context->{subdomain_format} = 1;

$cc = $subdomain;
Expand All @@ -633,6 +642,7 @@ sub init_request ($request_ref = {}) {

}
elsif ($subdomain =~ /(.*?)-(.*)/) {
# subdomain contains the country code and the language code: world-fr.openfoodfacts.org, ch-it.openfoodfacts.org,...
local $log->context->{subdomain_format} = 2;
$log->debug("subdomain in cc-lc format - checking values",
{subdomain => $subdomain, lc => $lc, cc => $cc, country => $country})
Expand Down Expand Up @@ -733,6 +743,13 @@ sub init_request ($request_ref = {}) {
}
}

# If lc is not one of the official languages of the country and if the request comes from
# a bot crawler, don't index the webpage (return an empty noindex HTML page)
# It also disables every world-{lc} combinations for lc != 'en' for web crawlers
if (!($lc ~~ $country_languages{$cc}) and ($request_ref->{is_crawl_bot} eq 1)) {
$request_ref->{no_index} = 1;
}

# select the nutriment table format according to the country
$nutriment_table = $cc_nutriment_table{default};
if (exists $cc_nutriment_table{$cc}) {
Expand Down Expand Up @@ -883,7 +900,6 @@ CSS
$request_ref->{cc} = $cc;
$request_ref->{country} = $country;
$request_ref->{lcs} = \@lcs;
set_user_agent_request_ref_attributes($request_ref);

return $request_ref;
}
Expand Down
6 changes: 1 addition & 5 deletions lib/ProductOpener/Routing.pm
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,6 @@ sub analyze_request ($request_ref) {

$request_ref->{query_string} = $request_ref->{original_query_string};

# `no_index` specifies whether we send an empty HTML page with a <meta name="robots" content="noindex">
# in the HTML headers. This is only done for known web crawlers (Google, Bing, Yandex,...) on webpages that
# trigger heavy DB aggregation queries and overload our server.
$request_ref->{no_index} = 0;

$log->debug("analyzing query_string, step 0 - unmodified", {query_string => $request_ref->{query_string}})
if $log->is_debug();

Expand Down Expand Up @@ -576,6 +571,7 @@ sub analyze_request ($request_ref) {
# Return noindex empty HTML page for web crawlers that crawl specific facet pages
if ($request_ref->{is_crawl_bot} eq 1) {
if (defined $request_ref->{groupby_tagtype}) {
# $request_ref->{no_index} is set to 0 by default in init_request()
$request_ref->{no_index} = 1;
}
elsif (defined $request_ref->{tagtype}) {
Expand Down
31 changes: 31 additions & 0 deletions tests/integration/facet_page_crawler.t
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,37 @@ my $tests_ref = [
expected_type => 'html',
response_content_must_not_match => 'Fetching facet knowledge panel'
},
# Normal user should get access to every possible cc-lc combination
{
test_case => 'normal-user-get-non-official-cc-lc',
method => 'GET',
path => '/?cc=ch&lc=es',
headers_in => {'User-Agent' => $NORMAL_USER_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_not_match => '<h1>NOINDEX</h1>'
},
# Crawling bot should not have access to non official cc-lc combination
# Here lc=es is not an official language of cc=ch
{
test_case => 'crawler-get-non-official-cc-lc',
method => 'GET',
path => '/?cc=ch&lc=es',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<h1>NOINDEX</h1>'
},
# Crawling bot should not have access to world-{lc} where lc != en
{
test_case => 'crawler-get-non-official-cc-lc',
method => 'GET',
path => '/?cc=world&lc=es',
headers_in => {'User-Agent' => $CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<h1>NOINDEX</h1>'
},
];

execute_api_tests(__FILE__, $tests_ref);
Expand Down

0 comments on commit aabeabc

Please sign in to comment.