From 5f1906e5d52dde8141217e0173624632264cdd49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Fri, 23 Aug 2024 11:35:18 +0200 Subject: [PATCH 1/3] fix: ban more AI bots in robots.txt --- lib/ProductOpener/Display.pm | 4 ++-- templates/web/pages/robots/robots.tt.txt | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm index 7ae6121d1d477..d6aaf629986ee 100644 --- a/lib/ProductOpener/Display.pm +++ b/lib/ProductOpener/Display.pm @@ -1007,12 +1007,12 @@ sub set_user_agent_request_ref_attributes ($request_ref) { my $is_crawl_bot = 0; my $is_denied_crawl_bot = 0; if ($user_agent_str - =~ /\b(Googlebot|Googlebot-Image|Google-InspectionTool|bingbot|Applebot|Yandex|DuckDuck|DotBot|Seekport|Ahrefs|DataForSeo|Seznam|ZoomBot|Mojeek|QRbot|Qwant|facebookexternalhit|Bytespider|GPTBot|ClaudeBot|SEOkicks|Searchmetrics|MJ12|SurveyBot|SEOdiver|wotbox|Cliqz|Paracrawl|Scrapy|VelenPublicWebCrawler|Semrush|MegaIndex\.ru|Amazon|aiohttp|python-request)/i + =~ /\b(Googlebot|Googlebot-Image|Google-InspectionTool|bingbot|Applebot|Yandex|DuckDuck|DotBot|Seekport|Ahrefs|DataForSeo|Seznam|ZoomBot|Mojeek|QRbot|Qwant|facebookexternalhit|Bytespider|GPTBot|cohere-ai|anthropic-ai|PerplexityBot|ClaudeBot|Claude-Web|SEOkicks|Searchmetrics|MJ12|SurveyBot|SEOdiver|wotbox|Cliqz|Paracrawl|Scrapy|VelenPublicWebCrawler|Semrush|MegaIndex\.ru|Amazon|aiohttp|python-request)/i ) { $is_crawl_bot = 1; if ($user_agent_str - =~ /\b(bingbot|Seekport|Ahrefs|DataForSeo|Seznam|ZoomBot|Mojeek|QRbot|Bytespider|SEOkicks|Searchmetrics|MJ12|SurveyBot|SEOdiver|wotbox|Cliqz|Paracrawl|Scrapy|VelenPublicWebCrawler|Semrush|MegaIndex\.ru|YandexMarket|Amazon|ClaudeBot)/ + =~ /\b(bingbot|Seekport|Ahrefs|DataForSeo|Seznam|ZoomBot|Mojeek|QRbot|Bytespider|SEOkicks|Searchmetrics|MJ12|SurveyBot|SEOdiver|wotbox|Cliqz|Paracrawl|Scrapy|VelenPublicWebCrawler|Semrush|MegaIndex\.ru|YandexMarket|Amazon|GPTBot|PerplexityBot|ClaudeBot|Claude-Web|cohere-ai|anthropic-ai)/ ) { $is_denied_crawl_bot = 1; diff --git a/templates/web/pages/robots/robots.tt.txt b/templates/web/pages/robots/robots.tt.txt index 3337338da8e32..b69bc635a753f 100644 --- a/templates/web/pages/robots/robots.tt.txt +++ b/templates/web/pages/robots/robots.tt.txt @@ -90,4 +90,17 @@ Disallow: / User-agent: AhrefsBot Disallow: / + +User-agent: GPTBot +Disallow: / +User-agent: cohere-ai +Disallow: / +User-agent: anthropic-ai +Disallow: / +User-agent: ClaudeBot +Disallow: / +User-agent: Claude-Web +Disallow: / +User-agent: PerplexityBot +Disallow: / [% END %] \ No newline at end of file From 5afad1ee6083ba01a85006f3c89bd5da3987e920 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Fri, 23 Aug 2024 12:31:45 +0200 Subject: [PATCH 2/3] Update lib/ProductOpener/Display.pm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Stéphane Gigandet --- lib/ProductOpener/Display.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm index d6aaf629986ee..b6a19d57dc990 100644 --- a/lib/ProductOpener/Display.pm +++ b/lib/ProductOpener/Display.pm @@ -1012,7 +1012,7 @@ sub set_user_agent_request_ref_attributes ($request_ref) { { $is_crawl_bot = 1; if ($user_agent_str - =~ /\b(bingbot|Seekport|Ahrefs|DataForSeo|Seznam|ZoomBot|Mojeek|QRbot|Bytespider|SEOkicks|Searchmetrics|MJ12|SurveyBot|SEOdiver|wotbox|Cliqz|Paracrawl|Scrapy|VelenPublicWebCrawler|Semrush|MegaIndex\.ru|YandexMarket|Amazon|GPTBot|PerplexityBot|ClaudeBot|Claude-Web|cohere-ai|anthropic-ai)/ + =~ /\b(bingbot|Seekport|Ahrefs|DataForSeo|Seznam|ZoomBot|Mojeek|QRbot|Bytespider|SEOkicks|Searchmetrics|MJ12|SurveyBot|SEOdiver|wotbox|Cliqz|Paracrawl|Scrapy|VelenPublicWebCrawler|Semrush|MegaIndex\.ru|YandexMarket|Amazon|GPTBot|PerplexityBot|ClaudeBot|Claude-Web|cohere-ai|anthropic-ai)/i ) { $is_denied_crawl_bot = 1; From c5d7678370d8f61abfeece7daf917455dca7e818 Mon Sep 17 00:00:00 2001 From: Open Food Facts Bot Date: Mon, 26 Aug 2024 11:40:11 +0000 Subject: [PATCH 3/3] test: Update tests results --- .../get-product-auth-good-password.json | 2 +- .../get-existing-product-gs1-fnc1.json | 2 +- .../api_v3_product_read/get-existing-product.json | 2 +- .../page_crawler/get-robots-txt-ch-it.text | 13 +++++++++++++ .../get-robots-txt-fr-pro-platform.text | 13 +++++++++++++ .../page_crawler/get-robots-txt-fr.text | 13 +++++++++++++ .../get-robots-txt-world-pro-platform.text | 13 +++++++++++++ .../page_crawler/get-robots-txt-world.text | 13 +++++++++++++ 8 files changed, 68 insertions(+), 3 deletions(-) diff --git a/tests/integration/expected_test_results/api_v2_product_write/get-product-auth-good-password.json b/tests/integration/expected_test_results/api_v2_product_write/get-product-auth-good-password.json index 905b94375de9c..bbc069a2636fa 100644 --- a/tests/integration/expected_test_results/api_v2_product_write/get-product-auth-good-password.json +++ b/tests/integration/expected_test_results/api_v2_product_write/get-product-auth-good-password.json @@ -111,7 +111,7 @@ "origins_of_ingredients" : { "aggregated_origins" : [ { - "epi_score" : 0, + "epi_score" : "0", "origin" : "en:unknown", "percent" : 100, "transportation_score" : null diff --git a/tests/integration/expected_test_results/api_v3_product_read/get-existing-product-gs1-fnc1.json b/tests/integration/expected_test_results/api_v3_product_read/get-existing-product-gs1-fnc1.json index 80dd2095d858e..dda39bc5518a7 100644 --- a/tests/integration/expected_test_results/api_v3_product_read/get-existing-product-gs1-fnc1.json +++ b/tests/integration/expected_test_results/api_v3_product_read/get-existing-product-gs1-fnc1.json @@ -106,7 +106,7 @@ "origins_of_ingredients" : { "aggregated_origins" : [ { - "epi_score" : "0", + "epi_score" : 0, "origin" : "en:unknown", "percent" : 100, "transportation_score" : null diff --git a/tests/integration/expected_test_results/api_v3_product_read/get-existing-product.json b/tests/integration/expected_test_results/api_v3_product_read/get-existing-product.json index 06f6dda1ef743..1d726f24acc1c 100644 --- a/tests/integration/expected_test_results/api_v3_product_read/get-existing-product.json +++ b/tests/integration/expected_test_results/api_v3_product_read/get-existing-product.json @@ -106,7 +106,7 @@ "origins_of_ingredients" : { "aggregated_origins" : [ { - "epi_score" : "0", + "epi_score" : 0, "origin" : "en:unknown", "percent" : 100, "transportation_score" : null diff --git a/tests/integration/expected_test_results/page_crawler/get-robots-txt-ch-it.text b/tests/integration/expected_test_results/page_crawler/get-robots-txt-ch-it.text index 27353b85292de..747316e7e8391 100644 --- a/tests/integration/expected_test_results/page_crawler/get-robots-txt-ch-it.text +++ b/tests/integration/expected_test_results/page_crawler/get-robots-txt-ch-it.text @@ -208,3 +208,16 @@ Disallow: / User-agent: AhrefsBot Disallow: / + +User-agent: GPTBot +Disallow: / +User-agent: cohere-ai +Disallow: / +User-agent: anthropic-ai +Disallow: / +User-agent: ClaudeBot +Disallow: / +User-agent: Claude-Web +Disallow: / +User-agent: PerplexityBot +Disallow: / diff --git a/tests/integration/expected_test_results/page_crawler/get-robots-txt-fr-pro-platform.text b/tests/integration/expected_test_results/page_crawler/get-robots-txt-fr-pro-platform.text index 09fba6e0c3fcf..bc4c465474b58 100644 --- a/tests/integration/expected_test_results/page_crawler/get-robots-txt-fr-pro-platform.text +++ b/tests/integration/expected_test_results/page_crawler/get-robots-txt-fr-pro-platform.text @@ -297,3 +297,16 @@ Disallow: / User-agent: AhrefsBot Disallow: / + +User-agent: GPTBot +Disallow: / +User-agent: cohere-ai +Disallow: / +User-agent: anthropic-ai +Disallow: / +User-agent: ClaudeBot +Disallow: / +User-agent: Claude-Web +Disallow: / +User-agent: PerplexityBot +Disallow: / diff --git a/tests/integration/expected_test_results/page_crawler/get-robots-txt-fr.text b/tests/integration/expected_test_results/page_crawler/get-robots-txt-fr.text index 09fba6e0c3fcf..bc4c465474b58 100644 --- a/tests/integration/expected_test_results/page_crawler/get-robots-txt-fr.text +++ b/tests/integration/expected_test_results/page_crawler/get-robots-txt-fr.text @@ -297,3 +297,16 @@ Disallow: / User-agent: AhrefsBot Disallow: / + +User-agent: GPTBot +Disallow: / +User-agent: cohere-ai +Disallow: / +User-agent: anthropic-ai +Disallow: / +User-agent: ClaudeBot +Disallow: / +User-agent: Claude-Web +Disallow: / +User-agent: PerplexityBot +Disallow: / diff --git a/tests/integration/expected_test_results/page_crawler/get-robots-txt-world-pro-platform.text b/tests/integration/expected_test_results/page_crawler/get-robots-txt-world-pro-platform.text index 27353b85292de..747316e7e8391 100644 --- a/tests/integration/expected_test_results/page_crawler/get-robots-txt-world-pro-platform.text +++ b/tests/integration/expected_test_results/page_crawler/get-robots-txt-world-pro-platform.text @@ -208,3 +208,16 @@ Disallow: / User-agent: AhrefsBot Disallow: / + +User-agent: GPTBot +Disallow: / +User-agent: cohere-ai +Disallow: / +User-agent: anthropic-ai +Disallow: / +User-agent: ClaudeBot +Disallow: / +User-agent: Claude-Web +Disallow: / +User-agent: PerplexityBot +Disallow: / diff --git a/tests/integration/expected_test_results/page_crawler/get-robots-txt-world.text b/tests/integration/expected_test_results/page_crawler/get-robots-txt-world.text index 27353b85292de..747316e7e8391 100644 --- a/tests/integration/expected_test_results/page_crawler/get-robots-txt-world.text +++ b/tests/integration/expected_test_results/page_crawler/get-robots-txt-world.text @@ -208,3 +208,16 @@ Disallow: / User-agent: AhrefsBot Disallow: / + +User-agent: GPTBot +Disallow: / +User-agent: cohere-ai +Disallow: / +User-agent: anthropic-ai +Disallow: / +User-agent: ClaudeBot +Disallow: / +User-agent: Claude-Web +Disallow: / +User-agent: PerplexityBot +Disallow: /