From 6afb7dc2b06977687bd41a5d6fde67f3e2f9cdd1 Mon Sep 17 00:00:00 2001 From: oldkid Date: Sat, 26 Apr 2025 06:57:32 +0200 Subject: [PATCH] update ai.robots.txt add robots.txt add apache-badbots.conf --- apache-badbots.conf | 1 + robots.txt | 80 +++++++++++++++++++ source/ai.robots.txt /.htaccess | 3 + .../ai.robots.txt /nginx-block-ai-bots.conf | 3 + source/ai.robots.txt /robots.txt | 2 + 5 files changed, 89 insertions(+) create mode 100644 apache-badbots.conf create mode 100644 robots.txt create mode 100644 source/ai.robots.txt /.htaccess create mode 100644 source/ai.robots.txt /nginx-block-ai-bots.conf diff --git a/apache-badbots.conf b/apache-badbots.conf new file mode 100644 index 0000000..4ac760c --- /dev/null +++ b/apache-badbots.conf @@ -0,0 +1 @@ +I2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|AhrefsBot|Baiduspider|Barkrowler|Bingbot|BLEXBot|Bytedance|DotBot|EmailCollector|facebookcatalog|facebookexternalhit|fidget-spinner-bot|Franck the Fediverse Graph Crawler|Googlebot|Livelapbot|Mediapartners-Google|MJ12bot|SemrushBot|SeznamBot|VelenPublicWebCrawler|WebEMailExtrac|YandexBot|YisouSpider| \ No newline at end of file diff --git a/robots.txt b/robots.txt new file mode 100644 index 0000000..67fcc46 --- /dev/null +++ b/robots.txt @@ -0,0 +1,80 @@ +User-agent: AI2Bot +User-agent: Ai2Bot-Dolma +User-agent: aiHitBot +User-agent: Amazonbot +User-agent: anthropic-ai +User-agent: Applebot +User-agent: Applebot-Extended +User-agent: Brightbot 1.0 +User-agent: Bytespider +User-agent: CCBot +User-agent: ChatGPT-User +User-agent: Claude-Web +User-agent: ClaudeBot +User-agent: cohere-ai +User-agent: cohere-training-data-crawler +User-agent: Cotoyogi +User-agent: Crawlspace +User-agent: Diffbot +User-agent: DuckAssistBot +User-agent: FacebookBot +User-agent: Factset_spyderbot +User-agent: FirecrawlAgent +User-agent: FriendlyCrawler +User-agent: Google-Extended +User-agent: GoogleOther +User-agent: GoogleOther-Image +User-agent: GoogleOther-Video +User-agent: GPTBot +User-agent: iaskspider/2.0 +User-agent: ICC-Crawler +User-agent: ImagesiftBot +User-agent: img2dataset +User-agent: imgproxy +User-agent: ISSCyberRiskCrawler +User-agent: Kangaroo Bot +User-agent: meta-externalagent +User-agent: Meta-ExternalAgent +User-agent: meta-externalfetcher +User-agent: Meta-ExternalFetcher +User-agent: NovaAct +User-agent: OAI-SearchBot +User-agent: omgili +User-agent: omgilibot +User-agent: Operator +User-agent: PanguBot +User-agent: Perplexity-User +User-agent: PerplexityBot +User-agent: PetalBot +User-agent: Scrapy +User-agent: SemrushBot-OCOB +User-agent: SemrushBot-SWA +User-agent: Sidetrade indexer bot +User-agent: TikTokSpider +User-agent: Timpibot +User-agent: VelenPublicWebCrawler +User-agent: Webzio-Extended +User-agent: YouBot +User-agent: AhrefsBot +User-agent: Baiduspider +User-agent: Barkrowler +User-agent: Bingbot +User-agent: BLEXBot +User-agent: Bytedance +User-agent: DotBot +User-agent: EmailCollector +User-agent: facebookcatalog +User-agent: facebookexternalhit +User-agent: fidget-spinner-bot +User-agent: Franck the Fediverse Graph Crawler +User-agent: Googlebot +User-agent: Livelapbot +User-agent: Mediapartners-Google +User-agent: MJ12bot +User-agent: SemrushBot +User-agent: SeznamBot +User-agent: VelenPublicWebCrawler +User-agent: WebEMailExtrac +User-agent: YandexBot +User-agent: YisouSpider +Disallow: / diff --git a/source/ai.robots.txt /.htaccess b/source/ai.robots.txt /.htaccess new file mode 100644 index 0000000..586adab --- /dev/null +++ b/source/ai.robots.txt /.htaccess @@ -0,0 +1,3 @@ +RewriteEngine On +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/source/ai.robots.txt /nginx-block-ai-bots.conf b/source/ai.robots.txt /nginx-block-ai-bots.conf new file mode 100644 index 0000000..fc58d61 --- /dev/null +++ b/source/ai.robots.txt /nginx-block-ai-bots.conf @@ -0,0 +1,3 @@ +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { + return 403; +} \ No newline at end of file diff --git a/source/ai.robots.txt /robots.txt b/source/ai.robots.txt /robots.txt index 53291ca..232e119 100644 --- a/source/ai.robots.txt /robots.txt +++ b/source/ai.robots.txt /robots.txt @@ -34,7 +34,9 @@ User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: meta-externalagent +User-agent: Meta-ExternalAgent User-agent: meta-externalfetcher +User-agent: Meta-ExternalFetcher User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili