From 566042670094831d8f6177c801be1d55a8fd361e Mon Sep 17 00:00:00 2001 From: Xe Iaso Date: Sat, 6 Jun 2026 10:31:24 -0400 Subject: [PATCH] chore: ban x.ai (#1673) * chore: ban x.ai Signed-off-by: Xe Iaso * chore: spelling Signed-off-by: Xe Iaso * chore: spelling Signed-off-by: Xe Iaso --------- Signed-off-by: Xe Iaso --- .github/actions/spelling/allow.txt | 2 ++ data/botPolicies.yaml | 3 +++ data/crawlers/xai.yaml | 8 ++++++++ data/meta/default-config.yaml | 3 +++ docs/docs/CHANGELOG.md | 1 + 5 files changed, 17 insertions(+) create mode 100644 data/crawlers/xai.yaml diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt index 2279616f..e5d21c7e 100644 --- a/.github/actions/spelling/allow.txt +++ b/.github/actions/spelling/allow.txt @@ -45,3 +45,5 @@ AWOO firewalls bindhosts handrolled +xai +gitlab diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml index d6d3671e..33c91df6 100644 --- a/data/botPolicies.yaml +++ b/data/botPolicies.yaml @@ -41,6 +41,9 @@ bots: # Challenge Firefox AI previews - import: (data)/clients/x-firefox-ai.yaml + # x.ai has a scraper that is killing gitlab instances + - import: (data)/crawlers/xai.yaml + # Allow common "keeping the internet working" routes (well-known, favicon, robots.txt) - import: (data)/common/keep-internet-working.yaml diff --git a/data/crawlers/xai.yaml b/data/crawlers/xai.yaml new file mode 100644 index 00000000..bb7b7269 --- /dev/null +++ b/data/crawlers/xai.yaml @@ -0,0 +1,8 @@ +- name: xai-crawler-and-asn + action: DENY + user_agent_regex: code-review-sourcing.*\+xai-research + remote_addresses: + - 69.12.56.0/12 +- name: xai-crawler-user-agent + action: DENY + user_agent_regex: code-review-sourcing.*\+xai-research diff --git a/data/meta/default-config.yaml b/data/meta/default-config.yaml index 73e8a0d2..c9357613 100644 --- a/data/meta/default-config.yaml +++ b/data/meta/default-config.yaml @@ -25,6 +25,9 @@ # Challenge Firefox AI previews - import: (data)/clients/x-firefox-ai.yaml +# x.ai has a scraper that is killing gitlab instances +- import: (data)/crawlers/xai.yaml + # Allow common "keeping the internet working" routes (well-known, favicon, robots.txt) - import: (data)/common/keep-internet-working.yaml diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 0647316c..3c0b72c8 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -30,6 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fix a bug in the dataset poisoning maze that could allow denial of service [#1580](https://github.com/TecharoHQ/anubis/issues/1580). - Add config option to add ASN to logs/metrics. - Log weight when issuing challenge. +- Block x.ai's crawler for code review training. - Gate pprof endpoints behind `metrics.debug` in the policy file. - Limit naive honeypot r9k delay to one second. - Fix an obscure case where adding query values to a subrequest match could cause an invalid rule match when using path based matching for protected resources.