mirror of
https://github.com/TecharoHQ/anubis.git
synced 2026-04-05 08:18:17 +00:00
Compare commits
6 Commits
Xe/less-pa
...
Xe/even-le
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0af1917a10 | ||
|
|
a12b4bb755 | ||
|
|
4dfc73abd1 | ||
|
|
ffbbdce3da | ||
|
|
c09c86778d | ||
|
|
9c47c180d0 |
1
.github/workflows/smoke-tests.yml
vendored
1
.github/workflows/smoke-tests.yml
vendored
@@ -14,6 +14,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
test:
|
||||
- default-config-macro
|
||||
- double_slash
|
||||
- forced-language
|
||||
- git-clone
|
||||
|
||||
@@ -11,6 +11,9 @@
|
||||
## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from.
|
||||
|
||||
bots:
|
||||
# You can import the entire default config with this macro:
|
||||
# - import: (data)/meta/default-config.yaml
|
||||
|
||||
# Pathological bots to deny
|
||||
- # This correlates to data/bots/_deny-pathological.yaml in the source tree
|
||||
# https://github.com/TecharoHQ/anubis/blob/main/data/bots/_deny-pathological.yaml
|
||||
@@ -93,6 +96,50 @@ bots:
|
||||
# weight:
|
||||
# adjust: -10
|
||||
|
||||
# Assert behaviour that only genuine browsers display. This ensures that Chrome
|
||||
# or Firefox versions
|
||||
- name: realistic-browser-catchall
|
||||
expression:
|
||||
all:
|
||||
- '"User-Agent" in headers'
|
||||
- '( userAgent.contains("Firefox") ) || ( userAgent.contains("Chrome") ) || ( userAgent.contains("Safari") )'
|
||||
- '"Accept" in headers'
|
||||
- '"Sec-Fetch-Dest" in headers'
|
||||
- '"Sec-Fetch-Mode" in headers'
|
||||
- '"Sec-Fetch-Site" in headers'
|
||||
- '"Accept-Encoding" in headers'
|
||||
- '( headers["Accept-Encoding"].contains("zstd") || headers["Accept-Encoding"].contains("br") )'
|
||||
- '"Accept-Language" in headers'
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: -10
|
||||
|
||||
# The Upgrade-Insecure-Requests header is typically sent by browsers, but not always
|
||||
- name: upgrade-insecure-requests
|
||||
expression: '"Upgrade-Insecure-Requests" in headers'
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: -2
|
||||
|
||||
# Chrome should behave like Chrome
|
||||
- name: chrome-is-proper
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Chrome")
|
||||
- '"Sec-Ch-Ua" in headers'
|
||||
- 'headers["Sec-Ch-Ua"].contains("Chromium")'
|
||||
- '"Sec-Ch-Ua-Mobile" in headers'
|
||||
- '"Sec-Ch-Ua-Platform" in headers'
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: -5
|
||||
|
||||
- name: should-have-accept
|
||||
expression: '!("Accept" in headers)'
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: 5
|
||||
|
||||
# Generic catchall rule
|
||||
- name: generic-browser
|
||||
user_agent_regex: >-
|
||||
@@ -212,14 +259,10 @@ thresholds:
|
||||
- weight < 20
|
||||
action: CHALLENGE
|
||||
challenge:
|
||||
# https://anubis.techaro.lol/docs/admin/configuration/challenges/preact
|
||||
#
|
||||
# This challenge proves the client can run a webapp written with Preact.
|
||||
# The preact webapp simply loads, calculates the SHA-256 checksum of the
|
||||
# challenge data, and forwards that to the client.
|
||||
algorithm: preact
|
||||
difficulty: 1
|
||||
report_as: 1
|
||||
# https://anubis.techaro.lol/docs/admin/configuration/challenges/proof-of-work
|
||||
algorithm: fast
|
||||
difficulty: 2 # two leading zeros, very fast for most clients
|
||||
report_as: 2
|
||||
- name: mild-proof-of-work
|
||||
expression:
|
||||
all:
|
||||
@@ -229,8 +272,8 @@ thresholds:
|
||||
challenge:
|
||||
# https://anubis.techaro.lol/docs/admin/configuration/challenges/proof-of-work
|
||||
algorithm: fast
|
||||
difficulty: 2 # two leading zeros, very fast for most clients
|
||||
report_as: 2
|
||||
difficulty: 4
|
||||
report_as: 4
|
||||
# For clients that are browser like and have gained many points from custom rules
|
||||
- name: extreme-suspicion
|
||||
expression: weight >= 30
|
||||
@@ -238,5 +281,5 @@ thresholds:
|
||||
challenge:
|
||||
# https://anubis.techaro.lol/docs/admin/configuration/challenges/proof-of-work
|
||||
algorithm: fast
|
||||
difficulty: 4
|
||||
report_as: 4
|
||||
difficulty: 6
|
||||
report_as: 6
|
||||
|
||||
@@ -2,13 +2,19 @@
|
||||
action: ALLOW
|
||||
expression:
|
||||
all:
|
||||
- >
|
||||
(
|
||||
userAgent.startsWith("git/") ||
|
||||
userAgent.contains("libgit") ||
|
||||
userAgent.startsWith("go-git") ||
|
||||
userAgent.startsWith("JGit/") ||
|
||||
userAgent.startsWith("JGit-")
|
||||
)
|
||||
- '"Git-Protocol" in headers'
|
||||
- headers["Git-Protocol"] == "version=2"
|
||||
- >
|
||||
(
|
||||
userAgent.startsWith("git/") ||
|
||||
userAgent.contains("libgit") ||
|
||||
userAgent.startsWith("go-git") ||
|
||||
userAgent.startsWith("JGit/") ||
|
||||
userAgent.startsWith("JGit-")
|
||||
)
|
||||
- '"Accept" in headers'
|
||||
- headers["Accept"] == "*/*"
|
||||
- '"Cache-Control" in headers'
|
||||
- headers["Cache-Control"] == "no-cache"
|
||||
- '"Pragma" in headers'
|
||||
- headers["Pragma"] == "no-cache"
|
||||
- '"Accept-Encoding" in headers'
|
||||
- headers["Accept-Encoding"].contains("gzip")
|
||||
|
||||
133
data/meta/default-config.yaml
Normal file
133
data/meta/default-config.yaml
Normal file
@@ -0,0 +1,133 @@
|
||||
- # Pathological bots to deny
|
||||
# This correlates to data/bots/_deny-pathological.yaml in the source tree
|
||||
# https://github.com/TecharoHQ/anubis/blob/main/data/bots/_deny-pathological.yaml
|
||||
import: (data)/bots/_deny-pathological.yaml
|
||||
- import: (data)/bots/aggressive-brazilian-scrapers.yaml
|
||||
|
||||
# Aggressively block AI/LLM related bots/agents by default
|
||||
- import: (data)/meta/ai-block-aggressive.yaml
|
||||
|
||||
# Consider replacing the aggressive AI policy with more selective policies:
|
||||
# - import: (data)/meta/ai-block-moderate.yaml
|
||||
# - import: (data)/meta/ai-block-permissive.yaml
|
||||
|
||||
# Search engine crawlers to allow, defaults to:
|
||||
# - Google (so they don't try to bypass Anubis)
|
||||
# - Apple
|
||||
# - Bing
|
||||
# - DuckDuckGo
|
||||
# - Qwant
|
||||
# - The Internet Archive
|
||||
# - Kagi
|
||||
# - Marginalia
|
||||
# - Mojeek
|
||||
- import: (data)/crawlers/_allow-good.yaml
|
||||
# Challenge Firefox AI previews
|
||||
- import: (data)/clients/x-firefox-ai.yaml
|
||||
|
||||
# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt)
|
||||
- import: (data)/common/keep-internet-working.yaml
|
||||
|
||||
# # Punish any bot with "bot" in the user-agent string
|
||||
# # This is known to have a high false-positive rate, use at your own risk
|
||||
# - name: generic-bot-catchall
|
||||
# user_agent_regex: (?i:bot|crawler)
|
||||
# action: CHALLENGE
|
||||
# challenge:
|
||||
# difficulty: 16 # impossible
|
||||
# report_as: 4 # lie to the operator
|
||||
# algorithm: slow # intentionally waste CPU cycles and time
|
||||
|
||||
# Requires a subscription to Thoth to use, see
|
||||
# https://anubis.techaro.lol/docs/admin/thoth#geoip-based-filtering
|
||||
- name: countries-with-aggressive-scrapers
|
||||
action: WEIGH
|
||||
geoip:
|
||||
countries:
|
||||
- BR
|
||||
- CN
|
||||
weight:
|
||||
adjust: 10
|
||||
|
||||
# Requires a subscription to Thoth to use, see
|
||||
# https://anubis.techaro.lol/docs/admin/thoth#asn-based-filtering
|
||||
- name: aggressive-asns-without-functional-abuse-contact
|
||||
action: WEIGH
|
||||
asns:
|
||||
match:
|
||||
- 13335 # Cloudflare
|
||||
- 136907 # Huawei Cloud
|
||||
- 45102 # Alibaba Cloud
|
||||
weight:
|
||||
adjust: 10
|
||||
|
||||
# ## System load based checks.
|
||||
# # If the system is under high load, add weight.
|
||||
# - name: high-load-average
|
||||
# action: WEIGH
|
||||
# expression: load_1m >= 10.0 # make sure to end the load comparison in a .0
|
||||
# weight:
|
||||
# adjust: 20
|
||||
|
||||
## If your backend service is running on the same operating system as Anubis,
|
||||
## you can uncomment this rule to make the challenge easier when the system is
|
||||
## under low load.
|
||||
##
|
||||
## If it is not, remove weight.
|
||||
# - name: low-load-average
|
||||
# action: WEIGH
|
||||
# expression: load_15m <= 4.0 # make sure to end the load comparison in a .0
|
||||
# weight:
|
||||
# adjust: -10
|
||||
|
||||
# Assert behaviour that only genuine browsers display. This ensures that Chrome
|
||||
# or Firefox versions
|
||||
- name: realistic-browser-catchall
|
||||
expression:
|
||||
all:
|
||||
- '"User-Agent" in headers'
|
||||
- '( userAgent.contains("Firefox") ) || ( userAgent.contains("Chrome") ) || ( userAgent.contains("Safari") )'
|
||||
- '"Accept" in headers'
|
||||
- '"Sec-Fetch-Dest" in headers'
|
||||
- '"Sec-Fetch-Mode" in headers'
|
||||
- '"Sec-Fetch-Site" in headers'
|
||||
- '"Accept-Encoding" in headers'
|
||||
- '( headers["Accept-Encoding"].contains("zstd") || headers["Accept-Encoding"].contains("br") )'
|
||||
- '"Accept-Language" in headers'
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: -10
|
||||
|
||||
# The Upgrade-Insecure-Requests header is typically sent by browsers, but not always
|
||||
- name: upgrade-insecure-requests
|
||||
expression: '"Upgrade-Insecure-Requests" in headers'
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: -2
|
||||
|
||||
# Chrome should behave like Chrome
|
||||
- name: chrome-is-proper
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Chrome")
|
||||
- '"Sec-Ch-Ua" in headers'
|
||||
- 'headers["Sec-Ch-Ua"].contains("Chromium")'
|
||||
- '"Sec-Ch-Ua-Mobile" in headers'
|
||||
- '"Sec-Ch-Ua-Platform" in headers'
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: -5
|
||||
|
||||
- name: should-have-accept
|
||||
expression: '!("Accept" in headers)'
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: 5
|
||||
|
||||
# Generic catchall rule
|
||||
- name: generic-browser
|
||||
user_agent_regex: >-
|
||||
Mozilla|Opera
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: 10
|
||||
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
<!-- This changes the project to: -->
|
||||
|
||||
- Added `(data)/meta/default-config.yaml` for importing the entire default configuration at once.
|
||||
- Add `-custom-real-ip-header` flag to get the original request IP from a different header than `x-real-ip`.
|
||||
- Add `contentLength` variable to bot expressions.
|
||||
- Add `COOKIE_SAME_SITE_MODE` to force anubis cookies SameSite value, and downgrade automatically from `None` to `Lax` if cookie is insecure.
|
||||
@@ -29,8 +30,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Fixes concurrency problems with very old browsers ([#1082](https://github.com/TecharoHQ/anubis/issues/1082)).
|
||||
- Randomly use the Refresh header instead of the meta refresh tag in the metarefresh challenge.
|
||||
- Update OpenRC service to truncate the runtime directory before starting Anubis.
|
||||
- Make the git client profile more strictly match how the git client behaves.
|
||||
- Make the default configuration reward users using normal browsers.
|
||||
- Allow multiple consecutive slashes in a row in application paths ([#754](https://github.com/TecharoHQ/anubis/issues/754)).
|
||||
- Add option to set `targetSNI` to special keyword 'auto' to indicate that it should be automatically set to the request Host name ([424](https://github.com/TecharoHQ/anubis/issues/424)).
|
||||
- The Preact challenge has been removed from the default configuration. It will be deprecated in the future.
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ sequenceDiagram
|
||||
participant Validation
|
||||
participant Evil Site
|
||||
|
||||
Hacker->>+User: Click on yoursite.com with this solution
|
||||
Hacker->>+User: Click on example.org with this solution
|
||||
User->>+Validation: Here's a solution, send me to evilsite.com
|
||||
Validation->>+User: Here's a cookie, go to evilsite.com
|
||||
User->>+Evil Site: GET evilsite.com
|
||||
@@ -46,11 +46,14 @@ Redirect domain not allowed
|
||||
|
||||
## Configuring allowed redirect domains
|
||||
|
||||
By default, Anubis will limit redirects to be on the same HTTP Host that Anubis is running on (EG: requests to yoursite.com cannot redirect outside of yoursite.com). If you need to set more than one domain, fill the `REDIRECT_DOMAINS` environment variable with a comma-separated list of domain names that Anubis should allow redirects to.
|
||||
By default, Anubis may redirect to any domain which could cause security issues in the unlikely case that an attacker passes a challenge for your browser and then tricks you into clicking a link to your domain.
|
||||
One can restrict the domains that Anubis can redirect to when passing a challenge by setting up `REDIRECT_DOMAINS` environment variable.
|
||||
If you need to set more than one domain, fill the environment variable with a comma-separated list of domain names.
|
||||
There is also glob matching support. You can pass `*.bugs.techaro.lol` to allow redirecting to anything ending with `.bugs.techaro.lol`. There is a limit of 4 wildcards.
|
||||
|
||||
:::note
|
||||
|
||||
These domains are _an exact string match_, they do not support wildcard matches.
|
||||
If you are hosting Anubis on a non-standard port (`https://example:com:8443`, `http://www.example.net:8080`, etc.), you must also include the port number here.
|
||||
|
||||
:::
|
||||
|
||||
@@ -60,7 +63,7 @@ These domains are _an exact string match_, they do not support wildcard matches.
|
||||
```shell
|
||||
# anubis.env
|
||||
|
||||
REDIRECT_DOMAINS="yoursite.com,secretplans.yoursite.com"
|
||||
REDIRECT_DOMAINS="example.org,secretplans.example.org,*.test.example.org"
|
||||
# ...
|
||||
```
|
||||
|
||||
@@ -72,7 +75,7 @@ services:
|
||||
anubis-nginx:
|
||||
image: ghcr.io/techarohq/anubis:latest
|
||||
environment:
|
||||
REDIRECT_DOMAINS: "yoursite.com,secretplans.yoursite.com"
|
||||
REDIRECT_DOMAINS: "example.org,secretplans.example.org,*.test.example.org"
|
||||
# ...
|
||||
```
|
||||
|
||||
@@ -86,7 +89,7 @@ Inside your Deployment, StatefulSet, or Pod:
|
||||
image: ghcr.io/techarohq/anubis:latest
|
||||
env:
|
||||
- name: REDIRECT_DOMAINS
|
||||
value: "yoursite.com,secretplans.yoursite.com"
|
||||
value: "example.org,secretplans.example.org,*.test.example.org"
|
||||
# ...
|
||||
```
|
||||
|
||||
|
||||
@@ -95,7 +95,7 @@ Anubis uses these environment variables for configuration:
|
||||
| `OVERLAY_FOLDER` | unset | <EO /> If set, treat the given path as an [overlay folder](./botstopper.mdx#custom-images-and-css), allowing you to customize CSS, fonts, images, and add other assets to BotStopper deployments. |
|
||||
| `POLICY_FNAME` | unset | The file containing [bot policy configuration](./policies.mdx). See the bot policy documentation for more details. If unset, the default bot policy configuration is used. |
|
||||
| `PUBLIC_URL` | unset | The externally accessible URL for this Anubis instance, used for constructing redirect URLs (e.g., for Traefik forwardAuth). |
|
||||
| `REDIRECT_DOMAINS` | unset | If set, restrict the domains that Anubis can redirect to when passing a challenge.<br/><br/>If this is unset, Anubis may redirect to any domain which could cause security issues in the unlikely case that an attacker passes a challenge for your browser and then tricks you into clicking a link to your domain.<br/><br/>Note that if you are hosting Anubis on a non-standard port (`https://example:com:8443`, `http://www.example.net:8080`, etc.), you must also include the port number here. |
|
||||
| `REDIRECT_DOMAINS` | unset | Comma-separated list of domain names that Anubis should allow redirects to when passing a challenge. See [Redirect Domain Configuration](./configuration/redirect-domains) for more details. |
|
||||
| `SERVE_ROBOTS_TXT` | `false` | If set `true`, Anubis will serve a default `robots.txt` file that disallows all known AI scrapers by name and then additionally disallows every scraper. This is useful if facts and circumstances make it difficult to change the underlying service to serve such a `robots.txt` file. |
|
||||
| `SLOG_LEVEL` | `INFO` | The log level for structured logging. Valid values are `DEBUG`, `INFO`, `WARN`, and `ERROR`. Set to `DEBUG` to see all requests, evaluations, and detailed diagnostic information. |
|
||||
| `SOCKET_MODE` | `0770` | _Only used when at least one of the `*_BIND_NETWORK` variables are set to `unix`._ The socket mode (permissions) for Unix domain sockets. |
|
||||
|
||||
@@ -194,6 +194,7 @@ func (u *userAgentRoundTripper) RoundTrip(req *http.Request) (*http.Response, er
|
||||
// Only set if not already present
|
||||
req = req.Clone(req.Context()) // avoid mutating original request
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0")
|
||||
req.Header.Set("Accept-Encoding", "gzip")
|
||||
return u.rt.RoundTrip(req)
|
||||
}
|
||||
|
||||
|
||||
82
test/default-config-macro/compare_bots.py
Normal file
82
test/default-config-macro/compare_bots.py
Normal file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to verify that the 'bots' field in data/botPolicies.yaml
|
||||
has the same semantic contents as data/meta/default-config.yaml.
|
||||
|
||||
CW: generated by AI
|
||||
"""
|
||||
|
||||
import yaml
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import difflib
|
||||
|
||||
def load_yaml(file_path):
|
||||
"""Load YAML file and return the data."""
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print(f"Error loading {file_path}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def normalize_yaml(data):
|
||||
"""Normalize YAML data by removing comments and standardizing structure."""
|
||||
# For lists, just return as is, since YAML comments are stripped by safe_load
|
||||
return data
|
||||
|
||||
def get_repo_root():
|
||||
"""Get the root directory of the git repository."""
|
||||
try:
|
||||
result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], capture_output=True, text=True, check=True)
|
||||
return result.stdout.strip()
|
||||
except subprocess.CalledProcessError:
|
||||
print("Error: Not in a git repository")
|
||||
sys.exit(1)
|
||||
|
||||
def main():
|
||||
# Get the git repository root
|
||||
repo_root = get_repo_root()
|
||||
|
||||
# Paths relative to the repo root
|
||||
bot_policies_path = os.path.join(repo_root, 'data', 'botPolicies.yaml')
|
||||
default_config_path = os.path.join(repo_root, 'data', 'meta', 'default-config.yaml')
|
||||
|
||||
# Load the files
|
||||
bot_policies = load_yaml(bot_policies_path)
|
||||
default_config = load_yaml(default_config_path)
|
||||
|
||||
# Extract the 'bots' field from botPolicies.yaml
|
||||
if 'bots' not in bot_policies:
|
||||
print("Error: 'bots' field not found in botPolicies.yaml")
|
||||
sys.exit(1)
|
||||
bots_field = bot_policies['bots']
|
||||
|
||||
# The default-config.yaml is a list directly
|
||||
default_bots = default_config
|
||||
|
||||
# Normalize both
|
||||
normalized_bots = normalize_yaml(bots_field)
|
||||
normalized_default = normalize_yaml(default_bots)
|
||||
|
||||
# Compare
|
||||
if normalized_bots == normalized_default:
|
||||
print("SUCCESS: The 'bots' field in botPolicies.yaml matches the contents of default-config.yaml")
|
||||
sys.exit(0)
|
||||
else:
|
||||
print("FAILURE: The 'bots' field in botPolicies.yaml does not match the contents of default-config.yaml")
|
||||
print("\nDiff:")
|
||||
bots_yaml = yaml.dump(normalized_bots, default_flow_style=False)
|
||||
default_yaml = yaml.dump(normalized_default, default_flow_style=False)
|
||||
diff = difflib.unified_diff(
|
||||
bots_yaml.splitlines(keepends=True),
|
||||
default_yaml.splitlines(keepends=True),
|
||||
fromfile='bots field in botPolicies.yaml',
|
||||
tofile='default-config.yaml'
|
||||
)
|
||||
print(''.join(diff))
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
7
test/default-config-macro/test.sh
Executable file
7
test/default-config-macro/test.sh
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
python3 -c 'import yaml'
|
||||
python3 ./compare_bots.py
|
||||
Reference in New Issue
Block a user