mirror of
https://github.com/TecharoHQ/anubis.git
synced 2026-04-06 00:38:18 +00:00
Compare commits
6 Commits
Xe/default
...
Xe/fix-web
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5756f51495 | ||
|
|
b345e0069b | ||
|
|
00261d049e | ||
|
|
a12b4bb755 | ||
|
|
4dfc73abd1 | ||
|
|
ffbbdce3da |
1
.github/workflows/smoke-tests.yml
vendored
1
.github/workflows/smoke-tests.yml
vendored
@@ -14,6 +14,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
test:
|
||||
- default-config-macro
|
||||
- double_slash
|
||||
- forced-language
|
||||
- git-clone
|
||||
|
||||
@@ -11,6 +11,9 @@
|
||||
## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from.
|
||||
|
||||
bots:
|
||||
# You can import the entire default config with this macro:
|
||||
# - import: (data)/meta/default-config.yaml
|
||||
|
||||
# Pathological bots to deny
|
||||
- # This correlates to data/bots/_deny-pathological.yaml in the source tree
|
||||
# https://github.com/TecharoHQ/anubis/blob/main/data/bots/_deny-pathological.yaml
|
||||
@@ -104,7 +107,6 @@ bots:
|
||||
- '"Sec-Fetch-Dest" in headers'
|
||||
- '"Sec-Fetch-Mode" in headers'
|
||||
- '"Sec-Fetch-Site" in headers'
|
||||
- '"Upgrade-Insecure-Requests" in headers'
|
||||
- '"Accept-Encoding" in headers'
|
||||
- '( headers["Accept-Encoding"].contains("zstd") || headers["Accept-Encoding"].contains("br") )'
|
||||
- '"Accept-Language" in headers'
|
||||
@@ -112,6 +114,13 @@ bots:
|
||||
weight:
|
||||
adjust: -10
|
||||
|
||||
# The Upgrade-Insecure-Requests header is typically sent by browsers, but not always
|
||||
- name: upgrade-insecure-requests
|
||||
expression: '"Upgrade-Insecure-Requests" in headers'
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: -2
|
||||
|
||||
# Chrome should behave like Chrome
|
||||
- name: chrome-is-proper
|
||||
expression:
|
||||
|
||||
133
data/meta/default-config.yaml
Normal file
133
data/meta/default-config.yaml
Normal file
@@ -0,0 +1,133 @@
|
||||
- # Pathological bots to deny
|
||||
# This correlates to data/bots/_deny-pathological.yaml in the source tree
|
||||
# https://github.com/TecharoHQ/anubis/blob/main/data/bots/_deny-pathological.yaml
|
||||
import: (data)/bots/_deny-pathological.yaml
|
||||
- import: (data)/bots/aggressive-brazilian-scrapers.yaml
|
||||
|
||||
# Aggressively block AI/LLM related bots/agents by default
|
||||
- import: (data)/meta/ai-block-aggressive.yaml
|
||||
|
||||
# Consider replacing the aggressive AI policy with more selective policies:
|
||||
# - import: (data)/meta/ai-block-moderate.yaml
|
||||
# - import: (data)/meta/ai-block-permissive.yaml
|
||||
|
||||
# Search engine crawlers to allow, defaults to:
|
||||
# - Google (so they don't try to bypass Anubis)
|
||||
# - Apple
|
||||
# - Bing
|
||||
# - DuckDuckGo
|
||||
# - Qwant
|
||||
# - The Internet Archive
|
||||
# - Kagi
|
||||
# - Marginalia
|
||||
# - Mojeek
|
||||
- import: (data)/crawlers/_allow-good.yaml
|
||||
# Challenge Firefox AI previews
|
||||
- import: (data)/clients/x-firefox-ai.yaml
|
||||
|
||||
# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt)
|
||||
- import: (data)/common/keep-internet-working.yaml
|
||||
|
||||
# # Punish any bot with "bot" in the user-agent string
|
||||
# # This is known to have a high false-positive rate, use at your own risk
|
||||
# - name: generic-bot-catchall
|
||||
# user_agent_regex: (?i:bot|crawler)
|
||||
# action: CHALLENGE
|
||||
# challenge:
|
||||
# difficulty: 16 # impossible
|
||||
# report_as: 4 # lie to the operator
|
||||
# algorithm: slow # intentionally waste CPU cycles and time
|
||||
|
||||
# Requires a subscription to Thoth to use, see
|
||||
# https://anubis.techaro.lol/docs/admin/thoth#geoip-based-filtering
|
||||
- name: countries-with-aggressive-scrapers
|
||||
action: WEIGH
|
||||
geoip:
|
||||
countries:
|
||||
- BR
|
||||
- CN
|
||||
weight:
|
||||
adjust: 10
|
||||
|
||||
# Requires a subscription to Thoth to use, see
|
||||
# https://anubis.techaro.lol/docs/admin/thoth#asn-based-filtering
|
||||
- name: aggressive-asns-without-functional-abuse-contact
|
||||
action: WEIGH
|
||||
asns:
|
||||
match:
|
||||
- 13335 # Cloudflare
|
||||
- 136907 # Huawei Cloud
|
||||
- 45102 # Alibaba Cloud
|
||||
weight:
|
||||
adjust: 10
|
||||
|
||||
# ## System load based checks.
|
||||
# # If the system is under high load, add weight.
|
||||
# - name: high-load-average
|
||||
# action: WEIGH
|
||||
# expression: load_1m >= 10.0 # make sure to end the load comparison in a .0
|
||||
# weight:
|
||||
# adjust: 20
|
||||
|
||||
## If your backend service is running on the same operating system as Anubis,
|
||||
## you can uncomment this rule to make the challenge easier when the system is
|
||||
## under low load.
|
||||
##
|
||||
## If it is not, remove weight.
|
||||
# - name: low-load-average
|
||||
# action: WEIGH
|
||||
# expression: load_15m <= 4.0 # make sure to end the load comparison in a .0
|
||||
# weight:
|
||||
# adjust: -10
|
||||
|
||||
# Assert behaviour that only genuine browsers display. This ensures that Chrome
|
||||
# or Firefox versions
|
||||
- name: realistic-browser-catchall
|
||||
expression:
|
||||
all:
|
||||
- '"User-Agent" in headers'
|
||||
- '( userAgent.contains("Firefox") ) || ( userAgent.contains("Chrome") ) || ( userAgent.contains("Safari") )'
|
||||
- '"Accept" in headers'
|
||||
- '"Sec-Fetch-Dest" in headers'
|
||||
- '"Sec-Fetch-Mode" in headers'
|
||||
- '"Sec-Fetch-Site" in headers'
|
||||
- '"Accept-Encoding" in headers'
|
||||
- '( headers["Accept-Encoding"].contains("zstd") || headers["Accept-Encoding"].contains("br") )'
|
||||
- '"Accept-Language" in headers'
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: -10
|
||||
|
||||
# The Upgrade-Insecure-Requests header is typically sent by browsers, but not always
|
||||
- name: upgrade-insecure-requests
|
||||
expression: '"Upgrade-Insecure-Requests" in headers'
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: -2
|
||||
|
||||
# Chrome should behave like Chrome
|
||||
- name: chrome-is-proper
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Chrome")
|
||||
- '"Sec-Ch-Ua" in headers'
|
||||
- 'headers["Sec-Ch-Ua"].contains("Chromium")'
|
||||
- '"Sec-Ch-Ua-Mobile" in headers'
|
||||
- '"Sec-Ch-Ua-Platform" in headers'
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: -5
|
||||
|
||||
- name: should-have-accept
|
||||
expression: '!("Accept" in headers)'
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: 5
|
||||
|
||||
# Generic catchall rule
|
||||
- name: generic-browser
|
||||
user_agent_regex: >-
|
||||
Mozilla|Opera
|
||||
action: WEIGH
|
||||
weight:
|
||||
adjust: 10
|
||||
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
<!-- This changes the project to: -->
|
||||
|
||||
- Added `(data)/meta/default-config.yaml` for importing the entire default configuration at once.
|
||||
- Add `-custom-real-ip-header` flag to get the original request IP from a different header than `x-real-ip`.
|
||||
- Add `contentLength` variable to bot expressions.
|
||||
- Add `COOKIE_SAME_SITE_MODE` to force anubis cookies SameSite value, and downgrade automatically from `None` to `Lax` if cookie is insecure.
|
||||
@@ -21,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Document missing environment variables in installation guide: `SLOG_LEVEL`, `COOKIE_PREFIX`, `FORCED_LANGUAGE`, and `TARGET_DISABLE_KEEPALIVE` ([#1086](https://github.com/TecharoHQ/anubis/pull/1086)).
|
||||
- Add validation warning when persistent storage is used without setting signing keys.
|
||||
- Fixed `robots2policy` to properly group consecutive user agents into `any:` instead of only processing the last one ([#925](https://github.com/TecharoHQ/anubis/pull/925)).
|
||||
- Make the `fast` algorithm prefer purejs when running in an insecure context.
|
||||
- Add the [`s3api` storage backend](./admin/policies.mdx#s3api) to allow Anubis to use S3 API compatible object storage as its storage backend.
|
||||
- Fix a "stutter" in the cookie name prefix so the auth cookie is named `techaro.lol-anubis-auth` instead of `techaro.lol-anubis-auth-auth`.
|
||||
- Make `cmd/containerbuild` support commas for separating elements of the `--docker-tags` argument as well as newlines.
|
||||
|
||||
@@ -32,7 +32,7 @@ sequenceDiagram
|
||||
participant Validation
|
||||
participant Evil Site
|
||||
|
||||
Hacker->>+User: Click on yoursite.com with this solution
|
||||
Hacker->>+User: Click on example.org with this solution
|
||||
User->>+Validation: Here's a solution, send me to evilsite.com
|
||||
Validation->>+User: Here's a cookie, go to evilsite.com
|
||||
User->>+Evil Site: GET evilsite.com
|
||||
@@ -46,11 +46,14 @@ Redirect domain not allowed
|
||||
|
||||
## Configuring allowed redirect domains
|
||||
|
||||
By default, Anubis will limit redirects to be on the same HTTP Host that Anubis is running on (EG: requests to yoursite.com cannot redirect outside of yoursite.com). If you need to set more than one domain, fill the `REDIRECT_DOMAINS` environment variable with a comma-separated list of domain names that Anubis should allow redirects to.
|
||||
By default, Anubis may redirect to any domain which could cause security issues in the unlikely case that an attacker passes a challenge for your browser and then tricks you into clicking a link to your domain.
|
||||
One can restrict the domains that Anubis can redirect to when passing a challenge by setting up `REDIRECT_DOMAINS` environment variable.
|
||||
If you need to set more than one domain, fill the environment variable with a comma-separated list of domain names.
|
||||
There is also glob matching support. You can pass `*.bugs.techaro.lol` to allow redirecting to anything ending with `.bugs.techaro.lol`. There is a limit of 4 wildcards.
|
||||
|
||||
:::note
|
||||
|
||||
These domains are _an exact string match_, they do not support wildcard matches.
|
||||
If you are hosting Anubis on a non-standard port (`https://example:com:8443`, `http://www.example.net:8080`, etc.), you must also include the port number here.
|
||||
|
||||
:::
|
||||
|
||||
@@ -60,7 +63,7 @@ These domains are _an exact string match_, they do not support wildcard matches.
|
||||
```shell
|
||||
# anubis.env
|
||||
|
||||
REDIRECT_DOMAINS="yoursite.com,secretplans.yoursite.com"
|
||||
REDIRECT_DOMAINS="example.org,secretplans.example.org,*.test.example.org"
|
||||
# ...
|
||||
```
|
||||
|
||||
@@ -72,7 +75,7 @@ services:
|
||||
anubis-nginx:
|
||||
image: ghcr.io/techarohq/anubis:latest
|
||||
environment:
|
||||
REDIRECT_DOMAINS: "yoursite.com,secretplans.yoursite.com"
|
||||
REDIRECT_DOMAINS: "example.org,secretplans.example.org,*.test.example.org"
|
||||
# ...
|
||||
```
|
||||
|
||||
@@ -86,7 +89,7 @@ Inside your Deployment, StatefulSet, or Pod:
|
||||
image: ghcr.io/techarohq/anubis:latest
|
||||
env:
|
||||
- name: REDIRECT_DOMAINS
|
||||
value: "yoursite.com,secretplans.yoursite.com"
|
||||
value: "example.org,secretplans.example.org,*.test.example.org"
|
||||
# ...
|
||||
```
|
||||
|
||||
|
||||
@@ -95,7 +95,7 @@ Anubis uses these environment variables for configuration:
|
||||
| `OVERLAY_FOLDER` | unset | <EO /> If set, treat the given path as an [overlay folder](./botstopper.mdx#custom-images-and-css), allowing you to customize CSS, fonts, images, and add other assets to BotStopper deployments. |
|
||||
| `POLICY_FNAME` | unset | The file containing [bot policy configuration](./policies.mdx). See the bot policy documentation for more details. If unset, the default bot policy configuration is used. |
|
||||
| `PUBLIC_URL` | unset | The externally accessible URL for this Anubis instance, used for constructing redirect URLs (e.g., for Traefik forwardAuth). |
|
||||
| `REDIRECT_DOMAINS` | unset | If set, restrict the domains that Anubis can redirect to when passing a challenge.<br/><br/>If this is unset, Anubis may redirect to any domain which could cause security issues in the unlikely case that an attacker passes a challenge for your browser and then tricks you into clicking a link to your domain.<br/><br/>Note that if you are hosting Anubis on a non-standard port (`https://example:com:8443`, `http://www.example.net:8080`, etc.), you must also include the port number here. |
|
||||
| `REDIRECT_DOMAINS` | unset | Comma-separated list of domain names that Anubis should allow redirects to when passing a challenge. See [Redirect Domain Configuration](./configuration/redirect-domains) for more details. |
|
||||
| `SERVE_ROBOTS_TXT` | `false` | If set `true`, Anubis will serve a default `robots.txt` file that disallows all known AI scrapers by name and then additionally disallows every scraper. This is useful if facts and circumstances make it difficult to change the underlying service to serve such a `robots.txt` file. |
|
||||
| `SLOG_LEVEL` | `INFO` | The log level for structured logging. Valid values are `DEBUG`, `INFO`, `WARN`, and `ERROR`. Set to `DEBUG` to see all requests, evaluations, and detailed diagnostic information. |
|
||||
| `SOCKET_MODE` | `0770` | _Only used when at least one of the `*_BIND_NETWORK` variables are set to `unix`._ The socket mode (permissions) for Unix domain sockets. |
|
||||
|
||||
@@ -194,6 +194,7 @@ func (u *userAgentRoundTripper) RoundTrip(req *http.Request) (*http.Response, er
|
||||
// Only set if not already present
|
||||
req = req.Clone(req.Context()) // avoid mutating original request
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0")
|
||||
req.Header.Set("Accept-Encoding", "gzip")
|
||||
return u.rt.RoundTrip(req)
|
||||
}
|
||||
|
||||
|
||||
82
test/default-config-macro/compare_bots.py
Normal file
82
test/default-config-macro/compare_bots.py
Normal file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to verify that the 'bots' field in data/botPolicies.yaml
|
||||
has the same semantic contents as data/meta/default-config.yaml.
|
||||
|
||||
CW: generated by AI
|
||||
"""
|
||||
|
||||
import yaml
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import difflib
|
||||
|
||||
def load_yaml(file_path):
|
||||
"""Load YAML file and return the data."""
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print(f"Error loading {file_path}: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def normalize_yaml(data):
|
||||
"""Normalize YAML data by removing comments and standardizing structure."""
|
||||
# For lists, just return as is, since YAML comments are stripped by safe_load
|
||||
return data
|
||||
|
||||
def get_repo_root():
|
||||
"""Get the root directory of the git repository."""
|
||||
try:
|
||||
result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], capture_output=True, text=True, check=True)
|
||||
return result.stdout.strip()
|
||||
except subprocess.CalledProcessError:
|
||||
print("Error: Not in a git repository")
|
||||
sys.exit(1)
|
||||
|
||||
def main():
|
||||
# Get the git repository root
|
||||
repo_root = get_repo_root()
|
||||
|
||||
# Paths relative to the repo root
|
||||
bot_policies_path = os.path.join(repo_root, 'data', 'botPolicies.yaml')
|
||||
default_config_path = os.path.join(repo_root, 'data', 'meta', 'default-config.yaml')
|
||||
|
||||
# Load the files
|
||||
bot_policies = load_yaml(bot_policies_path)
|
||||
default_config = load_yaml(default_config_path)
|
||||
|
||||
# Extract the 'bots' field from botPolicies.yaml
|
||||
if 'bots' not in bot_policies:
|
||||
print("Error: 'bots' field not found in botPolicies.yaml")
|
||||
sys.exit(1)
|
||||
bots_field = bot_policies['bots']
|
||||
|
||||
# The default-config.yaml is a list directly
|
||||
default_bots = default_config
|
||||
|
||||
# Normalize both
|
||||
normalized_bots = normalize_yaml(bots_field)
|
||||
normalized_default = normalize_yaml(default_bots)
|
||||
|
||||
# Compare
|
||||
if normalized_bots == normalized_default:
|
||||
print("SUCCESS: The 'bots' field in botPolicies.yaml matches the contents of default-config.yaml")
|
||||
sys.exit(0)
|
||||
else:
|
||||
print("FAILURE: The 'bots' field in botPolicies.yaml does not match the contents of default-config.yaml")
|
||||
print("\nDiff:")
|
||||
bots_yaml = yaml.dump(normalized_bots, default_flow_style=False)
|
||||
default_yaml = yaml.dump(normalized_default, default_flow_style=False)
|
||||
diff = difflib.unified_diff(
|
||||
bots_yaml.splitlines(keepends=True),
|
||||
default_yaml.splitlines(keepends=True),
|
||||
fromfile='bots field in botPolicies.yaml',
|
||||
tofile='default-config.yaml'
|
||||
)
|
||||
print(''.join(diff))
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
7
test/default-config-macro/test.sh
Executable file
7
test/default-config-macro/test.sh
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
python3 -c 'import yaml'
|
||||
python3 ./compare_bots.py
|
||||
@@ -18,7 +18,12 @@ export default function process(
|
||||
): Promise<string> {
|
||||
console.debug("fast algo");
|
||||
|
||||
let workerMethod = window.crypto !== undefined ? "webcrypto" : "purejs";
|
||||
// Choose worker based on secure context.
|
||||
// Use the WebCrypto worker if the page is a secure context; otherwise fall back to pure‑JS.
|
||||
let workerMethod: "webcrypto" | "purejs" = "purejs";
|
||||
if (window.isSecureContext) {
|
||||
workerMethod = "webcrypto";
|
||||
}
|
||||
|
||||
if (navigator.userAgent.includes("Firefox") || navigator.userAgent.includes("Goanna")) {
|
||||
console.log("Firefox detected, using pure-JS fallback");
|
||||
@@ -82,4 +87,4 @@ export default function process(
|
||||
workers.push(worker);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user