Compare commits

...

6 Commits

Author SHA1 Message Date
Xe Iaso
a5b82bd37b fix(lib): de-flake package lib tests
Signed-off-by: Xe Iaso <me@xeiaso.net>
2025-10-13 15:44:32 +00:00
Xe Iaso
ffbbdce3da feat: default config macro (#1186)
* feat(data): add default-config macro

Closes #1152

Signed-off-by: Xe Iaso <me@xeiaso.net>

* docs: update CHANGELOG

Signed-off-by: Xe Iaso <me@xeiaso.net>

* test: add default-config-macro smoke test

This uses an AI generated python script to diff the contents of the bots
field of the default configuration file and the
data/meta/default-config.yaml file. It emits a patch showing what needs
to be changed.

Signed-off-by: Xe Iaso <me@xeiaso.net>

---------

Signed-off-by: Xe Iaso <me@xeiaso.net>
2025-10-13 11:33:16 -04:00
Xe Iaso
c09c86778d fix(default-config): remove preact challenge (#1184)
* fix(default-config): remove the preact challenge from the default config

Signed-off-by: Xe Iaso <me@xeiaso.net>

* docs: update CHANGELOG

Signed-off-by: Xe Iaso <me@xeiaso.net>

---------

Signed-off-by: Xe Iaso <me@xeiaso.net>
2025-10-11 09:22:07 -04:00
Xe Iaso
9c47c180d0 fix(default-config): make the default config far less paranoid (#1179)
* test: add httpdebug tool

Signed-off-by: Xe Iaso <me@xeiaso.net>

* fix(data/clients/git): more strictly match the git client

Signed-off-by: Xe Iaso <me@xeiaso.net>

* fix(default-config): make the default config far less paranoid

This uses a variety of heuristics to make sure that clients that claim
to be browsers are more likely to behave like browsers. Most of these
are based on the results of a lot of reverse engineering and data
collection from honeypot servers.

Signed-off-by: Xe Iaso <me@xeiaso.net>

* docs: update CHANGELOG

Signed-off-by: Xe Iaso <me@xeiaso.net>

---------

Signed-off-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Xe Iaso <xe.iaso@techaro.lol>
2025-10-11 08:48:12 -04:00
Xe Iaso
d51d32726c fix(lib): serve CSS properly (#1158)
Signed-off-by: Xe Iaso <me@xeiaso.net>
2025-09-27 22:16:23 -04:00
dependabot[bot]
ff33982ee9 build(deps): bump github.com/docker/docker (#1131)
Bumps [github.com/docker/docker](https://github.com/docker/docker) from 28.3.2+incompatible to 28.3.3+incompatible.
- [Release notes](https://github.com/docker/docker/releases)
- [Commits](https://github.com/docker/docker/compare/v28.3.2...v28.3.3)

---
updated-dependencies:
- dependency-name: github.com/docker/docker
  dependency-version: 28.3.3+incompatible
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2025-09-27 14:29:07 -04:00
12 changed files with 296 additions and 26 deletions

View File

@@ -14,6 +14,7 @@ jobs:
strategy:
matrix:
test:
- default-config-macro
- double_slash
- forced-language
- git-clone

View File

@@ -11,6 +11,9 @@
## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from.
bots:
# You can import the entire default config with this macro:
# - import: (data)/meta/default-config.yaml
# Pathological bots to deny
- # This correlates to data/bots/_deny-pathological.yaml in the source tree
# https://github.com/TecharoHQ/anubis/blob/main/data/bots/_deny-pathological.yaml
@@ -93,6 +96,44 @@ bots:
# weight:
# adjust: -10
# Assert behaviour that only genuine browsers display. This ensures that Chrome
# or Firefox versions
- name: realistic-browser-catchall
expression:
all:
- '"User-Agent" in headers'
- '( userAgent.contains("Firefox") ) || ( userAgent.contains("Chrome") ) || ( userAgent.contains("Safari") )'
- '"Accept" in headers'
- '"Sec-Fetch-Dest" in headers'
- '"Sec-Fetch-Mode" in headers'
- '"Sec-Fetch-Site" in headers'
- '"Upgrade-Insecure-Requests" in headers'
- '"Accept-Encoding" in headers'
- '( headers["Accept-Encoding"].contains("zstd") || headers["Accept-Encoding"].contains("br") )'
- '"Accept-Language" in headers'
action: WEIGH
weight:
adjust: -10
# Chrome should behave like Chrome
- name: chrome-is-proper
expression:
all:
- userAgent.contains("Chrome")
- '"Sec-Ch-Ua" in headers'
- 'headers["Sec-Ch-Ua"].contains("Chromium")'
- '"Sec-Ch-Ua-Mobile" in headers'
- '"Sec-Ch-Ua-Platform" in headers'
action: WEIGH
weight:
adjust: -5
- name: should-have-accept
expression: '!("Accept" in headers)'
action: WEIGH
weight:
adjust: 5
# Generic catchall rule
- name: generic-browser
user_agent_regex: >-
@@ -212,14 +253,10 @@ thresholds:
- weight < 20
action: CHALLENGE
challenge:
# https://anubis.techaro.lol/docs/admin/configuration/challenges/preact
#
# This challenge proves the client can run a webapp written with Preact.
# The preact webapp simply loads, calculates the SHA-256 checksum of the
# challenge data, and forwards that to the client.
algorithm: preact
difficulty: 1
report_as: 1
# https://anubis.techaro.lol/docs/admin/configuration/challenges/proof-of-work
algorithm: fast
difficulty: 2 # two leading zeros, very fast for most clients
report_as: 2
- name: mild-proof-of-work
expression:
all:
@@ -229,8 +266,8 @@ thresholds:
challenge:
# https://anubis.techaro.lol/docs/admin/configuration/challenges/proof-of-work
algorithm: fast
difficulty: 2 # two leading zeros, very fast for most clients
report_as: 2
difficulty: 4
report_as: 4
# For clients that are browser like and have gained many points from custom rules
- name: extreme-suspicion
expression: weight >= 30
@@ -238,5 +275,5 @@ thresholds:
challenge:
# https://anubis.techaro.lol/docs/admin/configuration/challenges/proof-of-work
algorithm: fast
difficulty: 4
report_as: 4
difficulty: 6
report_as: 6

View File

@@ -2,13 +2,19 @@
action: ALLOW
expression:
all:
- >
(
userAgent.startsWith("git/") ||
userAgent.contains("libgit") ||
userAgent.startsWith("go-git") ||
userAgent.startsWith("JGit/") ||
userAgent.startsWith("JGit-")
)
- '"Git-Protocol" in headers'
- headers["Git-Protocol"] == "version=2"
- >
(
userAgent.startsWith("git/") ||
userAgent.contains("libgit") ||
userAgent.startsWith("go-git") ||
userAgent.startsWith("JGit/") ||
userAgent.startsWith("JGit-")
)
- '"Accept" in headers'
- headers["Accept"] == "*/*"
- '"Cache-Control" in headers'
- headers["Cache-Control"] == "no-cache"
- '"Pragma" in headers'
- headers["Pragma"] == "no-cache"
- '"Accept-Encoding" in headers'
- headers["Accept-Encoding"].contains("gzip")

View File

@@ -0,0 +1,127 @@
- # Pathological bots to deny
# This correlates to data/bots/_deny-pathological.yaml in the source tree
# https://github.com/TecharoHQ/anubis/blob/main/data/bots/_deny-pathological.yaml
import: (data)/bots/_deny-pathological.yaml
- import: (data)/bots/aggressive-brazilian-scrapers.yaml
# Aggressively block AI/LLM related bots/agents by default
- import: (data)/meta/ai-block-aggressive.yaml
# Consider replacing the aggressive AI policy with more selective policies:
# - import: (data)/meta/ai-block-moderate.yaml
# - import: (data)/meta/ai-block-permissive.yaml
# Search engine crawlers to allow, defaults to:
# - Google (so they don't try to bypass Anubis)
# - Apple
# - Bing
# - DuckDuckGo
# - Qwant
# - The Internet Archive
# - Kagi
# - Marginalia
# - Mojeek
- import: (data)/crawlers/_allow-good.yaml
# Challenge Firefox AI previews
- import: (data)/clients/x-firefox-ai.yaml
# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt)
- import: (data)/common/keep-internet-working.yaml
# # Punish any bot with "bot" in the user-agent string
# # This is known to have a high false-positive rate, use at your own risk
# - name: generic-bot-catchall
# user_agent_regex: (?i:bot|crawler)
# action: CHALLENGE
# challenge:
# difficulty: 16 # impossible
# report_as: 4 # lie to the operator
# algorithm: slow # intentionally waste CPU cycles and time
# Requires a subscription to Thoth to use, see
# https://anubis.techaro.lol/docs/admin/thoth#geoip-based-filtering
- name: countries-with-aggressive-scrapers
action: WEIGH
geoip:
countries:
- BR
- CN
weight:
adjust: 10
# Requires a subscription to Thoth to use, see
# https://anubis.techaro.lol/docs/admin/thoth#asn-based-filtering
- name: aggressive-asns-without-functional-abuse-contact
action: WEIGH
asns:
match:
- 13335 # Cloudflare
- 136907 # Huawei Cloud
- 45102 # Alibaba Cloud
weight:
adjust: 10
# ## System load based checks.
# # If the system is under high load, add weight.
# - name: high-load-average
# action: WEIGH
# expression: load_1m >= 10.0 # make sure to end the load comparison in a .0
# weight:
# adjust: 20
## If your backend service is running on the same operating system as Anubis,
## you can uncomment this rule to make the challenge easier when the system is
## under low load.
##
## If it is not, remove weight.
# - name: low-load-average
# action: WEIGH
# expression: load_15m <= 4.0 # make sure to end the load comparison in a .0
# weight:
# adjust: -10
# Assert behaviour that only genuine browsers display. This ensures that Chrome
# or Firefox versions
- name: realistic-browser-catchall
expression:
all:
- '"User-Agent" in headers'
- '( userAgent.contains("Firefox") ) || ( userAgent.contains("Chrome") ) || ( userAgent.contains("Safari") )'
- '"Accept" in headers'
- '"Sec-Fetch-Dest" in headers'
- '"Sec-Fetch-Mode" in headers'
- '"Sec-Fetch-Site" in headers'
- '"Upgrade-Insecure-Requests" in headers'
- '"Accept-Encoding" in headers'
- '( headers["Accept-Encoding"].contains("zstd") || headers["Accept-Encoding"].contains("br") )'
- '"Accept-Language" in headers'
action: WEIGH
weight:
adjust: -10
# Chrome should behave like Chrome
- name: chrome-is-proper
expression:
all:
- userAgent.contains("Chrome")
- '"Sec-Ch-Ua" in headers'
- 'headers["Sec-Ch-Ua"].contains("Chromium")'
- '"Sec-Ch-Ua-Mobile" in headers'
- '"Sec-Ch-Ua-Platform" in headers'
action: WEIGH
weight:
adjust: -5
- name: should-have-accept
expression: '!("Accept" in headers)'
action: WEIGH
weight:
adjust: 5
# Generic catchall rule
- name: generic-browser
user_agent_regex: >-
Mozilla|Opera
action: WEIGH
weight:
adjust: 10

View File

@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
<!-- This changes the project to: -->
- Added `(data)/meta/default-config.yaml` for importing the entire default configuration at once.
- Add `-custom-real-ip-header` flag to get the original request IP from a different header than `x-real-ip`.
- Add `contentLength` variable to bot expressions.
- Add `COOKIE_SAME_SITE_MODE` to force anubis cookies SameSite value, and downgrade automatically from `None` to `Lax` if cookie is insecure.
@@ -29,8 +30,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixes concurrency problems with very old browsers ([#1082](https://github.com/TecharoHQ/anubis/issues/1082)).
- Randomly use the Refresh header instead of the meta refresh tag in the metarefresh challenge.
- Update OpenRC service to truncate the runtime directory before starting Anubis.
- Make the git client profile more strictly match how the git client behaves.
- Make the default configuration reward users using normal browsers.
- Allow multiple consecutive slashes in a row in application paths ([#754](https://github.com/TecharoHQ/anubis/issues/754)).
- Add option to set `targetSNI` to special keyword 'auto' to indicate that it should be automatically set to the request Host name ([424](https://github.com/TecharoHQ/anubis/issues/424)).
- The Preact challenge has been removed from the default configuration. It will be deprecated in the future.
### Bug Fixes

2
go.mod
View File

@@ -87,7 +87,7 @@ require (
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
github.com/distribution/reference v0.6.0 // indirect
github.com/dlclark/regexp2 v1.11.5 // indirect
github.com/docker/docker v28.3.2+incompatible // indirect
github.com/docker/docker v28.3.3+incompatible // indirect
github.com/docker/go-connections v0.5.0 // indirect
github.com/docker/go-units v0.5.0 // indirect
github.com/dop251/goja v0.0.0-20250630131328-58d95d85e994 // indirect

4
go.sum
View File

@@ -141,8 +141,8 @@ github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5Qvfr
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
github.com/dlclark/regexp2 v1.11.5 h1:Q/sSnsKerHeCkc/jSTNq1oCm7KiVgUMZRDUoRu0JQZQ=
github.com/dlclark/regexp2 v1.11.5/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
github.com/docker/docker v28.3.2+incompatible h1:wn66NJ6pWB1vBZIilP8G3qQPqHy5XymfYn5vsqeA5oA=
github.com/docker/docker v28.3.2+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
github.com/docker/docker v28.3.3+incompatible h1:Dypm25kh4rmk49v1eiVbsAtpAsYURjYkaKubwuBdxEI=
github.com/docker/docker v28.3.3+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c=
github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc=
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=

View File

@@ -194,6 +194,7 @@ func (u *userAgentRoundTripper) RoundTrip(req *http.Request) (*http.Response, er
// Only set if not already present
req = req.Clone(req.Context()) // avoid mutating original request
req.Header.Set("User-Agent", "Mozilla/5.0")
req.Header.Set("Accept-Encoding", "gzip")
return u.rt.RoundTrip(req)
}

View File

@@ -17,6 +17,7 @@ import (
"github.com/TecharoHQ/anubis/lib/localization"
"github.com/TecharoHQ/anubis/lib/policy"
"github.com/TecharoHQ/anubis/web"
"github.com/TecharoHQ/anubis/xess"
"github.com/a-h/templ"
"github.com/golang-jwt/jwt/v5"
"golang.org/x/net/publicsuffix"
@@ -282,6 +283,9 @@ func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
if strings.HasPrefix(r.URL.Path, anubis.BasePrefix+anubis.StaticPath) {
s.mux.ServeHTTP(w, r)
return
} else if strings.HasPrefix(r.URL.Path, anubis.BasePrefix+xess.BasePrefix) {
s.mux.ServeHTTP(w, r)
return
}
s.maybeReverseProxyOrPage(w, r)

View File

@@ -0,0 +1,82 @@
#!/usr/bin/env python3
"""
Script to verify that the 'bots' field in data/botPolicies.yaml
has the same semantic contents as data/meta/default-config.yaml.
CW: generated by AI
"""
import yaml
import sys
import os
import subprocess
import difflib
def load_yaml(file_path):
"""Load YAML file and return the data."""
try:
with open(file_path, 'r') as f:
return yaml.safe_load(f)
except Exception as e:
print(f"Error loading {file_path}: {e}")
sys.exit(1)
def normalize_yaml(data):
"""Normalize YAML data by removing comments and standardizing structure."""
# For lists, just return as is, since YAML comments are stripped by safe_load
return data
def get_repo_root():
"""Get the root directory of the git repository."""
try:
result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], capture_output=True, text=True, check=True)
return result.stdout.strip()
except subprocess.CalledProcessError:
print("Error: Not in a git repository")
sys.exit(1)
def main():
# Get the git repository root
repo_root = get_repo_root()
# Paths relative to the repo root
bot_policies_path = os.path.join(repo_root, 'data', 'botPolicies.yaml')
default_config_path = os.path.join(repo_root, 'data', 'meta', 'default-config.yaml')
# Load the files
bot_policies = load_yaml(bot_policies_path)
default_config = load_yaml(default_config_path)
# Extract the 'bots' field from botPolicies.yaml
if 'bots' not in bot_policies:
print("Error: 'bots' field not found in botPolicies.yaml")
sys.exit(1)
bots_field = bot_policies['bots']
# The default-config.yaml is a list directly
default_bots = default_config
# Normalize both
normalized_bots = normalize_yaml(bots_field)
normalized_default = normalize_yaml(default_bots)
# Compare
if normalized_bots == normalized_default:
print("SUCCESS: The 'bots' field in botPolicies.yaml matches the contents of default-config.yaml")
sys.exit(0)
else:
print("FAILURE: The 'bots' field in botPolicies.yaml does not match the contents of default-config.yaml")
print("\nDiff:")
bots_yaml = yaml.dump(normalized_bots, default_flow_style=False)
default_yaml = yaml.dump(normalized_default, default_flow_style=False)
diff = difflib.unified_diff(
bots_yaml.splitlines(keepends=True),
default_yaml.splitlines(keepends=True),
fromfile='bots field in botPolicies.yaml',
tofile='default-config.yaml'
)
print(''.join(diff))
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")"
python3 -c 'import yaml'
python3 ./compare_bots.py

View File

@@ -16,7 +16,8 @@ var (
//go:embed *.css static
Static embed.FS
URL = "/.within.website/x/xess/xess.css"
BasePrefix = "/.within.website/x/xess/"
URL = "/.within.website/x/xess/xess.css"
)
func init() {