mirror of
https://github.com/TecharoHQ/anubis.git
synced 2026-04-07 01:08:19 +00:00
Compare commits
8 Commits
json/docs
...
fix/multib
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
de6b4de967 | ||
|
|
ecc0704b77 | ||
|
|
afb1e8b79a | ||
|
|
f0199d014f | ||
|
|
2e7f37215c | ||
|
|
61db9a618d | ||
|
|
e51b4bd965 | ||
|
|
291ed2a084 |
@@ -29,7 +29,7 @@ var (
|
||||
)
|
||||
|
||||
type RobotsRule struct {
|
||||
UserAgent string
|
||||
UserAgents []string
|
||||
Disallows []string
|
||||
Allows []string
|
||||
CrawlDelay int
|
||||
@@ -130,10 +130,26 @@ func main() {
|
||||
}
|
||||
}
|
||||
|
||||
func createRuleFromAccumulated(userAgents, disallows, allows []string, crawlDelay int) RobotsRule {
|
||||
rule := RobotsRule{
|
||||
UserAgents: make([]string, len(userAgents)),
|
||||
Disallows: make([]string, len(disallows)),
|
||||
Allows: make([]string, len(allows)),
|
||||
CrawlDelay: crawlDelay,
|
||||
}
|
||||
copy(rule.UserAgents, userAgents)
|
||||
copy(rule.Disallows, disallows)
|
||||
copy(rule.Allows, allows)
|
||||
return rule
|
||||
}
|
||||
|
||||
func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
|
||||
scanner := bufio.NewScanner(input)
|
||||
var rules []RobotsRule
|
||||
var currentRule *RobotsRule
|
||||
var currentUserAgents []string
|
||||
var currentDisallows []string
|
||||
var currentAllows []string
|
||||
var currentCrawlDelay int
|
||||
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
@@ -154,38 +170,42 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
|
||||
|
||||
switch directive {
|
||||
case "user-agent":
|
||||
// Start a new rule section
|
||||
if currentRule != nil {
|
||||
rules = append(rules, *currentRule)
|
||||
}
|
||||
currentRule = &RobotsRule{
|
||||
UserAgent: value,
|
||||
Disallows: make([]string, 0),
|
||||
Allows: make([]string, 0),
|
||||
// If we have accumulated rules with directives and encounter a new user-agent,
|
||||
// flush the current rules
|
||||
if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) {
|
||||
rule := createRuleFromAccumulated(currentUserAgents, currentDisallows, currentAllows, currentCrawlDelay)
|
||||
rules = append(rules, rule)
|
||||
// Reset for next group
|
||||
currentUserAgents = nil
|
||||
currentDisallows = nil
|
||||
currentAllows = nil
|
||||
currentCrawlDelay = 0
|
||||
}
|
||||
currentUserAgents = append(currentUserAgents, value)
|
||||
|
||||
case "disallow":
|
||||
if currentRule != nil && value != "" {
|
||||
currentRule.Disallows = append(currentRule.Disallows, value)
|
||||
if len(currentUserAgents) > 0 && value != "" {
|
||||
currentDisallows = append(currentDisallows, value)
|
||||
}
|
||||
|
||||
case "allow":
|
||||
if currentRule != nil && value != "" {
|
||||
currentRule.Allows = append(currentRule.Allows, value)
|
||||
if len(currentUserAgents) > 0 && value != "" {
|
||||
currentAllows = append(currentAllows, value)
|
||||
}
|
||||
|
||||
case "crawl-delay":
|
||||
if currentRule != nil {
|
||||
if len(currentUserAgents) > 0 {
|
||||
if delay, err := parseIntSafe(value); err == nil {
|
||||
currentRule.CrawlDelay = delay
|
||||
currentCrawlDelay = delay
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Don't forget the last rule
|
||||
if currentRule != nil {
|
||||
rules = append(rules, *currentRule)
|
||||
// Don't forget the last group of rules
|
||||
if len(currentUserAgents) > 0 {
|
||||
rule := createRuleFromAccumulated(currentUserAgents, currentDisallows, currentAllows, currentCrawlDelay)
|
||||
rules = append(rules, rule)
|
||||
}
|
||||
|
||||
// Mark blacklisted user agents (those with "Disallow: /")
|
||||
@@ -211,10 +231,11 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
|
||||
var anubisRules []AnubisRule
|
||||
ruleCounter := 0
|
||||
|
||||
// Process each robots rule individually
|
||||
for _, robotsRule := range robotsRules {
|
||||
userAgent := robotsRule.UserAgent
|
||||
userAgents := robotsRule.UserAgents
|
||||
|
||||
// Handle crawl delay as weight adjustment (do this first before any continues)
|
||||
// Handle crawl delay
|
||||
if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 {
|
||||
ruleCounter++
|
||||
rule := AnubisRule{
|
||||
@@ -223,20 +244,32 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
|
||||
Weight: &config.Weight{Adjust: *crawlDelay},
|
||||
}
|
||||
|
||||
if userAgent == "*" {
|
||||
if len(userAgents) == 1 && userAgents[0] == "*" {
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
All: []string{"true"}, // Always applies
|
||||
}
|
||||
} else {
|
||||
} else if len(userAgents) == 1 {
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
|
||||
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])},
|
||||
}
|
||||
} else {
|
||||
// Multiple user agents - use any block
|
||||
var expressions []string
|
||||
for _, ua := range userAgents {
|
||||
if ua == "*" {
|
||||
expressions = append(expressions, "true")
|
||||
} else {
|
||||
expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
|
||||
}
|
||||
}
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
Any: expressions,
|
||||
}
|
||||
}
|
||||
|
||||
anubisRules = append(anubisRules, rule)
|
||||
}
|
||||
|
||||
// Handle blacklisted user agents (complete deny/challenge)
|
||||
// Handle blacklisted user agents
|
||||
if robotsRule.IsBlacklist {
|
||||
ruleCounter++
|
||||
rule := AnubisRule{
|
||||
@@ -244,21 +277,36 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
|
||||
Action: *userAgentDeny,
|
||||
}
|
||||
|
||||
if userAgent == "*" {
|
||||
// This would block everything - convert to a weight adjustment instead
|
||||
rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter)
|
||||
rule.Action = "WEIGH"
|
||||
rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
All: []string{"true"}, // Always applies
|
||||
if len(userAgents) == 1 {
|
||||
userAgent := userAgents[0]
|
||||
if userAgent == "*" {
|
||||
// This would block everything - convert to a weight adjustment instead
|
||||
rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter)
|
||||
rule.Action = "WEIGH"
|
||||
rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
All: []string{"true"}, // Always applies
|
||||
}
|
||||
} else {
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Multiple user agents - use any block
|
||||
var expressions []string
|
||||
for _, ua := range userAgents {
|
||||
if ua == "*" {
|
||||
expressions = append(expressions, "true")
|
||||
} else {
|
||||
expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
|
||||
}
|
||||
}
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
|
||||
Any: expressions,
|
||||
}
|
||||
}
|
||||
anubisRules = append(anubisRules, rule)
|
||||
continue
|
||||
}
|
||||
|
||||
// Handle specific disallow rules
|
||||
@@ -276,9 +324,33 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
|
||||
// Build CEL expression
|
||||
var conditions []string
|
||||
|
||||
// Add user agent condition if not wildcard
|
||||
if userAgent != "*" {
|
||||
conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgent))
|
||||
// Add user agent conditions
|
||||
if len(userAgents) == 1 && userAgents[0] == "*" {
|
||||
// Wildcard user agent - no user agent condition needed
|
||||
} else if len(userAgents) == 1 {
|
||||
conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0]))
|
||||
} else {
|
||||
// For multiple user agents, we need to use a more complex expression
|
||||
// This is a limitation - we can't easily combine any for user agents with all for path
|
||||
// So we'll create separate rules for each user agent
|
||||
for _, ua := range userAgents {
|
||||
if ua == "*" {
|
||||
continue // Skip wildcard as it's handled separately
|
||||
}
|
||||
ruleCounter++
|
||||
subRule := AnubisRule{
|
||||
Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
|
||||
Action: *baseAction,
|
||||
Expression: &config.ExpressionOrList{
|
||||
All: []string{
|
||||
fmt.Sprintf("userAgent.contains(%q)", ua),
|
||||
buildPathCondition(disallow),
|
||||
},
|
||||
},
|
||||
}
|
||||
anubisRules = append(anubisRules, subRule)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Add path condition
|
||||
@@ -291,7 +363,6 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
|
||||
|
||||
anubisRules = append(anubisRules, rule)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return anubisRules
|
||||
|
||||
@@ -78,6 +78,12 @@ func TestDataFileConversion(t *testing.T) {
|
||||
expectedFile: "complex.yaml",
|
||||
options: TestOptions{format: "yaml", crawlDelayWeight: 5},
|
||||
},
|
||||
{
|
||||
name: "consecutive_user_agents",
|
||||
robotsFile: "consecutive.robots.txt",
|
||||
expectedFile: "consecutive.yaml",
|
||||
options: TestOptions{format: "yaml", crawlDelayWeight: 3},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
|
||||
6
cmd/robots2policy/testdata/blacklist.yaml
vendored
6
cmd/robots2policy/testdata/blacklist.yaml
vendored
@@ -25,6 +25,6 @@
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Googlebot")
|
||||
- path.startsWith("/search")
|
||||
name: robots-txt-policy-disallow-7
|
||||
- userAgent.contains("Googlebot")
|
||||
- path.startsWith("/search")
|
||||
name: robots-txt-policy-disallow-7
|
||||
|
||||
24
cmd/robots2policy/testdata/complex.yaml
vendored
24
cmd/robots2policy/testdata/complex.yaml
vendored
@@ -20,8 +20,8 @@
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Googlebot")
|
||||
- path.startsWith("/search/")
|
||||
- userAgent.contains("Googlebot")
|
||||
- path.startsWith("/search/")
|
||||
name: robots-txt-policy-disallow-6
|
||||
- action: WEIGH
|
||||
expression: userAgent.contains("Bingbot")
|
||||
@@ -31,14 +31,14 @@
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Bingbot")
|
||||
- path.startsWith("/search/")
|
||||
- userAgent.contains("Bingbot")
|
||||
- path.startsWith("/search/")
|
||||
name: robots-txt-policy-disallow-8
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Bingbot")
|
||||
- path.startsWith("/admin/")
|
||||
- userAgent.contains("Bingbot")
|
||||
- path.startsWith("/admin/")
|
||||
name: robots-txt-policy-disallow-9
|
||||
- action: DENY
|
||||
expression: userAgent.contains("BadBot")
|
||||
@@ -54,18 +54,18 @@
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/.*/admin")
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/.*/admin")
|
||||
name: robots-txt-policy-disallow-13
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/temp.*\\.html")
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/temp.*\\.html")
|
||||
name: robots-txt-policy-disallow-14
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/file.\\.log")
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/file.\\.log")
|
||||
name: robots-txt-policy-disallow-15
|
||||
|
||||
25
cmd/robots2policy/testdata/consecutive.robots.txt
vendored
Normal file
25
cmd/robots2policy/testdata/consecutive.robots.txt
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
# Test consecutive user agents that should be grouped into any: blocks
|
||||
User-agent: *
|
||||
Disallow: /admin
|
||||
Crawl-delay: 10
|
||||
|
||||
# Multiple consecutive user agents - should be grouped
|
||||
User-agent: BadBot
|
||||
User-agent: SpamBot
|
||||
User-agent: EvilBot
|
||||
Disallow: /
|
||||
|
||||
# Single user agent - should be separate
|
||||
User-agent: GoodBot
|
||||
Disallow: /private
|
||||
|
||||
# Multiple consecutive user agents with crawl delay
|
||||
User-agent: SlowBot1
|
||||
User-agent: SlowBot2
|
||||
Crawl-delay: 5
|
||||
|
||||
# Multiple consecutive user agents with specific path
|
||||
User-agent: SearchBot1
|
||||
User-agent: SearchBot2
|
||||
User-agent: SearchBot3
|
||||
Disallow: /search
|
||||
47
cmd/robots2policy/testdata/consecutive.yaml
vendored
Normal file
47
cmd/robots2policy/testdata/consecutive.yaml
vendored
Normal file
@@ -0,0 +1,47 @@
|
||||
- action: WEIGH
|
||||
expression: "true"
|
||||
name: robots-txt-policy-crawl-delay-1
|
||||
weight:
|
||||
adjust: 3
|
||||
- action: CHALLENGE
|
||||
expression: path.startsWith("/admin")
|
||||
name: robots-txt-policy-disallow-2
|
||||
- action: DENY
|
||||
expression:
|
||||
any:
|
||||
- userAgent.contains("BadBot")
|
||||
- userAgent.contains("SpamBot")
|
||||
- userAgent.contains("EvilBot")
|
||||
name: robots-txt-policy-blacklist-3
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("GoodBot")
|
||||
- path.startsWith("/private")
|
||||
name: robots-txt-policy-disallow-4
|
||||
- action: WEIGH
|
||||
expression:
|
||||
any:
|
||||
- userAgent.contains("SlowBot1")
|
||||
- userAgent.contains("SlowBot2")
|
||||
name: robots-txt-policy-crawl-delay-5
|
||||
weight:
|
||||
adjust: 3
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("SearchBot1")
|
||||
- path.startsWith("/search")
|
||||
name: robots-txt-policy-disallow-7
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("SearchBot2")
|
||||
- path.startsWith("/search")
|
||||
name: robots-txt-policy-disallow-8
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("SearchBot3")
|
||||
- path.startsWith("/search")
|
||||
name: robots-txt-policy-disallow-9
|
||||
8
cmd/robots2policy/testdata/simple.json
vendored
8
cmd/robots2policy/testdata/simple.json
vendored
@@ -1,12 +1,12 @@
|
||||
[
|
||||
{
|
||||
"action": "CHALLENGE",
|
||||
"expression": "path.startsWith(\"/admin/\")",
|
||||
"name": "robots-txt-policy-disallow-1"
|
||||
"name": "robots-txt-policy-disallow-1",
|
||||
"action": "CHALLENGE"
|
||||
},
|
||||
{
|
||||
"action": "CHALLENGE",
|
||||
"expression": "path.startsWith(\"/private\")",
|
||||
"name": "robots-txt-policy-disallow-2"
|
||||
"name": "robots-txt-policy-disallow-2",
|
||||
"action": "CHALLENGE"
|
||||
}
|
||||
]
|
||||
@@ -11,7 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
- Document `SLOG_LEVEL` environment variable in installation guide ([#1086](https://github.com/TecharoHQ/anubis/pull/1086))
|
||||
- Document missing environment variables in installation guide: `SLOG_LEVEL`, `COOKIE_PREFIX`, `FORCED_LANGUAGE`, and `TARGET_DISABLE_KEEPALIVE` ([#1086](https://github.com/TecharoHQ/anubis/pull/1086))
|
||||
- Fixed `robots2policy` to properly group consecutive user agents into `any:` instead of only processing the last one ([#925](https://github.com/TecharoHQ/anubis/pull/925))
|
||||
|
||||
<!-- This changes the project to: -->
|
||||
|
||||
|
||||
@@ -67,6 +67,7 @@ Anubis uses these environment variables for configuration:
|
||||
| `COOKIE_DYNAMIC_DOMAIN` | false | If set to true, automatically set cookie domain fields based on the hostname of the request. EG: if you are making a request to `anubis.techaro.lol`, the Anubis cookie will be valid for any subdomain of `techaro.lol`. |
|
||||
| `COOKIE_EXPIRATION_TIME` | `168h` | The amount of time the authorization cookie is valid for. |
|
||||
| `COOKIE_PARTITIONED` | `false` | If set to `true`, enables the [partitioned (CHIPS) flag](https://developers.google.com/privacy-sandbox/cookies/chips), meaning that Anubis inside an iframe has a different set of cookies than the domain hosting the iframe. |
|
||||
| `COOKIE_PREFIX` | `anubis-cookie` | The prefix used for browser cookies created by Anubis. Useful for customization or avoiding conflicts with other applications. |
|
||||
| `COOKIE_SECURE` | `true` | If set to `true`, enables the [Secure flag](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/Cookies#block_access_to_your_cookies), meaning that the cookies will only be transmitted over HTTPS. If Anubis is used in an unsecure context (plain HTTP), this will be need to be set to false |
|
||||
| `DIFFICULTY` | `4` | The difficulty of the challenge, or the number of leading zeroes that must be in successful responses. |
|
||||
| `ED25519_PRIVATE_KEY_HEX` | unset | The hex-encoded ed25519 private key used to sign Anubis responses. If this is not set, Anubis will generate one for you. This should be exactly 64 characters long. When running multiple instances on the same base domain, the key must be the same across all instances. See below for details. |
|
||||
@@ -101,10 +102,12 @@ If you don't know or understand what these settings mean, ignore them. These are
|
||||
|
||||
| Environment Variable | Default value | Explanation |
|
||||
| :---------------------------- | :------------ | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `TARGET_SNI` | unset | If set, overrides the TLS handshake hostname in requests forwarded to `TARGET`. |
|
||||
| `FORCED_LANGUAGE` | unset | If set, forces Anubis to display challenge pages in the specified language instead of using the browser's Accept-Language header. Use ISO 639-1 language codes (e.g., `de` for German, `fr` for French). |
|
||||
| `HS512_SECRET` | unset | Secret string for JWT HS512 algorithm. If this is not set, Anubis will use ED25519 as defined via the variables above. The longer the better; 128 chars should suffice. |
|
||||
| `TARGET_DISABLE_KEEPALIVE` | `false` | If `true`, disables HTTP keep-alive for connections to the target backend. Useful for backends that don't handle keep-alive properly. |
|
||||
| `TARGET_HOST` | unset | If set, overrides the Host header in requests forwarded to `TARGET`. |
|
||||
| `TARGET_INSECURE_SKIP_VERIFY` | `false` | If `true`, skip TLS certificate validation for targets that listen over `https`. If your backend does not listen over `https`, ignore this setting. |
|
||||
| `HS512_SECRET` | unset | Secret string for JWT HS512 algorithm. If this is not set, Anubis will use ED25519 as defined via the variables above. The longer the better; 128 chars should suffice. |
|
||||
| `TARGET_SNI` | unset | If set, overrides the TLS handshake hostname in requests forwarded to `TARGET`. |
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
Reference in New Issue
Block a user