Merge branch 'main' into json/requireED25519

Signed-off-by: Jason Cameron <git@jasoncameron.dev>
fix(robots2policy): handle multiple user agents under one block (#925 )
2026-05-21 21:47:48 +00:00 · 2025-09-06 22:35:45 -04:00 · 2025-09-06 22:35:19 -04:00 · 2025-09-06 22:30:43 -04:00 · 2025-09-06 22:07:14 -04:00 · 2025-09-06 22:01:38 -04:00
25 changed files with 285 additions and 98 deletions
@@ -15,7 +15,7 @@ jobs:
    runs-on: ubuntu-24.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-tags: true
          fetch-depth: 0
@@ -25,7 +25,7 @@ jobs:
        uses: Homebrew/actions/setup-homebrew@main

      - name: Setup Homebrew cellar cache
-        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
+        uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
        with:
          path: |
            /home/linuxbrew/.linuxbrew/Cellar
@@ -47,7 +47,7 @@ jobs:

      - name: Docker meta
        id: meta
-        uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5.7.0
+        uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f # v5.8.0
        with:
          images: ghcr.io/${{ github.repository }}

@@ -21,7 +21,7 @@ jobs:
    runs-on: ubuntu-24.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-tags: true
          fetch-depth: 0
@@ -35,7 +35,7 @@ jobs:
        uses: Homebrew/actions/setup-homebrew@main

      - name: Setup Homebrew cellar cache
-        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
+        uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
        with:
          path: |
            /home/linuxbrew/.linuxbrew/Cellar
@@ -56,7 +56,7 @@ jobs:
          brew bundle

      - name: Log into registry
-        uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
+        uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
@@ -64,7 +64,7 @@ jobs:

      - name: Docker meta
        id: meta
-        uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5.7.0
+        uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f # v5.8.0
        with:
          images: ${{ env.IMAGE }}

@@ -78,7 +78,7 @@ jobs:
          SLOG_LEVEL: debug

      - name: Generate artifact attestation
-        uses: actions/attest-build-provenance@e8998f949152b193b063cb0ec769d69d929409be # v2.4.0
+        uses: actions/attest-build-provenance@977bb373ede98d70efdf65b84cb5f73e068dcc2a # v3.0.0
        with:
          subject-name: ${{ env.IMAGE }}
          subject-digest: ${{ steps.build.outputs.digest }}
@@ -17,7 +17,7 @@ jobs:
    runs-on: ubuntu-24.04

    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          persist-credentials: false

@@ -25,7 +25,7 @@ jobs:
        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1

      - name: Log into registry
-        uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
+        uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
        with:
          registry: ghcr.io
          username: techarohq
@@ -33,7 +33,7 @@ jobs:

      - name: Docker meta
        id: meta
-        uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5.7.0
+        uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f # v5.8.0
        with:
          images: ghcr.io/techarohq/anubis/docs
          tags: |
@@ -53,14 +53,14 @@ jobs:
          push: true

      - name: Apply k8s manifests to limsa lominsa
-        uses: actions-hub/kubectl@b5b19eeb6a0ffde16637e398f8b96ef01eb8fdb7 # v1.33.3
+        uses: actions-hub/kubectl@af345ed727f0268738e65be48422e463cc67c220 # v1.34.0
        env:
          KUBE_CONFIG: ${{ secrets.LIMSA_LOMINSA_KUBECONFIG }}
        with:
          args: apply -k docs/manifest

      - name: Apply k8s manifests to limsa lominsa
-        uses: actions-hub/kubectl@b5b19eeb6a0ffde16637e398f8b96ef01eb8fdb7 # v1.33.3
+        uses: actions-hub/kubectl@af345ed727f0268738e65be48422e463cc67c220 # v1.34.0
        env:
          KUBE_CONFIG: ${{ secrets.LIMSA_LOMINSA_KUBECONFIG }}
        with:
@@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-24.04

    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          persist-credentials: false

@@ -22,7 +22,7 @@ jobs:

      - name: Docker meta
        id: meta
-        uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5.7.0
+        uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f # v5.8.0
        with:
          images: ghcr.io/techarohq/anubis/docs
          tags: |
@@ -15,7 +15,7 @@ jobs:
    #runs-on: alrest-techarohq
    runs-on: ubuntu-24.04
    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      with:
        persist-credentials: false

@@ -28,7 +28,7 @@ jobs:
      uses: Homebrew/actions/setup-homebrew@main

    - name: Setup Homebrew cellar cache
-      uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
+      uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
      with:
        path: |
          /home/linuxbrew/.linuxbrew/Cellar
@@ -49,7 +49,7 @@ jobs:
        brew bundle

    - name: Setup Golang caches
-      uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
+      uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
      with:
        path: |
          ~/.cache/go-build
@@ -59,7 +59,7 @@ jobs:
          ${{ runner.os }}-golang-

    - name: Cache playwright binaries
-      uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
+      uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
      id: playwright-cache
      with:
        path: |
@@ -14,7 +14,7 @@ jobs:
    #runs-on: alrest-techarohq
    runs-on: ubuntu-24.04
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          persist-credentials: false
          fetch-tags: true
@@ -29,7 +29,7 @@ jobs:
        uses: Homebrew/actions/setup-homebrew@main

      - name: Setup Homebrew cellar cache
-        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
+        uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
        with:
          path: |
            /home/linuxbrew/.linuxbrew/Cellar
@@ -50,7 +50,7 @@ jobs:
          brew bundle

      - name: Setup Golang caches
-        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
+        uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
        with:
          path: |
            ~/.cache/go-build
@@ -15,7 +15,7 @@ jobs:
    #runs-on: alrest-techarohq
    runs-on: ubuntu-24.04
    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      with:
        persist-credentials: false
        fetch-tags: true
@@ -30,7 +30,7 @@ jobs:
      uses: Homebrew/actions/setup-homebrew@main

    - name: Setup Homebrew cellar cache
-      uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
+      uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
      with:
        path: |
          /home/linuxbrew/.linuxbrew/Cellar
@@ -51,7 +51,7 @@ jobs:
        brew bundle

    - name: Setup Golang caches
-      uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
+      uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
      with:
        path: |
          ~/.cache/go-build
@@ -24,7 +24,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          persist-credentials: false

@@ -18,13 +18,13 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-tags: true
          fetch-depth: 0
          persist-credentials: false
      - name: Log into registry
-        uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
+        uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
@@ -20,7 +20,7 @@ jobs:
          - ci@ppc64le.techaro.lol
    steps:
      - name: Checkout code
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-tags: true
          fetch-depth: 0
@@ -16,12 +16,12 @@ jobs:
      security-events: write
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          persist-credentials: false

      - name: Install the latest version of uv
-        uses: astral-sh/setup-uv@e92bafb6253dcd438e0484186d7669ea7a8ca1cc # v6.4.3
+        uses: astral-sh/setup-uv@4959332f0f014c5280e7eac8b70c90cb574c9f9b # v6.6.0

      - name: Run zizmor 🌈
        run: uvx zizmor --format sarif . > results.sarif 
@@ -29,7 +29,7 @@ jobs:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 

      - name: Upload SARIF file
-        uses: github/codeql-action/upload-sarif@4e828ff8d448a8a6e532957b1811f387a63867e8 # v3.29.4
+        uses: github/codeql-action/upload-sarif@3c3833e0f8c1c83d449a7478aa59c036a9165498 # v3.29.11
        with:
          sarif_file: results.sarif
          category: zizmor
@@ -317,6 +317,16 @@ func main() {
 		log.Fatalf("can't parse policy file: %v", err)
 	}

+	// Warn if persistent storage is used without a configured signing key
+	if policy.Store.IsPersistent() {
+		if *hs512Secret == "" && *ed25519PrivateKeyHex == "" && *ed25519PrivateKeyHexFile == "" {
+			slog.Warn("[misconfiguration] persistent storage backend is configured, but no private key is set. " +
+				"Challenges will be invalidated when Anubis restarts. " +
+				"Set HS512_SECRET, ED25519_PRIVATE_KEY_HEX, or ED25519_PRIVATE_KEY_HEX_FILE to ensure challenges survive service restarts. " +
+				"See: https://anubis.techaro.lol/docs/admin/installation#key-generation")
+		}
+	}
+
 	ruleErrorIDs := make(map[string]string)
 	for _, rule := range policy.Bots {
 		if rule.Action != config.RuleDeny {
@@ -29,7 +29,7 @@ var (
 )

 type RobotsRule struct {
-	UserAgent   string
+	UserAgents  []string
 	Disallows   []string
 	Allows      []string
 	CrawlDelay  int
@@ -130,10 +130,26 @@ func main() {
 	}
 }

+func createRuleFromAccumulated(userAgents, disallows, allows []string, crawlDelay int) RobotsRule {
+	rule := RobotsRule{
+		UserAgents: make([]string, len(userAgents)),
+		Disallows:  make([]string, len(disallows)),
+		Allows:     make([]string, len(allows)),
+		CrawlDelay: crawlDelay,
+	}
+	copy(rule.UserAgents, userAgents)
+	copy(rule.Disallows, disallows)
+	copy(rule.Allows, allows)
+	return rule
+}
+
 func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
 	scanner := bufio.NewScanner(input)
 	var rules []RobotsRule
-	var currentRule *RobotsRule
+	var currentUserAgents []string
+	var currentDisallows []string
+	var currentAllows []string
+	var currentCrawlDelay int

 	for scanner.Scan() {
 		line := strings.TrimSpace(scanner.Text())
@@ -154,38 +170,42 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {

 		switch directive {
 		case "user-agent":
-			// Start a new rule section
-			if currentRule != nil {
-				rules = append(rules, *currentRule)
-			}
-			currentRule = &RobotsRule{
-				UserAgent: value,
-				Disallows: make([]string, 0),
-				Allows:    make([]string, 0),
+			// If we have accumulated rules with directives and encounter a new user-agent,
+			// flush the current rules
+			if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) {
+				rule := createRuleFromAccumulated(currentUserAgents, currentDisallows, currentAllows, currentCrawlDelay)
+				rules = append(rules, rule)
+				// Reset for next group
+				currentUserAgents = nil
+				currentDisallows = nil
+				currentAllows = nil
+				currentCrawlDelay = 0
 			}
+			currentUserAgents = append(currentUserAgents, value)

 		case "disallow":
-			if currentRule != nil && value != "" {
-				currentRule.Disallows = append(currentRule.Disallows, value)
+			if len(currentUserAgents) > 0 && value != "" {
+				currentDisallows = append(currentDisallows, value)
 			}

 		case "allow":
-			if currentRule != nil && value != "" {
-				currentRule.Allows = append(currentRule.Allows, value)
+			if len(currentUserAgents) > 0 && value != "" {
+				currentAllows = append(currentAllows, value)
 			}

 		case "crawl-delay":
-			if currentRule != nil {
+			if len(currentUserAgents) > 0 {
 				if delay, err := parseIntSafe(value); err == nil {
-					currentRule.CrawlDelay = delay
+					currentCrawlDelay = delay
 				}
 			}
 		}
 	}

-	// Don't forget the last rule
-	if currentRule != nil {
-		rules = append(rules, *currentRule)
+	// Don't forget the last group of rules
+	if len(currentUserAgents) > 0 {
+		rule := createRuleFromAccumulated(currentUserAgents, currentDisallows, currentAllows, currentCrawlDelay)
+		rules = append(rules, rule)
 	}

 	// Mark blacklisted user agents (those with "Disallow: /")
@@ -211,10 +231,11 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
 	var anubisRules []AnubisRule
 	ruleCounter := 0

+	// Process each robots rule individually
 	for _, robotsRule := range robotsRules {
-		userAgent := robotsRule.UserAgent
+		userAgents := robotsRule.UserAgents

-		// Handle crawl delay as weight adjustment (do this first before any continues)
+		// Handle crawl delay
 		if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 {
 			ruleCounter++
 			rule := AnubisRule{
@@ -223,20 +244,32 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
 				Weight: &config.Weight{Adjust: *crawlDelay},
 			}

-			if userAgent == "*" {
+			if len(userAgents) == 1 && userAgents[0] == "*" {
 				rule.Expression = &config.ExpressionOrList{
 					All: []string{"true"}, // Always applies
 				}
-			} else {
+			} else if len(userAgents) == 1 {
 				rule.Expression = &config.ExpressionOrList{
-					All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
+					All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])},
+				}
+			} else {
+				// Multiple user agents - use any block
+				var expressions []string
+				for _, ua := range userAgents {
+					if ua == "*" {
+						expressions = append(expressions, "true")
+					} else {
+						expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
+					}
+				}
+				rule.Expression = &config.ExpressionOrList{
+					Any: expressions,
 				}
 			}
-
 			anubisRules = append(anubisRules, rule)
 		}

-		// Handle blacklisted user agents (complete deny/challenge)
+		// Handle blacklisted user agents
 		if robotsRule.IsBlacklist {
 			ruleCounter++
 			rule := AnubisRule{
@@ -244,21 +277,36 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
 				Action: *userAgentDeny,
 			}

-			if userAgent == "*" {
-				// This would block everything - convert to a weight adjustment instead
-				rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter)
-				rule.Action = "WEIGH"
-				rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly
-				rule.Expression = &config.ExpressionOrList{
-					All: []string{"true"}, // Always applies
+			if len(userAgents) == 1 {
+				userAgent := userAgents[0]
+				if userAgent == "*" {
+					// This would block everything - convert to a weight adjustment instead
+					rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter)
+					rule.Action = "WEIGH"
+					rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly
+					rule.Expression = &config.ExpressionOrList{
+						All: []string{"true"}, // Always applies
+					}
+				} else {
+					rule.Expression = &config.ExpressionOrList{
+						All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
+					}
 				}
 			} else {
+				// Multiple user agents - use any block
+				var expressions []string
+				for _, ua := range userAgents {
+					if ua == "*" {
+						expressions = append(expressions, "true")
+					} else {
+						expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
+					}
+				}
 				rule.Expression = &config.ExpressionOrList{
-					All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
+					Any: expressions,
 				}
 			}
 			anubisRules = append(anubisRules, rule)
-			continue
 		}

 		// Handle specific disallow rules
@@ -276,9 +324,33 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
 			// Build CEL expression
 			var conditions []string

-			// Add user agent condition if not wildcard
-			if userAgent != "*" {
-				conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgent))
+			// Add user agent conditions
+			if len(userAgents) == 1 && userAgents[0] == "*" {
+				// Wildcard user agent - no user agent condition needed
+			} else if len(userAgents) == 1 {
+				conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0]))
+			} else {
+				// For multiple user agents, we need to use a more complex expression
+				// This is a limitation - we can't easily combine any for user agents with all for path
+				// So we'll create separate rules for each user agent
+				for _, ua := range userAgents {
+					if ua == "*" {
+						continue // Skip wildcard as it's handled separately
+					}
+					ruleCounter++
+					subRule := AnubisRule{
+						Name:   fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
+						Action: *baseAction,
+						Expression: &config.ExpressionOrList{
+							All: []string{
+								fmt.Sprintf("userAgent.contains(%q)", ua),
+								buildPathCondition(disallow),
+							},
+						},
+					}
+					anubisRules = append(anubisRules, subRule)
+				}
+				continue
 			}

 			// Add path condition
@@ -291,7 +363,6 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {

 			anubisRules = append(anubisRules, rule)
 		}
-
 	}

 	return anubisRules
@@ -78,6 +78,12 @@ func TestDataFileConversion(t *testing.T) {
 			expectedFile: "complex.yaml",
 			options:      TestOptions{format: "yaml", crawlDelayWeight: 5},
 		},
+		{
+			name:         "consecutive_user_agents",
+			robotsFile:   "consecutive.robots.txt",
+			expectedFile: "consecutive.yaml",
+			options:      TestOptions{format: "yaml", crawlDelayWeight: 3},
+		},
 	}

 	for _, tc := range testCases {
@@ -25,6 +25,6 @@
 - action: CHALLENGE
  expression:
    all:
-        - userAgent.contains("Googlebot")
-        - path.startsWith("/search")
-  name: robots-txt-policy-disallow-7
+    - userAgent.contains("Googlebot")
+    - path.startsWith("/search")
+  name: robots-txt-policy-disallow-7
@@ -20,8 +20,8 @@
 - action: CHALLENGE
  expression:
    all:
-        - userAgent.contains("Googlebot")
-        - path.startsWith("/search/")
+    - userAgent.contains("Googlebot")
+    - path.startsWith("/search/")
  name: robots-txt-policy-disallow-6
 - action: WEIGH
  expression: userAgent.contains("Bingbot")
@@ -31,14 +31,14 @@
 - action: CHALLENGE
  expression:
    all:
-        - userAgent.contains("Bingbot")
-        - path.startsWith("/search/")
+    - userAgent.contains("Bingbot")
+    - path.startsWith("/search/")
  name: robots-txt-policy-disallow-8
 - action: CHALLENGE
  expression:
    all:
-        - userAgent.contains("Bingbot")
-        - path.startsWith("/admin/")
+    - userAgent.contains("Bingbot")
+    - path.startsWith("/admin/")
  name: robots-txt-policy-disallow-9
 - action: DENY
  expression: userAgent.contains("BadBot")
@@ -54,18 +54,18 @@
 - action: CHALLENGE
  expression:
    all:
-        - userAgent.contains("TestBot")
-        - path.matches("^/.*/admin")
+    - userAgent.contains("TestBot")
+    - path.matches("^/.*/admin")
  name: robots-txt-policy-disallow-13
 - action: CHALLENGE
  expression:
    all:
-        - userAgent.contains("TestBot")
-        - path.matches("^/temp.*\\.html")
+    - userAgent.contains("TestBot")
+    - path.matches("^/temp.*\\.html")
  name: robots-txt-policy-disallow-14
 - action: CHALLENGE
  expression:
    all:
-        - userAgent.contains("TestBot")
-        - path.matches("^/file.\\.log")
+    - userAgent.contains("TestBot")
+    - path.matches("^/file.\\.log")
  name: robots-txt-policy-disallow-15
@@ -0,0 +1,25 @@
+# Test consecutive user agents that should be grouped into any: blocks
+User-agent: *
+Disallow: /admin
+Crawl-delay: 10
+
+# Multiple consecutive user agents - should be grouped
+User-agent: BadBot
+User-agent: SpamBot
+User-agent: EvilBot
+Disallow: /
+
+# Single user agent - should be separate
+User-agent: GoodBot
+Disallow: /private
+
+# Multiple consecutive user agents with crawl delay
+User-agent: SlowBot1
+User-agent: SlowBot2
+Crawl-delay: 5
+
+# Multiple consecutive user agents with specific path
+User-agent: SearchBot1
+User-agent: SearchBot2
+User-agent: SearchBot3
+Disallow: /search 
@@ -0,0 +1,47 @@
+- action: WEIGH
+  expression: "true"
+  name: robots-txt-policy-crawl-delay-1
+  weight:
+    adjust: 3
+- action: CHALLENGE
+  expression: path.startsWith("/admin")
+  name: robots-txt-policy-disallow-2
+- action: DENY
+  expression:
+    any:
+    - userAgent.contains("BadBot")
+    - userAgent.contains("SpamBot")
+    - userAgent.contains("EvilBot")
+  name: robots-txt-policy-blacklist-3
+- action: CHALLENGE
+  expression:
+    all:
+    - userAgent.contains("GoodBot")
+    - path.startsWith("/private")
+  name: robots-txt-policy-disallow-4
+- action: WEIGH
+  expression:
+    any:
+    - userAgent.contains("SlowBot1")
+    - userAgent.contains("SlowBot2")
+  name: robots-txt-policy-crawl-delay-5
+  weight:
+    adjust: 3
+- action: CHALLENGE
+  expression:
+    all:
+    - userAgent.contains("SearchBot1")
+    - path.startsWith("/search")
+  name: robots-txt-policy-disallow-7
+- action: CHALLENGE
+  expression:
+    all:
+    - userAgent.contains("SearchBot2")
+    - path.startsWith("/search")
+  name: robots-txt-policy-disallow-8
+- action: CHALLENGE
+  expression:
+    all:
+    - userAgent.contains("SearchBot3")
+    - path.startsWith("/search")
+  name: robots-txt-policy-disallow-9
@@ -1,12 +1,12 @@
 [
  {
-    "action": "CHALLENGE",
    "expression": "path.startsWith(\"/admin/\")",
-    "name": "robots-txt-policy-disallow-1"
+    "name": "robots-txt-policy-disallow-1",
+    "action": "CHALLENGE"
  },
  {
-    "action": "CHALLENGE",
    "expression": "path.startsWith(\"/private\")",
-    "name": "robots-txt-policy-disallow-2"
+    "name": "robots-txt-policy-disallow-2",
+    "action": "CHALLENGE"
  }
 ]
@@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [Unreleased]

+- Document missing environment variables in installation guide: `SLOG_LEVEL`, `COOKIE_PREFIX`, `FORCED_LANGUAGE`, and `TARGET_DISABLE_KEEPALIVE` ([#1086](https://github.com/TecharoHQ/anubis/pull/1086))
+- Add validation warning when persistent storage is used without setting signing keys
+- Fixed `robots2policy` to properly group consecutive user agents into `any:` instead of only processing the last one ([#925](https://github.com/TecharoHQ/anubis/pull/925))
+
 <!-- This changes the project to: -->

 ## v1.22.0: Yda Hext
@@ -59,7 +59,7 @@ Currently the following settings are configurable via the policy file:
 Anubis uses these environment variables for configuration:

 | Environment Variable           | Default value           | Explanation                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| :----------------------------- | :---------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+|:-------------------------------|:------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `BASE_PREFIX`                  | unset                   | If set, adds a global prefix to all Anubis endpoints (everything starting with `/.within.website/x/anubis/`). For example, setting this to `/myapp` would make Anubis accessible at `/myapp/` instead of `/`. This is useful when running Anubis behind a reverse proxy that routes based on path prefixes.                                                                                                                                                                                             |
 | `BIND`                         | `:8923`                 | The network address that Anubis listens on. For `unix`, set this to a path: `/run/anubis/instance.sock`                                                                                                                                                                                                                                                                                                                                                                                                 |
 | `BIND_NETWORK`                 | `tcp`                   | The address family that Anubis listens on. Accepts `tcp`, `unix` and anything Go's [`net.Listen`](https://pkg.go.dev/net#Listen) supports.                                                                                                                                                                                                                                                                                                                                                              |
@@ -67,10 +67,11 @@ Anubis uses these environment variables for configuration:
 | `COOKIE_DYNAMIC_DOMAIN`        | false                   | If set to true, automatically set cookie domain fields based on the hostname of the request. EG: if you are making a request to `anubis.techaro.lol`, the Anubis cookie will be valid for any subdomain of `techaro.lol`.                                                                                                                                                                                                                                                                               |
 | `COOKIE_EXPIRATION_TIME`       | `168h`                  | The amount of time the authorization cookie is valid for.                                                                                                                                                                                                                                                                                                                                                                                                                                               |
 | `COOKIE_PARTITIONED`           | `false`                 | If set to `true`, enables the [partitioned (CHIPS) flag](https://developers.google.com/privacy-sandbox/cookies/chips), meaning that Anubis inside an iframe has a different set of cookies than the domain hosting the iframe.                                                                                                                                                                                                                                                                          |
+| `COOKIE_PREFIX`                | `anubis-cookie`         | The prefix used for browser cookies created by Anubis. Useful for customization or avoiding conflicts with other applications.                                                                                                                                                                                                                                                                                                                                                                          |
 | `COOKIE_SECURE`                | `true`                  | If set to `true`, enables the [Secure flag](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/Cookies#block_access_to_your_cookies), meaning that the cookies will only be transmitted over HTTPS. If Anubis is used in an unsecure context (plain HTTP), this will be need to be set to false                                                                                                                                                                                                   |
 | `DIFFICULTY`                   | `4`                     | The difficulty of the challenge, or the number of leading zeroes that must be in successful responses.                                                                                                                                                                                                                                                                                                                                                                                                  |
-| `ED25519_PRIVATE_KEY_HEX`      | unset                   | The hex-encoded ed25519 private key used to sign Anubis responses. If this is not set, Anubis will generate one for you. This should be exactly 64 characters long. When running multiple instances on the same base domain, the key must be the same across all instances. See below for details.                                                                                                                                                                                                      |
-| `ED25519_PRIVATE_KEY_HEX_FILE` | unset                   | Path to a file containing the hex-encoded ed25519 private key. Only one of this or its sister option may be set.                                                                                                                                                                                                                                                                                                                                                                                        |
+| `ED25519_PRIVATE_KEY_HEX`      | unset                   | The hex-encoded ed25519 private key used to sign Anubis responses. If this is not set, Anubis will generate one for you. This should be exactly 64 characters long. **Required when using persistent storage backends** (like bbolt) to ensure challenges survive service restarts. When running multiple instances on the same base domain, the key must be the same across all instances. See below for details.                                                                                      |
+| `ED25519_PRIVATE_KEY_HEX_FILE` | unset                   | Path to a file containing the hex-encoded ed25519 private key. Only one of this or its sister option may be set. **Required when using persistent storage backends** (like bbolt) to ensure challenges survive service restarts. When running multiple instances on the same base domain, the key must be the same across all instances.                                                                                                                                                                |
 | `JWT_RESTRICTION_HEADER`       | `X-Real-IP`             | If set, the JWT is only valid if the current value of this header matches the value when the JWT was created. You can use it e.g. to restrict a JWT to the source IP of the user using `X-Real-IP`.                                                                                                                                                                                                                                                                                                     |
 | `METRICS_BIND`                 | `:9090`                 | The network address that Anubis serves Prometheus metrics on. See `BIND` for more information.                                                                                                                                                                                                                                                                                                                                                                                                          |
 | `METRICS_BIND_NETWORK`         | `tcp`                   | The address family that the Anubis metrics server listens on. See `BIND_NETWORK` for more information.                                                                                                                                                                                                                                                                                                                                                                                                  |
@@ -81,6 +82,7 @@ Anubis uses these environment variables for configuration:
 | `PUBLIC_URL`                   | unset                   | The externally accessible URL for this Anubis instance, used for constructing redirect URLs (e.g., for Traefik forwardAuth).                                                                                                                                                                                                                                                                                                                                                                            |
 | `REDIRECT_DOMAINS`             | unset                   | If set, restrict the domains that Anubis can redirect to when passing a challenge.<br/><br/>If this is unset, Anubis may redirect to any domain which could cause security issues in the unlikely case that an attacker passes a challenge for your browser and then tricks you into clicking a link to your domain.<br/><br/>Note that if you are hosting Anubis on a non-standard port (`https://example:com:8443`, `http://www.example.net:8080`, etc.), you must also include the port number here. |
 | `SERVE_ROBOTS_TXT`             | `false`                 | If set `true`, Anubis will serve a default `robots.txt` file that disallows all known AI scrapers by name and then additionally disallows every scraper. This is useful if facts and circumstances make it difficult to change the underlying service to serve such a `robots.txt` file.                                                                                                                                                                                                                |
+| `SLOG_LEVEL`                   | `INFO`                  | The log level for structured logging. Valid values are `DEBUG`, `INFO`, `WARN`, and `ERROR`. Set to `DEBUG` to see all requests, evaluations, and detailed diagnostic information.                                                                                                                                                                                                                                                                                                                      |
 | `SOCKET_MODE`                  | `0770`                  | _Only used when at least one of the `*_BIND_NETWORK` variables are set to `unix`._ The socket mode (permissions) for Unix domain sockets.                                                                                                                                                                                                                                                                                                                                                               |
 | `STRIP_BASE_PREFIX`            | `false`                 | If set to `true`, strips the base prefix from request paths when forwarding to the target server. This is useful when your target service expects to receive requests without the base prefix. For example, with `BASE_PREFIX=/foo` and `STRIP_BASE_PREFIX=true`, a request to `/foo/bar` would be forwarded to the target as `/bar`.                                                                                                                                                                   |
 | `TARGET`                       | `http://localhost:3923` | The URL of the service that Anubis should forward valid requests to. Supports Unix domain sockets, set this to a URI like so: `unix:///path/to/socket.sock`.                                                                                                                                                                                                                                                                                                                                            |
@@ -100,10 +102,12 @@ If you don't know or understand what these settings mean, ignore them. These are

 | Environment Variable          | Default value | Explanation                                                                                                                                                             |
 | :---------------------------- | :------------ | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `TARGET_SNI`                  | unset         | If set, overrides the TLS handshake hostname in requests forwarded to `TARGET`.                                                                                         |
+| `FORCED_LANGUAGE`             | unset         | If set, forces Anubis to display challenge pages in the specified language instead of using the browser's Accept-Language header. Use ISO 639-1 language codes (e.g., `de` for German, `fr` for French). |
+| `HS512_SECRET`                | unset         | Secret string for JWT HS512 algorithm. If this is not set, Anubis will use ED25519 as defined via the variables above. The longer the better; 128 chars should suffice. **Required when using persistent storage backends** (like bbolt) to ensure challenges survive service restarts. When running multiple instances on the same base domain, the key must be the same across all instances. |
+| `TARGET_DISABLE_KEEPALIVE`    | `false`       | If `true`, disables HTTP keep-alive for connections to the target backend. Useful for backends that don't handle keep-alive properly.                                   |
 | `TARGET_HOST`                 | unset         | If set, overrides the Host header in requests forwarded to `TARGET`.                                                                                                    |
 | `TARGET_INSECURE_SKIP_VERIFY` | `false`       | If `true`, skip TLS certificate validation for targets that listen over `https`. If your backend does not listen over `https`, ignore this setting.                     |
-| `HS512_SECRET`                | unset         | Secret string for JWT HS512 algorithm. If this is not set, Anubis will use ED25519 as defined via the variables above. The longer the better; 128 chars should suffice. |
+| `TARGET_SNI`                  | unset         | If set, overrides the TLS handshake hostname in requests forwarded to `TARGET`.                                                                                         |

 </details>

@@ -11,10 +11,9 @@ import (
 	"go.etcd.io/bbolt"
 )

-// Sentinel error values used for testing and in admin-visible error messages.
+// Sentinel error value used for testing and in admin-visible error messages.
 var (
-	ErrBucketDoesNotExist = errors.New("bbolt: bucket does not exist")
-	ErrNotExists          = errors.New("bbolt: value does not exist in store")
+	ErrNotExists = errors.New("bbolt: value does not exist in store")
 )

 // Store implements store.Interface backed by bbolt[1].
@@ -150,6 +149,10 @@ func (s *Store) cleanup(ctx context.Context) error {
 	})
 }

+func (s *Store) IsPersistent() bool {
+	return true
+}
+
 func (s *Store) cleanupThread(ctx context.Context) {
 	t := time.NewTicker(time.Hour)
 	defer t.Stop()
@@ -37,6 +37,11 @@ type Interface interface {

 	// Set puts a value into the store that expires according to its expiry.
 	Set(ctx context.Context, key string, value []byte, expiry time.Duration) error
+
+	// IsPersistent returns true if this storage backend persists data across
+	// service restarts (e.g., bbolt, valkey). Returns false for volatile storage
+	// like in-memory backends.
+	IsPersistent() bool
 }

 func z[T any]() T { return *new(T) }
@@ -88,3 +93,7 @@ func (j *JSON[T]) Set(ctx context.Context, key string, value T, expiry time.Dura

 	return nil
 }
+
+func (j *JSON[T]) IsPersistent() bool {
+	return j.Underlying.IsPersistent()
+}
@@ -48,6 +48,10 @@ func (i *impl) Set(_ context.Context, key string, value []byte, expiry time.Dura
 	return nil
 }

+func (i *impl) IsPersistent() bool {
+	return false
+}
+
 func (i *impl) cleanupThread(ctx context.Context) {
 	t := time.NewTicker(5 * time.Minute)
 	defer t.Stop()
@@ -47,3 +47,7 @@ func (s *Store) Set(ctx context.Context, key string, value []byte, expiry time.D

 	return nil
 }
+
+func (s *Store) IsPersistent() bool {
+	return true
+}
Author	SHA1	Message	Date
Jason Cameron	49a7e4b7e6	Merge branch 'main' into json/requireED25519 Signed-off-by: Jason Cameron <git@jasoncameron.dev>	2025-09-06 22:35:45 -04:00
Jason Cameron	82099d9e05	fix(robots2policy): handle multiple user agents under one block (#925 )	2025-09-06 22:35:19 -04:00
dependabot[bot]	87c2f1e0e6	build(deps): bump the github-actions group across 1 directory with 8 updates (#1071 ) Co-authored-by: Jason Cameron <git@jasoncameron.dev>	2025-09-06 22:30:43 -04:00
Jason Cameron	e1ef8c050e	feat(storage): add IsPersistent method and validation warning for signing keys	2025-09-06 22:07:14 -04:00
Jason Cameron	e53dd5a37a	feat(storage): add IsPersistent method and validation warning for signing keys	2025-09-06 22:01:38 -04:00
Jason Cameron	9e8e643c7f	Merge branch 'main' of https://github.com/TecharoHQ/anubis into json/requireED25519	2025-09-06 21:45:49 -04:00
Jason Cameron	f0199d014f	docs: document some missing env vars (#1087 )	2025-09-07 01:34:42 +00:00
Jason Cameron	74c85bb971	docs(installation): document missing environment variables in installation guide	2025-09-06 21:24:37 -04:00
Jason Cameron	75109f6b73	docs(installation): add SLOG_LEVEL environment variable to configuration (#1086 ) * docs(installation): add SLOG_LEVEL environment variable to configuration * docs(installation): add SLOG_LEVEL environment variable to configuration	2025-09-06 20:59:02 -04:00
Jason Cameron	4a527a304b	docs(installation): add SLOG_LEVEL environment variable to configuration	2025-09-06 20:42:34 -04:00
Jason Cameron	c2ead79823	docs(installation): add SLOG_LEVEL environment variable to configuration	2025-09-06 20:40:11 -04:00