feat: add various robots.txt and YAML configurations for user agent handling and crawl delays

This commit is contained in:
Jason Cameron
2025-06-13 13:31:47 -04:00
parent 5a85cd41a2
commit 714496e317
14 changed files with 231 additions and 356 deletions

80
cmd/robots2policy/testdata/complex.yaml vendored Normal file
View File

@@ -0,0 +1,80 @@
- name: robots-txt-policy-crawl-delay-1
action: WEIGH
expression:
single: "true"
weight:
adjust: 3
- name: robots-txt-policy-disallow-2
action: CHALLENGE
expression:
single: path.startsWith("/admin/")
- name: robots-txt-policy-disallow-3
action: CHALLENGE
expression:
single: path.startsWith("/private/")
- name: robots-txt-policy-disallow-4
action: CHALLENGE
expression:
single: path.startsWith("/api/internal/")
- name: robots-txt-policy-crawl-delay-5
action: WEIGH
expression:
single: userAgent.contains("Googlebot")
weight:
adjust: 3
- name: robots-txt-policy-disallow-6
action: CHALLENGE
expression:
all:
- userAgent.contains("Googlebot")
- path.startsWith("/search/")
- name: robots-txt-policy-crawl-delay-7
action: WEIGH
expression:
single: userAgent.contains("Bingbot")
weight:
adjust: 3
- name: robots-txt-policy-disallow-8
action: CHALLENGE
expression:
all:
- userAgent.contains("Bingbot")
- path.startsWith("/search/")
- name: robots-txt-policy-disallow-9
action: CHALLENGE
expression:
all:
- userAgent.contains("Bingbot")
- path.startsWith("/admin/")
- name: robots-txt-policy-blacklist-10
action: DENY
expression:
single: userAgent.contains("BadBot")
- name: robots-txt-policy-crawl-delay-11
action: WEIGH
expression:
single: userAgent.contains("SeoBot")
weight:
adjust: 3
- name: robots-txt-policy-blacklist-12
action: DENY
expression:
single: userAgent.contains("SeoBot")
- name: robots-txt-policy-disallow-13
action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/.*/admin")
- name: robots-txt-policy-disallow-14
action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/temp.*\\.html")
- name: robots-txt-policy-disallow-15
action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/file.\\.log")