feat: add various robots.txt and YAML configurations for user agent handling and crawl delays

This commit is contained in:
Jason Cameron
2025-06-13 13:31:47 -04:00
parent 5a85cd41a2
commit 714496e317
14 changed files with 231 additions and 356 deletions

View File

@@ -0,0 +1,15 @@
# Test with blacklisted user agents
User-agent: *
Disallow: /admin
Crawl-delay: 10
User-agent: BadBot
Disallow: /
User-agent: SpamBot
Disallow: /
Crawl-delay: 60
User-agent: Googlebot
Disallow: /search
Crawl-delay: 5

View File

@@ -0,0 +1,36 @@
- name: robots-txt-policy-crawl-delay-1
action: WEIGH
expression:
single: "true"
weight:
adjust: 3
- name: robots-txt-policy-disallow-2
action: CHALLENGE
expression:
single: path.startsWith("/admin")
- name: robots-txt-policy-blacklist-3
action: DENY
expression:
single: userAgent.contains("BadBot")
- name: robots-txt-policy-crawl-delay-4
action: WEIGH
expression:
single: userAgent.contains("SpamBot")
weight:
adjust: 3
- name: robots-txt-policy-blacklist-5
action: DENY
expression:
single: userAgent.contains("SpamBot")
- name: robots-txt-policy-crawl-delay-6
action: WEIGH
expression:
single: userAgent.contains("Googlebot")
weight:
adjust: 3
- name: robots-txt-policy-disallow-7
action: CHALLENGE
expression:
all:
- userAgent.contains("Googlebot")
- path.startsWith("/search")

View File

@@ -0,0 +1,30 @@
# Complex real-world example
User-agent: *
Disallow: /admin/
Disallow: /private/
Disallow: /api/internal/
Allow: /api/public/
Crawl-delay: 5
User-agent: Googlebot
Disallow: /search/
Allow: /api/
Crawl-delay: 2
User-agent: Bingbot
Disallow: /search/
Disallow: /admin/
Crawl-delay: 10
User-agent: BadBot
Disallow: /
User-agent: SeoBot
Disallow: /
Crawl-delay: 300
# Test with various patterns
User-agent: TestBot
Disallow: /*/admin
Disallow: /temp*.html
Disallow: /file?.log

80
cmd/robots2policy/testdata/complex.yaml vendored Normal file
View File

@@ -0,0 +1,80 @@
- name: robots-txt-policy-crawl-delay-1
action: WEIGH
expression:
single: "true"
weight:
adjust: 3
- name: robots-txt-policy-disallow-2
action: CHALLENGE
expression:
single: path.startsWith("/admin/")
- name: robots-txt-policy-disallow-3
action: CHALLENGE
expression:
single: path.startsWith("/private/")
- name: robots-txt-policy-disallow-4
action: CHALLENGE
expression:
single: path.startsWith("/api/internal/")
- name: robots-txt-policy-crawl-delay-5
action: WEIGH
expression:
single: userAgent.contains("Googlebot")
weight:
adjust: 3
- name: robots-txt-policy-disallow-6
action: CHALLENGE
expression:
all:
- userAgent.contains("Googlebot")
- path.startsWith("/search/")
- name: robots-txt-policy-crawl-delay-7
action: WEIGH
expression:
single: userAgent.contains("Bingbot")
weight:
adjust: 3
- name: robots-txt-policy-disallow-8
action: CHALLENGE
expression:
all:
- userAgent.contains("Bingbot")
- path.startsWith("/search/")
- name: robots-txt-policy-disallow-9
action: CHALLENGE
expression:
all:
- userAgent.contains("Bingbot")
- path.startsWith("/admin/")
- name: robots-txt-policy-blacklist-10
action: DENY
expression:
single: userAgent.contains("BadBot")
- name: robots-txt-policy-crawl-delay-11
action: WEIGH
expression:
single: userAgent.contains("SeoBot")
weight:
adjust: 3
- name: robots-txt-policy-blacklist-12
action: DENY
expression:
single: userAgent.contains("SeoBot")
- name: robots-txt-policy-disallow-13
action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/.*/admin")
- name: robots-txt-policy-disallow-14
action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/temp.*\\.html")
- name: robots-txt-policy-disallow-15
action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/file.\\.log")

View File

@@ -0,0 +1,8 @@
- name: my-custom-policy-disallow-1
action: CHALLENGE
expression:
single: path.startsWith("/admin/")
- name: my-custom-policy-disallow-2
action: CHALLENGE
expression:
single: path.startsWith("/private")

View File

@@ -0,0 +1,8 @@
- name: robots-txt-policy-disallow-1
action: DENY
expression:
single: path.startsWith("/admin/")
- name: robots-txt-policy-disallow-2
action: DENY
expression:
single: path.startsWith("/private")

View File

@@ -0,0 +1,2 @@
# Empty robots.txt (comments only)
# No actual rules

1
cmd/robots2policy/testdata/empty.yaml vendored Normal file
View File

@@ -0,0 +1 @@
[]

16
cmd/robots2policy/testdata/simple.json vendored Normal file
View File

@@ -0,0 +1,16 @@
[
{
"name": "robots-txt-policy-disallow-1",
"action": "CHALLENGE",
"expression": {
"single": "path.startsWith(\"/admin/\")"
}
},
{
"name": "robots-txt-policy-disallow-2",
"action": "CHALLENGE",
"expression": {
"single": "path.startsWith(\"/private\")"
}
}
]

View File

@@ -0,0 +1,5 @@
# Simple robots.txt test
User-agent: *
Disallow: /admin/
Disallow: /private
Allow: /public

View File

@@ -0,0 +1,8 @@
- name: robots-txt-policy-disallow-1
action: CHALLENGE
expression:
single: path.startsWith("/admin/")
- name: robots-txt-policy-disallow-2
action: CHALLENGE
expression:
single: path.startsWith("/private")

View File

@@ -0,0 +1,6 @@
# Test wildcard patterns
User-agent: *
Disallow: /search*
Disallow: /*/private
Disallow: /file?.txt
Disallow: /admin/*?action=delete

View File

@@ -0,0 +1,16 @@
- name: robots-txt-policy-disallow-1
action: CHALLENGE
expression:
single: path.matches("^/search.*")
- name: robots-txt-policy-disallow-2
action: CHALLENGE
expression:
single: path.matches("^/.*/private")
- name: robots-txt-policy-disallow-3
action: CHALLENGE
expression:
single: path.matches("^/file.\\.txt")
- name: robots-txt-policy-disallow-4
action: CHALLENGE
expression:
single: path.matches("^/admin/.*.action=delete")