mirror of
https://github.com/TecharoHQ/anubis.git
synced 2026-04-13 03:58:45 +00:00
feat: add various robots.txt and YAML configurations for user agent handling and crawl delays
This commit is contained in:
15
cmd/robots2policy/testdata/blacklist.robots.txt
vendored
Normal file
15
cmd/robots2policy/testdata/blacklist.robots.txt
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
# Test with blacklisted user agents
|
||||
User-agent: *
|
||||
Disallow: /admin
|
||||
Crawl-delay: 10
|
||||
|
||||
User-agent: BadBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: SpamBot
|
||||
Disallow: /
|
||||
Crawl-delay: 60
|
||||
|
||||
User-agent: Googlebot
|
||||
Disallow: /search
|
||||
Crawl-delay: 5
|
||||
36
cmd/robots2policy/testdata/blacklist.yaml
vendored
Normal file
36
cmd/robots2policy/testdata/blacklist.yaml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
- name: robots-txt-policy-crawl-delay-1
|
||||
action: WEIGH
|
||||
expression:
|
||||
single: "true"
|
||||
weight:
|
||||
adjust: 3
|
||||
- name: robots-txt-policy-disallow-2
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
single: path.startsWith("/admin")
|
||||
- name: robots-txt-policy-blacklist-3
|
||||
action: DENY
|
||||
expression:
|
||||
single: userAgent.contains("BadBot")
|
||||
- name: robots-txt-policy-crawl-delay-4
|
||||
action: WEIGH
|
||||
expression:
|
||||
single: userAgent.contains("SpamBot")
|
||||
weight:
|
||||
adjust: 3
|
||||
- name: robots-txt-policy-blacklist-5
|
||||
action: DENY
|
||||
expression:
|
||||
single: userAgent.contains("SpamBot")
|
||||
- name: robots-txt-policy-crawl-delay-6
|
||||
action: WEIGH
|
||||
expression:
|
||||
single: userAgent.contains("Googlebot")
|
||||
weight:
|
||||
adjust: 3
|
||||
- name: robots-txt-policy-disallow-7
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Googlebot")
|
||||
- path.startsWith("/search")
|
||||
30
cmd/robots2policy/testdata/complex.robots.txt
vendored
Normal file
30
cmd/robots2policy/testdata/complex.robots.txt
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
# Complex real-world example
|
||||
User-agent: *
|
||||
Disallow: /admin/
|
||||
Disallow: /private/
|
||||
Disallow: /api/internal/
|
||||
Allow: /api/public/
|
||||
Crawl-delay: 5
|
||||
|
||||
User-agent: Googlebot
|
||||
Disallow: /search/
|
||||
Allow: /api/
|
||||
Crawl-delay: 2
|
||||
|
||||
User-agent: Bingbot
|
||||
Disallow: /search/
|
||||
Disallow: /admin/
|
||||
Crawl-delay: 10
|
||||
|
||||
User-agent: BadBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: SeoBot
|
||||
Disallow: /
|
||||
Crawl-delay: 300
|
||||
|
||||
# Test with various patterns
|
||||
User-agent: TestBot
|
||||
Disallow: /*/admin
|
||||
Disallow: /temp*.html
|
||||
Disallow: /file?.log
|
||||
80
cmd/robots2policy/testdata/complex.yaml
vendored
Normal file
80
cmd/robots2policy/testdata/complex.yaml
vendored
Normal file
@@ -0,0 +1,80 @@
|
||||
- name: robots-txt-policy-crawl-delay-1
|
||||
action: WEIGH
|
||||
expression:
|
||||
single: "true"
|
||||
weight:
|
||||
adjust: 3
|
||||
- name: robots-txt-policy-disallow-2
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
single: path.startsWith("/admin/")
|
||||
- name: robots-txt-policy-disallow-3
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
single: path.startsWith("/private/")
|
||||
- name: robots-txt-policy-disallow-4
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
single: path.startsWith("/api/internal/")
|
||||
- name: robots-txt-policy-crawl-delay-5
|
||||
action: WEIGH
|
||||
expression:
|
||||
single: userAgent.contains("Googlebot")
|
||||
weight:
|
||||
adjust: 3
|
||||
- name: robots-txt-policy-disallow-6
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Googlebot")
|
||||
- path.startsWith("/search/")
|
||||
- name: robots-txt-policy-crawl-delay-7
|
||||
action: WEIGH
|
||||
expression:
|
||||
single: userAgent.contains("Bingbot")
|
||||
weight:
|
||||
adjust: 3
|
||||
- name: robots-txt-policy-disallow-8
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Bingbot")
|
||||
- path.startsWith("/search/")
|
||||
- name: robots-txt-policy-disallow-9
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Bingbot")
|
||||
- path.startsWith("/admin/")
|
||||
- name: robots-txt-policy-blacklist-10
|
||||
action: DENY
|
||||
expression:
|
||||
single: userAgent.contains("BadBot")
|
||||
- name: robots-txt-policy-crawl-delay-11
|
||||
action: WEIGH
|
||||
expression:
|
||||
single: userAgent.contains("SeoBot")
|
||||
weight:
|
||||
adjust: 3
|
||||
- name: robots-txt-policy-blacklist-12
|
||||
action: DENY
|
||||
expression:
|
||||
single: userAgent.contains("SeoBot")
|
||||
- name: robots-txt-policy-disallow-13
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/.*/admin")
|
||||
- name: robots-txt-policy-disallow-14
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/temp.*\\.html")
|
||||
- name: robots-txt-policy-disallow-15
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/file.\\.log")
|
||||
8
cmd/robots2policy/testdata/custom-name.yaml
vendored
Normal file
8
cmd/robots2policy/testdata/custom-name.yaml
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
- name: my-custom-policy-disallow-1
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
single: path.startsWith("/admin/")
|
||||
- name: my-custom-policy-disallow-2
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
single: path.startsWith("/private")
|
||||
8
cmd/robots2policy/testdata/deny-action.yaml
vendored
Normal file
8
cmd/robots2policy/testdata/deny-action.yaml
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
- name: robots-txt-policy-disallow-1
|
||||
action: DENY
|
||||
expression:
|
||||
single: path.startsWith("/admin/")
|
||||
- name: robots-txt-policy-disallow-2
|
||||
action: DENY
|
||||
expression:
|
||||
single: path.startsWith("/private")
|
||||
2
cmd/robots2policy/testdata/empty.robots.txt
vendored
Normal file
2
cmd/robots2policy/testdata/empty.robots.txt
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
# Empty robots.txt (comments only)
|
||||
# No actual rules
|
||||
1
cmd/robots2policy/testdata/empty.yaml
vendored
Normal file
1
cmd/robots2policy/testdata/empty.yaml
vendored
Normal file
@@ -0,0 +1 @@
|
||||
[]
|
||||
16
cmd/robots2policy/testdata/simple.json
vendored
Normal file
16
cmd/robots2policy/testdata/simple.json
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
[
|
||||
{
|
||||
"name": "robots-txt-policy-disallow-1",
|
||||
"action": "CHALLENGE",
|
||||
"expression": {
|
||||
"single": "path.startsWith(\"/admin/\")"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "robots-txt-policy-disallow-2",
|
||||
"action": "CHALLENGE",
|
||||
"expression": {
|
||||
"single": "path.startsWith(\"/private\")"
|
||||
}
|
||||
}
|
||||
]
|
||||
5
cmd/robots2policy/testdata/simple.robots.txt
vendored
Normal file
5
cmd/robots2policy/testdata/simple.robots.txt
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
# Simple robots.txt test
|
||||
User-agent: *
|
||||
Disallow: /admin/
|
||||
Disallow: /private
|
||||
Allow: /public
|
||||
8
cmd/robots2policy/testdata/simple.yaml
vendored
Normal file
8
cmd/robots2policy/testdata/simple.yaml
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
- name: robots-txt-policy-disallow-1
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
single: path.startsWith("/admin/")
|
||||
- name: robots-txt-policy-disallow-2
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
single: path.startsWith("/private")
|
||||
6
cmd/robots2policy/testdata/wildcards.robots.txt
vendored
Normal file
6
cmd/robots2policy/testdata/wildcards.robots.txt
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
# Test wildcard patterns
|
||||
User-agent: *
|
||||
Disallow: /search*
|
||||
Disallow: /*/private
|
||||
Disallow: /file?.txt
|
||||
Disallow: /admin/*?action=delete
|
||||
16
cmd/robots2policy/testdata/wildcards.yaml
vendored
Normal file
16
cmd/robots2policy/testdata/wildcards.yaml
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
- name: robots-txt-policy-disallow-1
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
single: path.matches("^/search.*")
|
||||
- name: robots-txt-policy-disallow-2
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
single: path.matches("^/.*/private")
|
||||
- name: robots-txt-policy-disallow-3
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
single: path.matches("^/file.\\.txt")
|
||||
- name: robots-txt-policy-disallow-4
|
||||
action: CHALLENGE
|
||||
expression:
|
||||
single: path.matches("^/admin/.*.action=delete")
|
||||
Reference in New Issue
Block a user