diff --git a/cmd/robots2policy/batch/batch_process.go b/cmd/robots2policy/batch/batch_process.go new file mode 100644 index 00000000..b448bb38 --- /dev/null +++ b/cmd/robots2policy/batch/batch_process.go @@ -0,0 +1,78 @@ +/* +Batch process robots.txt files from archives like https://github.com/nrjones8/robots-dot-txt-archive-bot/tree/master/data/cleaned +into Anubis CEL policies. Usage: go run batch_process.go +*/ +package main + +import ( + "fmt" + "io/fs" + "log" + "os" + "os/exec" + "path/filepath" + "strings" +) + +func main() { + if len(os.Args) < 2 { + fmt.Println("Usage: go run batch_process.go ") + fmt.Println("Example: go run batch_process.go ./cleaned") + os.Exit(1) + } + + cleanedDir := os.Args[1] + outputDir := "generated_policies" + + // Create output directory + if err := os.MkdirAll(outputDir, 0755); err != nil { + log.Fatalf("Failed to create output directory: %v", err) + } + + count := 0 + err := filepath.WalkDir(cleanedDir, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + + // Skip directories + if d.IsDir() { + return nil + } + + // Generate policy name from file path + relPath, _ := filepath.Rel(cleanedDir, path) + policyName := strings.ReplaceAll(relPath, "/", "-") + policyName = strings.TrimSuffix(policyName, "-robots.txt") + policyName = strings.ReplaceAll(policyName, ".", "-") + + outputFile := filepath.Join(outputDir, policyName+".yaml") + + cmd := exec.Command("go", "run", "main.go", + "-input", path, + "-output", outputFile, + "-name", policyName, + "-format", "yaml") + + if err := cmd.Run(); err != nil { + fmt.Printf("Warning: Failed to process %s: %v\n", path, err) + return nil // Continue processing other files + } + + count++ + if count%100 == 0 { + fmt.Printf("Processed %d files...\n", count) + } else if count%10 == 0 { + fmt.Print(".") + } + + return nil + }) + + if err != nil { + log.Fatalf("Error walking directory: %v", err) + } + + fmt.Printf("Successfully processed %d robots.txt files\n", count) + fmt.Printf("Generated policies saved to: %s/\n", outputDir) +}