# robots.txt for https://rozz.certain.com
# GEO-optimized content subdomain for certain.com
# This file applies to: https://rozz.certain.com/* (NOT the main certain.com site)

# Allow all crawlers (content is public)
User-agent: *
Allow: /

# =============================================================================
# TRADITIONAL SEARCH ENGINES
# =============================================================================

User-agent: Googlebot
Allow: /

User-agent: Bingbot
Allow: /

User-agent: Slurp
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: Yandex
Allow: /

# =============================================================================
# LLM CRAWLERS - Welcome! See https://rozz.certain.com/llms.txt and https://rozz.certain.com/llms-full.txt
# =============================================================================

# --- Anthropic (Claude) ---
# ClaudeBot: Training data collection
User-agent: ClaudeBot
Allow: /
Crawl-delay: 0

# Claude-SearchBot: Search index for Claude web search citations 
User-agent: Claude-SearchBot
Allow: /
Crawl-delay: 0

# Claude-User: User-triggered URL fetches during conversations
User-agent: Claude-User
Allow: /

# Claude-Web: Legacy Anthropic crawler
User-agent: Claude-Web
Allow: /

# anthropic-ai: Generic Anthropic bot identifier
User-agent: anthropic-ai
Allow: /

# --- OpenAI (ChatGPT/SearchGPT) ---
# GPTBot: Training data collection
User-agent: GPTBot
Allow: /

# OAI-SearchBot: Search index for SearchGPT citations 
User-agent: OAI-SearchBot
Allow: /
Crawl-delay: 0

# ChatGPT-User: User-triggered URL fetches during conversations
User-agent: ChatGPT-User
Allow: /

# --- Perplexity ---
# PerplexityBot: Training and search index for Perplexity citations
User-agent: PerplexityBot
Allow: /
Crawl-delay: 0

# PerplexityUser: User-triggered fetches
User-agent: PerplexityUser
Allow: /

# --- Google AI ---
# Google-Extended: Gemini/Bard training data
User-agent: Google-Extended
Allow: /

# --- Meta (Facebook/Instagram) ---
# Meta-ExternalAgent: Meta AI training and search
User-agent: Meta-ExternalAgent
Allow: /

# Meta-ExternalFetcher: Meta content fetching
User-agent: Meta-ExternalFetcher
Allow: /

# --- Other LLM Providers ---
# Amazonbot: Amazon Alexa/AI training
User-agent: Amazonbot
Allow: /

# YouBot: You.com AI search
User-agent: YouBot
Allow: /

# cohere-ai: Cohere AI training
User-agent: cohere-ai
Allow: /

# Bytespider: ByteDance/TikTok AI
User-agent: Bytespider
Allow: /

# CCBot: Common Crawl (used by many AI companies)
User-agent: CCBot
Allow: /

# =============================================================================
# DISCOVERY FILES & APIs
# =============================================================================

# Sitemap location
Sitemap: https://rozz.certain.com/sitemap.xml

# LLM discovery files (https://llmstxt.org/)
# https://rozz.certain.com/llms.txt - Concise content index with links
# https://rozz.certain.com/llms-full.txt - Complete Q&A content inline (recommended for LLMs)

# Structured API endpoints (JSON)
# https://rozz.certain.com/api/qna.json - All Q&As with answers and metadata
# https://rozz.certain.com/api/pages.json - All pages index
# https://rozz.certain.com/api/topics.json - Topic taxonomy
# https://rozz.certain.com/api/search.json - Lightweight search index

# Rate limiting recommendation
# Please limit requests to 10/second

# Contact
# Questions: support@rozz.site