# robots.txt para oposiciones.de — propuesta auditoría 2026
# Política: permitimos crawlers de búsqueda y de IA por defecto. Bloqueamos endpoints debug y rutas internas.
# Última revisión: 2026-05-21

# -----------------------------------------------------------------------------
# Regla por defecto: todos los user-agents
# -----------------------------------------------------------------------------
User-agent: *
Allow: /
Disallow: /php/
Disallow: /consulta/php/
Disallow: /inc/
Disallow: /logs/
Disallow: /test.php
Disallow: /diagnostico-busqueda.php
Disallow: /dashboard.html
Disallow: /*.sql$
Disallow: /*.bak$
Disallow: /*.csv$
# Resultados de búsqueda interna — evita duplicado e indexación de queries arbitrarias
Disallow: /?q=
Disallow: /*?q=

# -----------------------------------------------------------------------------
# Crawlers de buscadores tradicionales (declarados explícitamente para claridad)
# -----------------------------------------------------------------------------
User-agent: Googlebot
Allow: /

User-agent: Googlebot-Image
Allow: /img/

User-agent: Bingbot
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: Slurp
Allow: /

User-agent: YandexBot
Allow: /

# -----------------------------------------------------------------------------
# Crawlers de IA generativa — política: PERMITIR (priorizamos citabilidad)
# Si en el futuro se quisiera entrenar-no/cita-sí, separar por user-agent.
# -----------------------------------------------------------------------------

# OpenAI
User-agent: GPTBot
Allow: /

User-agent: ChatGPT-User
Allow: /

User-agent: OAI-SearchBot
Allow: /

# Google AI (Gemini, Bard, Vertex)
User-agent: Google-Extended
Allow: /

# Anthropic (Claude)
User-agent: ClaudeBot
Allow: /

User-agent: Claude-Web
Allow: /

User-agent: anthropic-ai
Allow: /

User-agent: Claude-SearchBot
Allow: /

User-agent: Claude-User
Allow: /

# Perplexity
User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

# Apple Intelligence
User-agent: Applebot
Allow: /

User-agent: Applebot-Extended
Allow: /

# Meta (Llama / Meta AI)
User-agent: Meta-ExternalAgent
Allow: /

User-agent: Meta-ExternalFetcher
Allow: /

User-agent: FacebookBot
Allow: /

# Common Crawl (usado por casi todos los LLM para training)
User-agent: CCBot
Allow: /

# ByteDance (Doubao / TikTok AI)
User-agent: Bytespider
Allow: /

# Amazon (Alexa, Nova)
User-agent: Amazonbot
Allow: /

# Cohere
User-agent: cohere-ai
Allow: /

User-agent: cohere-training-data-crawler
Allow: /

# Mistral
User-agent: MistralAI-User
Allow: /

# You.com
User-agent: YouBot
Allow: /

# Diffbot (también usado por LLMs)
User-agent: Diffbot
Allow: /

# Omgili (LLM training datasets)
User-agent: Omgilibot
Allow: /

User-agent: Omgili
Allow: /

# Webz.io
User-agent: Webzio-Extended
Allow: /

# Timpi
User-agent: Timpibot
Allow: /

# Kagi
User-agent: KagiBot
Allow: /

# Phind
User-agent: PhindBot
Allow: /

# DuckDuckGo AI
User-agent: DuckAssistBot
Allow: /

# -----------------------------------------------------------------------------
# Crawlers conocidos de scraping abusivo o SEO competitivo — bloqueo selectivo
# -----------------------------------------------------------------------------
User-agent: AhrefsBot
Crawl-delay: 10

User-agent: SemrushBot
Crawl-delay: 10

User-agent: MJ12bot
Disallow: /

User-agent: DotBot
Disallow: /

User-agent: BLEXBot
Disallow: /

# -----------------------------------------------------------------------------
# Sitemaps
# -----------------------------------------------------------------------------
Sitemap: https://www.oposiciones.de/sitemap-index.xml
Sitemap: https://www.oposiciones.de/sitemap.xml