# =============================================================================
# robots.txt — WellVersed  (https://wellversed.eu)
# Last updated: 2026-05-01
#
# INDEXING POLICY
#   Public pages open for crawling, indexing, and user-facing discovery:
#
#     /              Marketing landing page
#     /pricing       Public pricing & plans page
#     /contact       Contact & support page
#     /documentation FAQ / documentation page
#     /blogs         Blog index
#     /blogs/*       Individual published blog posts
#     /cases         Use-case index
#     /cases/*       Individual use-case pages
#     /terms         Terms of Service
#     /privacy       Privacy Policy
#
#   All other routes (authenticated app, API, uploads) are off-limits.
#
#   Note: wellversed.eu is the EU / EEA / UK canonical domain.
#         The global canonical is https://wellversed.tech.
#         Both serve identical public content. Crawlers are directed
#         to their geo-appropriate entry point by HTTP 302 redirect.
#
# COPYRIGHT & AI-TRAINING OPT-OUT
#   All content on this site is © WellVersed 2026. All rights reserved.
#
#   PERMITTED:
#     - Index and surface public pages in search results or AI-generated
#       recommendations, provided the original URL / attribution is shown.
#
#   PROHIBITED (without prior written consent):
#     - Reproduce, republish, or redistribute any content.
#     - Use any content — in whole or in part — to train, fine-tune, or
#       evaluate ML/AI models for any purpose, commercial or otherwise.
#       This includes large-scale pre-training, RLHF datasets, instruction
#       tuning, retrieval-augmented generation corpora, and benchmarking.
#     - Scrape, mirror, or create derivative works from any content.
#     - Store or cache content beyond what is necessary to render a single
#       page for a single user.
#
#   These prohibitions are enforceable under copyright law, the Computer
#   Fraud and Abuse Act (18 U.S.C. § 1030), the EU Database Directive,
#   and the UK Database Right.
#
#   Terms of Service:      https://wellversed.eu/terms
#   Privacy Policy:        https://wellversed.eu/privacy
#   Licensing enquiries:   legal@wellversed.tech
#
# REFERENCES
#   https://www.robotstxt.org/
#   https://developers.google.com/search/docs/crawling-indexing/robots/intro
#   https://www.rfc-editor.org/rfc/rfc9309  (Robots Exclusion Protocol, RFC 9309)
#   https://darkvisitors.com/agents
#   https://github.com/ai-robots-txt/ai.robots.txt
# =============================================================================


# =============================================================================
# SECTION 1 — Major search-engine crawlers (index, follow)
# These crawlers are permitted to discover and index public pages only.
# Authenticated app routes are explicitly disallowed.
# =============================================================================
User-agent: Googlebot
User-agent: Googlebot-Image
User-agent: Googlebot-Video
User-agent: Googlebot-News
User-agent: Bingbot
User-agent: BingPreview
User-agent: Slurp
User-agent: DuckDuckBot
User-agent: Ecosia
User-agent: Applebot
User-agent: SeznamBot
User-agent: MojeekBot
User-agent: BraveBot
User-agent: Baidu
User-agent: Perplexity-User

# Editorial-only CMS routes (auth-protected, disallow crawlers too)
Disallow: /blogs/new
Disallow: /blogs/*/edit

# -- Public content — explicitly allowed ----------------------------------------
Allow: /$
Allow: /pricing
Allow: /contact
Allow: /documentation
Allow: /terms
Allow: /privacy
Allow: /blogs
Allow: /blogs/
Allow: /cases
Allow: /cases/

# Allow hashed CSS/JS bundles so renderers can score Core Web Vitals
Allow: /assets/

# -- Authenticated application routes -------------------------------------------
Disallow: /dashboard
Disallow: /workspace
Disallow: /projects
Disallow: /resources
Disallow: /settings
Disallow: /profile
Disallow: /team
Disallow: /candidates
Disallow: /interviews
Disallow: /interview
Disallow: /interactive
Disallow: /workflows
Disallow: /tasks
Disallow: /task-library
Disallow: /payments
Disallow: /practice
Disallow: /skill-quests
Disallow: /leaderboard
Disallow: /sysdesign
Disallow: /take-home
Disallow: /code-review
Disallow: /learner-home
Disallow: /learning-profile
Disallow: /study-plan
Disallow: /achievements
Disallow: /notifications
Disallow: /billing
Disallow: /newsletter

# -- Auth flow (no SEO value) ---------------------------------------------------
Disallow: /login
Disallow: /register

# -- Server-side / infrastructure paths ----------------------------------------
Disallow: /admin/
Disallow: /api/
Disallow: /uploads/
Disallow: /build/
Disallow: /.env
Disallow: /.git/
Disallow: /certs/
Disallow: /server/

Crawl-delay: 10


# =============================================================================
# SECTION 1b — AI-search crawlers (public pages only · Crawl-delay 15)
# These bots power AI-assisted search and user-triggered web fetching.
# They are NOT used for training data collection per each provider's policy.
# Server middleware verifies legitimacy via each provider's published IP ranges.
#
#   OAI-SearchBot    https://openai.com/searchbot.json
#   ChatGPT-User     https://openai.com/chatgpt-user.json
#   Claude-User      https://claude.com/crawling/bots.json
#   Claude-SearchBot https://claude.com/crawling/bots.json
# =============================================================================
User-agent: OAI-SearchBot
User-agent: ChatGPT-User
User-agent: Claude-User
User-agent: Claude-SearchBot

Disallow: /blogs/new
Disallow: /blogs/*/edit

Allow: /$
Allow: /pricing
Allow: /contact
Allow: /documentation
Allow: /terms
Allow: /privacy
Allow: /blogs
Allow: /blogs/
Allow: /cases
Allow: /cases/
Allow: /assets/

Disallow: /dashboard
Disallow: /workspace
Disallow: /projects
Disallow: /resources
Disallow: /settings
Disallow: /profile
Disallow: /team
Disallow: /candidates
Disallow: /interviews
Disallow: /interview
Disallow: /interactive
Disallow: /workflows
Disallow: /tasks
Disallow: /task-library
Disallow: /payments
Disallow: /practice
Disallow: /skill-quests
Disallow: /leaderboard
Disallow: /sysdesign
Disallow: /take-home
Disallow: /code-review
Disallow: /learner-home
Disallow: /learning-profile
Disallow: /study-plan
Disallow: /achievements
Disallow: /notifications
Disallow: /billing
Disallow: /newsletter
Disallow: /login
Disallow: /register
Disallow: /admin/
Disallow: /api/
Disallow: /uploads/
Disallow: /build/
Disallow: /.env
Disallow: /.git/
Disallow: /certs/
Disallow: /server/

Crawl-delay: 15


# =============================================================================
# SECTION 2 — SEO-analysis & competitive-intelligence tools
# Permitted to audit public pages for SEO purposes only.
# Access does NOT grant any licence to publish, sell, or share scraped data.
# =============================================================================
User-agent: AhrefsBot
User-agent: SemrushBot
User-agent: SiteAuditBot
User-agent: MJ12bot
User-agent: DotBot
User-agent: BLEXBot
User-agent: SEOkicks
User-agent: Screaming Frog SEO Spider
User-agent: Rogerbot

Disallow: /blogs/new
Disallow: /blogs/*/edit
Allow: /$
Allow: /pricing
Allow: /contact
Allow: /documentation
Allow: /terms
Allow: /privacy
Allow: /blogs
Allow: /blogs/
Allow: /cases
Allow: /cases/
Disallow: /

Crawl-delay: 30


# =============================================================================
# SECTION 3 — AI model-training, LLM-corpus harvesting, and large-scale
#              data-collection crawlers.
#
# Crawling this site for any of these purposes is CATEGORICALLY PROHIBITED
# under our Terms of Service and the copyright notice above.
#
# Disallow: / applies to the ENTIRE site — no exceptions, no partial access.
# =============================================================================

# -- Perplexity training (autonomous crawler) ---------------------------------
User-agent: PerplexityBot
# NOTE: Perplexity-User (user-triggered) is in Section 1b (allowed).

# -- OpenAI -------------------------------------------------------------------
User-agent: GPTBot
# NOTE: OAI-SearchBot and ChatGPT-User moved to Section 1b (search/user-triggered only).

# -- Anthropic ----------------------------------------------------------------
User-agent: anthropic-ai
User-agent: ClaudeBot

# -- Google (AI-training variants, NOT the regular search bot) ----------------
User-agent: Google-Extended
User-agent: GoogleOther
User-agent: GoogleOther-Image
User-agent: GoogleOther-Video

# -- Apple AI training --------------------------------------------------------
User-agent: Applebot-Extended

# -- Meta / Facebook ----------------------------------------------------------
User-agent: meta-externalagent
User-agent: FacebookBot
User-agent: facebookexternalhit

# -- ByteDance / TikTok -------------------------------------------------------
User-agent: Bytespider

# -- Common Crawl (primary source for LLM training corpora) -------------------
User-agent: CCBot
User-agent: archive.org_bot
User-agent: ia_archiver

# -- Cohere -------------------------------------------------------------------
User-agent: cohere-ai

# -- MistralAI ----------------------------------------------------------------
User-agent: MistralAI

# -- Amazon Alexa data collection ---------------------------------------------
User-agent: Amazonbot

# -- AI2 / Allen Institute for AI ---------------------------------------------
User-agent: AI2Bot
User-agent: Ai2Bot-Dolma

# -- Huawei -------------------------------------------------------------------
User-agent: PetalBot

# -- Diffbot (entity extraction / training data) ------------------------------
User-agent: Diffbot

# -- Data brokers / content aggregators / bulk scrapers -----------------------
User-agent: magpie-crawler
User-agent: Omgilibot
User-agent: Omgili
User-agent: ImagesiftBot
User-agent: DataForSeoBot
User-agent: Brightbot
User-agent: webz.io
User-agent: TurnitinBot
User-agent: AwarioRssBot
User-agent: AwarioSmartBot
User-agent: Meltwater
User-agent: Kangaroo Bot
User-agent: Barkrowler
User-agent: NetNewsWire
User-agent: YouBot
User-agent: Timpibot
User-agent: iaskspider
User-agent: proximic

# -- xAI / Grok --------------------------------------------------------------
User-agent: xAIBot

# -- DeepSeek ----------------------------------------------------------------
User-agent: DeepSeekBot

# -- OpenAI additional agents (ChatGPT Agent, Operator computer-use) ----------
User-agent: ChatGPT Agent
User-agent: OpenAI
User-agent: Operator

# -- Anthropic additional agents (Claude Web) ----------------------------------
# NOTE: Claude-SearchBot and Claude-User moved to Section 1b (search/user-triggered only).
User-agent: Claude-Web

# -- Google AI additional agents (Gemini, NotebookLM, Mariner, Firebase) ------
User-agent: CloudVertexBot
User-agent: Gemini-Deep-Research
User-agent: Google-Agent
User-agent: Google-CloudVertexBot
User-agent: Google-Firebase
User-agent: Google-NotebookLM
User-agent: GoogleAgent-Mariner
User-agent: NotebookLM

# -- Amazon / AWS AI (Bedrock, Kendra, Buy-for-Me, Nova Act) ------------------
User-agent: amazon-kendra
User-agent: AmazonBuyForMe
User-agent: Amzn-SearchBot
User-agent: Amzn-User
User-agent: bedrockbot
User-agent: NovaAct

# -- Meta AI additional agents ------------------------------------------------
User-agent: Meta-ExternalAgent
User-agent: meta-externalfetcher
User-agent: Meta-ExternalFetcher
User-agent: meta-webindexer

# -- Mistral additional agents ------------------------------------------------
User-agent: MistralAI-User
User-agent: MistralAI-User/1.0

# -- Cohere additional agents -------------------------------------------------
User-agent: cohere-training-data-crawler

# -- Chinese AI (Zhipu/ChatGLM, Huawei Pangu) ---------------------------------
User-agent: ChatGLM-Spider
User-agent: PanguBot

# -- LAION / HuggingFace dataset harvest tools --------------------------------
User-agent: img2dataset
User-agent: LAIONDownloader
User-agent: laion-huggingface-processor

# -- Cloudflare / Microsoft Azure AI ------------------------------------------
User-agent: AzureAI-SearchBot
User-agent: Cloudflare-AutoRAG

# -- AI agentic frameworks and computer-use tools -----------------------------
User-agent: ApifyBot
User-agent: ApifyWebsiteContentCrawler
User-agent: Crawl4AI
User-agent: Devin
User-agent: FirecrawlAgent
User-agent: Manus-User
User-agent: TwinAgent

# -- AI search assistants (Kagi, DuckDuckGo AI, Phind, Exa, Andi) ------------
User-agent: Andibot
User-agent: DuckAssistBot
User-agent: ExaBot
User-agent: iAskBot
User-agent: iaskspider/2.0
User-agent: kagi-fetcher
User-agent: LinerBot
User-agent: LinkupBot
User-agent: PhindBot
User-agent: TavilyBot

# -- Atlassian enterprise AI --------------------------------------------------
User-agent: atlassian-bot

# -- Semrush AI-feature variants (not the standard SEO auditor in §2) ---------
User-agent: SemrushBot-OCOB
User-agent: SemrushBot-SWA

# -- AI2 / Allen Institute additional variants --------------------------------
User-agent: AI2Bot-DeepResearchEval

# -- Awario additional variant ------------------------------------------------
User-agent: Awario

# -- Corpus and bulk-harvest crawlers (2025-2026) -----------------------------
User-agent: AddSearchBot
User-agent: aiHitBot
User-agent: Anomura
User-agent: Aranet-SearchBot
User-agent: bigsur.ai
User-agent: BuddyBot
User-agent: Channel3Bot
User-agent: Cotoyogi
User-agent: Crawlspace
User-agent: Datenbank Crawler
User-agent: Echobot Bot
User-agent: EchoboxBot
User-agent: Factset_spyderbot
User-agent: FriendlyCrawler
User-agent: IbouBot
User-agent: ICC-Crawler
User-agent: imageSpider
User-agent: ISSCyberRiskCrawler
User-agent: KlaviyoAIBot
User-agent: KunatoCrawler
User-agent: LCC
User-agent: Linguee Bot
User-agent: MyCentralAIScraperBot
User-agent: NagetBot
User-agent: netEstate Imprint Crawler
User-agent: newsai
User-agent: Panscient
User-agent: panscient.com
User-agent: Poggio-Citations
User-agent: Poseidon Research Crawler
User-agent: QualifiedBot
User-agent: QuillBot
User-agent: quillbot.com
User-agent: SBIntuitionsBot
User-agent: Scrapy
User-agent: ShapBot
User-agent: Sidetrade indexer bot
User-agent: Spider
User-agent: TerraCotta
User-agent: Thinkbot
User-agent: TikTokSpider
User-agent: VelenPublicWebCrawler
User-agent: WARDBot
User-agent: webzio-extended
User-agent: Webzio-Extended
User-agent: wpbot
User-agent: WRTNBot
User-agent: YaK
User-agent: YandexAdditional
User-agent: YandexAdditionalBot
User-agent: ZanistaBot

Disallow: /


# =============================================================================
# SECTION 4 — All other crawlers (catch-all default)
# Same public-only boundary as Section 1.
# Unknown crawlers are assumed to have no legitimate need for app pages.
# =============================================================================
User-agent: *

Disallow: /blogs/new
Disallow: /blogs/*/edit

Allow: /$
Allow: /pricing
Allow: /contact
Allow: /documentation
Allow: /terms
Allow: /privacy
Allow: /blogs
Allow: /blogs/
Allow: /cases
Allow: /cases/
Allow: /assets/

Disallow: /dashboard
Disallow: /workspace
Disallow: /projects
Disallow: /resources
Disallow: /settings
Disallow: /profile
Disallow: /team
Disallow: /candidates
Disallow: /interviews
Disallow: /interview
Disallow: /interactive
Disallow: /workflows
Disallow: /tasks
Disallow: /task-library
Disallow: /payments
Disallow: /practice
Disallow: /skill-quests
Disallow: /leaderboard
Disallow: /sysdesign
Disallow: /take-home
Disallow: /code-review
Disallow: /learner-home
Disallow: /learning-profile
Disallow: /study-plan
Disallow: /achievements
Disallow: /notifications
Disallow: /billing
Disallow: /newsletter
Disallow: /login
Disallow: /register
Disallow: /admin/
Disallow: /api/
Disallow: /uploads/
Disallow: /build/
Disallow: /.env
Disallow: /.git/
Disallow: /certs/
Disallow: /server/

Crawl-delay: 10


# =============================================================================
# Sitemap
# =============================================================================
Sitemap: https://wellversed.eu/sitemap.xml