深度解析

Cloudflare Turnstile 实施检测指南

在解决 Cloudflare Turnstile 挑战之前,您需要在页面上检测它并提取站点密钥。 Turnstile 可以通过 HTML 属性、JavaScript API 调用嵌入,或在页面渲染后动态加载。本指南涵盖了每种检测方法——从简单的 HTML 解析到运行时 JavaScript 分析。


闸机实现方法

站点以三种方式嵌入 Turnstile,每种方式都需要不同的检测方法:

方法 它是如何运作的 检测难度
HTML 隐式 页面源代码中的 <div class="cf-turnstile" data-sitekey="..."> 简单(静态 HTML)
JavaScript 显式 在脚本中调用 turnstile.render() Medium(解析JS)
动态加载 用户操作或 XHR 后加载的小部件 Hard(需要JS执行)

方法一:静态HTML检测

最简单的 Turnstile 集成使用 cf-turnstile 类和 data-sitekey 属性:

import re
import requests

def detect_turnstile_html(url):
    """Detect Turnstile from static HTML."""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 Chrome/120.0.0.0",
        "Accept": "text/html,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
    }

    response = requests.get(url, headers=headers, timeout=15)
    html = response.text

    result = {
        "turnstile_found": False,
        "sitekey": None,
        "mode": None,
        "theme": None,
        "action": None,
        "script_loaded": False,
    }

    # Check for Turnstile script
    if "challenges.cloudflare.com/turnstile" in html:
        result["script_loaded"] = True

    # Check for widget container
    if "cf-turnstile" in html:
        result["turnstile_found"] = True

        # Extract sitekey
        sitekey_match = re.search(
            r'data-sitekey=["\']([0-9x][A-Za-z0-9_-]+)["\']', html
        )
        if sitekey_match:
            result["sitekey"] = sitekey_match.group(1)

        # Extract mode
        if 'data-size="invisible"' in html:
            result["mode"] = "invisible"
        elif 'data-appearance="interaction-only"' in html:
            result["mode"] = "non-interactive"
        else:
            result["mode"] = "managed"

        # Extract theme
        theme_match = re.search(r'data-theme=["\'](\w+)["\']', html)
        if theme_match:
            result["theme"] = theme_match.group(1)

        # Extract action
        action_match = re.search(r'data-action=["\']([^"\']+)["\']', html)
        if action_match:
            result["action"] = action_match.group(1)

    return result


# Usage
info = detect_turnstile_html("https://staging.example.com/qa-login")
if info["turnstile_found"]:
    print(f"Sitekey: {info['sitekey']}")
    print(f"Mode: {info['mode']}")

方法二:JavaScript API检测

有些网站使用 turnstile.render() 而不是 HTML 属性:

import re

def detect_turnstile_js_api(html):
    """Detect Turnstile from JavaScript render calls."""
    patterns = [
        # turnstile.render('#element', {sitekey: '...'})
        r"turnstile\.render\s*\(\s*['\"]([^'\"]+)['\"]\s*,\s*\{([^}]+)\}",
        # turnstile.render(element, {sitekey: '...'})
        r"turnstile\.render\s*\([^,]+,\s*\{([^}]+)\}",
    ]

    for pattern in patterns:
        match = re.search(pattern, html, re.DOTALL)
        if match:
            config_text = match.group(match.lastindex)

            # Extract sitekey from config object
            sitekey_match = re.search(
                r"sitekey\s*:\s*['\"]([0-9x][A-Za-z0-9_-]+)['\"]", config_text
            )
            # Extract callback
            callback_match = re.search(
                r"callback\s*:\s*(\w+|function)", config_text
            )
            # Extract action
            action_match = re.search(
                r"action\s*:\s*['\"]([^'\"]+)['\"]", config_text
            )
            # Extract appearance
            appearance_match = re.search(
                r"appearance\s*:\s*['\"]([^'\"]+)['\"]", config_text
            )

            return {
                "found": True,
                "method": "javascript_api",
                "sitekey": sitekey_match.group(1) if sitekey_match else None,
                "callback": callback_match.group(1) if callback_match else None,
                "action": action_match.group(1) if action_match else None,
                "appearance": appearance_match.group(1) if appearance_match else None,
            }

    return {"found": False, "method": None}

方法三:动态加载检测(Selenium/Puppeteer)

当 Turnstile 在页面交互后动态加载时:

Python(硒)

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

def detect_turnstile_dynamic(url):
    """Detect dynamically loaded Turnstile using Selenium."""
    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)

        # Wait for page to fully load
        WebDriverWait(driver, 10).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )

        result = {
            "turnstile_found": False,
            "sitekey": None,
            "iframe_present": False,
            "response_field": False,
        }

        # Check for Turnstile iframe
        iframes = driver.find_elements(By.CSS_SELECTOR, "iframe[src*='challenges.cloudflare.com']")
        if iframes:
            result["turnstile_found"] = True
            result["iframe_present"] = True

        # Check for cf-turnstile container
        containers = driver.find_elements(By.CSS_SELECTOR, ".cf-turnstile, [data-sitekey]")
        for container in containers:
            sitekey = container.get_attribute("data-sitekey")
            if sitekey:
                result["turnstile_found"] = True
                result["sitekey"] = sitekey

        # Check for hidden response field
        response_fields = driver.find_elements(
            By.CSS_SELECTOR, "[name='cf-turnstile-response'], [name='g-recaptcha-response']"
        )
        if response_fields:
            result["response_field"] = True

        # Check page source for JS API render
        page_source = driver.page_source
        js_match = re.search(
            r"sitekey\s*:\s*['\"]([0-9x][A-Za-z0-9_-]+)['\"]", page_source
        )
        if js_match and not result["sitekey"]:
            result["sitekey"] = js_match.group(1)
            result["turnstile_found"] = True

        return result

    finally:
        driver.quit()

Node.js(Puppeteer)

const puppeteer = require("puppeteer");

async function detectTurnstileDynamic(url) {
  const browser = await puppeteer.launch({
    headless: "new",
    args: ["--no-sandbox"],
  });

  const page = await browser.newPage();

  const result = {
    turnstileFound: false,
    sitekey: null,
    iframePresent: false,
    responseField: false,
    scriptUrl: null,
  };

  // Monitor network for Turnstile script
  page.on("response", (response) => {
    if (response.url().includes("challenges.cloudflare.com/turnstile")) {
      result.scriptUrl = response.url();
    }
  });

  await page.goto(url, { waitUntil: "networkidle2" });

  // Check for Turnstile container
  const sitekey = await page.evaluate(() => {
    const el = document.querySelector(
      ".cf-turnstile, [data-sitekey]"
    );
    return el ? el.getAttribute("data-sitekey") : null;
  });

  if (sitekey) {
    result.turnstileFound = true;
    result.sitekey = sitekey;
  }

  // Check for Turnstile iframe
  const iframes = await page.$$("iframe[src*='challenges.cloudflare.com']");
  if (iframes.length > 0) {
    result.turnstileFound = true;
    result.iframePresent = true;
  }

  // Check for response field
  const responseField = await page.$(
    "[name='cf-turnstile-response']"
  );
  result.responseField = !!responseField;

  await browser.close();
  return result;
}

detectTurnstileDynamic("https://staging.example.com/qa-login").then(console.log);

综合检测类

import re
import requests

class TurnstileDetector:
    """Detect Cloudflare Turnstile across all implementation methods."""

    TURNSTILE_SCRIPT = "challenges.cloudflare.com/turnstile"
    SITEKEY_PATTERNS = [
        r'data-sitekey=["\']([0-9x][A-Za-z0-9_-]+)["\']',
        r"sitekey\s*:\s*['\"]([0-9x][A-Za-z0-9_-]+)['\"]",
        r"siteKey\s*[=:]\s*['\"]([0-9x][A-Za-z0-9_-]+)['\"]",
        r"TURNSTILE_SITE_KEY\s*[=:]\s*['\"]([0-9x][A-Za-z0-9_-]+)['\"]",
    ]

    def __init__(self, url, html=None):
        self.url = url
        self.html = html
        if not self.html:
            self._fetch()

    def _fetch(self):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                          "AppleWebKit/537.36 Chrome/120.0.0.0",
            "Accept": "text/html,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
        }
        response = requests.get(self.url, headers=headers, timeout=15)
        self.html = response.text

    def detect(self):
        """Run all detection methods and return results."""
        return {
            "url": self.url,
            "turnstile_present": self.has_turnstile(),
            "sitekey": self.extract_sitekey(),
            "mode": self.detect_mode(),
            "implementation": self.detect_implementation(),
            "script_loaded": self.has_script(),
            "response_field": self.has_response_field(),
            "action": self.extract_action(),
            "theme": self.extract_theme(),
        }

    def has_turnstile(self):
        return (
            self.has_script()
            or "cf-turnstile" in self.html
            or self.extract_sitekey() is not None
        )

    def has_script(self):
        return self.TURNSTILE_SCRIPT in self.html

    def has_response_field(self):
        return "cf-turnstile-response" in self.html

    def extract_sitekey(self):
        for pattern in self.SITEKEY_PATTERNS:
            match = re.search(pattern, self.html)
            if match:
                return match.group(1)
        return None

    def detect_mode(self):
        if 'data-size="invisible"' in self.html or "size: 'invisible'" in self.html:
            return "invisible"
        if 'data-appearance="interaction-only"' in self.html:
            return "non-interactive"
        if "cf-turnstile" in self.html:
            return "managed"
        return "unknown"

    def detect_implementation(self):
        if "cf-turnstile" in self.html and re.search(r"data-sitekey=", self.html):
            return "html_implicit"
        if "turnstile.render" in self.html:
            return "javascript_explicit"
        if self.has_script() and not "cf-turnstile" in self.html:
            return "dynamic_loading"
        return "unknown"

    def extract_action(self):
        match = re.search(r'data-action=["\']([^"\']+)["\']', self.html)
        if match:
            return match.group(1)
        match = re.search(r"action\s*:\s*['\"]([^'\"]+)['\"]", self.html)
        return match.group(1) if match else None

    def extract_theme(self):
        match = re.search(r'data-theme=["\'](\w+)["\']', self.html)
        return match.group(1) if match else "auto"


# Usage
detector = TurnstileDetector("https://staging.example.com/qa-login")
info = detector.detect()

if info["turnstile_present"]:
    print(f"Sitekey: {info['sitekey']}")
    print(f"Mode: {info['mode']}")
    print(f"Implementation: {info['implementation']}")

检测后解决

一旦检测到,用 CaptchaAI 解决:

import requests
import time

API_KEY = "YOUR_API_KEY"

def solve_detected_turnstile(detection_result):
    """Solve Turnstile using detection results."""
    if not detection_result["turnstile_present"]:
        raise ValueError("No Turnstile detected")

    if not detection_result["sitekey"]:
        raise ValueError("Sitekey not found — may need browser-based extraction")

    params = {
        "key": API_KEY,
        "method": "turnstile",
        "sitekey": detection_result["sitekey"],
        "pageurl": detection_result["url"],
        "json": 1,
    }

    # Include action if present
    if detection_result.get("action"):
        params["action"] = detection_result["action"]

    submit = requests.post("https://ocr.captchaai.com/in.php", data=params)
    task_id = submit.json()["request"]

    for _ in range(60):
        time.sleep(5)
        result = requests.get("https://ocr.captchaai.com/res.php", params={
            "key": API_KEY,
            "action": "get",
            "id": task_id,
            "json": 1,
        }).json()

        if result.get("status") == 1:
            return result["request"]

    raise TimeoutError("Turnstile solve timed out")


# Full workflow
detector = TurnstileDetector("https://example.com/signup")
info = detector.detect()

if info["turnstile_present"]:
    token = solve_detected_turnstile(info)
    print(f"Token: {token[:50]}...")

边缘情况

设想 挑战 解决方案
外部 JS 文件中的 Sitekey 不在页面 HTML 中 解析链接的 JavaScript 文件以获取 sitekey 模式
来自 API 响应的 Sitekey XHR 调用后加载 监控 JSON 响应中 sitekey 的网络请求
多个Turnstile小部件 同一页面上不同的sitekey 将 sitekey 与您提交的特定表单相匹配
影子 DOM 中的Turnstile 无法通过常规选择器访问 在浏览器上下文中使用 shadowRoot.querySelector
服务器端呈现的 sitekey 嵌入模板变量 检查配置对象的 <script> 标签
旋转门背后的身份验证 在公共页面上不可见 先认证,后检测

故障排除

症状 原因 处理方式
找到脚本标签但没有站点密钥 JS API 使用其他来源的配置进行渲染 检查所有链接的 JS 文件和 XHR 响应
提取了错误的站点密钥 页面上有多个验证码小部件 将站点键与周围的表单元素相匹配
检测有效但解决失败 验证所需的操作参数 在解决请求中包含 data-action
小部件不在初始 HTML 中 用户交互后动态加载 使用Selenium/Puppeteer进行整页渲染
cf-turnstile-response 字段为空 小部件尚未完成 等待小部件完成加载

常见问题

Turnstile 站点密钥可以更改吗?

是的。站点运营商可以随时轮换站点密钥。始终从页面中提取新鲜的站点密钥,而不是对其进行硬编码。

我需要操作参数吗?

仅当站点在服务器端对其进行验证时。如果 HTML 中存在 data-action,请将其包含在您的解决请求中以获得最佳结果。

如果找不到站点密钥怎么办?

sitekey 可能位于外部 JavaScript 文件、API 响应中或动态生成。使用浏览器 DevTools(网络选项卡)找到它,或者在全页渲染后使用 Selenium/Puppeteer 来提取它。

检测方法是否影响求解?

不会。无论小部件如何实现,CaptchaAI 的 Turnstile 解算器的工作原理都是相同的。您只需要站点密钥和页面 URL。


概括

检测 Cloudflare Turnstile 需要检查 Turnstile 脚本标记、cf-turnstile 容器、data-sitekey 属性和 turnstile.render() 调用。使用静态 HTML 解析进行简单集成,使用 Selenium/Puppeteer 进行动态加载小部件。一旦检测到,解决CaptchaAI 的旋转门求解器使用提取的站点密钥 — 所有模式均以 100% 的成功率进行相同的处理。

相关文章

该文章已禁用评论。