集成指南

Colly + CaptchaAI:使用验证码解决方案进行基于 Go 的抓取

Colly 是一个流行的 Go 网络抓取框架。以下是如何集成 CaptchaAI 来处理 Go 抓取工具中的验证码。


CaptchaAI Go 客户端

package captchaai

import (
    "encoding/json"
    "errors"
    "fmt"
    "net/http"
    "net/url"
    "strings"
    "time"
)

type Client struct {
    APIKey     string
    HTTPClient *http.Client
}

type apiResponse struct {
    Status  int    `json:"status"`
    Request string `json:"request"`
}

func NewClient(apiKey string) *Client {
    return &Client{
        APIKey: apiKey,
        HTTPClient: &http.Client{Timeout: 30 * time.Second},
    }
}

func (c *Client) SolveRecaptchaV2(sitekey, pageurl string) (string, error) {
    // Submit task
    data := url.Values{
        "key":       {c.APIKey},
        "method":    {"userrecaptcha"},
        "googlekey": {sitekey},
        "pageurl":   {pageurl},
        "json":      {"1"},
    }

    resp, err := c.HTTPClient.PostForm("https://ocr.captchaai.com/in.php", data)
    if err != nil {
        return "", fmt.Errorf("submit error: %w", err)
    }
    defer resp.Body.Close()

    var result apiResponse
    if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
        return "", fmt.Errorf("decode error: %w", err)
    }

    if result.Status != 1 {
        return "", fmt.Errorf("submit failed: %s", result.Request)
    }

    taskID := result.Request

    // Poll for result
    time.Sleep(15 * time.Second)

    for i := 0; i < 24; i++ {
        pollURL := fmt.Sprintf(
            "https://ocr.captchaai.com/res.php?key=%s&action=get&id=%s&json=1",
            c.APIKey, taskID,
        )
        resp, err := c.HTTPClient.Get(pollURL)
        if err != nil {
            time.Sleep(5 * time.Second)
            continue
        }

        var pollResult apiResponse
        json.NewDecoder(resp.Body).Decode(&pollResult)
        resp.Body.Close()

        if pollResult.Status == 1 {
            return pollResult.Request, nil
        }
        if pollResult.Request != "CAPCHA_NOT_READY" {
            return "", fmt.Errorf("solve error: %s", pollResult.Request)
        }

        time.Sleep(5 * time.Second)
    }

    return "", errors.New("solve timeout")
}

科利集成

package main

import (
    "fmt"
    "log"
    "os"
    "strings"

    "github.com/gocolly/colly/v2"
)

func main() {
    apiKey := os.Getenv("CAPTCHAAI_API_KEY")
    solver := captchaai.NewClient(apiKey)

    c := colly.NewCollector(
        colly.AllowedDomains("example.com"),
        colly.MaxDepth(2),
    )

    // Detect CAPTCHA pages
    c.OnHTML("[data-sitekey]", func(e *colly.HTMLElement) {
        sitekey := e.Attr("data-sitekey")
        pageURL := e.Request.URL.String()

        log.Printf("CAPTCHA detected on %s, solving...", pageURL)

        token, err := solver.SolveRecaptchaV2(sitekey, pageURL)
        if err != nil {
            log.Printf("Solve failed: %v", err)
            return
        }

        log.Printf("CAPTCHA solved, token length: %d", len(token))

        // Post form with token
        err = c.Post(pageURL, map[string]string{
            "g-recaptcha-response": token,
        })
        if err != nil {
            log.Printf("Form submit failed: %v", err)
        }
    })

    // Extract data
    c.OnHTML("table tr", func(e *colly.HTMLElement) {
        cols := []string{}
        e.ForEach("td", func(_ int, td *colly.HTMLElement) {
            cols = append(cols, strings.TrimSpace(td.Text))
        })
        if len(cols) > 0 {
            fmt.Printf("Row: %s\n", strings.Join(cols, " | "))
        }
    })

    c.OnError(func(r *colly.Response, err error) {
        log.Printf("Error %s: %v", r.Request.URL, err)
    })

    c.Visit("https://example.com/data")
}

Colly 与速率限制

package main

import (
    "time"

    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()

    // Rate limit: 1 request per 3 seconds per domain
    c.Limit(&colly.LimitRule{
        DomainGlob:  "*",
        Parallelism: 1,
        Delay:       3 * time.Second,
        RandomDelay: 2 * time.Second,
    })

    // ... CAPTCHA handling as above ...

    c.Visit("https://example.com")
}

Go 中的Turnstile解决方案

func (c *Client) SolveTurnstile(sitekey, pageurl string) (string, error) {
    data := url.Values{
        "key":       {c.APIKey},
        "method":    {"turnstile"},
        "sitekey":   {sitekey},
        "pageurl":   {pageurl},
        "json":      {"1"},
    }

    resp, err := c.HTTPClient.PostForm("https://ocr.captchaai.com/in.php", data)
    if err != nil {
        return "", fmt.Errorf("submit error: %w", err)
    }
    defer resp.Body.Close()

    var result apiResponse
    json.NewDecoder(resp.Body).Decode(&result)

    if result.Status != 1 {
        return "", fmt.Errorf("submit failed: %s", result.Request)
    }

    // Poll (same as reCAPTCHA)
    time.Sleep(5 * time.Second)
    for i := 0; i < 20; i++ {
        pollURL := fmt.Sprintf(
            "https://ocr.captchaai.com/res.php?key=%s&action=get&id=%s&json=1",
            c.APIKey, result.Request,
        )
        resp, err := c.HTTPClient.Get(pollURL)
        if err != nil {
            time.Sleep(3 * time.Second)
            continue
        }

        var pr apiResponse
        json.NewDecoder(resp.Body).Decode(&pr)
        resp.Body.Close()

        if pr.Status == 1 {
            return pr.Request, nil
        }
        if pr.Request != "CAPCHA_NOT_READY" {
            return "", fmt.Errorf("error: %s", pr.Request)
        }
        time.Sleep(3 * time.Second)
    }

    return "", errors.New("timeout")
}

常问问题

为什么使用 Colly 而不是其他 Go 抓取工具?

Colly 是最流行的 Go 抓取框架,具有内置缓存、速率限制和并发请求处理功能。它与 CaptchaAI 的 HTTP API 配合良好。

我可以将 Colly 与无头浏览器一起使用吗?

对于需要 JavaScript 渲染的页面,请与 Colly 一起使用 chromedprod。对静态页面使用 Colly,对受验证码保护的动态页面使用无头浏览器。

CaptchaAI的API与Go兼容吗?

是的。 CaptchaAI 使用与 Go 的 net/http 包配合使用的标准 HTTP 端点。无需SDK。


相关指南


将验证码解决添加到您的 Go 抓取工具中 –”获取CaptchaAI.

该文章已禁用评论。