Colly 是一个流行的 Go 网络抓取框架。以下是如何集成 CaptchaAI 来处理 Go 抓取工具中的验证码。
CaptchaAI Go 客户端
package captchaai
import (
"encoding/json"
"errors"
"fmt"
"net/http"
"net/url"
"strings"
"time"
)
type Client struct {
APIKey string
HTTPClient *http.Client
}
type apiResponse struct {
Status int `json:"status"`
Request string `json:"request"`
}
func NewClient(apiKey string) *Client {
return &Client{
APIKey: apiKey,
HTTPClient: &http.Client{Timeout: 30 * time.Second},
}
}
func (c *Client) SolveRecaptchaV2(sitekey, pageurl string) (string, error) {
// Submit task
data := url.Values{
"key": {c.APIKey},
"method": {"userrecaptcha"},
"googlekey": {sitekey},
"pageurl": {pageurl},
"json": {"1"},
}
resp, err := c.HTTPClient.PostForm("https://ocr.captchaai.com/in.php", data)
if err != nil {
return "", fmt.Errorf("submit error: %w", err)
}
defer resp.Body.Close()
var result apiResponse
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return "", fmt.Errorf("decode error: %w", err)
}
if result.Status != 1 {
return "", fmt.Errorf("submit failed: %s", result.Request)
}
taskID := result.Request
// Poll for result
time.Sleep(15 * time.Second)
for i := 0; i < 24; i++ {
pollURL := fmt.Sprintf(
"https://ocr.captchaai.com/res.php?key=%s&action=get&id=%s&json=1",
c.APIKey, taskID,
)
resp, err := c.HTTPClient.Get(pollURL)
if err != nil {
time.Sleep(5 * time.Second)
continue
}
var pollResult apiResponse
json.NewDecoder(resp.Body).Decode(&pollResult)
resp.Body.Close()
if pollResult.Status == 1 {
return pollResult.Request, nil
}
if pollResult.Request != "CAPCHA_NOT_READY" {
return "", fmt.Errorf("solve error: %s", pollResult.Request)
}
time.Sleep(5 * time.Second)
}
return "", errors.New("solve timeout")
}
科利集成
package main
import (
"fmt"
"log"
"os"
"strings"
"github.com/gocolly/colly/v2"
)
func main() {
apiKey := os.Getenv("CAPTCHAAI_API_KEY")
solver := captchaai.NewClient(apiKey)
c := colly.NewCollector(
colly.AllowedDomains("example.com"),
colly.MaxDepth(2),
)
// Detect CAPTCHA pages
c.OnHTML("[data-sitekey]", func(e *colly.HTMLElement) {
sitekey := e.Attr("data-sitekey")
pageURL := e.Request.URL.String()
log.Printf("CAPTCHA detected on %s, solving...", pageURL)
token, err := solver.SolveRecaptchaV2(sitekey, pageURL)
if err != nil {
log.Printf("Solve failed: %v", err)
return
}
log.Printf("CAPTCHA solved, token length: %d", len(token))
// Post form with token
err = c.Post(pageURL, map[string]string{
"g-recaptcha-response": token,
})
if err != nil {
log.Printf("Form submit failed: %v", err)
}
})
// Extract data
c.OnHTML("table tr", func(e *colly.HTMLElement) {
cols := []string{}
e.ForEach("td", func(_ int, td *colly.HTMLElement) {
cols = append(cols, strings.TrimSpace(td.Text))
})
if len(cols) > 0 {
fmt.Printf("Row: %s\n", strings.Join(cols, " | "))
}
})
c.OnError(func(r *colly.Response, err error) {
log.Printf("Error %s: %v", r.Request.URL, err)
})
c.Visit("https://example.com/data")
}
Colly 与速率限制
package main
import (
"time"
"github.com/gocolly/colly/v2"
)
func main() {
c := colly.NewCollector()
// Rate limit: 1 request per 3 seconds per domain
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: 1,
Delay: 3 * time.Second,
RandomDelay: 2 * time.Second,
})
// ... CAPTCHA handling as above ...
c.Visit("https://example.com")
}
Go 中的Turnstile解决方案
func (c *Client) SolveTurnstile(sitekey, pageurl string) (string, error) {
data := url.Values{
"key": {c.APIKey},
"method": {"turnstile"},
"sitekey": {sitekey},
"pageurl": {pageurl},
"json": {"1"},
}
resp, err := c.HTTPClient.PostForm("https://ocr.captchaai.com/in.php", data)
if err != nil {
return "", fmt.Errorf("submit error: %w", err)
}
defer resp.Body.Close()
var result apiResponse
json.NewDecoder(resp.Body).Decode(&result)
if result.Status != 1 {
return "", fmt.Errorf("submit failed: %s", result.Request)
}
// Poll (same as reCAPTCHA)
time.Sleep(5 * time.Second)
for i := 0; i < 20; i++ {
pollURL := fmt.Sprintf(
"https://ocr.captchaai.com/res.php?key=%s&action=get&id=%s&json=1",
c.APIKey, result.Request,
)
resp, err := c.HTTPClient.Get(pollURL)
if err != nil {
time.Sleep(3 * time.Second)
continue
}
var pr apiResponse
json.NewDecoder(resp.Body).Decode(&pr)
resp.Body.Close()
if pr.Status == 1 {
return pr.Request, nil
}
if pr.Request != "CAPCHA_NOT_READY" {
return "", fmt.Errorf("error: %s", pr.Request)
}
time.Sleep(3 * time.Second)
}
return "", errors.New("timeout")
}
常问问题
为什么使用 Colly 而不是其他 Go 抓取工具?
Colly 是最流行的 Go 抓取框架,具有内置缓存、速率限制和并发请求处理功能。它与 CaptchaAI 的 HTTP API 配合良好。
我可以将 Colly 与无头浏览器一起使用吗?
对于需要 JavaScript 渲染的页面,请与 Colly 一起使用 chromedp 或 rod。对静态页面使用 Colly,对受验证码保护的动态页面使用无头浏览器。
CaptchaAI的API与Go兼容吗?
是的。 CaptchaAI 使用与 Go 的 net/http 包配合使用的标准 HTTP 端点。无需SDK。
相关指南
- Crawlee + CaptchaAI 集成
- 构建自定义抓取框架
将验证码解决添加到您的 Go 抓取工具中 –”获取CaptchaAI.