Node.js 擅长 I/O-heavy 抓取工作负载。当目标站点提供验证码时,CaptchaAI 的 API 会在您的脚本处理 HTTP 请求时解决这些问题。本教程涵盖了使用 axios 和 Cheerio 的完整工作流程。
要求
| 要求 | 细节 |
|---|---|
| Node.js 16+ | 使用 npm |
| 轴 | npm install axios |
| 欢呼 | npm install cheerio |
| CaptchaAI API 密钥 | 从验证码网站 |
CaptchaAI 求解器模块
// captcha-solver.js
const axios = require("axios");
class CaptchaSolver {
constructor(apiKey) {
this.apiKey = apiKey;
this.baseUrl = "https://ocr.captchaai.com";
}
async _submit(params) {
params.key = this.apiKey;
const resp = await axios.get(`${this.baseUrl}/in.php`, { params });
if (!resp.data.startsWith("OK|")) {
throw new Error(`Submit error: ${resp.data}`);
}
return resp.data.split("|")[1];
}
async _poll(taskId, timeout = 300000) {
const deadline = Date.now() + timeout;
while (Date.now() < deadline) {
await new Promise((r) => setTimeout(r, 5000));
const resp = await axios.get(`${this.baseUrl}/res.php`, {
params: { key: this.apiKey, action: "get", id: taskId },
});
if (resp.data === "CAPCHA_NOT_READY") continue;
if (resp.data.startsWith("OK|")) return resp.data.split("|")[1];
throw new Error(`Solve error: ${resp.data}`);
}
throw new Error("Solve timed out");
}
async solveRecaptchaV2(siteKey, pageUrl) {
const taskId = await this._submit({
method: "userrecaptcha",
googlekey: siteKey,
pageurl: pageUrl,
});
return this._poll(taskId);
}
async solveRecaptchaV3(siteKey, pageUrl, action = "verify") {
const taskId = await this._submit({
method: "userrecaptcha",
googlekey: siteKey,
pageurl: pageUrl,
version: "v3",
action,
});
return this._poll(taskId);
}
async solveTurnstile(siteKey, pageUrl) {
const taskId = await this._submit({
method: "turnstile",
sitekey: siteKey,
pageurl: pageUrl,
});
return this._poll(taskId);
}
}
module.exports = CaptchaSolver;
抓取受 reCAPTCHA 保护的页面
const axios = require("axios");
const cheerio = require("cheerio");
const CaptchaSolver = require("./captcha-solver");
const solver = new CaptchaSolver("YOUR_API_KEY");
async function scrapeProtectedPage(url) {
// Step 1: Load the page
const { data: html } = await axios.get(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
},
});
const $ = cheerio.load(html);
// Step 2: Extract site key
const siteKey = $(".g-recaptcha").attr("data-sitekey");
if (!siteKey) {
console.log("No CAPTCHA found, page loaded directly");
return html;
}
console.log("Site key found:", siteKey);
// Step 3: Solve the CAPTCHA
const token = await solver.solveRecaptchaV2(siteKey, url);
console.log("Token received:", token.substring(0, 50));
// Step 4: Submit with the token
const result = await axios.post(
url,
new URLSearchParams({
"g-recaptcha-response": token,
q: "search query",
}),
{
headers: {
"Content-Type": "application/x-www-form-urlencoded",
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
},
}
);
return result.data;
}
同时抓取多个页面
async function scrapePages(urls, siteKey, concurrency = 3) {
const results = [];
const queue = [...urls];
const worker = async () => {
while (queue.length > 0) {
const url = queue.shift();
try {
const token = await solver.solveRecaptchaV2(siteKey, url);
const { data } = await axios.post(
url,
new URLSearchParams({ "g-recaptcha-response": token }),
{
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
},
}
);
results.push({ url, data, success: true });
console.log(`Scraped: ${url}`);
} catch (err) {
results.push({ url, error: err.message, success: false });
console.error(`Failed: ${url} - ${err.message}`);
}
}
};
// Run workers concurrently
const workers = Array(concurrency)
.fill(null)
.map(() => worker());
await Promise.all(workers);
return results;
}
// Usage
const urls = [
"https://example.com/page/1",
"https://example.com/page/2",
"https://example.com/page/3",
];
const results = await scrapePages(urls, "6Le-wvkS...", 3);
处理 Cookie 和会话
对于需要会话 cookie 的站点,使用具有 cookie 持久性的 axios:
const { wrapper } = require("axios-cookiejar-support");
const { CookieJar } = require("tough-cookie");
const jar = new CookieJar();
const client = wrapper(
axios.create({
jar,
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
},
})
);
async function scrapeWithSession(url, siteKey) {
// Initial page load sets cookies
await client.get(url);
// Solve CAPTCHA
const token = await solver.solveRecaptchaV2(siteKey, url);
// Submit with maintained cookies
const result = await client.post(
url,
new URLSearchParams({ "g-recaptcha-response": token })
);
return result.data;
}
使用 Cheerio 解析结果
function parseResults(html) {
const $ = cheerio.load(html);
const items = [];
$(".result-item").each((_, el) => {
items.push({
title: $(el).find(".title").text().trim(),
url: $(el).find("a").attr("href"),
description: $(el).find(".description").text().trim(),
});
});
return items;
}
故障排除
| 问题 | 原因 | 处理方式 |
|---|---|---|
CAPTCHA_NOT_READY 无限循环 |
站点密钥错误或解决速度慢 | 验证站点密钥;增加超时时间 |
403 Forbidden 发布于 POST |
缺少 cookie 或标头 | 使用会话cookie;添加 Referer 标头 |
| Cheerio 找不到元素 | 动态内容 | 将 Puppeteer 用于 JS 渲染的网站 |
ECONNREFUSED |
速率受目标站点限制 | 添加延迟;QA 测试会话 |
常问问题
我什么时候应该使用 Puppeteer 而不是 axios?
当目标站点返回带有标准表单提交的 HTML 时,请使用 axios + Cheerio。当网站需要 JavaScript 执行、动态渲染或复杂的用户交互时,请使用 Puppeteer。
我可以同时解决多个验证码吗?
是的。同时向 CaptchaAI 提交多个验证码任务并轮询每个结果。上面的并发抓取示例演示了这种模式。
如何处理受 Cloudflare 保护的网站?
如果站点使用 Turnstile,请使用 solver.solveTurnstile()。如需完整的 Cloudflare 挑战页面,请使用Cloudflare 验证流程解决它返回 qa_session_cookie cookie。
相关指南
- 使用 Node.js 解决 Puppeteer CAPTCHA 问题
- 使用 Python 进行验证码抓取
- 用于验证码抓取的代理轮换