HTTPX 是一个现代 Python HTTP 客户端,具有异步支持和 HTTP/2. 本指南演示如何将其与 CaptchaAI 一起使用来解决同步和异步验证码问题。
要求
| 要求 | 细节 |
|---|---|
| Python | 3.8+ |
| httpx | 0.24+ |
| CaptchaAI API 密钥 | 在这里买一个 |
pip install httpx
同步客户端
import httpx
import time
import os
class CaptchaAISync:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://ocr.captchaai.com"
self.client = httpx.Client(timeout=30)
def solve(self, params, timeout=300):
params["key"] = self.api_key
# Submit
resp = self.client.get(f"{self.base_url}/in.php", params=params)
text = resp.text
if not text.startswith("OK|"):
raise Exception(f"Submit failed: {text}")
task_id = text.split("|")[1]
# Poll
deadline = time.time() + timeout
poll_params = {"key": self.api_key, "action": "get", "id": task_id}
while time.time() < deadline:
time.sleep(5)
result = self.client.get(
f"{self.base_url}/res.php", params=poll_params
)
if result.text == "CAPCHA_NOT_READY":
continue
if result.text.startswith("OK|"):
return result.text.split("|", 1)[1]
raise Exception(f"Solve failed: {result.text}")
raise TimeoutError(f"Task {task_id} timed out")
def get_balance(self):
resp = self.client.get(f"{self.base_url}/res.php", params={
"key": self.api_key, "action": "getbalance"
})
return float(resp.text)
def close(self):
self.client.close()
# Usage
solver = CaptchaAISync(os.environ["CAPTCHAAI_API_KEY"])
token = solver.solve({
"method": "userrecaptcha",
"googlekey": "6Le-wvkS...",
"pageurl": "https://example.com",
})
print(f"Token: {token[:50]}...")
solver.close()
异步客户端
import httpx
import asyncio
import os
class CaptchaAIAsync:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://ocr.captchaai.com"
self.client = httpx.AsyncClient(timeout=30)
async def solve(self, params, timeout=300):
params["key"] = self.api_key
# Submit
resp = await self.client.get(
f"{self.base_url}/in.php", params=params
)
text = resp.text
if not text.startswith("OK|"):
raise Exception(f"Submit failed: {text}")
task_id = text.split("|")[1]
# Poll
deadline = asyncio.get_event_loop().time() + timeout
poll_params = {"key": self.api_key, "action": "get", "id": task_id}
while asyncio.get_event_loop().time() < deadline:
await asyncio.sleep(5)
result = await self.client.get(
f"{self.base_url}/res.php", params=poll_params
)
if result.text == "CAPCHA_NOT_READY":
continue
if result.text.startswith("OK|"):
return result.text.split("|", 1)[1]
raise Exception(f"Solve failed: {result.text}")
raise TimeoutError(f"Task {task_id} timed out")
async def get_balance(self):
resp = await self.client.get(f"{self.base_url}/res.php", params={
"key": self.api_key, "action": "getbalance"
})
return float(resp.text)
async def close(self):
await self.client.aclose()
# Usage
async def main():
solver = CaptchaAIAsync(os.environ["CAPTCHAAI_API_KEY"])
# Solve multiple concurrently
tasks = [
solver.solve({
"method": "userrecaptcha",
"googlekey": "6Le-wvkS...",
"pageurl": f"https://example.com/page{i}",
})
for i in range(5)
]
results = await asyncio.gather(*tasks, return_exceptions=True)
for i, r in enumerate(results):
if isinstance(r, Exception):
print(f"Page {i}: FAILED - {r}")
else:
print(f"Page {i}: solved ({len(r)} chars)")
await solver.close()
asyncio.run(main())
HTTP/2 支持
HTTPX 支持 HTTP/2,减少连接开销:
pip install httpx[http2]
client = httpx.AsyncClient(http2=True, timeout=30)
HTTP/2 通过单个连接复用请求,从而提高提交和轮询多个验证码时的性能。
使用验证码处理进行抓取示例
import httpx
import re
import os
async def scrape_with_captcha(url, solver):
async with httpx.AsyncClient() as client:
# Fetch page
resp = await client.get(url)
html = resp.text
# Check for reCAPTCHA
match = re.search(
r'data-sitekey=["\']([A-Za-z0-9_-]+)["\']', html
)
if not match:
return html
site_key = match.group(1)
token = await solver.solve({
"method": "userrecaptcha",
"googlekey": site_key,
"pageurl": url,
})
# Submit form with token
resp = await client.post(url, data={
"g-recaptcha-response": token,
})
return resp.text
async def main():
solver = CaptchaAIAsync(os.environ["CAPTCHAAI_API_KEY"])
content = await scrape_with_captcha("https://example.com", solver)
print(f"Got {len(content)} chars")
await solver.close()
asyncio.run(main())
比较:httpx、requests、aiohttp
| 特征 | httpx(同步) | httpx(异步) | 要求 | aiohttp |
|---|---|---|---|---|
| 异步支持 | Ø | ○… | Ø | ○… |
| HTTP/2 | ○… | ○… | Ø | Ø |
| 连接池 | ○… | ○… | ○… | ○… |
| API兼容性 | 类请求 | 类请求 | —— | 不同的 |
| 最适合 | 直接更换 | 现代异步代码 | 快速脚本 | 高并发 |
常问问题
我应该在请求上使用 httpx 吗?
对于新项目来说,是的。 httpx 具有请求兼容的 API 以及异步和 HTTP/2 支持。对于使用请求的现有代码,迁移非常简单。
httpx 比 aiohttp 快吗?
aiohttp 对于纯异步工作负载的开销略低。 httpx 对于 HTTP/2 连接来说更快,对于混合sync/async 代码更方便。
我可以在 Scrapy 中使用 httpx 吗?
不直接 - Scrapy 使用 Twisted 的事件循环。在独立脚本中使用 httpx 或与基于 asyncio 的框架(如 FastAPI)一起使用。