广告验证需要访问数千个网页来检查广告投放、品牌安全和合规性。许多发布商网站使用验证码来阻止自动检查。 CaptchaAI 让您的验证管道保持运行。
广告验证检查哪些内容
| 查看 | 描述 | 为什么验证码会阻止它 |
|---|---|---|
| 广告投放 | 广告是否显示在首屏? | 自动页面访问触发机器人检测 |
| 品牌安全 | 有害内容旁边没有广告 | 批量 URL 检查类似于抓取 |
| 可见度 | 广告确实可见吗? | Cloudflare 标记的无头浏览器 |
| 地理定位 | 正确的广告在正确的区域 | 代理流量触发验证码 |
| 竞争对手监控 | 竞争对手展示什么广告? | 大容量广告查找 |
执行
import requests
import time
import re
import json
import os
from datetime import datetime
API_KEY = os.environ["CAPTCHAAI_API_KEY"]
def solve_captcha(method, params):
params["key"] = API_KEY
params["method"] = method
resp = requests.get("https://ocr.captchaai.com/in.php", params=params)
if not resp.text.startswith("OK|"):
raise Exception(resp.text)
task_id = resp.text.split("|")[1]
for _ in range(60):
time.sleep(5)
result = requests.get("https://ocr.captchaai.com/res.php", params={
"key": API_KEY, "action": "get", "id": task_id,
})
if result.text == "CAPCHA_NOT_READY":
continue
if result.text.startswith("OK|"):
return result.text.split("|", 1)[1]
raise Exception(result.text)
raise TimeoutError()
def verify_ad_placement(url, session):
"""Verify ad placement on a publisher page."""
resp = session.get(url)
# Solve CAPTCHA if present
match = re.search(r'data-sitekey=["\']([A-Za-z0-9_-]+)["\']', resp.text)
if match:
token = solve_captcha("userrecaptcha", {
"googlekey": match.group(1),
"pageurl": url,
})
resp = session.post(url, data={"g-recaptcha-response": token})
html = resp.text
# Check for ad elements
result = {
"url": url,
"timestamp": datetime.utcnow().isoformat(),
"ads_found": [],
"brand_safety": True,
"captcha_solved": match is not None,
}
# Detect ad tags
ad_patterns = [
(r'googletag\.pubads', "Google Ad Manager"),
(r'doubleclick\.net', "DFP/DoubleClick"),
(r'ad\.doubleclick', "DoubleClick"),
(r'amazon-adsystem', "Amazon Ads"),
(r'criteo\.com/.*\.js', "Criteo"),
]
for pattern, name in ad_patterns:
if re.search(pattern, html):
result["ads_found"].append(name)
# Brand safety check — flag problematic content
safety_keywords = [
"violence", "hate speech", "explicit",
"gambling", "illegal",
]
page_text = re.sub(r'<[^>]+>', '', html).lower()
for keyword in safety_keywords:
if keyword in page_text:
result["brand_safety"] = False
break
return result
def run_verification(urls, output_file="verification_report.json"):
"""Run ad verification across multiple publisher URLs."""
session = requests.Session()
session.headers["User-Agent"] = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/120.0.0.0"
)
results = []
for i, url in enumerate(urls):
try:
result = verify_ad_placement(url, session)
results.append(result)
ads = ", ".join(result["ads_found"]) or "None"
safe = "SAFE" if result["brand_safety"] else "UNSAFE"
print(f" [{i+1}/{len(urls)}] {url}: {ads} [{safe}]")
except Exception as e:
results.append({
"url": url,
"error": str(e),
"timestamp": datetime.utcnow().isoformat(),
})
print(f" [{i+1}/{len(urls)}] {url}: ERROR - {e}")
time.sleep(2)
with open(output_file, "w") as f:
json.dump(results, f, indent=2)
# Summary
total = len(results)
safe = sum(1 for r in results if r.get("brand_safety"))
captchas = sum(1 for r in results if r.get("captcha_solved"))
errors = sum(1 for r in results if "error" in r)
print(f"\n Total: {total} | Safe: {safe} | CAPTCHAs solved: {captchas} | Errors: {errors}")
return results
# Publisher URLs to verify
publisher_urls = [
"https://publisher1.com/article/tech-news",
"https://publisher2.com/sports/latest",
"https://publisher3.com/finance/markets",
]
run_verification(publisher_urls)
使用受 Cloudflare 保护的发布商进行扩展
许多优质发布商都使用 Cloudflare。应对Turnstile和全面挑战:
def handle_cloudflare(url, session):
"""Handle Cloudflare-protected publisher pages."""
resp = session.get(url)
if "cf-turnstile" in resp.text:
match = re.search(r'data-sitekey=["\']([^"\']+)', resp.text)
if match:
token = solve_captcha("turnstile", {
"sitekey": match.group(1),
"pageurl": url,
})
return session.post(url, data={
"cf-turnstile-response": token,
})
if resp.status_code == 403 and "cf-browser-verification" in resp.text:
data = solve_captcha("cloudflare_challenge", {
"pageurl": url,
"proxy": "user:pass@proxy:port",
"proxytype": "HTTP",
})
# Parse qa_session_cookie and use same proxy
return data
return resp
常问问题
我每小时可以验证多少页?
使用 CaptchaAI,您每小时可以验证 200-500 页,具体取决于验证码频率和解决时间。
这适用于视频广告验证吗?
此方法适用于展示广告和原生广告。视频广告验证通常需要使用 Selenium 或 Playwright 进行浏览器渲染。
不同地区如何处理?
使用目标地区的代理。 CaptchaAI 支持代理参数,因此求解上下文与您的地理定位相匹配。
相关指南
- 抓取受保护的网站
- 代理设置最佳实践
- 无头浏览器验证码问题