您的验证码解决管道的目标是 95% 的成功率。上周为 94.2%。这是一个问题吗?如果没有误差预算,您就无法定量地回答这个问题。错误预算准确地告诉您在可靠性低于 SLO 之前可以容忍多少次故障,以及预算用完时该怎么办。
错误预算基础知识
| 概念 | 定义 | 例子 |
|---|---|---|
| SLO | 目标成功率 | 95%成功解决 |
| 错误预算 | 允许故障率 | 总解决方案的 5% 可能会失败 |
| 燃烧率 | 预算消耗的速度有多快 | 2× 表示预算在半个窗口内耗尽 |
| 窗户 | 测量周期 | 24小时或7天滚动 |
如果您的 SLO 在 24 小时窗口内解决 10,000 次,达到 95%,那么您的错误预算就是 500 次失败。一旦达到 500 次故障,新的部署或有风险的更改就应该停止。
Python:错误预算跟踪器
import time
import threading
from dataclasses import dataclass, field
from collections import deque
from enum import Enum
API_KEY = "YOUR_API_KEY"
class BudgetStatus(Enum):
HEALTHY = "healthy" # Budget > 50% remaining
WARNING = "warning" # Budget 10-50% remaining
CRITICAL = "critical" # Budget < 10% remaining
EXHAUSTED = "exhausted" # Budget depleted
@dataclass
class SLOConfig:
"""Service Level Objective configuration."""
target_success_rate: float = 0.95 # 95%
window_seconds: int = 86400 # 24 hours
warning_threshold: float = 0.50 # Alert at 50% budget
critical_threshold: float = 0.10 # Alert at 10% budget
@dataclass
class ErrorBudgetEvent:
timestamp: float
success: bool
class ErrorBudgetTracker:
"""Tracks error budget consumption for CAPTCHA solving."""
def __init__(self, config: SLOConfig = SLOConfig()):
self.config = config
self._events: deque[ErrorBudgetEvent] = deque()
self._lock = threading.Lock()
self._callbacks: dict[BudgetStatus, list[callable]] = {
status: [] for status in BudgetStatus
}
self._last_status = BudgetStatus.HEALTHY
def on_status_change(self, status: BudgetStatus, callback: callable):
"""Register a callback for status transitions."""
self._callbacks[status].append(callback)
def record(self, success: bool):
"""Record a solve attempt."""
now = time.monotonic()
event = ErrorBudgetEvent(timestamp=now, success=success)
with self._lock:
self._events.append(event)
self._prune(now)
new_status = self._compute_status()
if new_status != self._last_status:
self._last_status = new_status
for cb in self._callbacks.get(new_status, []):
try:
cb(self.get_report())
except Exception as e:
print(f"[BUDGET] Callback error: {e}")
def _prune(self, now: float):
"""Remove events outside the window."""
cutoff = now - self.config.window_seconds
while self._events and self._events[0].timestamp < cutoff:
self._events.popleft()
def _compute_status(self) -> BudgetStatus:
remaining = self.remaining_fraction
if remaining <= 0:
return BudgetStatus.EXHAUSTED
if remaining < self.config.critical_threshold:
return BudgetStatus.CRITICAL
if remaining < self.config.warning_threshold:
return BudgetStatus.WARNING
return BudgetStatus.HEALTHY
@property
def total_events(self) -> int:
with self._lock:
return len(self._events)
@property
def success_count(self) -> int:
with self._lock:
return sum(1 for e in self._events if e.success)
@property
def failure_count(self) -> int:
with self._lock:
return sum(1 for e in self._events if not e.success)
@property
def current_success_rate(self) -> float:
total = self.total_events
return self.success_count / total if total > 0 else 1.0
@property
def error_budget_total(self) -> float:
"""Total allowed failures in the window."""
total = self.total_events
if total == 0:
return 0
return total * (1 - self.config.target_success_rate)
@property
def error_budget_remaining(self) -> float:
"""Remaining failure allowance."""
return max(0, self.error_budget_total - self.failure_count)
@property
def remaining_fraction(self) -> float:
"""Fraction of error budget remaining (0.0 to 1.0)."""
budget = self.error_budget_total
if budget <= 0:
return 1.0 if self.failure_count == 0 else 0.0
return max(0, self.error_budget_remaining / budget)
@property
def burn_rate(self) -> float:
"""How fast the budget is being consumed (1.0 = normal, 2.0 = 2× faster)."""
total = self.total_events
if total == 0:
return 0.0
expected_failures = total * (1 - self.config.target_success_rate)
if expected_failures == 0:
return 0.0
return self.failure_count / expected_failures
def get_report(self) -> dict:
return {
"status": self._last_status.value,
"slo_target": self.config.target_success_rate,
"current_rate": round(self.current_success_rate, 4),
"total_events": self.total_events,
"successes": self.success_count,
"failures": self.failure_count,
"budget_total": round(self.error_budget_total, 1),
"budget_remaining": round(self.error_budget_remaining, 1),
"budget_remaining_pct": round(self.remaining_fraction * 100, 1),
"burn_rate": round(self.burn_rate, 2),
}
# --- Integration with solver ---
budget = ErrorBudgetTracker(SLOConfig(
target_success_rate=0.95,
window_seconds=3600, # 1-hour window for demo
))
# Register alerts
budget.on_status_change(BudgetStatus.WARNING, lambda r:
print(f"[ALERT] Budget warning: {r['budget_remaining_pct']}% remaining"))
budget.on_status_change(BudgetStatus.CRITICAL, lambda r:
print(f"[ALERT] Budget critical: {r['budget_remaining_pct']}% remaining"))
budget.on_status_change(BudgetStatus.EXHAUSTED, lambda r:
print(f"[ALERT] Budget EXHAUSTED — throttle new requests"))
def solve_with_budget(params: dict) -> str:
"""Solve CAPTCHA while tracking error budget."""
import requests
if budget._last_status == BudgetStatus.EXHAUSTED:
raise RuntimeError("Error budget exhausted — solving paused")
try:
submit_params = {**params, "key": API_KEY, "json": 1}
resp = requests.post(
"https://ocr.captchaai.com/in.php", data=submit_params, timeout=30
).json()
if resp.get("status") != 1:
budget.record(False)
raise RuntimeError(f"Submit: {resp.get('request')}")
task_id = resp["request"]
start = time.monotonic()
while time.monotonic() - start < 180:
time.sleep(5)
poll = requests.get("https://ocr.captchaai.com/res.php", params={
"key": API_KEY, "action": "get", "id": task_id, "json": 1,
}, timeout=15).json()
if poll.get("request") == "CAPCHA_NOT_READY":
continue
if poll.get("status") == 1:
budget.record(True)
return poll["request"]
budget.record(False)
raise RuntimeError(f"Solve: {poll.get('request')}")
budget.record(False)
raise RuntimeError("Timeout")
except Exception:
budget.record(False)
raise
# Usage
for i in range(100):
try:
token = solve_with_budget({
"method": "turnstile",
"sitekey": "0x4XXXXXXXXXXXXXXXXX",
"pageurl": "https://example.com",
})
except RuntimeError as e:
if "exhausted" in str(e):
print(f"Stopped at iteration {i}")
break
print(budget.get_report())
JavaScript:错误预算跟踪器
class ErrorBudgetTracker {
#events = [];
#config;
#callbacks = {};
constructor(config = {}) {
this.#config = {
targetRate: config.targetRate || 0.95,
windowMs: config.windowMs || 3600_000,
warningThreshold: config.warningThreshold || 0.5,
criticalThreshold: config.criticalThreshold || 0.1,
};
this.lastStatus = "healthy";
}
on(status, callback) {
this.#callbacks[status] = this.#callbacks[status] || [];
this.#callbacks[status].push(callback);
}
record(success) {
const now = Date.now();
this.#events.push({ time: now, success });
this.#prune(now);
const newStatus = this.#computeStatus();
if (newStatus !== this.lastStatus) {
this.lastStatus = newStatus;
for (const cb of this.#callbacks[newStatus] || []) {
cb(this.report());
}
}
}
#prune(now) {
const cutoff = now - this.#config.windowMs;
while (this.#events.length && this.#events[0].time < cutoff) {
this.#events.shift();
}
}
#computeStatus() {
const frac = this.remainingFraction;
if (frac <= 0) return "exhausted";
if (frac < this.#config.criticalThreshold) return "critical";
if (frac < this.#config.warningThreshold) return "warning";
return "healthy";
}
get total() { return this.#events.length; }
get successes() { return this.#events.filter((e) => e.success).length; }
get failures() { return this.#events.filter((e) => !e.success).length; }
get currentRate() { return this.total ? this.successes / this.total : 1; }
get budgetTotal() {
return this.total * (1 - this.#config.targetRate);
}
get budgetRemaining() {
return Math.max(0, this.budgetTotal - this.failures);
}
get remainingFraction() {
const bt = this.budgetTotal;
if (bt <= 0) return this.failures === 0 ? 1 : 0;
return Math.max(0, this.budgetRemaining / bt);
}
get burnRate() {
const expected = this.total * (1 - this.#config.targetRate);
return expected > 0 ? this.failures / expected : 0;
}
report() {
return {
status: this.lastStatus,
currentRate: Math.round(this.currentRate * 10000) / 10000,
total: this.total,
failures: this.failures,
budgetRemainingPct: Math.round(this.remainingFraction * 1000) / 10,
burnRate: Math.round(this.burnRate * 100) / 100,
};
}
}
// Usage
const budget = new ErrorBudgetTracker({ targetRate: 0.95, windowMs: 3600_000 });
budget.on("warning", (r) => console.log(`[WARN] ${r.budgetRemainingPct}% budget left`));
budget.on("exhausted", (r) => console.log("[ALERT] Budget exhausted!"));
// Record results from your solver
budget.record(true); // success
budget.record(false); // failure
console.log(budget.report());
燃烧率警报
| 燃烧率 | 意义 | 行动 |
|---|---|---|
| < 1.0 | 消耗速度比预期慢 | 无需采取任何行动 |
| 1.0 | 在窗口尽头加速排气 | 密切监控 |
| 2.0 | 预算在半个窗口期就耗尽了 | 调查并放慢速度 |
| 5.0+ | 快速预算消费 | 暂停非关键解决 |
故障排除
| 问题 | 原因 | 处理方式 |
|---|---|---|
| 预算用得太快 | SLO 对于实际情况来说太紧 | 根据历史数据设置现实的 SLO |
| 预算从未消耗过 | SLO 过于慷慨 | 收紧 SLO 以推动可靠性改进 |
| 状态在状态之间波动 | 窗户太短 | 使用更长的测量窗口(24 小时与 1 小时) |
| 低容量时的燃烧率会产生误导 | 很少有事件影响计算结果 | 在计算消耗率之前需要最小事件计数 |
| 预算跟踪器内存增长 | 未修剪的事件 | 验证 _prune 在每个 record() 调用上运行 |
常问问题
验证码解决的实际 SLO 是什么?
取决于验证码类型。 reCAPTCHA v2 通常可以达到 90-95% 的解决率。Turnstile可能会更高。图像验证码各不相同。首先测量当前的成功率,然后将 SLO 设置为低于该基线 2-3%,以创建有意义的错误预算。
当错误预算耗尽时会发生什么?
从最不积极到最积极的选项:提醒团队、限制新请求、暂停非必要的解决、切换到手动验证码处理。永远不要默默地忽视耗尽的预算。
如何处理跨多种验证码类型的错误预算?
跟踪每种类型的单独预算。 reCAPTCHA 的 SLO 可能为 93%,而 Turnstile 的 SLO 为 97%。将它们汇总到一份预算中隐藏了特定类型的问题。
相关文章
下一步
定量跟踪您的验证码解决可靠性 -”获取您的 CaptchaAI API 密钥并实施错误预算跟踪。
相关指南:
- CAPTCHA API 调用的断路器模式
- CAPTCHA 工作人员的健康检查端点
- 使用 Prometheus 和 Grafana 监控验证码解决率