解决凌晨 3 点中断问题的验证码会导致数小时的数据丢失。 PagerDuty 集成可确保正确的人员立即收到通知,并有足够的上下文来诊断和解决问题,而无需深入挖掘日志。
警报策略
| 严重性 | 健康)状况 | 寻呼机值班操作 |
|---|---|---|
| 批判的 | 余额 < 2 美元 | 页面值班工程师 |
| 批判的 | 全体工人倒地 | 页面值班工程师 |
| 高的 | 错误率 > 20%,持续 5 分钟 | 创建紧急事件 |
| 警告 | 余额 < 10 美元 | 创建低紧急事件 |
| 警告 | 队列深度 > 100 持续 10 分钟 | 创建低紧急事件 |
| 信息 | 解决延迟 p95 > 120s | 添加到现有事件或日志 |
Python – PagerDuty 事件 API v2
import os
import time
import hashlib
import requests
from datetime import datetime
API_KEY = os.environ["CAPTCHAAI_API_KEY"]
PAGERDUTY_ROUTING_KEY = os.environ["PAGERDUTY_ROUTING_KEY"]
session = requests.Session()
class CaptchaPagerDuty:
EVENTS_URL = "https://events.pagerduty.com/v2/enqueue"
def __init__(self, routing_key):
self.routing_key = routing_key
def trigger(self, summary, severity="error", source="captcha-pipeline",
details=None, dedup_key=None):
"""Trigger a new PagerDuty incident."""
payload = {
"routing_key": self.routing_key,
"event_action": "trigger",
"payload": {
"summary": summary,
"severity": severity, # critical, error, warning, info
"source": source,
"timestamp": datetime.utcnow().isoformat() + "Z",
"custom_details": details or {}
}
}
if dedup_key:
payload["dedup_key"] = dedup_key
resp = requests.post(self.EVENTS_URL, json=payload, timeout=10)
resp.raise_for_status()
return resp.json()
def resolve(self, dedup_key):
"""Resolve an existing incident."""
payload = {
"routing_key": self.routing_key,
"event_action": "resolve",
"dedup_key": dedup_key
}
resp = requests.post(self.EVENTS_URL, json=payload, timeout=10)
resp.raise_for_status()
return resp.json()
def acknowledge(self, dedup_key):
"""Acknowledge an existing incident."""
payload = {
"routing_key": self.routing_key,
"event_action": "acknowledge",
"dedup_key": dedup_key
}
resp = requests.post(self.EVENTS_URL, json=payload, timeout=10)
resp.raise_for_status()
return resp.json()
pagerduty = CaptchaPagerDuty(PAGERDUTY_ROUTING_KEY)
class CaptchaMonitor:
def __init__(self):
self.error_window = [] # (timestamp, is_error)
self.window_size = 300 # 5 minutes in seconds
def record_solve(self, success):
now = time.time()
self.error_window.append((now, not success))
# Prune old entries
self.error_window = [
(t, e) for t, e in self.error_window
if now - t < self.window_size
]
@property
def error_rate(self):
if not self.error_window:
return 0.0
errors = sum(1 for _, e in self.error_window if e)
return errors / len(self.error_window)
def check_balance(self):
resp = session.get("https://ocr.captchaai.com/res.php", params={
"key": API_KEY, "action": "getbalance", "json": 1
})
data = resp.json()
if data.get("status") != 1:
return None
return float(data["request"])
def run_checks(self):
"""Run all monitoring checks and trigger alerts."""
# Check balance
balance = self.check_balance()
if balance is not None:
if balance < 2:
pagerduty.trigger(
summary=f"CaptchaAI balance critically low: ${balance:.2f}",
severity="critical",
dedup_key="captcha-balance-critical",
details={"balance": balance, "threshold": 2}
)
elif balance < 10:
pagerduty.trigger(
summary=f"CaptchaAI balance low: ${balance:.2f}",
severity="warning",
dedup_key="captcha-balance-warning",
details={"balance": balance, "threshold": 10}
)
else:
# Resolve if balance recovered
try:
pagerduty.resolve("captcha-balance-critical")
pagerduty.resolve("captcha-balance-warning")
except Exception:
pass # No incident to resolve
# Check error rate
rate = self.error_rate
if rate > 0.20:
total = len(self.error_window)
errors = sum(1 for _, e in self.error_window if e)
pagerduty.trigger(
summary=f"CaptchaAI error rate {rate:.0%} "
f"({errors}/{total} in 5 min)",
severity="error",
dedup_key="captcha-error-rate-high",
details={
"error_rate": round(rate, 3),
"total_tasks": total,
"failed_tasks": errors,
"window_seconds": self.window_size
}
)
elif rate < 0.05 and len(self.error_window) > 10:
try:
pagerduty.resolve("captcha-error-rate-high")
except Exception:
pass
monitor = CaptchaMonitor()
# After each solve:
# monitor.record_solve(success=True)
# Run checks every 60 seconds:
# while True:
# monitor.run_checks()
# time.sleep(60)
JavaScript – PagerDuty 集成
const axios = require("axios");
const API_KEY = process.env.CAPTCHAAI_API_KEY;
const PD_ROUTING_KEY = process.env.PAGERDUTY_ROUTING_KEY;
const PD_EVENTS_URL = "https://events.pagerduty.com/v2/enqueue";
class PagerDutyAlerter {
constructor(routingKey) {
this.routingKey = routingKey;
}
async trigger(summary, severity = "error", details = {}, dedupKey = null) {
const payload = {
routing_key: this.routingKey,
event_action: "trigger",
payload: {
summary,
severity,
source: "captcha-pipeline",
timestamp: new Date().toISOString(),
custom_details: details,
},
};
if (dedupKey) payload.dedup_key = dedupKey;
const resp = await axios.post(PD_EVENTS_URL, payload, { timeout: 10000 });
return resp.data;
}
async resolve(dedupKey) {
await axios.post(PD_EVENTS_URL, {
routing_key: this.routingKey,
event_action: "resolve",
dedup_key: dedupKey,
}, { timeout: 10000 });
}
}
const alerter = new PagerDutyAlerter(PD_ROUTING_KEY);
class CaptchaHealthMonitor {
constructor(windowMs = 300000) {
this.results = [];
this.windowMs = windowMs;
}
record(success) {
this.results.push({ time: Date.now(), success });
const cutoff = Date.now() - this.windowMs;
this.results = this.results.filter((r) => r.time > cutoff);
}
get errorRate() {
if (this.results.length === 0) return 0;
const errors = this.results.filter((r) => !r.success).length;
return errors / this.results.length;
}
async checkAndAlert() {
// Balance check
try {
const resp = await axios.get("https://ocr.captchaai.com/res.php", {
params: { key: API_KEY, action: "getbalance", json: 1 },
});
if (resp.data.status === 1) {
const balance = parseFloat(resp.data.request);
if (balance < 2) {
await alerter.trigger(
`CaptchaAI balance critically low: $${balance.toFixed(2)}`,
"critical",
{ balance },
"captcha-balance-critical"
);
} else if (balance < 10) {
await alerter.trigger(
`CaptchaAI balance low: $${balance.toFixed(2)}`,
"warning",
{ balance },
"captcha-balance-warning"
);
} else {
await alerter.resolve("captcha-balance-critical").catch(() => {});
await alerter.resolve("captcha-balance-warning").catch(() => {});
}
}
} catch (err) {
console.error("Balance check failed:", err.message);
}
// Error rate check
const rate = this.errorRate;
if (rate > 0.2 && this.results.length > 10) {
await alerter.trigger(
`CaptchaAI error rate: ${(rate * 100).toFixed(1)}%`,
"error",
{ errorRate: rate, totalTasks: this.results.length },
"captcha-error-rate"
);
} else if (rate < 0.05 && this.results.length > 10) {
await alerter.resolve("captcha-error-rate").catch(() => {});
}
}
}
const monitor = new CaptchaHealthMonitor();
// Run checks every 60 seconds
setInterval(() => monitor.checkAndAlert(), 60000);
module.exports = { monitor, alerter };
PagerDuty 设置清单
| 步 | 行动 |
|---|---|
| 1 | 在 PagerDuty 中为“CaptchaAI Pipeline”创建服务 |
| 2 | 将事件 API v2 集成添加到服务中 |
| 3 | 将路由密钥复制到 PAGERDUTY_ROUTING_KEY env var |
| 4 | 制定升级政策(待命 → 团队领导 → 经理) |
| 5 | 配置通知规则(推送、短信、电话) |
| 6 | 添加计划停机维护时段 |
故障排除
| 问题 | 原因 | 处理方式 |
|---|---|---|
| 警报未触发 | 错误的路由密钥 | 验证密钥与服务的事件 API 集成匹配 |
| 重复事件 | 缺少 dedup_key |
始终为每个警报类型设置一致的重复数据删除密钥 |
| 警报洪水 | 触发之间没有冷却时间 | PagerDuty 重复数据删除键抑制重复;确保您使用它们 |
| 自动解决不起作用 | 重复数据删除键不匹配 | 确保解析使用与触发器完全相同的重复数据删除密钥 |
常问问题
如何避免警觉疲劳?
使用重复数据删除键将相关警报分组到单个事件中。将警告警报设置为低紧急度(无页面)。为余额 < 2 美元或所有工作人员保留 key/high-urgency。
我可以将 PagerDuty 与 Datadog/New Relic 集成吗?
是的。 Datadog 和 New Relic 都具有原生 PagerDuty 集成。如果您已经将指标发送到可观察性平台,请使用这些。当您需要自定义控制时,直接 API 集成(本指南)是最好的选择。
触发、确认和解决之间有什么区别?
触发器创建一个新事件。 确认停止通知,但保持事件开放(有人正在处理它)。 解决彻底结束事件。
相关文章
- 构建客户端验证码管道 Captchaai
- 构建负责任的自动化 Captchaai
- Captchaai 监控 Datadog 指标警报
下一步
当您的验证码管道出现问题时,您会收到警报 –”以 CaptchaAI API 密钥开始并连接 PagerDuty。
相关指南:
- 数据狗监控
- 新遗物 APM
- 错误代码参考