法律数据库、法院归档系统和判例法存储库使用验证码保护其数据。律师事务所、法律科技公司和合规团队需要自动访问搜索判例法、监控备案和汇总监管数据。 CaptchaAI 处理这些门户上的验证码挑战。
合法数据源和验证码
| 来源 | 验证码类型 | 数据 | 用户 |
|---|---|---|---|
| 起搏器 | reCAPTCHA v2 | 联邦法院文件 | 诉讼团队 |
| 州法院系统 | 图像验证码 / reCAPTCHA | 国家案件记录 | 律师 |
| SEC埃德加 | reCAPTCHA v2 | 公司备案 | 遵守 |
| 专利数据库 | reCAPTCHA v2 | 专利记录 | 知识产权研究人员 |
| 监管门户网站 | 图片验证码 | 规则、指导 | 遵守 |
| 法律引文数据库 | reCAPTCHA v2 | 案例引用 | 法律科技 |
| 律师协会名录 | reCAPTCHA v2 | 律师记录 | 尽职调查 |
判例法搜索引擎
import requests
import time
import re
import base64
from bs4 import BeautifulSoup
import csv
CAPTCHAAI_KEY = "YOUR_API_KEY"
CAPTCHAAI_URL = "https://ocr.captchaai.com"
def solve_recaptcha(sitekey, pageurl):
resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data={
"key": CAPTCHAAI_KEY, "method": "userrecaptcha",
"googlekey": sitekey, "pageurl": pageurl, "json": 1,
})
task_id = resp.json()["request"]
for _ in range(60):
time.sleep(5)
result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
"key": CAPTCHAAI_KEY, "action": "get",
"id": task_id, "json": 1,
})
data = result.json()
if data["request"] != "CAPCHA_NOT_READY":
return data["request"]
raise TimeoutError("Timeout")
def solve_image_captcha(image_bytes):
img_b64 = base64.b64encode(image_bytes).decode()
resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data={
"key": CAPTCHAAI_KEY, "method": "base64",
"body": img_b64, "json": 1,
})
task_id = resp.json()["request"]
for _ in range(20):
time.sleep(3)
result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
"key": CAPTCHAAI_KEY, "action": "get",
"id": task_id, "json": 1,
})
data = result.json()
if data["request"] != "CAPCHA_NOT_READY":
return data["request"]
raise TimeoutError("Timeout")
class LegalResearchScraper:
def __init__(self, proxy=None):
self.session = requests.Session()
if proxy:
self.session.proxies = {"http": proxy, "https": proxy}
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
})
def search_cases(self, search_url, query, sitekey=None, max_pages=5):
"""Search case law database."""
all_cases = []
for page in range(max_pages):
url = f"{search_url}?q={query}&page={page + 1}"
resp = self.session.get(url, timeout=30)
if self._has_captcha(resp.text):
if sitekey:
token = solve_recaptcha(sitekey, url)
resp = self.session.post(url, data={
"q": query,
"g-recaptcha-response": token,
})
else:
resp = self._solve_image_and_retry(resp.text, url, query)
cases = self._parse_cases(resp.text)
if not cases:
break
all_cases.extend(cases)
print(f"Page {page + 1}: {len(cases)} cases")
time.sleep(5)
return all_cases
def get_case_details(self, case_url):
"""Fetch full case details."""
resp = self.session.get(case_url, timeout=30)
if self._has_captcha(resp.text):
sitekey = self._extract_sitekey(resp.text)
if sitekey:
token = solve_recaptcha(sitekey, case_url)
resp = self.session.post(case_url, data={
"g-recaptcha-response": token,
})
soup = BeautifulSoup(resp.text, "html.parser")
return {
"title": self._text(soup, "h1, .case-title"),
"citation": self._text(soup, ".citation, .case-cite"),
"court": self._text(soup, ".court, .jurisdiction"),
"date": self._text(soup, ".decision-date, .date-decided"),
"judge": self._text(soup, ".judge, .authored-by"),
"summary": self._text(soup, ".summary, .headnote"),
"url": case_url,
}
def monitor_docket(self, docket_url, case_number, sitekey=None):
"""Monitor a specific case docket for new filings."""
resp = self.session.get(docket_url, timeout=30)
data = {"case_number": case_number}
if sitekey and self._has_captcha(resp.text):
token = solve_recaptcha(sitekey, docket_url)
data["g-recaptcha-response"] = token
resp = self.session.post(docket_url, data=data)
return self._parse_docket(resp.text)
def export_results(self, cases, filename):
"""Export case results to CSV."""
if not cases:
return
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=cases[0].keys())
writer.writeheader()
writer.writerows(cases)
def _has_captcha(self, html):
return any(tag in html.lower() for tag in [
'data-sitekey', 'g-recaptcha', 'captcha',
])
def _extract_sitekey(self, html):
match = re.search(r'data-sitekey="([^"]+)"', html)
return match.group(1) if match else None
def _solve_image_and_retry(self, html, url, query):
match = re.search(r'src="(/captcha[^"]+)"', html)
if match:
img_url = url.split("?")[0].rstrip("/") + match.group(1)
img = self.session.get(img_url)
answer = solve_image_captcha(img.content)
return self.session.post(url, data={
"q": query,
"captcha": answer,
})
return self.session.get(url)
def _parse_cases(self, html):
soup = BeautifulSoup(html, "html.parser")
cases = []
for item in soup.select(".case-result, .search-result, tr.result"):
title_el = item.select_one("a, .case-name")
if title_el:
cases.append({
"title": title_el.get_text(strip=True),
"url": title_el.get("href", ""),
"citation": self._text(item, ".citation, .cite"),
"date": self._text(item, ".date"),
"court": self._text(item, ".court"),
})
return cases
def _parse_docket(self, html):
soup = BeautifulSoup(html, "html.parser")
entries = []
for row in soup.select(".docket-entry, tr.filing"):
entries.append({
"date": self._text(row, ".date, td:first-child"),
"entry": self._text(row, ".description, td:nth-child(2)"),
"filed_by": self._text(row, ".filer, td:nth-child(3)"),
})
return entries
def _text(self, el, selector):
found = el.select_one(selector)
return found.get_text(strip=True) if found else ""
# Usage
scraper = LegalResearchScraper(
proxy="http://user:pass@residential.proxy.com:5000"
)
# Search case law
cases = scraper.search_cases(
search_url="https://caselaw.example.com/search",
query="data privacy GDPR",
max_pages=5,
)
# Get details for relevant cases
for case in cases[:10]:
if case["url"]:
details = scraper.get_case_details(case["url"])
print(f"{details['citation']}: {details['title']}")
time.sleep(3)
# Export results
scraper.export_results(cases, "gdpr_cases.csv")
监管监控
class RegulatoryMonitor:
def __init__(self, proxy=None):
self.scraper = LegalResearchScraper(proxy=proxy)
self.seen_entries = set()
def check_new_filings(self, feeds):
"""Check regulatory portals for new filings."""
new_filings = []
for feed in feeds:
try:
cases = self.scraper.search_cases(
feed["url"], feed["query"],
sitekey=feed.get("sitekey"),
max_pages=2,
)
for case in cases:
key = case.get("citation") or case.get("title")
if key and key not in self.seen_entries:
self.seen_entries.add(key)
case["source"] = feed["name"]
new_filings.append(case)
except Exception as e:
print(f"Error checking {feed['name']}: {e}")
time.sleep(5)
return new_filings
故障排除
| 问题 | 原因 | 处理方式 |
|---|---|---|
| 图片验证码多次失败 | 文字扭曲 | 报告并重试 - 新图像 |
| PACER 阻止访问 | 超出速率限制 | 等待 30 分钟,降低请求频率 |
| 案件详情不完整 | 付费专区背后 | 根据需要支付每页费用 |
| 搜索没有返回结果 | 返回验证码页面 | 解析前检查验证码 |
| 案卷监控遗漏了备案 | 检查间隔太长 | 增加检查频率 |
常问问题
自动访问法庭记录合法吗?
公共法庭记录通常是可以查阅的。无论访问方式如何,PACER 都会按页收取费用。始终遵守系统速率限制和使用条款。
我如何处理 PACER 费用?
PACER 按页收费。规划批量数据收集时的费用预算。考虑使用 RECAP(免费存档)来获取已下载的文档。
哪些法律数据库拥有最多的验证码?
州法院系统最差——许多使用较旧的图像验证码。联邦法院 (PACER) 使用 reCAPTCHA v2。专利数据库也经常触发验证码。
相关指南
简化法律研究——获取您的 CaptchaAI 密钥并自动化判例法和档案搜索。