应用场景

使用验证码处理进行法律研究网络抓取

法律数据库、法院归档系统和判例法存储库使用验证码保护其数据。律师事务所、法律科技公司和合规团队需要自动访问搜索判例法、监控备案和汇总监管数据。 CaptchaAI 处理这些门户上的验证码挑战。


合法数据源和验证码

来源 验证码类型 数据 用户
起搏器 reCAPTCHA v2 联邦法院文件 诉讼团队
州法院系统 图像验证码 / reCAPTCHA 国家案件记录 律师
SEC埃德加 reCAPTCHA v2 公司备案 遵守
专利数据库 reCAPTCHA v2 专利记录 知识产权研究人员
监管门户网站 图片验证码 规则、指导 遵守
法律引文数据库 reCAPTCHA v2 案例引用 法律科技
律师协会名录 reCAPTCHA v2 律师记录 尽职调查

判例法搜索引擎

import requests
import time
import re
import base64
from bs4 import BeautifulSoup
import csv

CAPTCHAAI_KEY = "YOUR_API_KEY"
CAPTCHAAI_URL = "https://ocr.captchaai.com"


def solve_recaptcha(sitekey, pageurl):
    resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data={
        "key": CAPTCHAAI_KEY, "method": "userrecaptcha",
        "googlekey": sitekey, "pageurl": pageurl, "json": 1,
    })
    task_id = resp.json()["request"]
    for _ in range(60):
        time.sleep(5)
        result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
            "key": CAPTCHAAI_KEY, "action": "get",
            "id": task_id, "json": 1,
        })
        data = result.json()
        if data["request"] != "CAPCHA_NOT_READY":
            return data["request"]
    raise TimeoutError("Timeout")


def solve_image_captcha(image_bytes):
    img_b64 = base64.b64encode(image_bytes).decode()
    resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data={
        "key": CAPTCHAAI_KEY, "method": "base64",
        "body": img_b64, "json": 1,
    })
    task_id = resp.json()["request"]
    for _ in range(20):
        time.sleep(3)
        result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
            "key": CAPTCHAAI_KEY, "action": "get",
            "id": task_id, "json": 1,
        })
        data = result.json()
        if data["request"] != "CAPCHA_NOT_READY":
            return data["request"]
    raise TimeoutError("Timeout")


class LegalResearchScraper:
    def __init__(self, proxy=None):
        self.session = requests.Session()
        if proxy:
            self.session.proxies = {"http": proxy, "https": proxy}
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
        })

    def search_cases(self, search_url, query, sitekey=None, max_pages=5):
        """Search case law database."""
        all_cases = []

        for page in range(max_pages):
            url = f"{search_url}?q={query}&page={page + 1}"
            resp = self.session.get(url, timeout=30)

            if self._has_captcha(resp.text):
                if sitekey:
                    token = solve_recaptcha(sitekey, url)
                    resp = self.session.post(url, data={
                        "q": query,
                        "g-recaptcha-response": token,
                    })
                else:
                    resp = self._solve_image_and_retry(resp.text, url, query)

            cases = self._parse_cases(resp.text)
            if not cases:
                break

            all_cases.extend(cases)
            print(f"Page {page + 1}: {len(cases)} cases")
            time.sleep(5)

        return all_cases

    def get_case_details(self, case_url):
        """Fetch full case details."""
        resp = self.session.get(case_url, timeout=30)

        if self._has_captcha(resp.text):
            sitekey = self._extract_sitekey(resp.text)
            if sitekey:
                token = solve_recaptcha(sitekey, case_url)
                resp = self.session.post(case_url, data={
                    "g-recaptcha-response": token,
                })

        soup = BeautifulSoup(resp.text, "html.parser")
        return {
            "title": self._text(soup, "h1, .case-title"),
            "citation": self._text(soup, ".citation, .case-cite"),
            "court": self._text(soup, ".court, .jurisdiction"),
            "date": self._text(soup, ".decision-date, .date-decided"),
            "judge": self._text(soup, ".judge, .authored-by"),
            "summary": self._text(soup, ".summary, .headnote"),
            "url": case_url,
        }

    def monitor_docket(self, docket_url, case_number, sitekey=None):
        """Monitor a specific case docket for new filings."""
        resp = self.session.get(docket_url, timeout=30)

        data = {"case_number": case_number}
        if sitekey and self._has_captcha(resp.text):
            token = solve_recaptcha(sitekey, docket_url)
            data["g-recaptcha-response"] = token

        resp = self.session.post(docket_url, data=data)
        return self._parse_docket(resp.text)

    def export_results(self, cases, filename):
        """Export case results to CSV."""
        if not cases:
            return
        with open(filename, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=cases[0].keys())
            writer.writeheader()
            writer.writerows(cases)

    def _has_captcha(self, html):
        return any(tag in html.lower() for tag in [
            'data-sitekey', 'g-recaptcha', 'captcha',
        ])

    def _extract_sitekey(self, html):
        match = re.search(r'data-sitekey="([^"]+)"', html)
        return match.group(1) if match else None

    def _solve_image_and_retry(self, html, url, query):
        match = re.search(r'src="(/captcha[^"]+)"', html)
        if match:
            img_url = url.split("?")[0].rstrip("/") + match.group(1)
            img = self.session.get(img_url)
            answer = solve_image_captcha(img.content)
            return self.session.post(url, data={
                "q": query,
                "captcha": answer,
            })
        return self.session.get(url)

    def _parse_cases(self, html):
        soup = BeautifulSoup(html, "html.parser")
        cases = []
        for item in soup.select(".case-result, .search-result, tr.result"):
            title_el = item.select_one("a, .case-name")
            if title_el:
                cases.append({
                    "title": title_el.get_text(strip=True),
                    "url": title_el.get("href", ""),
                    "citation": self._text(item, ".citation, .cite"),
                    "date": self._text(item, ".date"),
                    "court": self._text(item, ".court"),
                })
        return cases

    def _parse_docket(self, html):
        soup = BeautifulSoup(html, "html.parser")
        entries = []
        for row in soup.select(".docket-entry, tr.filing"):
            entries.append({
                "date": self._text(row, ".date, td:first-child"),
                "entry": self._text(row, ".description, td:nth-child(2)"),
                "filed_by": self._text(row, ".filer, td:nth-child(3)"),
            })
        return entries

    def _text(self, el, selector):
        found = el.select_one(selector)
        return found.get_text(strip=True) if found else ""


# Usage
scraper = LegalResearchScraper(
    proxy="http://user:pass@residential.proxy.com:5000"
)

# Search case law
cases = scraper.search_cases(
    search_url="https://caselaw.example.com/search",
    query="data privacy GDPR",
    max_pages=5,
)

# Get details for relevant cases
for case in cases[:10]:
    if case["url"]:
        details = scraper.get_case_details(case["url"])
        print(f"{details['citation']}: {details['title']}")
        time.sleep(3)

# Export results
scraper.export_results(cases, "gdpr_cases.csv")

监管监控

class RegulatoryMonitor:
    def __init__(self, proxy=None):
        self.scraper = LegalResearchScraper(proxy=proxy)
        self.seen_entries = set()

    def check_new_filings(self, feeds):
        """Check regulatory portals for new filings."""
        new_filings = []

        for feed in feeds:
            try:
                cases = self.scraper.search_cases(
                    feed["url"], feed["query"],
                    sitekey=feed.get("sitekey"),
                    max_pages=2,
                )

                for case in cases:
                    key = case.get("citation") or case.get("title")
                    if key and key not in self.seen_entries:
                        self.seen_entries.add(key)
                        case["source"] = feed["name"]
                        new_filings.append(case)

            except Exception as e:
                print(f"Error checking {feed['name']}: {e}")

            time.sleep(5)

        return new_filings

故障排除

问题 原因 处理方式
图片验证码多次失败 文字扭曲 报告并重试 - 新图像
PACER 阻止访问 超出速率限制 等待 30 分钟,降低请求频率
案件详情不完整 付费专区背后 根据需要支付每页费用
搜索没有返回结果 返回验证码页面 解析前检查验证码
案卷监控遗漏了备案 检查间隔太长 增加检查频率

常问问题

自动访问法庭记录合法吗?

公共法庭记录通常是可以查阅的。无论访问方式如何,PACER 都会按页收取费用。始终遵守系统速率限制和使用条款。

我如何处理 PACER 费用?

PACER 按页收费。规划批量数据收集时的费用预算。考虑使用 RECAP(免费存档)来获取已下载的文档。

哪些法律数据库拥有最多的验证码?

州法院系统最差——许多使用较旧的图像验证码。联邦法院 (PACER) 使用 reCAPTCHA v2。专利数据库也经常触发验证码。


相关指南


简化法律研究——获取您的 CaptchaAI 密钥并自动化判例法和档案搜索。

该文章已禁用评论。