应用场景

GIS 和地图数据提取的验证码处理

政府 GIS 门户、县评估系统和地图平台通过图像和 OCR 验证码保护地理空间查询。这些门户提供地块边界、分区指定、洪泛区和财产评估等数据,这些数据对于房地产分析、城市规划和环境研究非常有价值。以下是处理验证码的方法。

GIS 门户上的验证码模式

门户型 验证码类型 扳机
县 GIS/assessor 图片文字验证码 包裹搜索查询
国家地理空间门户 自定义验证码 数据下载请求
美国地质调查局数据门户 reCAPTCHA v2 批量数据访问
市政分区图 图片验证码 重复查找属性
环境数据库 数学验证码 报告生成
洪泛区查找 图片文字验证码 地址查询

地理信息系统数据提取器

import requests
import base64
import time
import re

class GISDataExtractor:
    def __init__(self, api_key):
        self.api_key = api_key
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        })

    def lookup_parcel(self, portal_url, parcel_id):
        """Look up parcel data by ID, solving CAPTCHAs as needed."""
        response = self.session.get(
            f"{portal_url}/parcel", params={"id": parcel_id}
        )

        if self._has_image_captcha(response.text):
            captcha_url = self._extract_captcha_url(response.text, portal_url)
            captcha_text = self._solve_captcha(captcha_url)

            # Re-submit with solved CAPTCHA
            response = self.session.post(f"{portal_url}/parcel", data={
                "id": parcel_id,
                "captcha": captcha_text,
                **self._extract_hidden_fields(response.text)
            })

        return self._parse_parcel_data(response.text)

    def search_by_address(self, portal_url, address):
        """Search GIS records by street address."""
        response = self.session.get(
            f"{portal_url}/search", params={"address": address}
        )

        if self._has_image_captcha(response.text):
            captcha_url = self._extract_captcha_url(response.text, portal_url)
            captcha_text = self._solve_captcha(captcha_url)

            response = self.session.post(f"{portal_url}/search", data={
                "address": address,
                "captcha": captcha_text,
                **self._extract_hidden_fields(response.text)
            })

        return self._parse_search_results(response.text)

    def bulk_extract(self, portal_url, parcel_ids, delay=3):
        """Extract data for multiple parcels with rate limiting."""
        results = {}

        for parcel_id in parcel_ids:
            try:
                results[parcel_id] = self.lookup_parcel(portal_url, parcel_id)
            except Exception as e:
                results[parcel_id] = {"error": str(e)}
            time.sleep(delay)

        return results

    def _has_image_captcha(self, html):
        return bool(re.search(
            r'captcha|verification.?image|security.?code',
            html, re.IGNORECASE
        ))

    def _extract_captcha_url(self, html, base_url):
        from bs4 import BeautifulSoup
        from urllib.parse import urljoin
        soup = BeautifulSoup(html, "html.parser")

        img = (
            soup.find("img", attrs={"src": lambda s: s and "captcha" in s.lower()}) or
            soup.find("img", {"id": re.compile(r"captcha", re.I)}) or
            soup.find("img", {"class": re.compile(r"captcha", re.I)})
        )

        if img and img.get("src"):
            return urljoin(base_url, img["src"])
        raise ValueError("CAPTCHA image not found")

    def _solve_captcha(self, captcha_url):
        """Download and solve image CAPTCHA."""
        img_response = self.session.get(captcha_url)
        img_base64 = base64.b64encode(img_response.content).decode("utf-8")

        resp = requests.post("https://ocr.captchaai.com/in.php", data={
            "key": self.api_key,
            "method": "base64",
            "body": img_base64,
            "json": 1
        })
        task_id = resp.json()["request"]

        for _ in range(30):
            time.sleep(3)
            result = requests.get("https://ocr.captchaai.com/res.php", params={
                "key": self.api_key,
                "action": "get",
                "id": task_id,
                "json": 1
            })
            data = result.json()
            if data["status"] == 1:
                return data["request"]

        raise TimeoutError("CAPTCHA solve timed out")

    def _extract_hidden_fields(self, html):
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(html, "html.parser")
        fields = {}
        for inp in soup.select("input[type='hidden']"):
            name = inp.get("name")
            if name:
                fields[name] = inp.get("value", "")
        return fields

    def _parse_parcel_data(self, html):
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(html, "html.parser")

        def text_or_none(node):
            return node.text.strip() if node and node.text else None

        return {
            "parcel_id": text_or_none(soup.select_one(".parcel-id, #parcelId")),
            "owner": text_or_none(soup.select_one(".owner, .owner-name")),
            "address": text_or_none(soup.select_one(".address, .situs")),
            "zoning": text_or_none(soup.select_one(".zoning, .zone-code")),
            "acreage": text_or_none(soup.select_one(".acreage, .area")),
            "assessed_value": text_or_none(soup.select_one(".assessed, .value")),
            "land_use": text_or_none(soup.select_one(".land-use, .use-code"))
        }

    def _parse_search_results(self, html):
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(html, "html.parser")

        def text_or_none(node):
            return node.text.strip() if node and node.text else None

        results = []
        for row in soup.select(".result-row, tr.parcel"):
            results.append({
                "parcel_id": text_or_none(row.select_one(".parcel-id")),
                "address": text_or_none(row.select_one(".address")),
                "owner": text_or_none(row.select_one(".owner"))
            })
        return results


# Usage
extractor = GISDataExtractor("YOUR_API_KEY")

# Single parcel lookup
parcel = extractor.lookup_parcel(
    "https://gis.county.example.gov",
    "12-34-567-890"
)
print(f"Owner: {parcel['owner']}, Zoning: {parcel['zoning']}")

# Bulk extraction
parcels = extractor.bulk_extract(
    "https://gis.county.example.gov",
    ["12-34-567-890", "12-34-567-891", "12-34-567-892"]
)

基于坐标的提取 (JavaScript)

class GISExtractor {
  constructor(apiKey) {
    this.apiKey = apiKey;
  }

  async extractByCoordinates(portalUrl, lat, lng) {
    const url = `${portalUrl}/identify?lat=${lat}&lng=${lng}`;
    const response = await fetch(url);
    const html = await response.text();

    if (this.hasCaptcha(html)) {
      return this.solveAndExtract(portalUrl, html, { lat, lng });
    }

    return this.parseGISData(html);
  }

  async extractRegion(portalUrl, bounds, gridSize = 0.01) {
    const results = [];
    const { north, south, east, west } = bounds;

    for (let lat = south; lat <= north; lat += gridSize) {
      for (let lng = west; lng <= east; lng += gridSize) {
        try {
          const data = await this.extractByCoordinates(portalUrl, lat, lng);
          if (data.parcelId) results.push(data);
        } catch (error) {
          console.error(`Failed at ${lat},${lng}: ${error.message}`);
        }
        // Rate limit
        await new Promise(r => setTimeout(r, 2000));
      }
    }

    return results;
  }

  hasCaptcha(html) {
    return /captcha|verification.?image|security.?code/i.test(html);
  }

  async solveAndExtract(portalUrl, html, params) {
    const imgMatch = html.match(/src="([^"]*captcha[^"]*)"/i);
    if (!imgMatch) throw new Error('CAPTCHA image not found');

    const imgUrl = new URL(imgMatch[1], portalUrl).href;
    const imgResp = await fetch(imgUrl);
    const buffer = await imgResp.arrayBuffer();
    const base64 = Buffer.from(buffer).toString('base64');

    const submitResp = await fetch('https://ocr.captchaai.com/in.php', {
      method: 'POST',
      body: new URLSearchParams({
        key: this.apiKey,
        method: 'base64',
        body: base64,
        json: '1'
      })
    });
    const { request: taskId } = await submitResp.json();

    for (let i = 0; i < 30; i++) {
      await new Promise(r => setTimeout(r, 3000));
      const result = await fetch(
        `https://ocr.captchaai.com/res.php?key=${this.apiKey}&action=get&id=${taskId}&json=1`
      );
      const data = await result.json();
      if (data.status === 1) {
        const response = await fetch(portalUrl, {
          method: 'POST',
          body: new URLSearchParams({
            ...params,
            captcha: data.request
          })
        });
        return this.parseGISData(await response.text());
      }
    }
    throw new Error('CAPTCHA solve timed out');
  }

  parseGISData(html) {
    return {
      parcelId: html.match(/parcel.?id[^>]*>([^<]+)/i)?.[1]?.trim(),
      zoning: html.match(/zon(?:e|ing)[^>]*>([^<]+)/i)?.[1]?.trim(),
      acreage: html.match(/acreage|area[^>]*>([^<]+)/i)?.[1]?.trim(),
      landUse: html.match(/land.?use[^>]*>([^<]+)/i)?.[1]?.trim()
    };
  }
}

// Usage
const gis = new GISExtractor('YOUR_API_KEY');

// Single coordinate lookup
const data = await gis.extractByCoordinates(
  'https://gis.county.example.gov',
  34.0522, -118.2437
);

// Extract entire region
const region = await gis.extractRegion('https://gis.county.example.gov', {
  north: 34.10, south: 34.00, east: -118.20, west: -118.30
});

GIS 门户的验证码参数

范围 价值 使用案例
method base64 标准图片验证码
numeric 1 纯数字验证码
min_len 4 当字符数已知时
max_len 6 当字符数已知时
language 0 英文/Latin字符
textinstructions 风俗 数学验证码或格式化代码

预批提取检查表

  • 在开始大型收集运行之前验证地图视口、区域过滤器和分页控件。
  • 存储标准化坐标有效负载和原始目标响应,以便提取错误保持可调试性。
  • 当验证码密度激增时暂停批次,而不是让重试隐藏目标端行为更改。

故障排除

问题 原因 处理方式
验证码图像加载损坏 需要会话 cookie 首先加载搜索页面
已解决的文本被拒绝 区分大小写 添加case_sensitive=1参数
门户返回不同的验证码 特定于会话的验证码 在同一会话中下载并解决
验证码后无包裹数据 缺少隐藏表单字段 提交前提取所有隐藏输入

常问问题

为什么 GIS 门户使用旧式图像验证码?

政府 GIS 系统通常构建在现代验证码服务出现之前的遗留平台上。预算限制和漫长的采购周期意味着这些旧的验证码仍然存在。

我应该如何处理特定于县的验证码格式?

每个县可能使用不同的验证码实施方式。使用CaptchaAI的textinstructions参数来描述具体格式——例如“5个大写字母”或“求解数学方程”。

我可以提取验证码后面的 shapefile 或 GeoJSON 数据吗?

如果门户在验证码后面提供可下载的空间数据,请解决验证码以访问下载链接。 CaptchaAI 处理验证码;然后就可以正常下载文件了。

下一步

可靠地提取 GIS 数据 —获取您的 CaptchaAI API 密钥并自动处理政府门户验证码。


后续阅读

该文章已禁用评论。