Indeed、LinkedIn 和 Glassdoor 等招聘网站在检测到自动访问模式时会部署验证码。招聘平台、市场研究人员和人力资源分析工具需要可靠的验证码解决方案来大规模收集职位列表数据。
主要求职板上的验证码
| 平台 | 验证码类型 | 扳机 | 可用数据 |
|---|---|---|---|
| 的确 | reCAPTCHA v2 | 高请求量 | 职位列表、薪资 |
| 领英 | Cloudflare 验证流程 | 机器人检测 | 职位、公司数据 |
| 玻璃门 | reCAPTCHA v2 | 刮擦检测 | 评论、薪资、职位 |
| ZipRecruiter | Cloudflare Turnstile | 自动访问 | 职位列表 |
| 怪物 | reCAPTCHA v2 | 搜索页面 | 职位列表 |
| 职业建设者 | reCAPTCHA v3 | 登录、搜索 | 职位列表、简历搜索 |
具有验证码处理功能的求职板抓取工具
import requests
import time
import re
from bs4 import BeautifulSoup
CAPTCHAAI_KEY = "YOUR_API_KEY"
CAPTCHAAI_URL = "https://ocr.captchaai.com"
def solve_captcha(method, sitekey, pageurl, **kwargs):
data = {
"key": CAPTCHAAI_KEY,
"method": method,
"googlekey": sitekey,
"pageurl": pageurl,
"json": 1,
}
data.update(kwargs)
resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data=data)
task_id = resp.json()["request"]
for _ in range(60):
time.sleep(5)
result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
"key": CAPTCHAAI_KEY, "action": "get",
"id": task_id, "json": 1,
})
r = result.json()
if r["request"] != "CAPCHA_NOT_READY":
return r["request"]
raise TimeoutError("Solve timeout")
class JobBoardScraper:
def __init__(self, proxy=None):
self.session = requests.Session()
if proxy:
self.session.proxies = {"http": proxy, "https": proxy}
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
})
def search_jobs(self, base_url, query, location, pages=5):
"""Search job listings across multiple pages."""
all_jobs = []
for page in range(pages):
url = f"{base_url}/jobs?q={query}&l={location}&start={page * 10}"
resp = self.session.get(url, timeout=30)
# Check for CAPTCHA
if self._has_captcha(resp.text):
resp = self._solve_and_retry(resp.text, url)
if resp.status_code == 200:
jobs = self._parse_listings(resp.text)
all_jobs.extend(jobs)
print(f"Page {page + 1}: {len(jobs)} jobs found")
else:
print(f"Page {page + 1}: Request failed ({resp.status_code})")
time.sleep(3) # Rate limit
return all_jobs
def _has_captcha(self, html):
indicators = [
'data-sitekey=',
'g-recaptcha',
'cf-turnstile',
'captcha-delivery',
]
return any(ind in html.lower() for ind in indicators)
def _solve_and_retry(self, html, url):
# Try reCAPTCHA first
match = re.search(r'data-sitekey="([^"]+)"', html)
if match:
sitekey = match.group(1)
# Detect Turnstile vs reCAPTCHA
if 'cf-turnstile' in html:
token = solve_captcha("turnstile", sitekey, url)
field = "cf-turnstile-response"
else:
token = solve_captcha("userrecaptcha", sitekey, url)
field = "g-recaptcha-response"
return self.session.post(url, data={field: token})
return self.session.get(url)
def _parse_listings(self, html):
soup = BeautifulSoup(html, "html.parser")
jobs = []
for card in soup.select(".job_seen_beacon, .jobsearch-ResultsList > li"):
title_el = card.select_one("h2 a, .jobTitle a")
company_el = card.select_one(".companyName, [data-testid='company-name']")
location_el = card.select_one(".companyLocation, [data-testid='text-location']")
salary_el = card.select_one(".salary-snippet, .estimated-salary")
if title_el:
jobs.append({
"title": title_el.get_text(strip=True),
"company": company_el.get_text(strip=True) if company_el else "",
"location": location_el.get_text(strip=True) if location_el else "",
"salary": salary_el.get_text(strip=True) if salary_el else "",
"url": title_el.get("href", ""),
})
return jobs
# Usage
scraper = JobBoardScraper(
proxy="http://user:pass@residential.proxy.com:5000"
)
jobs = scraper.search_jobs(
base_url="https://jobs.example.com",
query="python developer",
location="New York",
pages=10,
)
print(f"Total jobs collected: {len(jobs)}")
薪资数据收集
import csv
def collect_salary_data(titles, locations, output_file):
"""Collect salary data across job titles and locations."""
scraper = JobBoardScraper(
proxy="http://user:pass@residential.proxy.com:5000"
)
results = []
for title in titles:
for location in locations:
try:
jobs = scraper.search_jobs(
"https://jobs.example.com",
title, location, pages=3,
)
salaries = [j["salary"] for j in jobs if j["salary"]]
results.append({
"title": title,
"location": location,
"listings": len(jobs),
"with_salary": len(salaries),
"salary_samples": "; ".join(salaries[:5]),
})
time.sleep(5)
except Exception as e:
results.append({
"title": title,
"location": location,
"error": str(e),
})
with open(output_file, "w", newline="") as f:
writer = csv.DictWriter(
f, fieldnames=["title", "location", "listings",
"with_salary", "salary_samples", "error"],
)
writer.writeheader()
writer.writerows(results)
return results
# Collect salary data for market analysis
collect_salary_data(
titles=["Data Engineer", "ML Engineer", "DevOps Engineer"],
locations=["San Francisco", "New York", "Austin", "Remote"],
output_file="salary_data.csv",
)
招聘委员会的标准配置提示
| 技术 | 为什么它有帮助 |
|---|---|
| 轮换自有服务器基础设施 | 跨真实 IP 分发请求 |
| 页面之间有 3-5 秒的延迟 | 模仿人类浏览速度 |
| 每个会话一致的用户代理 | 避免特征不一致 |
| 接受cookies | 求职板通过 cookie 跟踪会话 |
| 随机化搜索顺序 | 避免连续页面模式 |
| 每个域限制为 200 页/day | 保持在检测阈值以下 |
故障排除
| 问题 | 原因 | 处理方式 |
|---|---|---|
| 每次搜索时都会显示验证码 | IP 已标记或速率超出 | 切换IP,增加更长的延迟 |
| 结果页面为空 | 返回验证码块 | 解析前检测验证码 |
| “请确认你是人类” | 机器人检测已触发 | 使用自有服务器基础设施+真实UA |
| 薪资数据需要登录 | 平台门控内容 | 实施经过身份验证的会话 |
| 与浏览器的结果不同 | 位置/cookie差异 | 匹配接受语言和地理代理 |
常问问题
我每天可以抓取多少个职位列表?
通过轮换自有服务器基础设施和适当的延迟,每个域可以实现 500-2000 个页面,而无需遇到持久的验证码。
求职板会阻止抓取吗?
大多数招聘网站都有不鼓励自动访问的条款,但执行情况各不相同。验证码是他们的主要防御,由 CaptchaAI 处理。
哪种代理类型最适合招聘网站?
轮换自有服务器基础设施是成本和成功率的最佳平衡。数据中心 IP 经常被 LinkedIn 和 Glassdoor 屏蔽。
相关指南
- 轮换自有服务器基础设施
- 代理质量影响解决率
- 粘性会话与轮换会话
大规模收集就业市场数据——获取您的 CaptchaAI 密钥用于自动验证码解决。