[Asm] 纯文本查看 复制代码
import requests
import time
import base64
import os
import sys
import io
import shutil # 彻底清理文件夹
import json
from PIL import Image, ImageStat # 引入清晰度计算模块
import ddddocr
# 终端输出防乱码
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# 初始化 OCR
ocr = ddddocr.DdddOcr(show_ad=False)
# 保持会话
session = requests.Session()
# ==========================================
# 获取当前 demo.py 所在的真实目录路径
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# 动态拼接子文件夹路径
TEST_DIR = os.path.join(BASE_DIR, "test")
BG_IMG_DIR = os.path.join(BASE_DIR, "bg_img")
def dissect_and_recognize(gif_bytes):
"""
既清空文件夹、保存每一帧,又自动评分锁定最清晰帧!
"""
img = Image.open(io.BytesIO(gif_bytes))
# 🌟 使用 os.path 动态路径
if os.path.exists(TEST_DIR):
shutil.rmtree(TEST_DIR)
print("🧹 已彻底清空旧的切片文件...")
os.makedirs(TEST_DIR, exist_ok=True)
print(f"🔪 开始解剖 GIF 并进行清晰度扫描... 共发现 {img.n_frames} 帧!")
best_frame = None
max_stddev = 0
best_index = 0
# 遍历每一帧:保存 + 打分
for i in range(img.n_frames):
img.seek(i)
# 1. 提取当前这一帧的真实面目 (白底防黑块)
raw_frame = img.copy().convert('RGBA')
white_bg = Image.new('RGBA', img.size, (255, 255, 255, 255))
white_bg.paste(raw_frame, (0, 0), mask=raw_frame)
final_frame = white_bg.convert('RGB')
# 🌟 下载功能:用 os.path.join 拼接文件名
final_frame.save(os.path.join(TEST_DIR, f"raw_frame_{i}.png"))
# 2. 核心算法:计算“像素标准差”作为清晰度评分
stat = ImageStat.Stat(final_frame.convert('L'))
stddev = stat.stddev[0]
print(f" ✅ 已保存第 {i} 帧切片, 清晰度评分: {stddev:.2f}")
# 3. 记录目前为止得分最高的那一帧
if stddev > max_stddev:
max_stddev = stddev
best_frame = final_frame
best_index = i
print(f"🎯 自动锁定最清晰的 [第 {best_index} 帧] 送给 OCR!")
# 贴心小功能:单独存一个特别的名字,方便你直接看!
best_frame.save(os.path.join(TEST_DIR, f"00_BEST_FRAME_is_{best_index}.png"))
# 转成二进制喂给 ddddocr
img_byte_arr = io.BytesIO()
best_frame.save(img_byte_arr, format='PNG')
text = ocr.classification(img_byte_arr.getvalue())
# 防误判逻辑:如果没认出或者太短,返回 None 触发重试
if not text or len(text) < 3:
print(f"⚠️ 抓到了最佳帧,但 OCR 没看懂 (结果: '{text}'),准备重新请求...")
return None
return text
def get_and_clean_captcha():
headers = {
"accept": "*/*",
"referer": "https://www.spiderdemo.cn/captcha/cap2_challenge/?challenge_type=cap2_challenge",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
cookies = {
"sessionid": "jitlmeznodbjh707v7y3uwqqpz23zm7d" # <-- 如果总是识别失败,记得去网页换个新的
}
url = "https://www.spiderdemo.cn/captcha/api/cap2_challenge/captcha_image/"
params = {"t": str(int(time.time() * 1000))}
response = session.get(url, headers=headers, cookies=cookies, params=params)
data = response.json()
t_raw = data.get('T', '')
f_raw = data.get('F', '')
t_b64 = t_raw.split(',')[-1] if ',' in t_raw else t_raw
f_b64 = f_raw.split(',')[-1] if ',' in f_raw else f_raw
# 挑出体积更大的真图
if len(t_b64) > len(f_b64):
real_base64 = t_b64
else:
real_base64 = f_b64
img_bytes = base64.b64decode(real_base64)
# [调试用] 🌟 使用动态路径保存原始动图
os.makedirs(BG_IMG_DIR, exist_ok=True)
with open(os.path.join(BG_IMG_DIR, "original.gif"), "wb") as f:
f.write(img_bytes)
return dissect_and_recognize(img_bytes)
def get_verify_Api(page, code):
headers = {
"accept": "application/json, text/javascript, */*; q=0.01",
"accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
"cache-control": "no-cache",
"content-type": "application/json",
"origin": "https://www.spiderdemo.cn",
"pragma": "no-cache",
"priority": "u=1, i",
"referer": "https://www.spiderdemo.cn/captcha/cap2_challenge/?challenge_type=cap2_challenge",
"sec-ch-ua": "\"Chromium\";v=\"146\", \"Not-A.Brand\";v=\"24\", \"Google Chrome\";v=\"146\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36",
"x-requested-with": "XMLHttpRequest"
}
cookies = {
"sessionid": "jitlmeznodbjh707v7y3uwqqpz23zm7d"
}
url = "https://www.spiderdemo.cn/captcha/api/cap2_challenge/page/"
data = {
"captcha_input": code,
"page_num": page,
"challenge_type": "cap2_challenge"
}
data = json.dumps(data, separators=(',', ':'))
response = requests.post(url, headers=headers, cookies=cookies, data=data).text
return json.loads(response)
if __name__ == '__main__':
total_num = 0
for page in range(1, 101):
max_retries = 10
final_code = None
# --- 验证码识别环节 ---
for i in range(max_retries):
print(f"\n--- 🚀 [第 {page} 页] 第 {i+1} 次尝试获取并识别验证码 ---")
code = get_and_clean_captcha()
if code:
final_code = code
break
else:
print("🔄 识别不佳,休息 1 秒后重试...")
time.sleep(1)
# --- 数据抓取环节 (只有拿到 final_code 才进来) ---
if final_code:
print("\n" + "=" * 40)
print(f"🏆 验证码通过: {final_code}")
print("=" * 40)
api_res = get_verify_Api(page, final_code)
if api_res and 'page_data' in api_res:
page_data = api_res.get('page_data')
# 进行求和
once_sum = sum(page_data)
total_num += once_sum
print(f"✅ 第 {page} 页抓取成功!本页和: {once_sum}")
print(f"💰 目前累计总分: {total_num}")
else:
print(f"❌ 第 {page} 页数据获取失败,服务器返回: {api_res}")
else:
# 走到这里说明 10 次都没认出来,为了不崩溃,我们只能跳过这一页
print(f"\n严重警告: 第 {page} 页连续 10 次识别失败,已跳过。")
continue
time.sleep(1)
print("\n" + "★" * 50)
print(f"🏁 100页全自动化收割完毕!")
print(f"🔥 最终全站总得分: {total_num}")
print("★" * 50)