[Python] 纯文本查看 复制代码
import os
import time
import random
import json
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
# 显示程序说明
def show_intro():
print("=" * 60)
print(" QQ输入法(官方推荐词库)下载工具 v1.0 By:52pojie_as8686886")
print("=" * 60)
print("功能说明:")
print("- 自动下载QQ输入法官方推荐的各类词库")
print("- 自动创建文件夹并保存词库文件")
print("- 内置错误重试机制,提高下载成功率")
print("- 模拟人类操作行为,降低被反爬虫机制拦截的概率")
print("=" * 60)
print("注意事项:")
print("- 下载过程可能需要较长时间,请保持网络连接稳定")
print("- 程序会自动处理Windows文件名中的非法字符")
print("- 如果遇到临时网络问题,程序会自动重试")
print("=" * 60)
input("按回车键开始下载,或按Ctrl+C退出程序...")
print()
# 总页数
TOTAL_PAGES = 51
sets = ['/', '\\', ':', '*', '?', '"', '<', '>', '|'] # windows文件命名不能有这些字符
# 添加更真实的请求头列表
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
]
# 创建会话函数,保持连接
def create_session():
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', random.choice(user_agents))]
urllib.request.install_opener(opener)
# 带重试机制的请求函数
def request_with_retry(url, max_retries=3, delay=5):
for attempt in range(max_retries):
try:
# 随机选择用户代理
headers = {
'User-Agent': random.choice(user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
req = Request(url, headers=headers)
# 添加随机延迟,模拟人类行为
time.sleep(random.uniform(1, 3))
response = urlopen(req, timeout=30)
return response
except urllib.error.URLError as e:
print(f"请求失败 (尝试 {attempt+1}/{max_retries}): {e.reason}")
if attempt < max_retries - 1:
wait_time = delay * (attempt + 1)
print(f"等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
else:
print(f"已达到最大重试次数,放弃请求: {url}")
raise
except Exception as e:
print(f"未知错误 (尝试 {attempt+1}/{max_retries}): {str(e)}")
if attempt < max_retries - 1:
wait_time = delay * (attempt + 1)
print(f"等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
else:
print(f"已达到最大重试次数,放弃请求: {url}")
raise
# 解析QQ输入法词库页面 - 修复版本
def parse_qq_dict_page(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
dict_items = []
# 查找词库列表项 - 根据提供的HTML结构
dict_summaries = soup.find_all('div', class_='summary')
for summary in dict_summaries:
try:
# 获取词库名称
title_elem = summary.find('li', class_='title')
if not title_elem:
continue
name_elem = title_elem.find('a')
if not name_elem:
continue
name = name_elem.get_text(strip=True)
# 获取下载链接
download_elem = summary.find('a', class_='downloadICO')
if not download_elem or not download_elem.get('href'):
continue
# 构建完整的下载URL
download_href = download_elem['href']
if download_href.startswith('/download?'):
download_url = f"https://cdict.qq.pinyin.cn{download_href}"
else:
download_url = download_href
# 从下载链接中提取dict_id
dict_id = None
if 'dict_id=' in download_url:
dict_id = download_url.split('dict_id=')[1].split('&')[0]
dict_items.append({
'name': name,
'download_url': download_url,
'dict_id': dict_id
})
except Exception as e:
print(f"解析词库项时出错: {str(e)}")
continue
return dict_items
# 显示下载进度
def show_progress(page, total_pages, count):
progress = f"第{page}/{total_pages}页 - 已下载{count}个词库"
print(progress)
# 主程序
def main():
# 显示程序说明
show_intro()
# 初始化会话
create_session()
# 创建下载目录
dir_name = "./QQ输入法官方推荐词库"
os.makedirs(dir_name, exist_ok=True)
total_count = 0
print(f"\n开始下载QQ输入法官方推荐词库...")
print(f"总共需要处理 {TOTAL_PAGES} 页")
for page in range(1, TOTAL_PAGES + 1):
try:
# 构建页面URL
if page == 1:
url = "https://cdict.qq.pinyin.cn/list?key=%E5%AE%98%E6%96%B9%E6%8E%A8%E8%8D%90"
else:
url = f"https://cdict.qq.pinyin.cn/list?key=%E5%AE%98%E6%96%B9%E6%8E%A8%E8%8D%90&page={page}"
print(f"\n正在处理第 {page} 页...")
# 使用带重试机制的请求
response = request_with_retry(url)
html_content = response.read().decode('utf-8')
# 解析页面获取词库信息
dict_items = parse_qq_dict_page(html_content)
if not dict_items:
print(f"第 {page} 页未找到词库信息")
# 打印部分HTML内容用于调试
if page == 1:
print("调试信息:检查页面结构...")
soup = BeautifulSoup(html_content, 'html.parser')
summaries = soup.find_all('div', class_='summary')
print(f"找到 {len(summaries)} 个summary元素")
if summaries:
print("第一个summary的内容:")
print(str(summaries[0])[:500] + "...")
continue
print(f"第 {page} 页找到 {len(dict_items)} 个词库")
# 随机打乱处理顺序,避免模式化行为
random.shuffle(dict_items)
page_count = 0
for item in dict_items:
try:
# 处理文件名中的非法字符
file_name = item['name']
for char in sets:
file_name = file_name.replace(char, "")
# 使用dict_id作为文件名的一部分,避免重复
if item['dict_id']:
file_name = f"{item['dict_id']}_{file_name}"
# 构建文件路径
file_path = os.path.join(dir_name, f"{file_name}.qcel")
# 如果文件已存在,跳过下载
if os.path.exists(file_path):
print(f"✓ 文件已存在,跳过: {file_name}")
continue
# 下载文件
download_success = False
for attempt in range(3):
try:
print(f"正在下载: {file_name}")
# 需要先访问下载页面获取真实的.qcel文件链接
# QQ输入法的下载链接需要先访问获取重定向
download_response = request_with_retry(item['download_url'])
final_url = download_response.geturl()
if final_url.endswith('.qcel'):
urllib.request.urlretrieve(final_url, file_path)
download_success = True
break
else:
print(f"获取到的最终链接不是.qcel文件: {final_url}")
break
except Exception as e:
print(f"下载失败 (尝试 {attempt+1}/3): {str(e)}")
time.sleep(random.uniform(2, 5))
if download_success:
page_count += 1
total_count += 1
print(f"✓ 成功下载: {file_name}")
else:
print(f"✗ 下载失败,跳过: {file_name}")
# 在下载之间添加随机延迟
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f"处理词库时发生错误: {str(e)}")
continue
# 显示当前页面进度
show_progress(page, TOTAL_PAGES, page_count)
except Exception as e:
print(f"处理页面时发生错误第{page}页: {str(e)}")
# 页面错误时等待更长时间
time.sleep(random.uniform(5, 10))
continue
print("\n" + "=" * 60)
print(f"所有下载任务完成!总共下载了 {total_count} 个词库")
print("=" * 60)
print("感谢使用QQ输入法(官方推荐词库)下载工具 v1.0")
input("按回车键退出程序...")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n用户中断程序,退出下载...")
except Exception as e:
print(f"\n程序发生未预期错误: {str(e)}")