好友
阅读权限10
听众
最后登录1970-1-1
|
本帖最后由 pnnhnjh 于 2025-4-23 22:42 编辑
弟子小说网小说爬取程序已经不能使用,改为新笔趣阁小说(https://www.xbqg06.com)下载,可设置100线程下载,但不建议,小心被封IP!运行后打开网站,选取你喜欢的小说,打开小说的目录页面(小说目录页),复制网址(如:https://www.xbqg06.com/373303/)后粘贴到输入提示窗口回车即可。注:不输入任何内容直接回车则开始示例小说下载!
(3.22更新下载,添加图形界面源码和可执行文件下载)
(4.23更新图形界面源码和可执行文件下载,包括三个网站,修改延迟等待、数据清洗等设置到配置文件)
后面的代码实际是一个模版,略懂python的朋友修改以下几行就可以爬取别的网站的小说了!
default_url = 'https://www.xbqg06.com/373303/' # 小说目录页第一页
book_name_xpath = '//h1/text()' # 小说书名
chapter_links_xpath = '(//ul[@class="section-list fix"])[2]/li/a/@href' # 小说目录页章节链接
chapter_links_start_number = 0 # 小说目录页章节开始序号
title_elements_xpath = '//h1/text()' # 小说内容页标题
contents_xpath = '//div[@id="content"]/p/text()' # 小说内容页内容
directory_pages_xpath = '//option' # 小说目录页目录链接,如果没有,请设置为空
current_page_option_xpath = '//option[@selected="selected"]' # 小说目录页当前页名称,如果没有,请设置为空
[Python] 纯文本查看 复制代码 import os
import re
import time
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin
from lxml import etree
from requests.adapters import HTTPAdapter
import chardet
import threading
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
]
MAX_RETRIES = 10 # 最大重试次数
TIMEOUT = 5 # 请求超时时间
def get_random_user_agent():
"""获取随机User-Agent"""
return USER_AGENTS[int(time.time()) % len(USER_AGENTS)]
def get_session():
"""为每个线程创建独立的Session对象"""
thread_local = threading.local()
if not hasattr(thread_local, "session"):
thread_local.session = requests.Session()
adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=MAX_RETRIES)
thread_local.session.mount('http://', adapter)
thread_local.session.mount('https://', adapter)
return thread_local.session
def decode_content(response):
"""统一处理响应内容的编码"""
detected = chardet.detect(response.content)
encodings = ['utf-8', 'gbk', 'gb2312', 'big5', 'gb18030']
if detected['confidence'] >= 0.7:
try:
return response.content.decode(detected['encoding'], errors='strict')
except UnicodeDecodeError:
pass
for enc in encodings:
try:
return response.content.decode(enc, errors='strict')
except UnicodeDecodeError:
continue
return response.content.decode(detected['encoding'], errors='replace')
def fetch_url(url, headers):
"""带有重试机制的请求函数"""
session = get_session()
for attempt in range(MAX_RETRIES):
try:
response = session.get(url, headers=headers, timeout=TIMEOUT)
response.raise_for_status() # 检查HTTP状态码
return response
except requests.exceptions.RequestException as e:
if attempt == MAX_RETRIES - 1:
raise e
time.sleep(1) # 等待一段时间后重试
def get_chaptercontent(chapter_url, index):
"""获取章节内容"""
headers = {
'User-Agent': get_random_user_agent(),
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Referer': chapter_url
}
all_content = []
title = ""
while chapter_url:
try:
response = fetch_url(chapter_url, headers)
html = decode_content(response)
selector = etree.HTML(html)
# 章节内容提取
title_elements = selector.xpath(title_elements_xpath)
contents = selector.xpath(contents_xpath)
all_content.extend([content.strip() for content in contents if content.strip()])
title = title_elements[0].strip() if title_elements else ""
# 去掉标题中的"(X / X)"部分
title = re.sub(r'(\s*\d+\s*/\s*\d+\s*)', '', title).strip()
# 检查是否有“下一页”
next_page = selector.xpath('//a[contains(text(), "下一页")]/@href')
if next_page and next_page[0] != "javascript:":
chapter_url = urljoin(chapter_url, next_page[0])
else:
chapter_url = None
except Exception as e:
print(f"获取章节 {title} 时发生错误: {e}")
break
if not title or not all_content:
print(f"章节 {index} 获取失败")
return (index, None, "")
chaptercontent = "\n ".join(all_content)
# 数据清洗
# chaptercontent = re.sub(r'一秒记住\s*.*?\s*无弹窗免费阅读!', '', chaptercontent, flags=re.S)
# chaptercontent = re.sub(r'\(https.*?html\)', '', chaptercontent, flags=re.S)
# chaptercontent = re.sub(r'[\s ]{0,6}第.{1,10}[部分章节卷页]{1,2}.{0,30}[\s \n]{0,6}', '', chaptercontent)
# chaptercontent = re.sub(r'[\s ]{0,6}\d{1,5}.{1,30}[\s \n]{0,6}', '', chaptercontent)
# chaptercontent = re.sub(r'[ ]{1,}', '', chaptercontent)
return (index, title, chaptercontent.strip())
def download_chapters(base_url, max_threads):
"""下载小说所有章节"""
headers = {'User-Agent': get_random_user_agent()}
all_chapter_links = []
book_name = None # 初始化 book_name 变量
first_directory_page = True # 标记是否是第一个目录页
while base_url:
try:
response = fetch_url(base_url, headers)
html = decode_content(response)
selector = etree.HTML(html)
if first_directory_page:
book_name = selector.xpath(book_name_xpath)[0].strip()
print(f'\n开始下载小说: 《{book_name}》\n')
first_directory_page = False
# 提取章节链接
chapter_links = selector.xpath(chapter_links_xpath)[chapter_links_start_number:]
all_chapter_links.extend(urljoin(base_url, href) for href in chapter_links)
# 获取所有目录页链接
if directory_pages_xpath and current_page_option_xpath: # 新增的条件判断
directory_pages = [(urljoin(base_url, option.attrib['value']), option.text) for option in
selector.xpath(directory_pages_xpath)]
# 当前页
current_page_option = selector.xpath(current_page_option_xpath)
if current_page_option:
current_page_value = urljoin(base_url, current_page_option[0].attrib['value'])
current_page_text = current_page_option[0].text
print(f'当前目录页:{current_page_text}')
# 如果有下一个目录页,则继续
current_page_index = [page[0] for page in directory_pages].index(current_page_value)
if current_page_index + 1 < len(directory_pages):
base_url = directory_pages[current_page_index + 1][0]
else:
base_url = None
else:
print("未找到当前选中的目录页,停止抓取。")
break
else:
# print("目录页的xpath表达式为空,跳过目录检测。")
break
except Exception as e:
print(f"获取目录页时发生错误: {e}")
break
if not book_name:
print("无法获取书名,请检查URL和网页结构。")
return False
save_dir = os.path.join(os.getcwd(), '我的小说')
os.makedirs(save_dir, exist_ok=True)
output_path = os.path.join(save_dir, f'{book_name}.txt')
chapters = []
failed_chapters = []
def write_to_file():
chapters.sort(key=lambda x: x[0])
try:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(f'\n\n书名:{book_name}\n\n网址:{input_url}\n\n\n')
for idx, title, content in chapters:
f.write(f"{title}\n\n{content}\n\n")
if failed_chapters:
print(f"\n以下章节下载失败: {failed_chapters}")
print(f'\n《{book_name}》下载完成')
return True
except Exception as e:
print(f"写入文件时发生错误: {e}")
return False
success = True
with ThreadPoolExecutor(max_workers=max_threads) as executor:
futures = [executor.submit(get_chaptercontent, link, idx)
for idx, link in enumerate(all_chapter_links, 1)]
for future in as_completed(futures):
try:
index, title, content = future.result()
if title and content:
chapters.append((index, title, content))
print(f"完成章节: {title}")
else:
failed_chapters.append(index)
except Exception as e:
print(f"处理章节时出错: {e}")
failed_chapters.append(index)
success = False
if not write_to_file():
success = False
return success
if __name__ == "__main__":
default_url = 'https://www.xbqg06.com/373303/' # 小说目录页第一页
book_name_xpath = '//h1/text()' # 小说书名
chapter_links_xpath = '(//ul[@class="section-list fix"])[2]/li/a/@href' # 小说目录页章节链接
chapter_links_start_number = 0 # 小说目录页章节开始序号
title_elements_xpath = '//h1/text()' # 小说内容页标题
contents_xpath = '//div[@id="content"]/p/text()' # 小说内容页内容
directory_pages_xpath = '//option' # 小说目录页目录链接,如果没有,请设置为空
current_page_option_xpath = '//option[@selected="selected"]' # 小说目录页当前页名称,如果没有,请设置为空
input_url = input(f"请输入小说目录页地址(默认 {default_url}): ") or default_url
while True:
threads_input = input("请输入并发线程数(1-100,默认20): ") or "20"
if threads_input.isdigit() and 1 <= int(threads_input) <= 100:
max_threads = int(threads_input)
break
print("输入无效,请输入1-100之间的整数")
start_time = time.time()
success = download_chapters(base_url=input_url, max_threads=max_threads)
elapsed = time.time() - start_time
if success:
print(f"总耗时: {elapsed:.2f}秒")
else:
print("下载过程中发生错误")
input("下载完成,小说保存在“我的小说”文件夹内,回车退出!")
编译后命令行版下载链接:
链接:https://pan.baidu.com/s/1B00FRJS8yv4SNWRO9tvDEg
提取码:52pj
|
免费评分
-
查看全部评分
|