好友
阅读权限10
听众
最后登录1970-1-1
|
我帮你优化了一下。全局 Session + 自动重试、网络波动、超时不会直接崩。更稳的正文抓取。同时匹配 id 和 class,支持更多网站。自动去广告。
内置广告过滤,正文更干净。
防重复机制增强
连续 3 章重复就自动停止,不会无限死循环。
空章节自动跳过
不会把空白内容写进文件。
防封 IP
可配置爬取间隔,默认 1.5 秒,非常安全。
编码更稳
不乱码、不报错。
下一章匹配更宽松
更容易找到下一章,不会半路断更。
import requests
from bs4 import BeautifulSoup
import re
import difflib
import time
from urllib.parse import urlparse, urljoin
from requests.adapters import HTTPAdapter
# ===================== 配置区(可自行调整)=====================
TIMEOUT = 15 # 请求超时
REQUEST_DELAY = 1.5 # 爬取间隔秒数,防封IP
MAX_RETRY = 3 # 失败重试次数
SIMILAR_THRESHOLD = 0.95 # 内容重复判定阈值
# ===============================================================
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
# 全局会话(带重试,更稳定)
session = requests.Session()
session.mount('http://', HTTPAdapter(max_retries=MAX_RETRY))
session.mount('https://', HTTPAdapter(max_retries=MAX_RETRY))
# 广告过滤规则
ad_patterns = [
re.compile(r'请收藏.*?网址.*?', re.I),
re.compile(r'最新网址.*?', re.I),
re.compile(r'手机版.*?访问.*?', re.I),
re.compile(r'广告.*?', re.I),
re.compile(r'vip会员.*?充值.*?', re.I),
re.compile(r'『.*?』', re.U),
re.compile(r'【.*?】', re.U),
]
def clean_ads(text):
for pat in ad_patterns:
text = pat.sub('', text)
return text
# 内容相似度检测
def is_content_similar(content1, content2, threshold=SIMILAR_THRESHOLD):
clean1 = re.sub(r'\s+', '', content1)
clean2 = re.sub(r'\s+', '', content2)
if not clean1 or not clean2:
return False
max_len = max(len(clean1), len(clean2))
if abs(len(clean1)-len(clean2)) / max_len > 0.1:
return False
ratio = difflib.SequenceMatcher(None, clean1, clean2).ratio()
return ratio >= threshold
# 提取章节标题
def get_chapter_title(page_text):
pattern = re.compile(r"第[0-9一二三四五六七八九十百千]+[章节回页]\s*.*", re.I)
match = pattern.search(page_text)
if match:
return match.group().strip()
return "未知章节"
# 获取单章内容(优化版)
def get_chapter(url, base_url):
try:
resp = session.get(url, headers=headers, timeout=TIMEOUT)
resp.encoding = resp.apparent_encoding or "utf-8"
soup = BeautifulSoup(resp.text, "html.parser")
chapter_title = get_chapter_title(soup.get_text())
# 更鲁棒的正文匹配:id 或 class
content = ""
content_div = soup.find(
"div",
attrs={"id": re.compile(r"content|chapter|text|booktext|novel", re.I)}
)
if not content_div:
content_div = soup.find(
"div",
attrs={"class": re.compile(r"content|chapter|text|booktext|novel", re.I)}
)
if content_div:
content = content_div.get_text(separator="\n", strip=True)
content = clean_ads(content)
content = re.sub(r'\n+', '\n\n', content).strip()
# 找下一章(更宽松匹配)
next_url = None
next_words = ["下一章", "下一节", "下一页", "后一页", "下页", "→", ">>", "》", "下"]
for word in next_words:
a_list = soup.find_all("a", string=lambda s: s and word in str(s).strip())
for a_tag in a_list:
href = a_tag.get("href")
if href and href.strip():
next_url = urljoin(base_url, href)
break
if next_url:
break
return chapter_title, content, next_url
except Exception as e:
print(f"抓取异常:{str(e)}")
return "异常章节", "", None
# 找第一章
def find_first_chapter(soup, base_url):
first_keywords = ["第一章", "第一节", "楔子", "序", "前言", "第1章"]
for word in first_keywords:
for a in soup.find_all("a"):
t = a.get_text(strip=True)
if t and word in t:
return urljoin(base_url, a["href"])
pat = re.compile(r"第[0-9一二三四五六七八九十]+[章节]", re.I)
for a in soup.find_all("a"):
if pat.search(a.get_text(strip=True)):
return urljoin(base_url, a["href"])
return None
def main():
index_url = input("请输入小说目录页URL:").strip()
if not index_url:
print("URL不能为空")
return
parsed = urlparse(index_url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
try:
resp = session.get(index_url, headers=headers, timeout=TIMEOUT)
resp.encoding = resp.apparent_encoding or "utf-8"
soup = BeautifulSoup(resp.text, "html.parser")
except Exception as e:
print(f"目录页获取失败:{e}")
return
# 清理书名
book_title = soup.title.get_text(strip=True) if soup.title else "小说"
book_title = re.sub(r'[\\/*?:"<>|]', "", book_title)
print("书名:", book_title)
current_url = find_first_chapter(soup, base_url)
if not current_url:
print("未找到第一章")
return
filename = f"{book_title}.txt"
with open(filename, "w", encoding="utf-8") as f:
f.write(book_title + "\n\n")
last_title = ""
last_content = ""
chapter_count = 0
repeat_stop_count = 0 # 连续重复计数
print("开始下载...\n")
while current_url:
chapter_title, content, next_url = get_chapter(current_url, base_url)
# 空内容保护
if not content:
print(f"【跳过空内容】{chapter_title}")
current_url = next_url
time.sleep(REQUEST_DELAY)
continue
# 重复内容自动跳过/停止
if is_content_similar(last_content, content):
repeat_stop_count += 1
print(f"⚠️ 检测到重复内容,连续重复 {repeat_stop_count}/3")
if repeat_stop_count >= 3:
print("🛑 连续重复过多,自动停止")
break
else:
repeat_stop_count = 0
print(f"已下载:{chapter_title}")
# 写入文件
with open(filename, "a", encoding="utf-8") as f:
if chapter_title != last_title:
f.write(chapter_title + "\n\n")
f.write(content + "\n\n\n")
chapter_count += 1
last_title = chapter_title
last_content = content
current_url = next_url
# 防爬延迟
time.sleep(REQUEST_DELAY)
print("\n✅ 全部下载完成!")
if __name__ == "__main__":
main() |
|