请教高手下载网站的小说，python代码应该怎样改

tvu2003 · 发表于 2025-3-15 10:25

本帖最后由 tvu2003 于 2025-3-16 22:36 编辑

https://www.paozww.com/biquge/414243/这个网址的小说，下面是python代码

[Python] 纯文本查看 复制代码

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

023

024

025

026

027

028

029

030

031

032

033

034

035

036

037

038

039

040

041

042

043

044

045

046

047

048

049

050

051

052

053

054

055

056

057

058

059

060

061

062

063

064

065

066

067

068

069

070

071

072

073

074

075

076

077

078

079

080

081

082

083

084

085

086

087

088

089

090

091

092

093

094

095

096

097

098

099

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import time
import requests
import os
import random
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
 
# 目标书籍URL
BOOK_URL = "https://www.paozww.com/biquge/414243/"
 
# 增强型请求头
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Referer": BOOK_URL,
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Cache-Control": "no-cache",
    "Pragma": "no-cache"
}
 
DOWNLOAD_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "一夕千悟")
 
def get_book_info():
    """增强版元数据获取"""
    session = requests.Session()
    try:
        # 添加重试逻辑
        for _ in range(3):
            response = session.get(BOOK_URL, headers=HEADERS, timeout=20)
            if response.status_code == 200:
                break
            time.sleep(2)
        else:
            raise Exception(f"无法获取页面，状态码：{response.status_code}")
 
        response.encoding = 'gb18030'  # 使用更全面的中文编码
        soup = BeautifulSoup(response.text, 'html.parser')
         
        # 增强元数据提取
        return {
            "title": soup.find('meta', {'property': 'og:title'})['content'].split('_')[0],
            "author": soup.find('meta', {'property': 'og:novel:author'})['content'],
            "session": session,
            "chapter_list": soup.select("#list > dl > dd > a")[12:]  # 跳过前12个伪章节
        }
    except Exception as e:
        print(f"书籍信息获取失败: {str(e)}")
        return None
 
def enhanced_downloader(args):
    """增强型下载器"""
    chapter, session, index = args
    title = chapter.text.strip()
    url = chapter['href'] if chapter['href'].startswith('http') else f"https://www.paozww.com{chapter['href']}"
     
    # 智能反爬策略
    time.sleep(random.expovariate(1/1.5))  # 指数分布延迟
     
    try:
        # 动态请求头
        dynamic_headers = HEADERS.copy()
        dynamic_headers.update({
            "Referer": BOOK_URL,
            "X-Requested-With": "XMLHttpRequest" if random.random() > 0.7 else ""
        })
         
        # 带状态验证的请求
        response = session.get(url, headers=dynamic_headers, timeout=25)
        if response.status_code != 200:
            return (index, title, f"【HTTP错误：{response.status_code}】")
             
        response.encoding = 'gb18030'
        soup = BeautifulSoup(response.text, 'html.parser')
         
        # 新版内容选择器
        content_div = soup.find('div', id='content')
        if not content_div:
            return (index, title, "【内容结构异常】")
             
        # 高级内容清洗
        cleaned = []
        for elem in content_div.contents:
            if elem.name == 'div' and 'alert' in elem.get('class', []):
                continue  # 跳过广告div
            text = elem.get_text(strip=True)
            if text and len(text) > 10:  # 过滤短文本广告
                cleaned.append(f"　　{text}\n")
         
        return (index, title, ''.join(cleaned))
         
    except Exception as e:
        return (index, title, f"【系统错误：{str(e)}】")
 
def main():
    """执行下载任务"""
    print("正在初始化...")
    book_info = get_book_info()
    if not book_info:
        return
 
    # 创建下载目录
    os.makedirs(DOWNLOAD_PATH, exist_ok=True)
    filename = f"{book_info['title']} - {book_info['author']}.txt"
    filepath = os.path.join(DOWNLOAD_PATH, filename)
     
    print(f"\n开始下载：{book_info['title']}")
    print(f"章节总数：{len(book_info['chapter_list'])}")
     
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(f"《{book_info['title']}》\n作者：{book_info['author']}\n\n")
         
        # 智能线程池管理
        with ThreadPoolExecutor(max_workers=2) as executor:  # 降低并发保证稳定性
            futures = []
            for idx, ch in enumerate(book_info['chapter_list']):
                # 动态任务提交
                if idx % 10 == 0:
                    time.sleep(5)  # 每10章增加休息
                futures.append(executor.submit(enhanced_downloader, (ch, book_info['session'], idx)))
             
            # 实时进度监控
            success = 0
            for i, future in enumerate(as_completed(futures), 1):
                idx, title, content = future.result()
                f.write(f"\n\n第{idx+1}章 {title}\n{content}")
                 
                # 进度显示
                progress = i/len(book_info['chapter_list'])*100
                status = "&#10003;" if "【" not in content else "&#10007;"
                print(f"\r[{status}] 进度: {progress:.1f}% | 成功: {success}", end='')
                success += 0 if "【" in content else 1
     
    print(f"\n\n下载完成！保存路径：{filepath}")
 
if __name__ == "__main__":
    main()

Pablo · 发表于 2025-3-15 12:55

是乱码还是无法下载的问题

lengbingling · 发表于 2025-3-15 13:05

先来个链接文本

tvu2003 · 发表于 2025-3-15 16:28

@Pablo 运行完后就出现以上的情况，爬不到小说章节，所以想请教下

MrChen001 · 发表于 2025-3-15 19:16

gb18030改成gbk或者utf-8试试，得看他网站中文编码格式，要一致，应该很少有用gb18030的吧

Arcticlyc · 发表于 2025-3-15 19:20

[Asm] 纯文本查看 复制代码

1	`response.encoding =` `"gb18030"` `# 使用更全面的中文编码`

删掉这一行

tvu2003 · 发表于 2025-3-15 19:38

@Arcticlyc 多谢兄弟的建议，不会乱码了，可能是网站防爬的因为，爬不了小说内容，不知道在那出问题了

Arcticlyc · 发表于 2025-3-15 20:19

tvu2003 发表于 2025-3-15 19:38
@Arcticlyc 多谢兄弟的建议，不会乱码了，可能是网站防爬的因为，爬不了小说内容，不知道在那出问题了

应该不是反爬，代码可能有问题

tvu2003 · 发表于 2025-3-15 20:21

代码没显示错误，只是爬不出小说内容。。。

马了顶大 · 发表于 2025-3-16 08:14

tvu2003 发表于 2025-3-15 20:21
代码没显示错误，只是爬不出小说内容。。。

【内容结构异常】就是你代码里提示的错误，代码解析有问题

帐号		自动登录	找回密码
密码			注册[Register]

[已解决] 请教高手下载网站的小说，python代码应该怎样改