吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 7310|回复: 63
上一主题 下一主题
收起左侧

[Python 原创] (4.23更新下载,包括三个网站图形界面源码和exe文件下载)新笔趣阁小说爬取程序

  [复制链接]
跳转到指定楼层
楼主
pnnhnjh 发表于 2025-3-1 22:26 回帖奖励
本帖最后由 pnnhnjh 于 2025-4-23 22:42 编辑

弟子小说网小说爬取程序已经不能使用,改为新笔趣阁小说(https://www.xbqg06.com)下载,可设置100线程下载,但不建议,小心被封IP!运行后打开网站,选取你喜欢的小说,打开小说的目录页面(小说目录页),复制网址(如:https://www.xbqg06.com/373303/)后粘贴到输入提示窗口回车即可。注:不输入任何内容直接回车则开始示例小说下载!




(3.22更新下载,添加图形界面源码和可执行文件下载)
(4.23更新图形界面源码和可执行文件下载,包括三个网站,修改延迟等待、数据清洗等设置到配置文件)


后面的代码实际是一个模版,略懂python的朋友修改以下几行就可以爬取别的网站的小说了!
default_url = 'https://www.xbqg06.com/373303/'  # 小说目录页第一页

book_name_xpath = '//h1/text()'  # 小说书名
chapter_links_xpath = '(//ul[@class="section-list fix"])[2]/li/a/@href'  # 小说目录页章节链接
chapter_links_start_number = 0  # 小说目录页章节开始序号
title_elements_xpath = '//h1/text()'  # 小说内容页标题
contents_xpath = '//div[@id="content"]/p/text()'  # 小说内容页内容

directory_pages_xpath = '//option'  # 小说目录页目录链接,如果没有,请设置为空
current_page_option_xpath = '//option[@selected="selected"]'  # 小说目录页当前页名称,如果没有,请设置为空


[Python] 纯文本查看 复制代码
import os
import re
import time
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin
from lxml import etree
from requests.adapters import HTTPAdapter
import chardet
import threading

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
]

MAX_RETRIES = 10  # 最大重试次数
TIMEOUT = 5  # 请求超时时间


def get_random_user_agent():
    """获取随机User-Agent"""
    return USER_AGENTS[int(time.time()) % len(USER_AGENTS)]


def get_session():
    """为每个线程创建独立的Session对象"""
    thread_local = threading.local()
    if not hasattr(thread_local, "session"):
        thread_local.session = requests.Session()
        adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=MAX_RETRIES)
        thread_local.session.mount('http://', adapter)
        thread_local.session.mount('https://', adapter)
    return thread_local.session


def decode_content(response):
    """统一处理响应内容的编码"""
    detected = chardet.detect(response.content)
    encodings = ['utf-8', 'gbk', 'gb2312', 'big5', 'gb18030']

    if detected['confidence'] >= 0.7:
        try:
            return response.content.decode(detected['encoding'], errors='strict')
        except UnicodeDecodeError:
            pass

    for enc in encodings:
        try:
            return response.content.decode(enc, errors='strict')
        except UnicodeDecodeError:
            continue

    return response.content.decode(detected['encoding'], errors='replace')


def fetch_url(url, headers):
    """带有重试机制的请求函数"""
    session = get_session()
    for attempt in range(MAX_RETRIES):
        try:
            response = session.get(url, headers=headers, timeout=TIMEOUT)
            response.raise_for_status()  # 检查HTTP状态码
            return response
        except requests.exceptions.RequestException as e:
            if attempt == MAX_RETRIES - 1:
                raise e
            time.sleep(1)  # 等待一段时间后重试


def get_chaptercontent(chapter_url, index):
    """获取章节内容"""
    headers = {
        'User-Agent': get_random_user_agent(),
        'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
        'Referer': chapter_url
    }

    all_content = []
    title = ""
    while chapter_url:
        try:
            response = fetch_url(chapter_url, headers)
            html = decode_content(response)
            selector = etree.HTML(html)

            # 章节内容提取
            title_elements = selector.xpath(title_elements_xpath)
            contents = selector.xpath(contents_xpath)
            all_content.extend([content.strip() for content in contents if content.strip()])
            title = title_elements[0].strip() if title_elements else ""

            # 去掉标题中的"(X / X)"部分
            title = re.sub(r'(\s*\d+\s*/\s*\d+\s*)', '', title).strip()

            # 检查是否有“下一页”
            next_page = selector.xpath('//a[contains(text(), "下一页")]/@href')
            if next_page and next_page[0] != "javascript:":
                chapter_url = urljoin(chapter_url, next_page[0])
            else:
                chapter_url = None

        except Exception as e:
            print(f"获取章节 {title} 时发生错误: {e}")
            break

    if not title or not all_content:
        print(f"章节 {index} 获取失败")
        return (index, None, "")

    chaptercontent = "\n  ".join(all_content)

    # 数据清洗
    # chaptercontent = re.sub(r'一秒记住\s*.*?\s*无弹窗免费阅读!', '', chaptercontent, flags=re.S)
    # chaptercontent = re.sub(r'\(https.*?html\)', '', chaptercontent, flags=re.S)
    # chaptercontent = re.sub(r'[\s ]{0,6}第.{1,10}[部分章节卷页]{1,2}.{0,30}[\s \n]{0,6}', '', chaptercontent)
    # chaptercontent = re.sub(r'[\s ]{0,6}\d{1,5}.{1,30}[\s \n]{0,6}', '', chaptercontent)
    # chaptercontent = re.sub(r'[  ]{1,}', '', chaptercontent)

    return (index, title, chaptercontent.strip())


def download_chapters(base_url, max_threads):
    """下载小说所有章节"""
    headers = {'User-Agent': get_random_user_agent()}
    all_chapter_links = []
    book_name = None  # 初始化 book_name 变量
    first_directory_page = True  # 标记是否是第一个目录页

    while base_url:
        try:
            response = fetch_url(base_url, headers)
            html = decode_content(response)
            selector = etree.HTML(html)

            if first_directory_page:
                book_name = selector.xpath(book_name_xpath)[0].strip()
                print(f'\n开始下载小说: 《{book_name}》\n')
                first_directory_page = False

            # 提取章节链接
            chapter_links = selector.xpath(chapter_links_xpath)[chapter_links_start_number:]
            all_chapter_links.extend(urljoin(base_url, href) for href in chapter_links)

            # 获取所有目录页链接
            if directory_pages_xpath and current_page_option_xpath:  # 新增的条件判断
                directory_pages = [(urljoin(base_url, option.attrib['value']), option.text) for option in
                                   selector.xpath(directory_pages_xpath)]

                # 当前页
                current_page_option = selector.xpath(current_page_option_xpath)
                if current_page_option:
                    current_page_value = urljoin(base_url, current_page_option[0].attrib['value'])
                    current_page_text = current_page_option[0].text
                    print(f'当前目录页:{current_page_text}')

                    # 如果有下一个目录页,则继续
                    current_page_index = [page[0] for page in directory_pages].index(current_page_value)
                    if current_page_index + 1 < len(directory_pages):
                        base_url = directory_pages[current_page_index + 1][0]
                    else:
                        base_url = None
                else:
                    print("未找到当前选中的目录页,停止抓取。")
                    break
            else:
                # print("目录页的xpath表达式为空,跳过目录检测。")
                break

        except Exception as e:
            print(f"获取目录页时发生错误: {e}")
            break

    if not book_name:
        print("无法获取书名,请检查URL和网页结构。")
        return False

    save_dir = os.path.join(os.getcwd(), '我的小说')
    os.makedirs(save_dir, exist_ok=True)
    output_path = os.path.join(save_dir, f'{book_name}.txt')

    chapters = []
    failed_chapters = []

    def write_to_file():
        chapters.sort(key=lambda x: x[0])
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(f'\n\n书名:{book_name}\n\n网址:{input_url}\n\n\n')
                for idx, title, content in chapters:
                    f.write(f"{title}\n\n{content}\n\n")

            if failed_chapters:
                print(f"\n以下章节下载失败: {failed_chapters}")

            print(f'\n《{book_name}》下载完成')
            return True
        except Exception as e:
            print(f"写入文件时发生错误: {e}")
            return False

    success = True
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = [executor.submit(get_chaptercontent, link, idx)
                   for idx, link in enumerate(all_chapter_links, 1)]

        for future in as_completed(futures):
            try:
                index, title, content = future.result()
                if title and content:
                    chapters.append((index, title, content))
                    print(f"完成章节: {title}")
                else:
                    failed_chapters.append(index)
            except Exception as e:
                print(f"处理章节时出错: {e}")
                failed_chapters.append(index)
                success = False

    if not write_to_file():
        success = False

    return success


if __name__ == "__main__":
    default_url = 'https://www.xbqg06.com/373303/'  # 小说目录页第一页

    book_name_xpath = '//h1/text()'  # 小说书名
    chapter_links_xpath = '(//ul[@class="section-list fix"])[2]/li/a/@href'  # 小说目录页章节链接
    chapter_links_start_number = 0  # 小说目录页章节开始序号
    title_elements_xpath = '//h1/text()'  # 小说内容页标题
    contents_xpath = '//div[@id="content"]/p/text()'  # 小说内容页内容

    directory_pages_xpath = '//option'  # 小说目录页目录链接,如果没有,请设置为空
    current_page_option_xpath = '//option[@selected="selected"]'  # 小说目录页当前页名称,如果没有,请设置为空

    input_url = input(f"请输入小说目录页地址(默认 {default_url}): ") or default_url

    while True:
        threads_input = input("请输入并发线程数(1-100,默认20): ") or "20"
        if threads_input.isdigit() and 1 <= int(threads_input) <= 100:
            max_threads = int(threads_input)
            break
        print("输入无效,请输入1-100之间的整数")

    start_time = time.time()
    success = download_chapters(base_url=input_url, max_threads=max_threads)

    elapsed = time.time() - start_time

    if success:
        print(f"总耗时: {elapsed:.2f}秒")
    else:
        print("下载过程中发生错误")
    input("下载完成,小说保存在“我的小说”文件夹内,回车退出!")


编译后命令行版下载链接:
链接:https://pan.baidu.com/s/1B00FRJS8yv4SNWRO9tvDEg
提取码:52pj


主界面.jpg (22.27 KB, 下载次数: 3)

主界面.jpg

小说下载器_图形界面_三个网站源码及编译后文件.txt

75 Bytes, 下载次数: 110, 下载积分: 吾爱币 -1 CB

免费评分

参与人数 14吾爱币 +15 热心值 +14 收起 理由
小月巴 + 1 我很赞同!
sjb5201314 + 1 + 1 谢谢@Thanks!
zqj0529 + 1 谢谢@Thanks!
SeanDcitonary + 1 谢谢@Thanks!
aishangpj + 1 我很赞同!
13557455543 + 1 + 1 我很赞同!
kymql24 + 1 + 1 谢谢@Thanks!
苏紫方璇 + 7 + 1 欢迎分析讨论交流,吾爱破解论坛有你更精彩!
zylz9941 + 1 + 1 谢谢@Thanks!
chydroid + 1 + 1 用心讨论,共获提升!
psqladm + 1 + 1 欢迎分析讨论交流,吾爱破解论坛有你更精彩!
shengruqing + 1 热心回复!
chinawolf2000 + 1 + 1 热心回复!
为之奈何? + 1 + 1 我很赞同!

查看全部评分

本帖被以下淘专辑推荐:

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

推荐
xiaolizi021018 发表于 2026-2-25 18:25
heshaa 发表于 2026-2-24 14:40
大哥发个完整版改好的行吗

哈哈,这个就是一个简单的示例代码,我没有写完整的,因为他那个小说网站是有加密的,我就贴了一个逆向流程,已经是完整的代码了
推荐
xiaolizi021018 发表于 2026-2-4 13:53
不灭的阿拉丁 发表于 2025-4-27 20:25
感谢大佬的辛勤劳作,但是发现增加了两个网址后资源量还是不太足,可以尝试添加{https://www.kanqizw.com ...

无聊的时候逆了一下,前端返回的源代码中有很多这样的代码

[JavaScript] 纯文本查看 复制代码
<script>
                            document.writeln(qsbs.bb('PHA+4oCc5bCP55qu55CD44CB5p626ISa6Lii77yM6ams5YWw5byA6Iqx5LqM5Y2B5LiA77yM5LqM5YWr5LqM5LqU5YWt77yM5LqM5YWr5LqM5LqU5LiD77yM5LqM5YWr5LqM5Lmd5LiJ5Y2B5LiA4oCm4oCm4oCdPC9wPg=='));
                        </script>
                        <script>
                            document.writeln(qsbs.bb('PHA+5pep5LiK5YWr54K577yM56WB5ZCM5Lyf56uZ5Zyo5LmhemhlbmdmdeS6jOalvOeahOacqOWktOeql+WtkOWJje+8jOeZvuaXoOiBiui1lueahOeci+edgOalvOS4i+i3s+earueti+eahOm6u+iKsei+q+Wwj+Wls+Wtqe+8jOWGheW/g+S6lOWRs+adgumZiOOAgjwvcD4='));
                        </script>


分段解密的

下面这个是对应的解密

[JavaScript] 纯文本查看 复制代码
var qsbs = {
    _keyStr: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=", aa: function (input) {
        var output = ""; var chr1, chr2, chr3, enc1, enc2, enc3, enc4; var i = 0; input = qsbs._utf8_encode(input); while (i < input.length) {
            chr1 = input.charCodeAt(i++); chr2 = input.charCodeAt(i++); chr3 = input.charCodeAt(i++); enc1 = chr1 >> 2; enc2 = ((chr1 & 3) << 4) | (chr2 >> 4); enc3 = ((chr2 & 15) << 2) | (chr3 >> 6); enc4 = chr3 & 63; if (isNaN(chr2)) {
                enc3 = enc4 = 64
            } else if (isNaN(chr3)) {
                enc4 = 64
            } output = output + this._keyStr.charAt(enc1) + this._keyStr.charAt(enc2) + this._keyStr.charAt(enc3) + this._keyStr.charAt(enc4)
        } return output
    }, bb: function (input) { 
        var output = ""; 
        var chr1, chr2, chr3; 
        var enc1, enc2, enc3, enc4;
        var i = 0; input = input.replace(/[^A-Za-z0-9\+\/\=]/g,""); 

    while (i < input.length) {
        enc1 = this._keyStr.indexOf(input.charAt(i++));
        enc2 = this._keyStr.indexOf(input.charAt(i++));
        enc3 = this._keyStr.indexOf(input.charAt(i++)); 
        enc4 = this._keyStr.indexOf(input.charAt(i++));
        chr1 = (enc1 << 2) | (enc2 >> 4);
        chr2 = ((enc2 & 15) << 4) | (enc3 >> 2);
        chr3 = ((enc3 & 3) << 6) | enc4;
        output = output + String.fromCharCode(chr1);
        if (enc3 != 64) {
            output = output + String.fromCharCode(chr2)
        } if (enc4 != 64) {
            output = output + String.fromCharCode(chr3)                                                              
        }
    } 
        output = qsbs._utf8_decode(output); 
        return output
    }, _utf8_encode: function (string) {
    string = string.replace(/\r\n/g,
        "\n"); var utftext = ""; for (var n = 0; n < string.length; n++) {
            var c = string.charCodeAt(n); if (c < 128) {
                utftext += String.fromCharCode(c)
            } else if ((c > 127) && (c < 2048)) {
                utftext += String.fromCharCode((c >> 6) | 192); utftext += String.fromCharCode((c & 63) | 128)
            } else {
                utftext += String.fromCharCode((c >> 12) | 224); utftext += String.fromCharCode(((c >> 6) & 63) | 128); utftext += String.fromCharCode((c & 63) | 128)
            }
        } return utftext
}, _utf8_decode: function (utftext) {
    var string = ""; var i = 0; var c = c1 = c2 = 0; while (i < utftext.length) {
        c = utftext.charCodeAt(i); if (c < 128) {
            string += String.fromCharCode(c); i++
        } else if ((c > 191) && (c < 224)) {
            c2 = utftext.charCodeAt(i + 1); string += String.fromCharCode(((c & 31) << 6) | (c2 & 63)); i += 2
        } else {
            c2 = utftext.charCodeAt(i + 1); c3 = utftext.charCodeAt(i + 2); string += String.fromCharCode(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63)); i += 3
        }
    } return string
}
}

const decryptedString = qsbs.bb("PHA+4oCc5bCP55qu55CD44CB5p626ISa6Lii77yM6ams5YWw5byA6Iqx5LqM5Y2B5LiA77yM5LqM5YWr5LqM5LqU5YWt77yM5LqM5YWr5LqM5LqU5LiD77yM5LqM5YWr5LqM5Lmd5LiJ5Y2B5LiA4oCm4oCm4oCdPC9wPg==");
const encryptedString = qsbs.aa("<p>“小皮球、架脚踢,马兰开花二十一,二八二五六,二八二五七,二八二九三十一……”</p>");
console.log(decryptedString);
沙发
yhzh 发表于 2025-3-1 22:41
3#
sanrokalv 发表于 2025-3-1 23:46
看到了全新的角度........................
4#
wyesheng 发表于 2025-3-2 00:03
好像最近Python的程序蛮流行哇
5#
Ghang 发表于 2025-3-2 00:18
wyesheng 发表于 2025-3-2 00:03
好像最近Python的程序蛮流行哇

简单又好用
6#
Mwowom 发表于 2025-3-2 00:42
好用的代码使我的公鸡旋转
7#
1921688998 发表于 2025-3-2 09:21
不会使用
8#
Doublevv 发表于 2025-3-2 09:50
可惜,打不开弟子小说网站
9#
yuzilin 发表于 2025-3-2 11:05
感谢分享,可以拿来学习了
10#
批注 发表于 2025-3-2 11:27
看来只能爬这一个网站啊
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - 52pojie.cn ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2026-5-12 17:39

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表