Python爬取武侠小说

ymsn2023 · 发表于 2023-8-2 13:00

将武侠小说列表，章节，内容保存成json格式内容，转存成文件，保存到本地。支持重复增量爬取

代码如下

[Python] 纯文本查看 复制代码

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from lxml import etree
import os
import json
import time
import random

# 主程序下载文件目录
main_dir = 'tianyabooks.com'

session = requests.Session()
retry = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)


# 根据路径请求网页（随机暂停5-10秒后请求）
def get_html_tree(url, encoding='utf-8'):
    # 所有的网络请求都休眠几秒再获取
    sep = random.randint(1, 5)
    print(str(sep) + '秒后请求网址：' + url)
    time.sleep(sep)

    headers = {
        'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
    }

    # requests.DEFAULT_RETRIES = 5  # 增加重试连接次数
    # s = requests.session()
    # s.keep_alive = False  # 关闭多余连接
    # response = requests.get(url=url, headers=headers, timeout=300)
    response = session.get(url=url, headers=headers, timeout=300)
    response.encoding = encoding
    content = response.text
    # print(content)
    return etree.HTML(content)


# 获取武侠小说列表
def get_wuxia_article_list(page_index, page_url):
    html_tree = get_html_tree(page_url)

    # 获取所有的文章
    article_a_list = html_tree.xpath('//div[@class=\'listbox\']//ul[@class=\'e2\']//li/a[@class=\'title\']')

    class_data = []
    # 如果没有数据，直接返回
    if len(article_a_list) <= 0:
        return class_data

    for article_a in article_a_list:
        temp_title = article_a.xpath('.//text()')
        json_data = {
            'article_url': article_a.xpath('./@href')[0],
            'article_title': ''.join(temp_title)
        }
        # print(json_data)
        class_data.append(json_data)

    print(class_data)

    with open('./' + main_dir + '/武侠小说/' + 'article_list_page_' + str(page_index) + '.json', 'w') as fp:
        fp.write(json.dumps(class_data))

    return class_data


# 获取武侠小说章节列表
def get_wuxia_article(article_url, article_title):
    article_dirname = './' + main_dir + '/武侠小说/' + article_title
    if not os.path.exists(article_dirname):
        os.mkdir(article_dirname)

    article_filename = article_dirname + '/article_info.json'

    html_tree = get_html_tree(article_url)

    book_div = html_tree.xpath('//div[@id=\'main\']//div[@class=\'book\']')[0]
    title = book_div.xpath('./h1/text()')[0]
    author = book_div.xpath('./h2/text()')[0]
    description_temp = book_div.xpath('./div[@class=\'description\']/p/text()')
    description = ''
    if len(description_temp) > 0:
        description = description_temp[0]

    chapter_a_list = book_div.xpath('./dl/dd/a')

    chapter_list = []
    for chapter in chapter_a_list:
        chapter_data = {
            'chapter_url': chapter.xpath('./@href')[0],
            'chapter_name': chapter.xpath('./text()')[0]
        }
        chapter_list.append(chapter_data)

    json_data = {
        'article_url': article_url,
        'article_title': title,
        'article_author': author,
        'description': description,
        'chapter_list': chapter_list
    }
    print(json_data)

    with open(article_filename, 'w') as fp:
        fp.write(json.dumps(json_data))

    return json_data


# 获取武侠小说章节内容
def get_wuxia_chapter(chapter_url, article_title, chapter_name):
    chapter_name = chapter_name.replace("?", "").replace("*", "")
    chapter_filename = './' + main_dir + '/武侠小说/' + article_title + '/' + chapter_name + '.json'
    if os.path.exists(chapter_filename):
        print(chapter_filename + '文件已存在，不再重复下载！')
        return

    html_tree = get_html_tree(chapter_url,'GB2312')

    content = html_tree.xpath('//table[4]//p/text()')

    json_data = {
        'chapter_url': chapter_url,
        'chapter_name': chapter_name,
        'content': content
    }
    print(json_data)

    with open(chapter_filename, 'w') as fp:
        fp.write(json.dumps(json_data))

    return json_data

# 获取武侠小说
def get_wuxia():
    print('开始获取武侠小说================')

    if not os.path.exists('./' + main_dir + '/武侠小说'):
        os.mkdir('./' + main_dir + '/武侠小说')

    # 翻页 1.html
    for i in range(1, 40):
        page_url = 'https://wx.tianyabooks.com/book/list_' + str(i) + '.html'
        print(page_url)

        page_index_filename = './' + main_dir + '/武侠小说/' + 'article_list_page_' + str(i) + '.json'
        article_list = []
        if os.path.exists(page_index_filename):
            with open(page_index_filename, 'r') as f:
                article_list = json.load(f)
                print(page_index_filename + '武侠小说分页文章列表文件存在，直接读取')
        else:
            print(page_index_filename + '从网页获取武侠小说第【' + str(i) + '】页下文章列表')
            article_list = get_wuxia_article_list(i, page_url)

        # 循环列表读取文章
        article_list_len = len(article_list)
        article_index = 1
        for article in article_list:
            article_url = article['article_url']
            article_title = article['article_title']
            print('【' + str(article_index) + '/' + str(article_list_len) + '】开始获取【' + article_title + '】文章数据:')
            article_index = article_index + 1

            article_filename = './' + main_dir + '/武侠小说/' + article_title + '/article_info.json'
            if os.path.exists(article_filename):
                print(article_filename + '文件已存在，不再重复下载！')
                continue

            article_info = get_wuxia_article('https://wx.tianyabooks.com/' + article_url, article_title)

            chapter_list_len = len(article_info['chapter_list'])
            chapter_list_index = 1
            for chapter in article_info['chapter_list']:
                print('【' + str(chapter_list_index) + '/' + str(chapter_list_len) + '】开始获取小说【' + article_title + '】-【' + chapter['chapter_name'] + '】的章节数据')
                chapter_list_index = chapter_list_index + 1

                get_wuxia_chapter('https://wx.tianyabooks.com/' + article_url + chapter['chapter_url'], article_title, chapter['chapter_name'])


# 主程序执行
if __name__ == '__main__':
    # 当前网站存储目录
    if not os.path.exists('./' + main_dir):
        os.mkdir('./' + main_dir)

    # 获取武侠
    get_wuxia()

xz91168 · 发表于 2023-8-3 11:18

C:\Users\Administrator\AppData\Local\Microsoft\WindowsApps\python3.11.exe C:\Users\Administrator\PycharmProjects\pythonProject\main.py
Traceback (most recent call last):
File "C:\Users\Administrator\PycharmProjects\pythonProject\main.py", line 1, in <module>
import requests
ModuleNotFoundError: No module named 'requests'

进程已结束,退出代码1

kkkkkkkkn · 发表于 2023-8-4 17:57

xz91168 发表于 2023-8-3 11:18
C:%users\Administrator\AppData\Local\Microsoft\WindowsApps\python3.11.exe C:%users\Administrator\Pyc ...

您没有下载request模块，它是一个非常流行的用于发送 HTTP 请求的库。requests 库提供了简洁而友好的 API，使得发送 HTTP 请求变得非常容易。

OfficeDK · 发表于 2023-8-2 15:58

好，学习了！！！

rjqg2023 · 发表于 2023-8-2 15:58

放入python里直接运行就行吗？

坐久落花多 · 发表于 2023-8-2 18:00

前两天想找同学帮忙做个爬取某网站的内容的，结果他说可能有违法风险，咋回事？实在是不懂啊。。。

吖力锅 · 发表于 2023-8-2 22:39

哇哇。这么多行代码的吗

RedWolfT · 发表于 2023-8-3 11:50

学习了，感谢分享！

echoaku · 发表于 2023-8-3 14:11

不错，支持一下

jrwapj · 发表于 2023-8-3 16:47

不错，支持，学习了

lzaiz24 · 发表于 2023-8-3 17:50

刚好学了点爬虫

帐号		自动登录	找回密码
密码			注册[Register]

[Python 原创] Python爬取武侠小说

免费评分

个人中心