小说网站爬虫作品分享

zqc8 · 发表于 2019-11-3 15:26

本帖最后由 zqc8 于 2020-3-22 22:30 编辑

新版代码已更新，由于时间过去已久，所以新代码思路/写法与就代码略有不同，且由于时间仓促，代码不作注释，如有问题留言

不知由于何因，新版代码 33行novel_name代码无法正常显示，正确代码为

由于上周考试，所以耽误了一周，没有发布分享，这次跟大家分享的是关于小说爬取的，理论上同样可以爬取全站已经完结的小说。拿到代码请合理学习，不要给服务器带来压力，谢谢！

同样的，如果对本代码有任何意见或者建议，欢迎提出，有不懂的欢迎留言评论，作者看到后会尽快回复！
另外跟大家预告下，目前作者正在爬取12306，想凭借自己的努力做一款抢票软件，目前余票查询、模拟登录两大核心部分已基本解决，开发完毕我会一并分享给大家，敬请期待！
新版代码：

[Python] 纯文本查看 复制代码

# -*- coding: UTF-8 -*-
# ！/usr/bin/env python3
# Author:  Murphy
#  Blog :  www.moyo1.cn

import os
import re
import requests
from time import sleep
from bs4 import BeautifulSoup


class CollectNovel(object):
    def __init__(self):
        self.novel_data = {}
        self.start_url = "https://www.ddxsku.com/full.html"
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}

    def collect_url(self):
        print("Collecting novel basic data.....")
        start_response = requests.get(self.start_url, headers=self.headers)
        total_page = re.search(r'<em id="pagestats">1/(.+)</em>', start_response.text).group(1)
        novel_navigation_urls = [fr"http://www.ddxsku.com/modules/article/articlelist.php?fullflag=1&page={i}" for i in range(1, int(total_page)+1)]

        for novel_navigation_url in novel_navigation_urls:
            novel_navigation_response = requests.get(novel_navigation_url, headers=self.headers)
            novel_index_urls = re.findall('<td class="L"><a href="(.+)" title=".+" target="_blank">.+</a></td>', novel_navigation_response.text)

            for novel_index_url in novel_index_urls:
                novel_index_response = requests.get(novel_index_url, headers=self.headers)
                novel_index_response.encoding = "utf-8"

                novel_name = re.search(fr'.+<a >(.+)</a>.+', novel_index_response.text).group(1)
                novel_author = re.search(r'<dd><h3>作者：(.+)</h3><br>.+</h3></dd>', novel_index_response.text).group(1)
                self.novel_data = {novel_name: [("novel_author", novel_author)]}
                print("Collecting novel:  《%s》--%s" % (novel_name, novel_author))

                index_soup = BeautifulSoup(novel_index_response.text, "html.parser")
                novel_text_urls = index_soup.find_all("td", class_="L")
                for each in novel_text_urls:
                    chapters_title = each.text
                    chapters_url = each.a["href"]
                    self.novel_data[novel_name].append((chapters_title, chapters_url))
                sleep(1)
                # break  # 调试减少运行时间使用，爬取全站删除此处即可。
            break  # 调试减少运行时间使用，爬取全站删除此处即可。

    def novel_copy(self):
        self.collect_url()
        if self.novel_data:
            for name in self.novel_data:
                count = 0
                print("Downloading:  《%s》" % name, end="\n"*2)

                work_path = r"C:/Users/Administrator/Desktop/NovelCopy/%s-%s" % (name, self.novel_data[name][0][1])
                if not os.path.exists(work_path):
                    os.makedirs(work_path)
                    os.chdir(work_path)
                else:
                    os.chdir(work_path)

                for chapter_data in self.novel_data[name][1:]:
                    count += 1
                    print("Downloading:  《%s》--%s" % (name, chapter_data[0]))
                    chapter_response = requests.get(chapter_data[1], headers=self.headers)
                    chapter_response.encoding = "utf-8"

                    chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")
                    chapter_text = chapter_soup.find("dd", id="contents")
                    with open("%d-%s.txt" % (count, chapter_data[0]), "w", encoding="utf-8") as f:
                        f.write(chapter_text.text)
                    sleep(2)
                print()
                break
        else:
            print("Collect data failed")


if __name__ == "__main__":
    novel = CollectNovel()
    novel.novel_copy()

旧版代码：

[Python] 纯文本查看 复制代码

#1.需要安装的第三方库:requests,bs4

import os,requests,re
from time import sleep
from bs4 import BeautifulSoup
from random import uniform

#网址解析
def url_open(url):
    headers = {}
    headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
    response = requests.get(url,headers=headers)
    response.encoding = "utf-8"  #该网站的编码格式：utf-8

    html =response.text

    return html


#目标链接收集
def collect_url(html,root_url):
    print("正在收集全站已完结的小说链接.....")
    novel_name_all = []
    novel_url_all = []

    soup = BeautifulSoup(html,"html.parser")
    totle_pages = int((soup.find("div",class_="pagelink").em.text).split("/")[1])  #查找总页数
    #print(totle_pages)

    #逐页打开手收集小说链接
    for page in range(1,totle_pages+1):
        url = root_url + 'modules/article/articlelist.php?fullflag=1&page={}'.format(page)
        #print(url)
        html = url_open(url)
        #收集当前页面的小说链接
        p_novel_url = fr'<a href="({root_url}xiaoshuo/.+.html)">'
        novel_url_temp = re.findall(p_novel_url,html)

        #将小说链接添加到URL总列表并获取小说名称。
        for novel_url in novel_url_temp:
            novel_url_all.append(novel_url)
            #获取小说名称
            p_novel_name = fr'{novel_url}">(.+)</a>'
            novel_name_temp = re.findall(p_novel_name,html)[0]
            novel_name_all.append(novel_name_temp)

        break #减少代码运行时间使用(若爬取全站则此处删除即可)

    data = [novel_name_all,novel_url_all]  #将数据进行打包，以便返回多个数据

    print("收集工作已完成，准备进入小说内容下载.....")
    sleep(1)

    return  data


#小说内容获取与保存
def get_and_save_data(data):
    novel_name_all = data[0]
    novel_url_all = data[1]
    i = -1  #用于索引获取小说名称

    for novel_url in novel_url_all:
        i += 1
        novel_name = novel_name_all[i]  # 获取小说名称
        print()  
        print("正在下载小说：《%s》"%novel_name)
        print()  

        html_1 = url_open(novel_url)
        soup_1 = BeautifulSoup(html_1, "html.parser")
        chapters_url = soup_1.find("p", class_="btnlinks").a["href"]

        #获取所有小说章节URL
        html_2 = url_open(chapters_url)
        soup_2 = BeautifulSoup(html_2, "html.parser")
        chapters_url_all = soup_2.find_all("td", class_="L")

        #逐页打开小说章节网址并获取内容保存
        for each in chapters_url_all:
            chapters_url = each.a["href"]
            html = url_open(chapters_url)

            soup = BeautifulSoup(html,"html.parser")
            chapters_name = soup.find("dd").h1.text  #抓取章节名称
            print("正在下载《%s》:%s"%(novel_name,chapters_name))

            #小说内容抓取
            contents = soup.find("dd",id="contents").text
            with open("%s.txt"%novel_name,"a",encoding="utf-8") as g:
                g.write("\n"*3 + "                               "+chapters_name+str("\n")*3)
                g.write("    "+contents)

            slee_time = uniform(0.35,0.75)
            sleep(slee_time) 

        print("小说%s已下载完毕"%novel_name)
        print("准备进入下一部小说下载")
        sleep(2)

        break #减少代码运行时间使用(若爬取全站则此处删除即可)


#主程序
def main():
    #设置工作路径
    path = r'C:\Users\Administrator\Desktop\test'
    if os.getcwd() != path:
        if os.path.exists(path) == False:
            os.mkdir(path)
            os.chdir(path)
        else:
            os.chdir(path)
    root_url = "https://www.ddxsku.com/"
    target_url = root_url + "full.html"
    data = collect_url(url_open(target_url),root_url)
    get_and_save_data(data)


if __name__ == "__main__":
    main()

ttyp · 发表于 2020-3-19 16:27

网站是https的，连接是http的，修改了下

[Python] 纯文本查看 复制代码

#1.需要安装的第三方库:requests,bs4
 
import os,requests,re
from time import sleep
from bs4 import BeautifulSoup
from random import uniform
 
#网址解析
def url_open(url):
    headers = {}
    headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
    response = requests.get(url,headers=headers)
    response.encoding = "utf-8"  #该网站的编码格式：utf-8
 
    html =response.text
 
    return html
 
 
#目标链接收集
def collect_url(html,root_url):
    print("正在收集全站已完结的小说链接.....")
    novel_name_all = []
    novel_url_all = []
 
    soup = BeautifulSoup(html,"html.parser")
    totle_pages = int((soup.find("div",class_="pagelink").em.text).split("/")[1])  #查找总页数
    #print(totle_pages)
    print("总页数：" + str(totle_pages))
 
    #逐页打开手收集小说链接
    for page in range(1,totle_pages+1):
        url = root_url + 'modules/article/articlelist.php?fullflag=1&page={}'.format(page)

        print("正在打开" + url)

        html = url_open(url)

        #收集当前页面的小说链接
        p_novel_url = fr'<a href="(.+?/xiaoshuo/.+\.html)">'
        print(p_novel_url)
        novel_url_temp = re.findall(p_novel_url,html)

        print(len(novel_url_temp))
 
        #将小说链接添加到URL总列表并获取小说名称。
        for novel_url in novel_url_temp:
            novel_url_all.append(novel_url)
            #获取小说名称
            p_novel_name = fr'{novel_url}">(.+)</a>'
            novel_name_temp = re.findall(p_novel_name,html)[0]
            novel_name_all.append(novel_name_temp)
 
        break #减少代码运行时间使用(若爬取全站则此处删除即可)
 
    data = [novel_name_all,novel_url_all]  #将数据进行打包，以便返回多个数据
 
    print("收集工作已完成，准备进入小说内容下载.....")
    sleep(1)
 
    return  data
 
 
#小说内容获取与保存
def get_and_save_data(data):
    novel_name_all = data[0]
    novel_url_all = data[1]
    i = -1  #用于索引获取小说名称
 
    for novel_url in novel_url_all:
        i += 1
        novel_name = novel_name_all[i]  # 获取小说名称
        print()  
        print("正在下载小说：《%s》"%novel_name)
        print()  
 
        html_1 = url_open(novel_url)
        soup_1 = BeautifulSoup(html_1, "html.parser")
        chapters_url = soup_1.find("p", class_="btnlinks").a["href"]
 
        #获取所有小说章节URL
        html_2 = url_open(chapters_url)
        soup_2 = BeautifulSoup(html_2, "html.parser")
        chapters_url_all = soup_2.find_all("td", class_="L")
 
        #逐页打开小说章节网址并获取内容保存
        for each in chapters_url_all:
            chapters_url = each.a["href"]
            html = url_open(chapters_url)
 
            soup = BeautifulSoup(html,"html.parser")
            chapters_name = soup.find("dd").h1.text  #抓取章节名称
            chapters_name = re.sub(' +',' ',chapters_name).strip()
            print("正在下载《%s》:%s"%(novel_name,chapters_name))
 
            #小说内容抓取
            contents = soup.find("dd",id="contents").text
            with open("%s.txt"%novel_name,"a",encoding="utf-8") as g:
                g.write("\n"*3 +chapters_name+str("\n")*3)
                g.write("    "+contents)
 
            slee_time = uniform(0.35,0.75)
            sleep(slee_time) 
 
        print("小说%s已下载完毕"%novel_name)
        print("准备进入下一部小说下载")
        sleep(2)
 
        break #减少代码运行时间使用(若爬取全站则此处删除即可)
 
 
#主程序
def main():
    #设置工作路径
    path = r'C:\Users\Administrator\Desktop\test'
    if os.getcwd() != path:
        if os.path.exists(path) == False:
            os.mkdir(path)
            os.chdir(path)
        else:
            os.chdir(path)
    root_url = "https://www.ddxsku.com/"
    target_url = root_url + "full.html"
    data = collect_url(url_open(target_url),root_url)
    get_and_save_data(data)
 
 
if __name__ == "__main__":
    main()

zqc8 · 发表于 2020-3-22 22:19

本帖最后由 zqc8 于 2020-3-22 22:26 编辑

新版代码已更新，由于时间仓促，不作注释，如有问题留言

不知由于何因，新版代码 33行novel_name代码无法正常显示，正确代码为

[Python] 纯文本查看 复制代码

# -*- coding: UTF-8 -*-
# ！/usr/bin/env python3
# Author:  Murphy
#  Blog :  www.moyo1.cn

import os
import re
import requests
from time import sleep
from bs4 import BeautifulSoup


class CollectNovel(object):
    def __init__(self):
        self.novel_data = {}
        self.start_url = "https://www.ddxsku.com/full.html"
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}

    def collect_url(self):
        print("Collecting novel basic data.....")
        start_response = requests.get(self.start_url, headers=self.headers)
        total_page = re.search(r'<em id="pagestats">1/(.+)</em>', start_response.text).group(1)
        novel_navigation_urls = [fr"http://www.ddxsku.com/modules/article/articlelist.php?fullflag=1&page={i}" for i in range(1, int(total_page)+1)]

        for novel_navigation_url in novel_navigation_urls:
            novel_navigation_response = requests.get(novel_navigation_url, headers=self.headers)
            novel_index_urls = re.findall('<td class="L"><a href="(.+)" title=".+" target="_blank">.+</a></td>', novel_navigation_response.text)

            for novel_index_url in novel_index_urls:
                novel_index_response = requests.get(novel_index_url, headers=self.headers)
                novel_index_response.encoding = "utf-8"

                novel_name = re.search(fr'.+<a >(.+)</a>.+', novel_index_response.text).group(1)
                novel_author = re.search(r'<dd><h3>作者：(.+)</h3><br>.+</h3></dd>', novel_index_response.text).group(1)
                self.novel_data = {novel_name: [("novel_author", novel_author)]}
                print("Collecting novel:  《%s》--%s" % (novel_name, novel_author))

                index_soup = BeautifulSoup(novel_index_response.text, "html.parser")
                novel_text_urls = index_soup.find_all("td", class_="L")
                for each in novel_text_urls:
                    chapters_title = each.text
                    chapters_url = each.a["href"]
                    self.novel_data[novel_name].append((chapters_title, chapters_url))
                sleep(1)
                # break  # 调试减少运行时间使用，爬取全站删除此处即可。
            break  # 调试减少运行时间使用，爬取全站删除此处即可。

    def novel_copy(self):
        self.collect_url()
        if self.novel_data:
            for name in self.novel_data:
                count = 0
                print("Downloading:  《%s》" % name, end="\n"*2)

                work_path = r"C:/Users/Administrator/Desktop/NovelCopy/%s-%s" % (name, self.novel_data[name][0][1])
                if not os.path.exists(work_path):
                    os.makedirs(work_path)
                    os.chdir(work_path)
                else:
                    os.chdir(work_path)

                for chapter_data in self.novel_data[name][1:]:
                    count += 1
                    print("Downloading:  《%s》--%s" % (name, chapter_data[0]))
                    chapter_response = requests.get(chapter_data[1], headers=self.headers)
                    chapter_response.encoding = "utf-8"

                    chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")
                    chapter_text = chapter_soup.find("dd", id="contents")
                    with open("%d-%s.txt" % (count, chapter_data[0]), "w", encoding="utf-8") as f:
                        f.write(chapter_text.text)
                    sleep(2)
                print()
                break
        else:
            print("Collect data failed")


if __name__ == "__main__":
    novel = CollectNovel()
    novel.novel_copy()

yonghermit · 发表于 2019-11-3 15:51

支持支持

xzgxp · 发表于 2019-11-3 16:20

不错，谢谢分享

fys2008 · 发表于 2019-11-3 17:00

谢谢分享，楼主辛苦了

zqc8 · 发表于 2019-11-3 17:11

谢谢大家的支持，至于为什么要写12306抢票软件，一方面是想锻炼下自己的动手能力，看看大网站是如何处理反爬的，来为自己积累经验，另一方面本人有时候确实会用的到，而且我还有个不错的想法等待验证

mei251617 · 发表于 2019-11-3 17:20

谢谢分享，楼主辛苦了

YuLoo · 发表于 2019-11-3 18:47

好漂亮的代码

zcmrp · 发表于 2019-11-3 19:59

求救，怎么提示字符非法

万丅冧 · 发表于 2019-11-3 21:48

感谢分享

topvip · 发表于 2019-11-3 23:17

小白来学习了，感谢分享！

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 小说网站爬虫作品分享

免费评分