Python爬取看雪学院1W+资料库

null119 · 发表于 2019-7-26 01:16

本帖最后由 null119 于 2019-7-26 01:25 编辑

先上图：

QQ截图20190725211259.png

因文章太多，不能转成一个html，要然会卡死，SO，只能按分类转PDF了

代码：

[Python] 纯文本查看 复制代码

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

023

024

025

026

027

028

029

030

031

032

033

034

035

036

037

038

039

040

041

042

043

044

045

046

047

048

049

050

051

052

053

054

055

056

057

058

059

060

061

062

063

064

065

066

067

068

069

070

071

072

073

074

075

076

077

078

079

080

081

082

083

084

085

086

087

088

089

090

091

092

093

094

095

096

097

098

099

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

import re,json,os,sys,time,requests
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
from urllib.parse import quote,unquote
 
def filterFName(FName):
    rstr = r"[\/\\\:\*\?\"\<\>\|]"
    new_name = re.sub(rstr, "_", FName)
    return new_name
 
def mkdir(path):
    path = path.strip()
    isExists = os.path.exists(path)
    if not isExists:
        os.makedirs(path)
 
def gethtml(url,encode):
    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'}
    r = requests.get(url,headers=headers)
    r.encoding = encode
    return r.text
 
def writehtml(path,str):
    f = open(path,'w+',encoding='utf-8')
    f.write(str)
    f.close
 
def postdata(url,pdata):
    headers = {'X-Requested-With': 'XMLHttpRequest'}
    rep = requests.post(url=url, data=pdata, headers=headers)
    return rep.text
 
def forstr(mstr):
    mstr=mstr.replace('{', '').replace('}', '').replace(' ', '').replace('"', '')
    pId = mstr.split(',')[2].split(':')[1]
    id = mstr.split(',')[0].split(':')[1]
    name =mstr.split(',')[1].split(':')[1].replace('"','')
    return id,pId,name
 
def getlv(lname,listtxt,num):
    global txt,count,zcount
    gurl = 'https://www.kanxue.com/chm-thread_last_read.htm'
    id,pId,name=forstr(listtxt)
    pdata = {"chmid": pId, "cateid": id, "nodename": quote(name, 'utf-8')}
    repdata = postdata(gurl, pdata)
    jsonstr = json.loads(repdata)
    txt = txt + '<h' + str(num) + '>' + name + '</h' + str(num) + '><br>'
    n = []
    for j in zlist:
        jid,pId1,name1=forstr(j)
        if pId1==id:
            n.append(j)
    if len(n)>0:
        for k in n:
            fname = k.split(',')[1].split(':')[1].replace('"', '').replace(' ', '')
            llname = lname + '->' +fname
            getlv(llname,k,num+1)
    if jsonstr['code'] != '-1':
        for m in range(len(jsonstr['message'])):
            html=gethtml('https:'+jsonstr['message'][m]['url'],'utf-8')
            ehtml = etree.HTML(html)
            try:
                strs = ehtml.xpath('//*[@class="message break-all"]')[-1]
            except:
                pass
            else:
                count+=1
                zcount+=1
                sys.stdout.write('\r'+'此目录已获取：'+str(count)+'篇文章,当前：' +lname+'->'+jsonstr['message'][m]['name'])
                sys.stdout.flush()
                strs = etree.tostring(strs, encoding="utf-8", pretty_print=True, method="html").decode("utf-8")
                strs=re.sub('<h[1,2,3,4,5,6,7,8,9, ]','<b ',strs)
                strs = re.sub('</h[1,2,3,4,5,6,7,8,9]>', '</b>', strs)
                strs = re.sub('<img src="upload','<img src="https://bbs.pediy.com/upload',strs)
                strs = re.sub('<img src="/view', '<img src="https://bbs.pediy.com/view', strs)
                strs = re.sub('a href="attach-', 'a href="https://bbs.pediy.com/attach-', strs)
                txt=txt+'<br><br><h'+str(num+1)+'>'+jsonstr['message'][m]['name']+'</h'+str(num+1)+'><br>'+strs
 
def getdata(url):
    global txt,zlist,count
    html=gethtml(url,'utf-8')
    pattern = re.compile('\{ id.*?\}')
    t=pattern.findall(html)
    toplist=[]
    zlist=[]
    for i in t:
        if 'pId: 0 'in i:
            toplist.append(i)
        else:
            zlist.append(i)
    for i in toplist:
        iid,itid,iname=forstr(i)
        zzlist = []
        for j in range(len(zlist), 0, -1):
            jid,jpid,jname=forstr(zlist[j-1])
            if jpid==iid:
                zzlist.append(zlist[j-1])
                zlist.remove(zlist[j-1])
        for j in range(len(zzlist), 0, -1):
            jid, jpid, jname = forstr(zzlist[j - 1])
            mkdir(savepath + filterFName(iname))
            dirname=iname+'->'+jname
            time_start=time.time()
            count = 0
            print('开始获取【'+iname+'->'+jname+'】...')
            txt = ''
            getlv(dirname,zzlist[j-1],2)
            time_end=time.time()
            print('')
            print(iname+'->'+jname+'：获取完成. 文章数：'+str(count)+',耗时：{:.2f} 秒.'.format(time_end - time_start))
            writehtml(savepath + filterFName(iname)+'\\'+filterFName(jname)+'.html', txt)
            print('*' * 100)
 
if __name__ == '__main__':
    global zlist,count,zcount
    count=0
    zcount=0
    url = 'https://www.kanxue.com/chm.htm'
    ztime=time.time()
    savepath='C:\\看雪知识库\\'
    getdata(url)
    print('全部任务完成,共获取文章： '+str(zcount)+' ,总耗时：{:.2f} 秒.'.format(time.time()-ztime))

转PDF的代码就不上了，很简单，不会的可以参考我其它的帖子代码

这些资料也上传到电子书屋

地址：https://www.52pojie.cn/forum.php?mod=viewthread&tid=997084

不得不说，资料实在是全，有这一套东西，根本不用再去到处找什么教程了

沙鱼 · 发表于 2019-7-26 01:21

沙发是我的，链接在哪里

ISR · 发表于 2019-7-26 01:35

感谢楼主

zhyzhh · 发表于 2019-7-26 04:11

真的厉害了，以后想学一下

ytzyyjj · 发表于 2019-7-26 04:58

这个好(?▽?)

拯救地球好累ya · 发表于 2019-7-26 05:28

牛牛牛，高手

SuperGround · 发表于 2019-7-26 07:51

楼主很厉害呀，膜拜膜拜

chen4321 · 发表于 2019-7-26 07:52

想要爬出来的资料

daymissed · 发表于 2019-7-26 08:07

我怎么就不会爬呢

lengyueyu · 发表于 2019-7-26 08:09

向高手学习

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] Python爬取看雪学院1W+资料库

免费评分

本帖被以下淘专辑推荐:

浏览过的版块