好友
阅读权限 10
听众
最后登录 1970-1-1
本帖最后由 YZH1028 于 2023-5-31 13:34 编辑
因为每一章有几小节,小节的链接地址我只能在每一章中取出,该如何加入多线程,望指点
import requests
from lxml import etree
import re
import urllib.parse
import time
import os
success = False
headers = {
'user-agent' : 'Mozilla/5.0' ,
}
class Urlchuli ():
def __init__ (self , can , mazhi ='utf-8' ):
self .can = can
self .mazhi = mazhi
def url_bm (self ):
quma = str (self .can).encode(self .mazhi)
bianma = urllib.parse.quote(quma)
return bianma
def url_jm (self ):
quma = str (self .can)
jiema = urllib.parse.unquote(quma, self .mazhi)
return jiema
name = Urlchuli(input (' 请输入书名 / 作者: \n ' ), 'gbk' )
name = name.url_bm()
def search_book (name ):
cookies = {
'PHPSESSID' : '5u2161egtijt1b5dr6qdtch3ll' ,
'jq_Obj' : '1' ,
'__51cke__' : '' ,
'__tins__18946369' : '%7B%22sid%22%3A%201684375113921%2C%20%22vd%22%3A%206%2C%20%22expires%22%3A%201684377145674%7D' ,
'__51laig__' : '6' ,
}
headers = {
'authority' : 'www.z555.net' ,
'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7' ,
'accept-language' : 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6' ,
'cache-control' : 'no-cache' ,
'content-type' : 'application/x-www-form-urlencoded' ,
# 'cookie': 'PHPSESSID=5u2161egtijt1b5dr6qdtch3ll; jq_Obj=1; __51cke__=; __tins__18946369=%7B%22sid%22%3A%201684375113921%2C%20%22vd%22%3A%206%2C%20%22expires%22%3A%201684377145674%7D; __51laig__=6',
'origin' : 'https://www.z555.net' ,
'pragma' : 'no-cache' ,
'referer' : 'https://www.z555.net/search.php' ,
'sec-ch-ua' : '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"' ,
'sec-ch-ua-mobile' : '?0' ,
'sec-ch-ua-platform' : '"Windows"' ,
'sec-fetch-dest' : 'document' ,
'sec-fetch-mode' : 'navigate' ,
'sec-fetch-site' : 'same-origin' ,
'sec-fetch-user' : '?1' ,
'upgrade-insecure-requests' : '1' ,
'user-agent' : 'Mozilla/5.0' ,
}
data = f's= { name } &searchtype=articlename'
response = requests.post('https://www.z555.net/search.php' , cookies =cookies, headers =headers, data =data)
response.encoding = response.apparent_encoding
html = etree.HTML(response.text)
books_time = html.xpath('//*[@id="nr"]/td[5]/text()' )
books_name = html.xpath('//tr[@id="nr"]/td[@class="odd"]/a/text()' )
books_writer = html.xpath('//*[@id="nr"]/td[3]/text()' )
books_link = html.xpath('//tr[@id="nr"]/td[@class="odd"]/a/@href' )
# list02 = map(list, zip(books_name, books_link, books_writer,books_time))
length = len (books_link)
print (length)
list01 = [[books_name, 'https://www.z555.net' +books_link, books_writer, books_time] for i in range (length)]
for i in list01:
print (i)
print (f'------------ 当前共查询到 { length} 种小说 -------------' )
# print(list02)
return list01
def choose_name (list01 ):
while True :
try :
name = input (' 请输入你要获取其中的小说: ' )
for i in list01 :
if name in i:
print (i)
return i
else :
continue
except ValueError as f:
print (f' 输入程序出错: { f} ,重新输入! ' )
def get_find (list01 ): # [0] 书本的名字 [1]url [2] 作者名字
get_choose = requests.get(list01 [1 ])
get_choose.encoding = get_choose.apparent_encoding
books_writer = list01 [2 ]
books_name = list01 [0 ]
html = etree.HTML(get_choose.text)
catalogue_name = html.xpath('//*[@id="chapterlist"]/ul/li/a/text()' )
catalogue_link = html.xpath('//*[@id="chapterlist"]/ul/li/a/@href' )
counts = len (catalogue_name)
list_data = [['https://www.z555.net' + catalogue_link, catalogue_name, books_name, books_writer] for i in range (counts)]
return list_data
def next_ye (nurl ):
book_next_link = ''
# print(nurl)
next_url = nurl [0 ][0 ]
title_url = next_url.split('1.html' )[0 ]
book_name = nurl [0 ][2 ]
book_author = nurl [0 ][3 ]
while book_next_link != './' :
# time.sleep(0.4)
res = requests.get(url =next_url, headers =headers)
res.encoding = 'gbk'
html = etree.HTML(res.text)
book_next_link = html.xpath('//*[@id="container"]/div[3]/a[5]/@href' )[0 ]
# print(book_next_link)
book_chapter_name = html.xpath('//div[@class="title"]/h1/text()' )[0 ]
book_chapter_name = named(book_chapter_name)
obj_name = re.compile(r' <div id="content">(?P<content>.*?)</div> ' , re.S)
if not os.path.exists(f' 小说存放 / { book_author} / 《 { book_name} 》 /' ):
os.makedirs(f' 小说存放 / { book_author} / 《 { book_name} 》 /' )
with open (f' 小说存放 / { book_author} / 《 { book_name} 》 / 《 { book_chapter_name} 》 .txt' , 'w' ,
encoding ='utf-8' ) as f:
for i in obj_name.finditer(res.text):
chapter_content = i.group('content' ).replace("<br/><br/>" , ' \n ' )
f.write(' 【 ' + book_chapter_name + ' 】 ' + ' \n\n ' + chapter_content + ' \n\n ' )
print (f'------------ 【 { book_chapter_name} 下载完成】 ------------------' )
f.close()
next_url = title_url+ book_next_link
else :
print ('-------------------- 小说下载完毕 ---------------------' )
def named (title ):
return re.sub(r' [?\\/:!<>|"\s] ' , '_' , title )
def main ():
text = get_find(choose_name(search_book(name)))
next_ye(text)
if __name__ == '__main__' :
t1 = time.time()
main()
t2 = time.time()
print (' 耗时: ' , t2 - t1)