怎样加入多线程且爬取顺序不变

YZH1028 · 发表于 2023-5-20 20:18

本帖最后由 YZH1028 于 2023-5-31 13:34 编辑

因为每一章有几小节，小节的链接地址我只能在每一章中取出，该如何加入多线程，望指点
import requests
from lxml import etree
import re
import urllib.parse
import time
import os
success = False
headers = {
'user-agent': 'Mozilla/5.0',
}
class Urlchuli():

def __init__(self, can, mazhi='utf-8'):
      self.can = can
      self.mazhi = mazhi

def url_bm(self):
      quma = str(self.can).encode(self.mazhi)
      bianma = urllib.parse.quote(quma)
      return bianma

def url_jm(self):
      quma = str(self.can)
      jiema = urllib.parse.unquote(quma, self.mazhi)
      return jiema

name = Urlchuli(input('请输入书名/作者：\n'), 'gbk')
name = name.url_bm()

def search_book(name):
cookies = {
      'PHPSESSID': '5u2161egtijt1b5dr6qdtch3ll',
      'jq_Obj': '1',
      '__51cke__': '',
      '__tins__18946369': '%7B%22sid%22%3A%201684375113921%2C%20%22vd%22%3A%206%2C%20%22expires%22%3A%201684377145674%7D',
      '__51laig__': '6',
}

headers = {
      'authority': 'www.z555.net',
      'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
      'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
      'cache-control': 'no-cache',
      'content-type': 'application/x-www-form-urlencoded',
      # 'cookie': 'PHPSESSID=5u2161egtijt1b5dr6qdtch3ll; jq_Obj=1; __51cke__=; __tins__18946369=%7B%22sid%22%3A%201684375113921%2C%20%22vd%22%3A%206%2C%20%22expires%22%3A%201684377145674%7D; __51laig__=6',
      'origin': 'https://www.z555.net',
      'pragma': 'no-cache',
      'referer': 'https://www.z555.net/search.php',
      'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
      'sec-ch-ua-mobile': '?0',
      'sec-ch-ua-platform': '"Windows"',
      'sec-fetch-dest': 'document',
      'sec-fetch-mode': 'navigate',
      'sec-fetch-site': 'same-origin',
      'sec-fetch-user': '?1',
      'upgrade-insecure-requests': '1',
      'user-agent': 'Mozilla/5.0',
}

data = f's={name}&searchtype=articlename'
response = requests.post('https://www.z555.net/search.php', cookies=cookies, headers=headers, data=data)
response.encoding = response.apparent_encoding

html = etree.HTML(response.text)
books_time = html.xpath('//*[@id="nr"]/td[5]/text()')
books_name = html.xpath('//tr[@id="nr"]/td[@class="odd"]/a/text()')
books_writer = html.xpath('//*[@id="nr"]/td[3]/text()')
books_link = html.xpath('//tr[@id="nr"]/td[@class="odd"]/a/@href')
# list02 = map(list, zip(books_name, books_link, books_writer,books_time))
length = len(books_link)
print(length)
list01 = [[books_name, 'https://www.z555.net'+books_link,books_writer,books_time] for i in range(length)]
for i in list01:
      print(i)

print(f'------------当前共查询到{length}种小说-------------')

# print(list02)
return list01

def choose_name(list01):
while True:
      try:
         name = input('请输入你要获取其中的小说：')
         for i in list01:
            if name in i:
                  print(i)
                  return i
            else:
                  continue
      except ValueError as f:
         print(f'输入程序出错：{f}，重新输入！')

def get_find(list01):  # [0]书本的名字  [1]url  [2]作者名字
get_choose = requests.get(list01[1])
get_choose.encoding = get_choose.apparent_encoding
books_writer = list01[2]
books_name = list01[0]

html = etree.HTML(get_choose.text)
catalogue_name = html.xpath('//*[@id="chapterlist"]/ul/li/a/text()')
catalogue_link = html.xpath('//*[@id="chapterlist"]/ul/li/a/@href')
counts = len(catalogue_name)
list_data = [['https://www.z555.net' + catalogue_link,catalogue_name, books_name,books_writer] for i in range(counts)]
return list_data

def next_ye(nurl):
book_next_link = ''
# print(nurl)
next_url = nurl[0][0]
title_url = next_url.split('1.html')[0]
book_name = nurl[0][2]
book_author = nurl[0][3]
while book_next_link != './':
      # time.sleep(0.4)
      res = requests.get(url=next_url, headers=headers)
      res.encoding = 'gbk'
      html = etree.HTML(res.text)
      book_next_link = html.xpath('//*[@id="container"]/div[3]/a[5]/@href')[0]
      # print(book_next_link)
      book_chapter_name = html.xpath('//div[@class="title"]/h1/text()')[0]
      book_chapter_name = named(book_chapter_name)
      obj_name = re.compile(r'<div id="content">(?P<content>.*?)</div>', re.S)
      if not os.path.exists(f'小说存放/{book_author}/《{book_name}》/'):
         os.makedirs(f'小说存放/{book_author}/《{book_name}》/')
      with open(f'小说存放/{book_author}/《{book_name}》/《{book_chapter_name}》.txt', 'w',
               encoding='utf-8') as f:
         for i in obj_name.finditer(res.text):
            chapter_content = i.group('content').replace("<br/><br/>", '\n')
            f.write('【' + book_chapter_name + '】' + '\n\n' + chapter_content + '\n\n')
            print(f'------------【{book_chapter_name}下载完成】------------------')
            f.close()

      next_url = title_url+ book_next_link
else:
      print('--------------------小说下载完毕---------------------')

def named(title):
return re.sub(r'[?\\/:!<>|"\s]', '_', title)

def main():
text = get_find(choose_name(search_book(name)))
next_ye(text)

if __name__ == '__main__':
t1 = time.time()
main()
t2 = time.time()
print('耗时：',t2 - t1)

yuxuechao · 发表于 2023-5-20 21:21

thread + 定义每个线程的步长

YZH1028 · 发表于 2023-5-20 22:58

yuxuechao 发表于 2023-5-20 21:21
thread + 定义每个线程的步长

能否说详细点，我用的thread每次他只能爬取第一章，小节的链接他不会更换

llacjj · 发表于 2023-5-21 05:05

好像没必要在爬取阶段处理顺序，给每个线程安排传递一个序号，在爬取返回阶段，按顺序处理序号线程，而不是按线程自身谁先爬取完成，这样做逻辑也好处理，想在爬取阶段按顺序处理，本身就违反了多线程的本意

涛之雨 · 发表于 2023-5-21 08:32

线程分配id，先均分任务池然后根据速度进行微调，最后根据id排个序

YZH1028 · 发表于 2023-5-21 11:45

好的，谢谢指点

YZH1028 · 发表于 2023-5-21 11:46

llacjj 发表于 2023-5-21 05:05
好像没必要在爬取阶段处理顺序，给每个线程安排传递一个序号，在爬取返回阶段，按顺序处理序号线程，而不是 ...

谢谢，了解了

pjy612 · 发表于 2023-5-21 18:09

如果知道请求的 id 可以当参数传过去，作为结果排序的依据。如果结果中包含可以排序的字段，也可以拿来最终排序用。
看具体对时效性的要求。
如果是立刻就要存下来什么的。那么可以根据结果或者入参id 啥的，给文件或者结果取名。。。

YZH1028 · 发表于 2023-5-21 19:46

pjy612 发表于 2023-5-21 18:09
如果知道请求的 id 可以当参数传过去，作为结果排序的依据。如果结果中包含可以排序的字段 ...

好滴，谢谢指点

kwk99 · 发表于 2023-5-25 15:48

可以使用 ThreadPoolExecutor ,并使用 map 方法，无需提前使用 submit 方法，map 方法与 python 标准库中的 map 含义相同，都是将序列中的每个元素都执行同一个函数，下面的代码就是对 urls 的每个元素都执行 get_html 函数，并分配到线程池里

[Python] 纯文本查看 复制代码

01

02

03

04

05

06

07

08

09

10

11

12

13

import time
from concurrent.futures import ThreadPoolExecutor
  
def get_html(times):
    time.sleep(times)
    print("get page {} success".format(times))
    return times
  
executor = ThreadPoolExecutor(max_workers=2) 
  
# 通过executor的 map 获取已经完成的task的值
for data in executor.map(get_html, urls):
    print("get {} page".format(data))

帐号		自动登录	找回密码
密码			注册[Register]

[已解决] 怎样加入多线程且爬取顺序不变