对楼主的代码做了一些修改:
1,引入线程池,可以自定义线程数量,默认为8
2,已下载的图片不再重复下载,如果程序下到一半中途停止,下次运行只下载未下载的图片
3,下载该网站所有页数直到返回404
4,一些细节修改
[Python] 纯文本查看 复制代码 # -*- coding = utf-8 -*-
# @Time:2022/4/17 10:05
# @Author:宇
# @File:1111.py
# @Software:PyCharm
import requests
import os
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
thread_num = 8 # 自定义线程数量
headers = {
'Referer': 'https://www.mmlme.com/jp',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}
# 获取图组链接
def get_zu_urls(pepa):
zu_title_urls = {}
url = f'https://www.mmlme.com/jp/page/{pepa}'
res = requests.get(url=url, headers=headers)
if res.status_code == 404:
return 0
tree = etree.HTML(res.text)
titles = tree.xpath('//div[@class="posts-row ajaxpager"]/posts/div[2]/h2/a/text()')
urls = tree.xpath('//div[@class="posts-row ajaxpager"]/posts/div[2]/h2/a/@href')
for title, url in zip(titles, urls):
zu_title_urls[title] = url
return zu_title_urls
# 得到图片url
def get_urls(urls):
res = requests.get(url=urls, headers=headers).text
tree = etree.HTML(res)
urls_list = tree.xpath('//ul/li/figure/a/@box-img')
return urls_list
def save(name, url):
if not os.path.exists('图库/' + name):
os.mkdir('图库/' + name)
# 得到图片url
urls_list = get_urls(url)
a = 1
for i in urls_list:
jpgpath = '图库/' + name + '/' + str(a) + '.jpg'
if os.path.exists(jpgpath):
a += 1
continue
res = requests.get(url=i, headers=headers).content
# print(a)
with open(jpgpath, 'wb') as f:
f.write(res)
a += 1
print(name + '下载完成!!!!!!!')
def main():
if not os.path.exists('图库'):
os.mkdir('图库')
with ThreadPoolExecutor(max_workers=thread_num) as pool:
for pepa in range(1, 666):
# 获取图组链接
print(f'............正在下载第{pepa}页............')
zu_title_urls = get_zu_urls(pepa)
if zu_title_urls == 0:
break
for name, url in zu_title_urls.items():
# save(name, url)
pool.submit(save, name, url)
print('全部下载完成')
if __name__ == '__main__':
main() |