本帖最后由 lihu5841314 于 2021-6-23 23:24 编辑
[Asm] 纯文本查看 复制代码 import requests,os,re,time
from lxml import etree
import asyncio
import aiohttp
import aiofile
yin_url = "http://www.tulishe.com/wp-content/themes/modown/timthumb.php?src=http://www.tulishe.com/wp-content/uploads/2021/06/20075128370.jpg&w=2000&h=2000&zc=2&q=1000"
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
"Referer": "http://www.tulishe.com/"
}
def resp(url):
response = requests.get(url=url,headers=headers)
response.encoding = response.apparent_encoding
return response
def main_img(res):
img_detail_urls = re.findall(r'<a itemprop="url" rel="bookmark" href="(?P<urls>.*?)" title=',res.text) #正则真牛比
return img_detail_urls
def tree(img_url):
response = resp(img_url)
img_tree = etree.HTML(response.text)
img_namedir = re.findall(r'<h1 class="article-title">(?P<name>.*?)</h1>',response.text)
global img_namedir1
img_namedir1 = "".join([x.strip() for x in img_namedir])
img_urls = []
div_list_yinchang = img_tree.xpath('//div[@id="gallery-2"]/div')[4:]
print("正在处理隐藏照片---------------")
for di_yin in div_list_yinchang:
img_url_yin = di_yin.xpath('./img/@src')[0]
img_url_yin_zhen =img_url_yin.split("&")[0] + '&w=2000&h=2000&zc=3&q=1000'
img_urls.append(img_url_yin_zhen)
div_list = img_tree.xpath('//div[@id="gallery-2"]/div')[:4]
for di in div_list:
img_url = di.xpath('./a/@href')[0]
img_urls.append(img_url)
return img_urls
async def down(ur):
async with aiohttp.ClientSession() as session:
async with await session.get(ur) as img_res:
if len(ur)>65:
img_name = ur.split("/")[-1].split("&")[0]
print("正在处理隐藏照片---------------名字")
print(img_name)
else:
img_name =ur.split("/")[-1]
print("---预览照片名字---")
if not os.path.exists("img_tuli/"+img_namedir1):
os.makedirs("img_tuli/"+img_namedir1)
path = "img_tuli/"+ img_namedir1 +'/' + img_name
async with aiofile.async_open(path,'wb') as f:
conment = await img_res.read()
await f.write(conment)
print(path,"下载完成")
def main():
for i in range(1, 2): #一共870多页
url = f"http://www.tulishe.com/page/{i}"
response = resp(url)
img_detail_urls = main_img(response)
for url in img_detail_urls:
img_urls = tree(url)
tasks = []
for ur in img_urls:
task =asyncio.ensure_future(down(ur))
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
if __name__ == '__main__':
start = time.time()
loop = asyncio.get_event_loop() #建立事件循环
main()
print('一共耗时',time.time()-start)
|