吾爱破解 - LCG - LSG |安卓破解|病毒分析|www.52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 11534|回复: 117
收起左侧

[Python 转载] python美女写真图库爬虫

  [复制链接]
1942 发表于 2021-6-22 23:30
本帖最后由 1942 于 2021-6-23 11:20 编辑

前几天朋友发我这个网站,自己学python也一周了,顺便拿来练练手。
期间发现这类网站的vip就是摆设,果断把隐藏图也全部薅了
上图:
QQ20210622-231859.png
2.png
一共800多页,各位下手轻点,(图片下载未设置多线程;www


[Python] 纯文本查看 复制代码
import requests
from lxml import etree
import csv
from time import sleep
import os
if __name__ == '__main__':
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'
    }
    all_url = 'http://www.tulishe.com/all/page/%d'
    f = open('tulishe.csv',mode='w',encoding='gb18030')
    csvwriter = csv.writer(f)
    for page_num in range(1,842):   #一共841页
        url = format(all_url%page_num)
        #print(url)
        page_text = requests.get(url=url,headers=headers).text
        tree = etree.HTML(page_text)
        post_list = tree.xpath('//div[@id="posts"]/div')
        print('------开始爬取第' + str(page_num) + '页------')
        for div in post_list:
            link = div.xpath('./div/a/@href')[0]        #文章链接
            title = div.xpath('./div/a/@title')[0]      #标题
            img = div.xpath('./div/a/img/@data-src')[0] #封面图
            img1 = img.split('=')[1]                    #处理封面图url前缀
            img2 = img1.split('&')[0]                   #处理封面图url后缀
            print('------开始下载---【' + title + '】---图片------')
            headers2 = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
                'Referer': link
            }
            #print(link,title)
            #开始请求详情页数据
            page2_text = requests.get(url=link,headers=headers).text
            tree2 = etree.HTML(page2_text)
            item = tree2.xpath('/html/body/div[2]/div/div[2]/div/article/header/div/span[3]/a/text()')      #分类
            article_tags = tree2.xpath('/html/body/div[2]/div/div[2]/div/article/div[3]/a')                 #标签组
            tags = []
            for a in article_tags:
                tag = a.xpath('./text()')
                tags.append(tag)
            #print(item,tags)
            all_pic_url = []            #详情页全部图片
            pic_urls = []
            pic_list = tree2.xpath('//*[@id="gallery-2"]/div[@class="gallery-item gallery-fancy-item"]')    #详情页4图
            a = 0
            for div in pic_list:
                pic_url = div.xpath('./a/@href')[0]
                pic_urls.append(pic_url)
                all_pic_url.append(pic_url)                #添加预览4图片到all_pic_url_

                # 下载4预览图
                a = int(a) + 1
                print('--开始下载第',a,'预览图片--')
                img_data = requests.get(url=pic_url, headers=headers2).content
                s = pic_url.rfind('/')
                dir = pic_url[0:s]
                if not os.path.isdir(dir):
                    try:
                        original_umask = os.umask(0)
                        os.makedirs(dir, mode=0o777)
                    finally:
                        os.umask(original_umask)
                with open(pic_url, 'wb')as fp:
                    fp.write(img_data)
            print('--',title, '-------',len(pic_urls),'张预览图--下载完成------')


            pic_list = tree2.xpath('//*[@id="gallery-2"]/div[@class="gallery-item gallery-blur-item"]')  # 详情页隐藏图
            pic_url3s = []
            for div in pic_list:
                pic_url = div.xpath('./img/@src')[0]
                pic_url2 = pic_url.split('=')[1]            #解析隐藏链接图片
                pic_url3 = pic_url2.split('&')[0]           #解析隐藏链接图片
                pic_url3s.append(pic_url3)
                all_pic_url.append(pic_url3)                #添加隐藏图片到all_pic_url_
                # 下载隐藏图
                a = int(a) + 1
                print('--开始下载第', a, '隐藏图片--')
                img_data = requests.get(url=pic_url3, headers=headers2).content
                s = pic_url3.rfind('/')
                dir = pic_url3[0:s]
                if not os.path.isdir(dir):
                    try:
                        original_umask = os.umask(0)
                        os.makedirs(dir, mode=0o777)
                    finally:
                        os.umask(original_umask)
                with open(pic_url3, 'wb')as fp:
                    fp.write(img_data)
            print(title, '-------',len(pic_url3s),'张隐藏图--下载完成------')
            csvwriter.writerow([title, link, img2, item,tags,all_pic_url])     #【标题;链接;封面图;分类;标签;详情页图片】保存到csv
            sleep(0.1)
            print(link,title,"爬取完毕!!!")

            # 下载封面图
            img_data = requests.get(url=img2, headers=headers2).content
            s = img2.rfind('/')
            dir = img2[0:s]
            if not os.path.isdir(dir):
                try:
                    original_umask = os.umask(0)
                    os.makedirs(dir, mode=0o777)
                finally:
                    os.umask(original_umask)
            with open(img2, 'wb')as fp:
                fp.write(img_data)
            print('--',title, '封面图------下载完成------')


        print('第'+ str(page_num) +'页,爬取完毕!!!')
    f.close()
    print("恭喜,全部爬取完毕!!!(文件为当前目录的tulishe.csv)")






windows 运行这段代码,感谢这位老哥  @yhp869
[Python] 纯文本查看 复制代码
import requests
from lxml import etree
import csv
from time import sleep
import os
 
if __name__ == '__main__':
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'
    }
    all_url = 'http://www.tulishe.com/all/page/%d'
    f = open('tulishe.csv', mode='w', encoding='gb18030')
    csvwriter = csv.writer(f)
    for page_num in range(1, 842):  # 一共841页
        try:
            url = format(all_url % page_num)
            # print(url)
            page_text = requests.get(url=url, headers=headers).text
            tree = etree.HTML(page_text)
            post_list = tree.xpath('//div[@id="posts"]/div')
            print('------开始爬取第' + str(page_num) + '页------')
            for div in post_list:
                try:
                    link = div.xpath('./div/a/@href')[0]  # 文章链接
                    title = div.xpath('./div/a/@title')[0]  # 标题
                    img = div.xpath('./div/a/img/@data-src')[0]  # 封面图
                    img1 = img.split('=')[1]  # 处理封面图url前缀
                    img2 = img1.split('&')[0]  # 处理封面图url后缀
                    print('------开始下载---【' + title + '】---图片------')
                    headers2 = {
                        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
                        'Referer': link
                    }
                    # print(link,title)
                    # 开始请求详情页数据
                    page2_text = requests.get(url=link, headers=headers).text
                    tree2 = etree.HTML(page2_text)
                    item = tree2.xpath('/html/body/div[2]/div/div[2]/div/article/header/div/span[3]/a/text()')  # 分类
                    article_tags = tree2.xpath('/html/body/div[2]/div/div[2]/div/article/div[3]/a')  # 标签组
                    tags = []
                    for a in article_tags:
                        tag = a.xpath('./text()')
                        tags.append(tag)
                    # print(item,tags)
                    all_pic_url = []  # 详情页全部图片
                    pic_urls = []
                    pic_list = tree2.xpath('//*[@id="gallery-2"]/div[@class="gallery-item gallery-fancy-item"]')  # 详情页4图
                    a = 0
                    for div in pic_list:
                        try:
                            pic_url = div.xpath('./a/@href')[0]
                            pic_urls.append(pic_url)
                            all_pic_url.append(pic_url)  # 添加预览4图片到all_pic_url_
 
                            # 下载4预览图
                            a = int(a) + 1
                            print('--开始下载第', a, '预览图片--')
                            img_data = requests.get(url=pic_url, headers=headers2).content
 
                            dir_name = pic_url.rsplit('.')[2].rsplit('/', 1)[0]
                            if not os.path.isdir(dir_name):
                                try:
                                    original_umask = os.umask(0)
                                    os.makedirs(dir_name, mode=0o777)
                                finally:
                                    os.umask(original_umask)
 
                            with open(pic_url.split(".", 2)[2], 'wb')as fp:
                                fp.write(img_data)
                        except Exception as e:
                            continue
                    print('--', title, '-------', len(pic_urls), '张预览图--下载完成------')
 
                    pic_list2 = tree2.xpath('//*[@id="gallery-2"]/div[@class="gallery-item gallery-blur-item"]')  # 详情页隐藏图
                    pic_url3s = []
                    for div in pic_list2:
                        try:
                            pic_url = div.xpath('./img/@src')[0]
                            pic_url2 = pic_url.split('=')[1]  # 解析隐藏链接图片
                            pic_url3 = pic_url2.split('&')[0]  # 解析隐藏链接图片
                            pic_url3s.append(pic_url3)
                            all_pic_url.append(pic_url3)  # 添加隐藏图片到all_pic_url_
                            # 下载隐藏图
                            a = int(a) + 1
                            print('--开始下载第', a, '隐藏图片--')
                            img_data = requests.get(url=pic_url3, headers=headers2).content
                            dir_name = pic_url3.rsplit('.')[2].rsplit('/', 1)[0]
                            if not os.path.isdir(dir_name):
                                try:
                                    original_umask = os.umask(0)
                                    os.makedirs(dir_name, mode=0o777)
                                finally:
                                    os.umask(original_umask)
                            with open(pic_url3.split(".", 2)[2], 'wb')as fp:
                                fp.write(img_data)
                        except Exception as e:
                            continue
                    print(title, '-------', len(pic_url3s), '张隐藏图--下载完成------')
                    csvwriter.writerow([title, link, img2, item, tags, all_pic_url])  # 【标题;链接;封面图;分类;标签;详情页图片】保存到csv
                    sleep(0.1)
                    print(link, title, "爬取完毕!!!")
 
                    # 下载封面图
                    img_data = requests.get(url=img2, headers=headers2).content
                    dir_name = img2.rsplit('.')[2].rsplit('/', 1)[0]
 
                    if not os.path.isdir(dir_name):
                        try:
                            original_umask = os.umask(0)
                            os.makedirs(dir_name, mode=0o777)
                        finally:
                            os.umask(original_umask)
                    with open(img2.split(".", 2)[2], 'wb')as fp:
                        fp.write(img_data)
                    print('--', title, '封面图------下载完成------')
                except Exception as e:
                    continue
 
            print('第' + str(page_num) + '页,爬取完毕!!!')
        except Exception as e:
            continue
    f.close()
    print("恭喜,全部爬取完毕!!!(文件为当前目录的tulishe.csv)")






免费评分

参与人数 10吾爱币 +6 热心值 +10 收起 理由
17798 + 1 感谢发布原创作品,吾爱破解论坛因你更精彩!
csqcloudcn + 1 + 1 鼓励转贴优秀软件安全工具和文档!
晚秋是一只兔子 + 1 + 1 我很赞同!
hshcompass + 1 + 1 学学咋突破vip
Jack2002 + 1 有水印的图不好看~!
wh2510 + 1 老哥加你了,麻烦同意一下
云之从 + 1 谢谢@Thanks!
hwh425 + 1 谢谢@Thanks!
ldx129 + 1 谢谢@Thanks!
QingYi. + 3 + 1 我很赞同!

查看全部评分

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

 楼主| 1942 发表于 2021-6-23 18:04
这个是在太慢了。 上多线程

[Python] 纯文本查看 复制代码
import requests
from lxml import etree
import csv
from time import sleep
import os
from concurrent.futures import ThreadPoolExecutor


headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'
}
f = open('tulishe.csv', mode='w', encoding='gb18030')
csvwriter = csv.writer(f)

def download_one_page(url):
    page_text = requests.get(url=url, headers=headers).text
    tree = etree.HTML(page_text)
    post_list = tree.xpath('//div[@id="posts"]/div')
    print('------开始爬取第' + str(page_num) + '页------')
    for div in post_list:
        try:
            link = div.xpath('./div/a/@href')[0]  # 文章链接
            title = div.xpath('./div/a/@title')[0]  # 标题
            img = div.xpath('./div/a/img/@data-src')[0]  # 封面图
            img1 = img.split('=')[1]  # 处理封面图url前缀
            img2 = img1.split('&')[0]  # 处理封面图url后缀
            print('------开始下载---【' + title + '】---图片------')
            headers2 = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
                'Referer': link
            }
            # print(link,title)
            # 开始请求详情页数据
            page2_text = requests.get(url=link, headers=headers).text
            tree2 = etree.HTML(page2_text)
            item = tree2.xpath('/html/body/div[2]/div/div[2]/div/article/header/div/span[3]/a/text()')  # 分类
            article_tags = tree2.xpath('/html/body/div[2]/div/div[2]/div/article/div[3]/a')  # 标签组
            tags = []
            for a in article_tags:
                tag = a.xpath('./text()')
                tags.append(tag)
            # print(item,tags)
            all_pic_url = []  # 详情页全部图片
            pic_urls = []
            pic_list = tree2.xpath(
                '//*[@id="gallery-2"]/div[@class="gallery-item gallery-fancy-item"]')  # 详情页4图
            a = 0
            for div in pic_list:
                try:
                    pic_url = div.xpath('./a/@href')[0]
                    pic_urls.append(pic_url)
                    all_pic_url.append(pic_url)  # 添加预览4图片到all_pic_url_

                    # 下载4预览图
                    a = int(a) + 1
                    #print('--开始下载第', a, '预览图片--')
                    img_data = requests.get(url=pic_url, headers=headers2).content

                    dir_name = pic_url.rsplit('.')[2].rsplit('/', 1)[0]
                    if not os.path.isdir(dir_name):
                        try:
                            original_umask = os.umask(0)
                            os.makedirs(dir_name, mode=0o777)
                        finally:
                            os.umask(original_umask)

                    with open(pic_url.split(".", 2)[2], 'wb')as fp:
                        fp.write(img_data)
                except Exception as e:
                    continue
            #print('--', title, '-------', len(pic_urls), '张预览图--下载完成------')

            pic_list2 = tree2.xpath(
                '//*[@id="gallery-2"]/div[@class="gallery-item gallery-blur-item"]')  # 详情页隐藏图
            pic_url3s = []
            for div in pic_list2:
                try:
                    pic_url = div.xpath('./img/@src')[0]
                    pic_url2 = pic_url.split('=')[1]  # 解析隐藏链接图片
                    pic_url3 = pic_url2.split('&')[0]  # 解析隐藏链接图片
                    pic_url3s.append(pic_url3)
                    all_pic_url.append(pic_url3)  # 添加隐藏图片到all_pic_url_
                    # 下载隐藏图
                    a = int(a) + 1
                    #print('--开始下载第', a, '隐藏图片--')
                    img_data = requests.get(url=pic_url3, headers=headers2).content
                    dir_name = pic_url3.rsplit('.')[2].rsplit('/', 1)[0]
                    if not os.path.isdir(dir_name):
                        try:
                            original_umask = os.umask(0)
                            os.makedirs(dir_name, mode=0o777)
                        finally:
                            os.umask(original_umask)
                    with open(pic_url3.split(".", 2)[2], 'wb')as fp:
                        fp.write(img_data)
                except Exception as e:
                    continue
            #print(title, '-------', len(pic_url3s), '张隐藏图--下载完成------')
            csvwriter.writerow([title, link, img2, item, tags, all_pic_url])  # 【标题;链接;封面图;分类;标签;详情页图片】保存到csv
            sleep(0.01)
            print(link, title, "爬取完毕!!!")

            # 下载封面图
            img_data = requests.get(url=img2, headers=headers2).content
            dir_name = img2.rsplit('.')[2].rsplit('/', 1)[0]

            if not os.path.isdir(dir_name):
                try:
                    original_umask = os.umask(0)
                    os.makedirs(dir_name, mode=0o777)
                finally:
                    os.umask(original_umask)
            with open(img2.split(".", 2)[2], 'wb')as fp:
                fp.write(img_data)
            print('--', title, '封面图------下载完成------')
        except Exception as e:
            continue

    print('第' + str(page_num) + '页,爬取完毕!!!')
# except Exception as e:
#     continue

if __name__ == '__main__':
    with ThreadPoolExecutor(100) as t:
        for page_num in range(1, 842):  # 一共841页
            t.submit(download_one_page,f'http://www.tulishe.com/all/page/{page_num}')

f.close()
print("恭喜,全部爬取完毕!!!(文件为当前目录的tulishe.csv)")


免费评分

参与人数 1吾爱币 +2 热心值 +1 收起 理由
125733578 + 2 + 1 NB的存在,速度真他吗的块

查看全部评分

slabber 发表于 2021-7-9 11:09
请教,window系统 运行显示已经全部爬取完毕,但只给一个tulishe.csv 打开是个空表 怎么回事呢 谢谢!
superGC 发表于 2021-6-23 10:11
巧言乱德 发表于 2021-6-23 10:01
目录文件名报错咋修改,有大佬会吗,改成win系统可用的

有个数组向后移,跳过http:就好了。
123598 发表于 2021-6-23 02:01
用PyCharm  运行 创文件夹会报错 该怎么该呢
QQ截图20210623015939.png
爱的小热裤 发表于 2021-6-23 01:35
老哥,能打包一份吗
雾都孤尔 发表于 2021-6-23 03:10
下载些图片养养眼。
kanaeri 发表于 2021-6-23 06:20
看一看看一看噢噢噢噢
头像被屏蔽
dongse 发表于 2021-6-23 06:43
提示: 作者被禁止或删除 内容自动屏蔽
w360 发表于 2021-6-23 07:23
给力的有吗
龍謹 发表于 2021-6-23 07:46
谢谢大佬分享爬虫源码!
zj1977lsz 发表于 2021-6-23 08:21
试了一下,会报错,但也不知道怎么修改,算了
grykwok110 发表于 2021-6-23 08:26
让我学习的只有妹子
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则 警告:本版块禁止灌水或回复与主题无关内容,违者重罚!

快速回复 收藏帖子 返回列表 搜索

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-4-20 21:26

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表