救救孩子吧，写了一下午没写出来，老哥们帮忙改一下啊

应真先生 · 发表于 2019-7-19 01:13

本帖最后由应真先生于 2019-7-19 17:50 编辑

今天看论坛里一个老哥用urllib库写了个爬虫，我想试着把它改一下，改成requests的，结果写了一天，一直报错，老哥们帮忙看看问题出在哪里。

[Python] 纯文本查看 复制代码

import urllib.request
import re
import requests
from requests import RequestException
import os
from hashlib import md5
'''
for a in range(0,10):
    fl_url = "https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/" + str(a) + ".html?route_id=15634482253883&route=1,3,&after_route=1,3"
    data = urllib.request.urlopen(fl_url).read().decode("utf-8","ignore")
    tj = "/detail/(.*?).html"
    p = re.compile(tj).findall(data)
    p = list(set(p))
    try:
        for i in range(len(p)):
            free = str(p)
            new_url = "https://818ps.com/detail/" + free + ".html"
            new_data = urllib.request.urlopen(new_url).read().decode("utf-8","ignore")
            new_tj = 'https://img.tuguaishou.com/ips_templ_preview/(.*?)"'
            q = re.compile(new_tj).findall(new_data)
            for j in range(len(q)):
                tup_url = "https://img.tuguaishou.com/ips_templ_preview/" + str(q[j])
                file = "F:/bing/a/" + str(a) +str(i) + str(j) + ".jpg"
                print("正在下载编号:" + str(a) + str(i) + str(j))
                urllib.request.urlretrieve(tup_url,filename=file)
                print("下载完成")
    except urllib.error.URLError as e:
            if hasattr(e, 'code'):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reason)
'''

def get_page(offset):
    base_url = 'https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/'+ str(offset)
    url = base_url+'.html?route_id=15634484581189&route=3,&after_route=3'
    response = requests.get(url)
    try:
        if response.status_code == 200:
            return response.text
    except RequestException:
        return None


def parse_url(html):
        pattern = re.compile('<a.*?open-detail.*?href="(.*?)"', re.S)
        items = re.findall(pattern, html)
        urls = []
        for url in items:
            urls.append(url)
        return urls


def save_image(urls):
    for i in range(len(urls)):
        resp = requests.get('https://818ps.com'+ urls[i])
        text = resp.text
        try:
            if resp.status_code == 200:
                pattern = re.compile('src="(.*?)"\salt.*title="(.*?)">',re.S)
                results = re.findall(pattern, text)
                for result in results:
                    print(result.group[1],result.group[2])
                    download = requests.get(result.group[1])
                    file_path = '{file_name}.{file_suffix}'.format(
                        file_name=result.group[2],
                        file_suffix='png')
                    if not os.path.exists(file_path):
                        with open(file_path, 'wb')as f:
                            f.write(download.content)
                        print('Downloaded image path is %s' % file_path)
                    else:
                        print('Already Downloaded', file_path)
        except Exception as e:
            print(e)


offset = 1
def main():
    html = get_page(1)
    urls = parse_url(html)
    print(urls)
    save_image(urls)


if __name__ == '__main__':
    main()

已找到问题出在哪里，并且把一楼老哥的代码改了一下，加入多线程，保存在img文件夹里用网页标题显示

[Python] 纯文本查看 复制代码

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import requests
import os
from multiprocessing import Pool


def get_page(offset): 
    fl_url = "https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/" + str(offset) + ".html?route_id=15634482253883&route=1,3,&after_route=1,3"
    data = requests.get(fl_url).text
    tj = "/detail/(.*?).html"
    p = re.compile(tj).findall(data)
    p = list(set(p))
    return p


def save_page(p):
    try:
        for i in p:
            # free = str(i)
            new_url = "https://818ps.com/detail/" + str(i) + ".html"
            new_data = requests.get(new_url).text
            new_tj = '//img.tuguaishou.com/ips_templ_preview/(.*?)"\salt.*title="(.*?)"/>'
            q = re.compile(new_tj).findall(new_data)
            for j, p in q:
                tup_url = "https://img.tuguaishou.com/ips_templ_preview/" + str(j)
                # file = "./bing/a/" + str(a) +str(i) + str(j) + ".jpg"
                p = re.sub('/', '_',str(p))
                print("正在下载编号:" + p)
                img_path = 'img'
                if not os.path.exists(img_path):
                    os.makedirs(img_path)
                imagetemp = requests.get(tup_url).content
                file_path =  img_path + os.path.sep + p + ".jpg"
                if not os.path.exists(file_path):
                    with open(file_path, 'wb') as f:
                        f.write(imagetemp)
                    print("下载完成")
                else:
                        print('已经下载', file_path)
    except requests.exceptions.InvalidURL as e:
        if hasattr(e, 'code'):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)


def main(offset):
    p = get_page(offset)
    save_page(p)


group_start = 1  #起始页数
group_end = 10   #结束页数
if __name__ == '__main__':
    pool = Pool()
    groups = ([x * 1 for x in range(group_start, group_end+1)])
    pool.map(main,groups)
    pool.close()
    pool.join()

ixsec · 发表于 2019-7-19 02:48

本帖最后由 ixsec 于 2019-7-19 02:50 编辑

网站有限制，你下载图片哪个连接其实是正常的网址并非是图片的url~~

!w1024_w?auth_key=2195922752-0-0-c503d6c4bc4d4d300e9961fdb9981463

就如这个一样这个其实是网页url 并非图片的 url

[Python] 纯文本查看 复制代码

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#! author = ixsec
#! date : 2019/7/19
#! filename : 818ps.py
# MIT License
#
# Copyright (c) 2019 ixsec
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import re
import requests
from requests import RequestException
import os
from hashlib import md5

for a in range(0,10):
    fl_url = "https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/" + str(a) + ".html?route_id=15634482253883&route=1,3,&after_route=1,3"
    data = requests.get(fl_url).text
    tj = "/detail/(.*?).html"
    p = re.compile(tj).findall(data)
    # p = list(set(p))
    try:
        for i in p:
            # free = str(i)
            new_url = "https://818ps.com/detail/" + str(i) + ".html"
            new_data = requests.get(new_url).text
            new_tj = '//img.tuguaishou.com/ips_templ_preview/(.*?)"'
            q = re.compile(new_tj).findall(new_data)
            for j in q:
                tup_url = "https://img.tuguaishou.com/ips_templ_preview/" + str(j)
                # file = "./bing/a/" + str(a) +str(i) + str(j) + ".jpg"
                print("正在下载编号:" + str(a) + str(i) + str(j))
                imagetemp = requests.get(tup_url).content
                with open("./" + str(a) +str(i) + str(j) + ".jpg", 'wb') as f:
                    f.write(imagetemp)
                print("下载完成")
    except requests.exceptions.InvalidURL as e:
            if hasattr(e, 'code'):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reason)

成国大吉大利 · 发表于 2019-7-19 03:04

哥们说详细点啊，看不出来你要表达意思

龙云666 · 发表于 2019-7-19 08:41

你这孩子没救了

haoii123 · 发表于 2019-7-19 08:47

有人解答了，我就不献丑了

mspe · 发表于 2019-7-19 08:52

我就是来摸鱼的

baduxue · 发表于 2019-7-19 09:05

一楼是正解！

waddy · 发表于 2019-7-19 09:17

论坛里果然大神无数，而且热心助人，仰望

应真先生 · 发表于 2019-7-19 10:50

ixsec 发表于 2019-7-19 02:48
网站有限制，你下载图片哪个连接其实是正常的网址并非是图片的url~~

!w1024_w?auth_key=2195922752-0 ...

老哥这个是论坛里某个老哥写的，我备注了，我想问一下下面我写的那个问题出在哪里？

zenaiwen · 发表于 2019-7-19 11:24

提示: 作者被禁止或删除内容自动屏蔽

帐号		自动登录	找回密码
密码			注册[Register]

[已解决] 救救孩子吧，写了一下午没写出来，老哥们帮忙改一下啊

个人中心

zenaiwen zenaiwen 当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	zenaiwen 发表于 2019-7-19 11:24 提示: 作者被禁止或删除内容自动屏蔽

	回复支持举报