scrapy框架爬取京东上手机的标题、价格、评价人数

Darren_Smith · 发表于 2019-11-13 09:13

废话不多说，直接上关键代码：
from __future__ import absolute_import
import scrapy
from scrapy.http import Request
#from jingdong.items import JingdongItem,IdItem
import re
import urllib.error
import urllib.request
import pymysql

class JdongItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
all_class=scrapy.Field()
next_class_name = scrapy.Field()
next_class_url = scrapy.Field()
book_name = scrapy.Field()
book_url = scrapy.Field()
comment = scrapy.Field()
pricesku = scrapy.Field()
title = scrapy.Field()
publisher_name = scrapy.Field()
publisher_url = scrapy.Field()
publish_time = scrapy.Field()
author = scrapy.Field()
original_price = scrapy.Field()
class JdSpider(scrapy.Spider):
name = 'jd'
allowed_domains = ['jd.com']
#start_urls = ['http://jd.com/']
header = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"}
#fh = open("D:/pythonlianxi/result/4.txt", "w")
def start_requests(self):
      return [Request("https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&enc=utf-8&wq=%E7%94%B5%E8%84%91&pvid=d67a3bf540b04ee9a2c695f2a6bf26f6&page=1",callback=self.parse,headers=self.header,meta={"cookiejar":1})]
def use_proxy(self,proxy_addr,url):
      try:
         req=urllib.request.Request(url)
         req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36")
         proxy = urllib.request.ProxyHandler({"http": proxy_addr})
         opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
         urllib.request.install_opener(opener)
         data=urllib.request.urlopen(req).read().decode("utf-8","ignore")
         return data
      except urllib.error.URLError as e:
         if hasattr(e,"code"):
            print(e.code)
         if hasattr(e,"reason"):
            print(e.reason)
      except Exception as e:
         print(str(e))

def parse(self, response):
      item=JdongItem()
      print("1")
      proxy_addr = "61.135.217.7:80"
      try:
         item["title"]=response.xpath("//div[@class='p-name']/a[@target='_blank']/em/text()").extract()
         item["pricesku"] =response.xpath("//li[@class='gl-item']/div/@data-sku").extract()

         for j in range(2,166):
            url="https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&enc=utf-8&wq=%E7%94%B5%E8%84%91&pvid=d67a3bf540b04ee9a2c695f2a6bf26f6&page="+str(j)
            print(j)
            #yield item
            yield Request(url)
         pricepat = '"p":"(.*?)"'
         personpat = '"CommentCountStr":"(.*?)",'
         print("2k")
         #fh = open("D:/pythonlianxi/result/5.txt", "a")
         conn = pymysql.connect(host="127.0.0.1", user="root", passwd="ab123321", db="311", charset="utf8")
         print("2")
         for i in range(0,len(item["pricesku"])):
            priceurl="https://p.3.cn/prices/mgets?&ext=11000000&pin=&type=1&area=1_72_4137_0&skuIds="+item["pricesku"]
            personurl = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + item["pricesku"]
            pricedata=self.use_proxy(proxy_addr,priceurl)
            price=re.compile(pricepat).findall(pricedata)
            persondata = self.use_proxy(proxy_addr,personurl)
            person = re.compile(personpat).findall(persondata)

            title=item["title"]
            print(title)
            price1=float(price[0])
            #print(price1)
            person1=person[0]
            #fh.write(tile+"\n"+price+"\n"+person+"\n")
            cursor = conn.cursor()
            sql = "insert into phone(title,price,person) values(%s,%s,%s);"
            params=(title,price1,person1)
            cursor.execute(sql,params)
            conn.commit()

         #fh.close()
         conn.close()
         return item
      except Exception as e:
         print(str(e))
注意：以上代码仅为spider内容，不能直接在idle中运行，需要生成一个scrapy项目，在其spider文件夹中新建python文件再用控制台运行，不然就选用其他IDE

lydy · 发表于 2019-11-13 09:27

有人去试过么？

狂人日记 · 发表于 2019-11-13 09:34

谢谢楼主我现在就去试试效果

yang806 · 发表于 2019-11-13 09:37

支持一下爬虫党

梦幻嘟嘟 · 发表于 2019-11-13 09:50

貌似。。。操作是加头，加代{过}{滤}理，处理url，丢数据库，over

zb0419 · 发表于 2019-11-13 11:04

没有爬成功

Darren_Smith · 发表于 2019-11-15 09:53

zb0419 发表于 2019-11-13 11:04
没有爬成功

看看你的报错内容

zb0419 · 发表于 2019-11-15 16:44

Darren_Smith 发表于 2019-11-15 09:53
看看你的报错内容

运行成功没有报错了，调用的库都安装好了，但没有输出结果，这个还要研究看看你的代码这么写的，反正现在库安装好了就没有不错了，但没有输出结果

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] scrapy框架爬取京东上手机的标题、价格、评价人数

免费评分

个人中心