从慢慢买网站中爬取京东和天猫的商品历史价格

Darren_Smith · 发表于 2019-11-15 10:31

从csdn付费下载的代码，修改之后可以在idle正常运行了，但数据是生成表格的，可以自行改善、学习

#!usr/bin/python
import json
import time
import random
import datetime
import requests
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from tkinter import *
from urllib import error
from urllib.parse import *
from lxml import etree
from requests.packages import urllib3
from matplotlib import pyplot
import xlrd

import socket
from time import ctime

mpl.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus']=False

now_date = time.strftime("%m-%d", time.localtime(time.time()))
now_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time()))

phone_list = [
18303517744,
13613416611,
15219466201,
15036222256,
18438888133,
18876941131,
18876622089,
18889262767,
13715150077,
13717033838,
18351078990,
13467719111,
15997693333,
13730600607,
13699051071,
13849038741,
18352936688,
13880888292,
18822441999,
15777770130,
15777772845,
13727693111,
13632577333,
15976856868,
18222167181,
13512960022,
13530102266,
18300666187,
15824817777,
18349333171,
13838555227,
15890005577,
15890008887,
13838200888,
13924853168,
18822488887,
13972911999,
13428334566,
13566102222,
13732097555,
15233333323,
13682987828,
13923918859,
18859981392,
15818692899,
15012563066,
18222522000,
13828716737,
13692298935,
13706053195,
13887441413,
18322040999,
13911336673,
13801391870,
13433196988,
13702485588,
13924578588,
13924852345,
18823143456,
13637666699,
13755630022,
13920593529,
18702888838,
15198120000,
13908057178,
18844227188,
18750468844,
13505952075,
15768179999,
18356194521,
13696754521,
13788829706,
15208275054,
18777770214,
13551275898,
18280151115,
13677777254,
18769721000,
18897777726,
15814226133,
15918128980,
15918129083,
15918129282,
15918129090,
18300077779,
15022277000,
15875766666,
18428088892,
15703382298,
15131712232,
15732922520,
13874677777,
18255555551,
18393897777,
15180222225,
13505740467,
13780390000,
18859567892,
15277775445,
13662688881,
18213777222,
13761746746,
15000505062,
14761188884,
13809070207,
13818357698,
13873179698,
18817871288,
15112998888,
15703361816,
15290911121,
15107555885,
18396217171,
13825876548,
13619870320,
13778891234,
13548291222,
18282200022,
18402898980,
18328025788,
15228886138,
17878781118,
15123888444,
15837182792,
15838125087,
18703896718,
18736011629,
18839781750,
18837170569,
15777776964,
18761755000,
18751373210,
15962711155,
15962792088,
18761755088,
13656291113,
18862779378,
15190971978,
13777888585,
15068936333,
15204025988,
13654059991,
15775677700,
13684218789,
15281898765,
13616202666,
18751126999,
13812920788,
13809055222,
13962350777,
18353240966,
18853296464,
17839929705,
18838967382,
18749418806,
15093239328,
15188349522,
18236956924,
18348405579,
15093334268,
13505647555,
15220525678,
15020050513,
15020030417,
15267701717,
15088931331,
15906878938,
13646514938,
13706636314,
18867793298,
13739742666,
15731102345,
13859652222,
18232102678,
13601261337,
15231099666,
18337728521,
15203802168,
18331758666,
18736599499,
13930109099,
15738888289,
15738888538,
15738888576,
15738888697,
15738888963,
13797904444,
15243191111,
18405311888,
18405311888,
13791080000,
13791080000,
13908376207,
13908335110,
13908374332,
18702397333,
18702379555,
15922584000,
13783666664,
18335392777,
15217430000,
15992225679,
13585510688,
15818991889,
17806722226,
13536565653,
18738651999,
18388555511,
15825022222,
15882234084,
13776268888,
15018310888,
15113133313,
13701097729,
15726835666,
15058299222,
15118444415,
18820300009,
18825700007,
13829111788,
13825766788,
13480423333,
13711888886,
13532923333,
13825737888,
13537328888,
13686678888,
13538345678,
15016967488,
15917735557,
15217104555,
15917669777,
15017888444,
15931390000,
15267180777,
15068793333,
18335156789,
13835175177,
18202468383,
13926787833,
15815100303,
15892056631,
13599305858,
13616979898,
13511100900,
13786766667,
13686868538,
13632878899,
13883038222,
18838200011,
13911672661,
13521935222,
13802289678,
13728888822,
13801507158,
15093939323,
15160299539,
18831119031,
13974259999,
15807539093,
15023669066,
13785811099,
18716433334,
18834845999,
13507170130,
13507115301,
13995588392,
13657247111,
17839999122,
17839993883,
13807196657,
13807197319,
13807198517,
13807153256,
13807190231,
13908631578,
13908863082,
18822858108,
13510308789,
13510102070,
18419521214,
13877853333,
18351203222,
18261197555,
15815285757,
15261115522,
13903173981,
15132755552,
15019677099,
18862192899,
13678863811,
13983652278,
13856977511,
13589966223,
18337623210,
13979673333,
15007927777,
18837744446,
13950654999,
13861186488,
18870000005,
15158172221,
15824107733,
13790746666,
15802648889,
13808322226,
15823513000,
18883190766,
18883298278,
18375801115,
18375702233,
15023871222,
13779033333,
18872855555,
18270003333,
18886889988,
15777777783,
18881111115,
18882888802,
14799448888,
13688819128,
13688819693,
]
user_agent_m = [
'Mozilla/5.0 (Linux; Android 8.1; PAR-AL00 Build/HUAWEIPAR-AL00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/6.2 TBS/044304 Mobile Safari/537.36 MicroMessenger/6.7.3.1360(0x26070333) NetType/WIFI Language/zh_CN Process/tools',
'Mozilla/5.0 (Linux; Android 8.1; EML-AL00 Build/HUAWEIEML-AL00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.143 Crosswalk/24.53.595.0 XWEB/358 MMWEBSDK/23 Mobile Safari/537.36 MicroMessenger/6.7.2.1340(0x2607023A) NetType/4G Language/zh_CN',
'Mozilla/5.0 (Linux; Android 8.0; MHA-AL00 Build/HUAWEIMHA-AL00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/6.2 TBS/044304 Mobile Safari/537.36 MicroMessenger/6.7.3.1360(0x26070333) NetType/4G Language/zh_CN Process/tools',
'Mozilla/5.0 (Linux; Android 5.1.1; vivo X6S A Build/LMY47V; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/6.2 TBS/044207 Mobile Safari/537.36 MicroMessenger/6.7.3.1340(0x26070332) NetType/4G Language/zh_CN Process/tools'
      ]
ip_origin = ["中国联通", "中国移动", "中国电信"]
c_devmodel_list = ['Mate10', 'P8青春', '荣耀7i', '畅玩7A', '荣耀8XMax',
               'Mate10Pro', '荣耀10', 'M3青春', '荣耀8青春']

class CrawlCompareWeb:
"""
比价网反爬严格，考虑换ip突破,此条有待考证
另一个查询历史价格接口：http://tool.manmanbuy.com/history.aspx?DA=1&action=gethistory&url=http%3a%2
f%2fitem.tmall.com%2fitem.htm%3fid%3d532034800285&bjid=&spbh=&cxid=&zkid=&w=350&token=yva7088d209cdc
bbbf30e6af9cf24005ce2dx
破解token就可以
"""

def __init__(self, search_words, writer):
      self.start_url = "https://apapia-search.manmanbuy.com/index_json.ashx"
      self.decode_type = "utf-8"
      self.total_page = None
      self.writer = writer
      self.words = search_words
      self.search_words = quote(
         search_words, encoding=self.decode_type, errors="replace"
      )
      self.headers = {
         "Host": "apapia-search.manmanbuy.com",
         "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
         "Proxy-Connection": "close",
         "Cookie": "ASP.NET_SessionId=5nm1vf35xt2eisuhe2k0rm33; jjkcpnew111=cp98576765_1063811521_2018/9/26",
         "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_4 like Mac OS X) AppleWebKit/605.1.15 "
         "(KHTML, like Gecko) Mobile/15F79 mmbWebBrowse",
         "Content-Length": "523",
         "Accept-Encoding": "gzip",
         "Connection": "close",
      }
      self.data = "jsoncallback=%3F&c_devmodel=iPhone%207&f1=&c_win=w_375_h_667&c_devid=C5707B0E-7A25-4BDF-BDF4-C64F8" "1711CAB&c_devtype=phone&f2=&key={}&iszy=&f3=&c_dp=2&f4=&c_devtoken=&c_channel=AppStore&f5=&" "smallclass=&f6=&methodName=getsearchkeylist&username=&c_operator=%E4%B8%AD%E5%9B%BD%E8%81%94%E" "9%80%9A&price2=&c_ostype=ios&c_engver=1.2.81&c_ctrl=w_search_form_f_search_product_content&page={}" "&sign={}&ppid=&price1=&c_contype=wifi&t={}&orderby=&c_osver=11.4&siteid=&c_appver=3.0.2"
      self.title_list = []
      self.mall_list = []
      self.iszy_list = []
      self.price_list = []
      self.sales_list = []
      self.prourl_list = []
      self.skuid_list = []
      self.itemid_list = []
      self.crawl_time_list = []
      self.comment_list = []

def turn_page_get_info(self):

      for i in range(1, 10):
         t_1 = int(round(time.time() * 1000))
         t_2 = t_1 + random.randint(1, 5)
         data = self.data.format(self.search_words, i, t_1, t_2)
         response = requests.post(self.start_url, data=data, headers=self.headers)
         result_data = response.content.decode("utf-8")
         if result_data:
            datas = result_data.replace("'", "").replace("[", "").replace("]", "")
            for j in datas.split("}"):
                  j = j.strip(",").strip("\n") + "}"
                  if "img" in j and j:
                     data_json = json.loads(j)
                     id = data_json["id"]
                     iszy = data_json["iszy"]
                     siteid = data_json["siteid"]
                     img = data_json["img"]
                     image = data_json["image"]
                     title = data_json["title"]
                     price = data_json["price"]
                     mall = data_json["mall"]
                     sales = data_json["sales"]
                     gourl = data_json["gourl"]
                     prourl = data_json["prourl"]
                     skuid = data_json["skuid"]
                     itemid = data_json["itemid"]
                     comment = data_json["comment"]
                     crawl_time = time.strftime(
                        "%Y%m%d%H%M%S", time.localtime(time.time())
                     )
                     self.title_list.append(title)
                     self.mall_list.append(mall)
                     self.iszy_list.append(iszy)
                     self.price_list.append(price)
                     self.sales_list.append(sales)
                     self.prourl_list.append(prourl)
                     self.skuid_list.append(skuid)
                     self.itemid_list.append(itemid)
                     self.comment_list.append(comment)
                     self.crawl_time_list.append(crawl_time)
            time.sleep(random.uniform(2, 3))
         else:
            break

def download_file(self):

      dataframe = pd.DataFrame(
         columns=["商品标题", "平台", "店铺", "价格", "销量", "评论量", "地址", "sku"]
      )
      dataframe["商品标题"] = self.title_list
      dataframe["平台"] = self.mall_list
      dataframe["店铺"] = self.iszy_list
      dataframe["价格"] = self.price_list
      dataframe["销量"] = self.sales_list
      dataframe["地址"] = self.prourl_list
      dataframe["sku"] = self.skuid_list
      dataframe["评论量"] = self.comment_list
      to_c_sheet = (
         self.words
         + "_"
         + "全网价格数据"
         + "_"
         + time.strftime("%m%d", time.localtime(time.time()))
      )
      dataframe.to_excel(
         self.writer, index=False, encoding="utf-8", sheet_name=to_c_sheet
      )
      print("数据写入完成，进程结束")
class HistoryPriceSearch:

def __init__(self, search_url, writer):

      self.search_preferential_url = "https://apapia-history.manmanbuy.com/ChromeWidgetServices/WidgetServices.ashx"
      self.search_price_url = "https://ext.henzanapp.com/api.html"
      self.t = int(time.time() * 1000)
      self.preferential_headers = {
         "Host": "apapia-history.manmanbuy.com",
         "Content-Type": "application/x-www-form-urlencoded; charset:utf-8",
         "Proxy-Connection": "close",
         "Cookie": "jjkcpnew111:cp44979114_1063811528_2018/10/18",
         "User-Agent": random.choice(user_agent_m),
         "Content-Length": "548",
         "Accept-Encoding": "gzip",
         "Connection": "close",
      }
      self.price_headers = {
         "Host": "ext.henzanapp.com",
         "Proxy-Connection": "close",
         "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36",
         "Content-Length": "4550",
         "Accept-Encoding": "gzip",
         "Connection": "close",
         "Cookie": "mmzdd=482ef902b98b228c76a0f748e7deaa79",
      }
      self.preferential_data = {
         "c_devid": "C5707B0E-7A25-4BDF-BDF4-C64F81711CAB",
         "username": random.choice(phone_list),
         "ipage": "",
         "c_dp": "2",
         "c_engver": "1.2.83",
         "c_devtoken": "",
         "c_devmodel": random.choice(c_devmodel_list),
         "c_contype": "wifi",
         "c_win": "w_375_h_667",
         "t": self.t,
         "c_firstchannel": "AppStore_update",
         "p_url": search_url,
         "sign": "07E0CB3EF0B16E74",
         "c_ostype": "Android",
         "jsoncallback": "%3F",
         "c_ctrl": "w_search_trend0_f_content",
         "methodName": "getZhekou",
         "c_channel": "Google Play",
         "c_devtype": "Android",
         "c_operator": random.choice(ip_origin),
         "c_appver": "3.0.5",
         "c_firstquerendate": "1540799598929",
         "ipagesize": "6",
         "c_osver": "11.4",
      }
      self.price_data = {
         "tPrice": "",
         "toolbar_state": "open",
         "path1": "qihoo-mall-goodsinfo",
         "mid": "",
         "tSale": "",
         "fromTp": "0",
         "checkinfo": "c9f8d7a8a8d7e899d7c9a9d709d9d71999d71909d7f8d9d7c999d7c8a9d709d9d7d899d7d809d7d8d9d78899d79909d7d8d9d7c909d71909d7d8d9d78819d7e909d7e8d9d7f8a9d7e999d709d9d7b909d7b9a9d7e9d9d7e819d7c909d7d8d9d78809d7b9a9d7d8d9d7d899d7f819d7e8d9d7e8980909d7b919d7e8d9d7f89809d7b819d7d8d9d7e809d7d819d7d8d9d7c899d7c999d7e8d9d7a8a8d799b8d7a8a8d7db5c1ccc7bdbfbcb9baba8a8d7b9a8d7a8a8d7888868e89898a8a8d799b8d7a8a8d7dbbb1cac8c7bdc2ca8a8d7b9a8d7a8a8d7f8a9d7a9a9d7c8d9d79819d79919d7e8d9d7a809d7a909d7f8d9d7a8a8d799b8d7a8a8d7db5c9b6c7bdbbb1cac8c7bdc2ca8a8d7b9a8d7a8a8d7f819d7c909d7d8d9d7d909d7d8a9d7e8d9d7b919d709a9d7c8d9d798a9d798a9d7d8d9d7a8a8d799b8d7a8a8d7accbcb9b7b1dacdbec1c4cdbcba8a8d7b9a8d7a8a8d7fb8c2c6888199c8888090d8888097bfb8c2c68f8d819f89809b8e8a897977b9b0b0bbb0dbb0b0bfbe9881d2c6adca91b19ababec985c8aa8a9cae9a8d7f8d819f89809b8e8a8e9a8d7c81ce9a8d79baccc0ddbfb5c1ce9a8d75c7cbb686ccbbb1c4c9b68fb5c1ce9a8d7e9a8d799b8d78ccccc0c88a8d7fb8c2c6888199c8888090d8888097bfb8c2c68f8d819f89809b8e8a897977b9b0bea2d0c9b0b0bdb2d981d2c6adba91b2a6cab68ca3a1ba8a9cae9a8d7f8d819f89809b8e8a8e9a8d7981ce9a8d79baccc0ddbfb5c1ce9a8d75c7cbb686ccbbb1c4c9b68fb5c1ce9a8d7e9a8d799b8d78ccccc0c88a8d7fb8c2c6888199c8888090d8888097bfb8c2c68f8d819f89809b8e8a8589b0bead82d9b0b0bdb8cbcba2c6adca9fa19f85c0dac0aeba8a9cae9a8d7f8d819f89809b8e8a8e9a8d7b81ce9a8d79baccc0ddbfb5c1ce9a8d75c7cbb686ccbbb1c4c9b68fb5c1ce9a8d7e9a8d799b8d78ccccc0c88a8d7fb8c2c6888199c8888090d8888097bfb8c2c68f8d819f89809b8e8a8589b0b8cb899bb0b0b1daa5cba3c6adca91bdc1adb8cfadc4ca8a9cae9a8d7f8d819f89809b8e8a8e9a8d7b81ce9a8d79baccc0ddbfb5c1ce9a8d75c7cbb686ccbbb1c4c9b68fb5c1ce9a8d7e9a8d799b8d78ccccc0c88a8d7fb8c2c6888199c8888090d8888097bfb8c2c68bb1c8c7b5cdbcc1c588897977b9b0bea7cfb9b0b0b193a2bba2c6a3aa9ac1a1a3cabc968c898a9cae9a8d7f8d819f89809b8e8a8e9a8d7981ce9a8d79baccc0ddbfb5c1ce9a8d75c7cbb686ccbbb1c4c9b68fb5c1ce9a8d7e9a8d799b8d78ccccc0c88a8d7a8a8d799b8d7a8a8d7fb5c1c1ccc4cdc5ca8a8d7b9a8d7a8a8d78809d7c819d719d9d7c8a9d7a919d7f8d9d7b909d71909d7f8d9d79809d7b819d7d8d9d7a8a8d799b8d7a8a8d7fb9bcc8c7c0cbca8a8d7b9a8d7a8a8d719e898a8a8d799b8d7a8a8d75cdc6ccc5cbba8a8d7b9a8d7a8a8d78899d7b809d7e8d9d70919d7b9a9d7c8d9d7b9a9d78809d7d8d9d7d809d7e8a9d709d9d7a8a8d799b8d7a8a8d7dbbb1cac8c4c9bdbaca8a8d7b9a8d7a8a8d7889898a8a8d799b8d7a8a8d75cdc6cdb4c9bbca8a8d7b9a8d7a8a8d7f819d799a9d7d8d9d788a9d70909d709d9d7f819d7f819d7e8d9d7c909d70909d709d9d7b899d79899d709d9d7c9a9d7b809d719d9d71999d7e919d719d9d7a8a8d799b8d7a8a8d7db5c9b6ccc6c9b0cbbacdb5ca8a8d7b9a8d7a8a8d768f809c8f8dac988a8d7c909d70909d709d9d7b899d79899d709d9d7c9a9d7b809d719d9d71999d7e919d719d9d7e9a8d7d94a1bcaba0aa8a8d799b8d7a8a8d7db5c9b6c7bccbbdccb7cac8ca8a8d7b9a8d7a8a8d768f809c8f8dac9a8a8d799b8d7a8a8d7acdbab5cdc6c7b5cdbcc1ca8a8d7b9a8d7a8a8d7c909d70909d709d9d7b899d79899d709d9d7c9a9d7b809d719d9d71999d7e919d719d9d7e9a8d7d94a1bcaba0aa8a8d799b8d7a8a8d7cb6c9bacaba8a8d7b9a8d7a8a8d7f8d819f89809b8e8a8a8a8d799b8d7a8a8d7cb1cacdb4c4cdbbc7c9bab7c9bcca8a8d7b9a8d7a8a8d7c88888e898a8b8a898a8a8d799b8d7a8a8d7cb1ccc9bbba8a8d7b9a8d7a8a8d7e898b9f8d7b9f8d7b9f8d7e898a8a8d799b8d7a8a8d7cb1cbba8a8d7b9a8d7a8a8d7a8a8d799b8d7a8a8d7ac9bec7bdc3cbca8a8d7b9a8d78899b8d7a8a8d7db4c9bba7c6ca8a8d7b9a8d7a8a8d7886898a8a8d799b8d7a8a8d76c7c1cbcacdbeca8a8d7b9a8d7a8a8d7a8a8d799b8d7a8a8d7ccbb1cacccbc1ccba8a8d7b9a8d79899b8d7a8a8d7db4c9bbabc1ca8a8d7b9a8d7a8a8d7fb8c2c6888199c88b8c80d88b8c87bfb8c2c68bb1c8c7b5cdbcc1c588897977b9b0bea7cfb9b0b0b193a2bba2c6a3aa9ac1a1a3cabc968c898a9cae9a8d7f8d819f89809b8e8a8e9a8d7981ce9a8d79baccc0ddbfb5c1ce9a8d75c7cbb686ccbbb1c4c9b68fb5c1ce9a8d7e9a8d799b8d7bc8ccccc0ca8a8d799b8d7a8a8d7bb1c8ca8a8d7b9a8d7a8a8d798a9d70999d7e8d9d7f809c8f8dac9d809d7b899d709d9d7f819d7d899d7d8d9d79919d788a9d7d8d9d7e8a9d7f819d7e8d9d7f819d7d899d7d8d9d7e8a9d7a9a9d7c8d9d7c899d709a9d7c8d9d7e819d7a899d709d9d7c999d7e919d7f8d9d719a9d799a9d7f8d9d79899d7c919d7e8d9d7d9a9d7b999d7e8d9d788a9d7e819d7e8d9d7d809d7b899d709d9d7e909d7c899d7d8d9d7d809d7b899d709d9d7b8a9d7d899d7d8d9d70909d71909d7f8d9d71999d7e919d719d9d7099888a8c909d70909d709d9d7b899d79899d709d9d7c9a9d7b809d719d9d71999d7e919d719d9d7a8a8d799b8d7a8a8d7db5c9b6ca8a8d7b9a8d7c8b89899b8d7a8a8d7dbbb1cac8ca8a8d7b9a8d79809c8c8c899b8d7a8a8d7cb1a8c7c0cbca8a8d7b9a8d7a8a8d7a999d7b909d7f8d9d71999d7c899d7d8d9d7a8a8d799b8d7a8a8d7db5c9b6a8c7c0cbca8a8d7a9f8",
         "prevpop": "",
         "bfrom": "normal",
         "url": search_url,
         "path2": "goodspricecmp",
         "tplmd5": "7330361958732444829",
         "hisOpn": "0",
         "isGulike": "0",
         "cv": "4.2.1.0",
         "ref": search_url,
         "v": "v5",
         "pop": "1",
      }
      self.writer = writer
      self.spname_list = []
      self.spprice_list = []
      self.dt_list = []
      self.infoid_list = []
      self.infotype_list = []
      self.sppic_list = []
      self.history_price_dict = {}
      self.search_price_start_date = None
      self.search_price_end_date = None

def parser_history_preferential_info(self, pages=None):
      if pages is not None and isinstance(pages, int):
         for page in range(1, pages + 1):
            self.preferential_data["ipage"] = page
            response = requests.post(
                  url=self.search_preferential_url,
                  headers=self.preferential_headers,
                  data=self.preferential_data,
                  verify=False,
            )
            print(response.content.decode("utf-8"))
            if (
                  response.status_code == 200
                  and json.loads(response.content.decode("utf-8")).get("ok") == 1
            ):
                  json_data = json.loads(response.content.decode("utf-8"))
                  for i in json_data.get("zklist"):
                     spname = i.get("spname")
                     spprice = (
                        i.get("spprice").replace("<p>", "").replace("</p>", "")
                     )
                     if i.get("dt"):
                        timeArray = time.localtime(
                              int(re.findall("\d+", i.get("dt"))[0]) / 1000
                        )
                        dt = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
                     else:
                        dt = None
                     infoid = i.get("infoid")
                     infotype = i.get("infotype")
                     sppic = i.get("sppic")
                     self.spname_list.append(spname)
                     self.spprice_list.append(spprice)
                     self.dt_list.append(dt)
                     self.infoid_list.append(infoid)
                     self.infotype_list.append(infotype)
                     self.sppic_list.append(sppic)
                     print(spname, spprice, dt, infoid, infotype, sppic)
            else:
                  error_status_code = response.status_code
                  print("凉了，被ban了~ 状态码:%s,自己看的办吧" % error_status_code)
      elif pages is None:
         page = 1
         self.preferential_data["ipage"] = page
         response = requests.post(
            url=self.search_preferential_url,
            headers=self.preferential_headers,
            data=self.preferential_data,
            verify=False,
         )
         json_data = json.loads(response.content.decode("utf-8"))
         print(self.search_preferential_url)
         print(self.preferential_headers)
         print(self.preferential_data)
         print(json_data)
         if response.status_code == 200 and json_data.get("ok") == 1:
            while json_data.get("ok") == 1:
                  for i in json_data.get("zklist"):
                     spname = i.get("spname")
                     spprice = (
                        i.get("spprice").replace("<p>", "").replace("</p>", "")
                     )
                     if i.get("dt"):
                        timeArray = time.localtime(
                              int(re.findall("\d+", i.get("dt"))[0]) / 1000
                        )
                        dt = time.strftime("%m-%d", timeArray)
                     else:
                        dt = None
                     infoid = i.get("infoid")
                     infotype = i.get("infotype")
                     sppic = i.get("sppic")
                     self.spname_list.append(spname)
                     self.spprice_list.append(spprice)
                     self.dt_list.append(dt)
                     self.infoid_list.append(infoid)
                     self.infotype_list.append(infotype)
                     self.sppic_list.append(sppic)
                     print(spname, spprice, dt, infoid, infotype, sppic)
                  page += 1
                  self.preferential_data["ipage"] = page
                  response = requests.post(
                     url=self.search_preferential_url,
                     headers=self.preferential_headers,
                     data=self.preferential_data,
                     verify=False,
                     timeout=5,
                  )
                  print(response.status_code)
                  print(self.preferential_data["ipage"])
                  json_data = json.loads(response.content.decode("utf-8"))
                  if not json_data["zklist"]:
                     break
                  print(json_data)
                  time.sleep(random.uniform(0.5, 1.0))
         else:
            print("该商品无历史优惠信息或User-Agent错误或触发反爬，请重试")
      else:
         print("数据抓取失败，洗洗睡吧")

def parser_history_price_info(self):

      response = requests.post(
         url=self.search_price_url,
         data=self.price_data,
         headers=self.price_headers,
         verify=False,
      )
      print(response.content.decode("utf-8"))
      if (
         response.status_code == 200
         and json.loads(response.content.decode("utf-8")).get("pcinfo")
      ):
         json_data = json.loads(response.content.decode("utf-8"))
         print(json_data)
         self.search_price_start_date = json_data["pcinfo"]["bd"]
         self.search_price_end_date = json_data["pcinfo"]["ed"]
         for k in json_data["pcinfo"]["info"]:
            self.history_price_dict[k["dt"]] = k["pr"]
      else:
         print("数据为空，或者被ban~~")

def download_preferential_info_data(self):

      dataframe = pd.DataFrame(
         columns=["名称", "优惠信息", "日期", "infoid", "infotype", "商品主图"]
      )
      dataframe["名称"] = self.spname_list
      dataframe["优惠信息"] = self.spprice_list
      dataframe["日期"] = self.dt_list
      dataframe["infoid"] = self.infoid_list
      dataframe["infotype"] = self.infotype_list
      dataframe["商品主图"] = self.sppic_list
      to_c_sheet = (
         "商品历史查询数据"
         + "_"
         + time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))
      )
      dataframe.to_excel(
         self.writer, index=False, encoding="utf-8", sheet_name=to_c_sheet
      )
      print("数据写入完成，进程结束")

      pyplot.plot(self.dt_list, self.spprice_list)
      pyplot.xlabel('日期')
      pyplot.ylabel('价格')
      pyplot.title(self.spname_list[0])
      #将纵坐标等刻度划分

      #设置填充选项：参数分别对应横坐标，纵坐标，纵坐标填充起始值，填充颜色（可以有更多选项）
      pyplot.fill_between(self.dt_list, self.spprice_list, 10, color = 'white')
      #使横坐标逆序输出
      pyplot.gca().invert_xaxis()
      #显示图表
      now_time1 = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time()))
      pyplot.savefig("./" + now_time1 +".jpg")
      print("已经保存")
      pyplot.show()

def download_price_info_data(self):

      fig = plt.figure()
      mpl.rcParams["font.sans-serif"] = ["SimHei"]
      mpl.rcParams["axes.unicode_minus"] = False
      plt.rcParams["figure.figsize"] = (8.0, 4.0)
      print(self.history_price_dict)
      if self.history_price_dict:
         min_date = min(self.history_price_dict, key=self.history_price_dict.get)
         min_price = self.history_price_dict[
            min(self.history_price_dict, key=self.history_price_dict.get)
         ]
         max_date = max(self.history_price_dict, key=self.history_price_dict.get)
         max_price = self.history_price_dict[
            max(self.history_price_dict, key=self.history_price_dict.get)
         ]
         plt.plot(self.history_price_dict.keys(), self.history_price_dict.values())
         plt.text(
            min(self.history_price_dict, key=self.history_price_dict.get),
            self.history_price_dict[
                  min(self.history_price_dict, key=self.history_price_dict.get)
            ],
            min(self.history_price_dict, key=self.history_price_dict.get),
            ha="right",
            va="bottom",
            fontsize=10,
         )
         plt.text(
            min(self.history_price_dict, key=self.history_price_dict.get),
            self.history_price_dict[
                  min(self.history_price_dict, key=self.history_price_dict.get)
            ],
            self.history_price_dict[
                  min(self.history_price_dict, key=self.history_price_dict.get)
            ],
            ha="left",
            va="bottom",
            fontsize=10,
         )
         plt.title("历史价格分布")
         plt.xlabel("日期")
         plt.ylabel("金额")
         fig.savefig("foo.png")
         fig_title = (
            time.strftime("%m%d", time.localtime(time.time())) + "历史价格查询"
         )
         sheet = self.writer.book.add_worksheet(fig_title)
         sheet.insert_image(0, 0, "foo.png")
      else:
         print('数据为空，该商品未被收录')

class Application:
def __init__(self):
      self.window = Tk()
      self.text = Text(self.window)
      # 设置窗口大小和位置
      self.window.title("阳光价格")
      self.window.geometry("290x430+500+280")
      self.window.minsize(290, 380)
      # 创建一个文本框
      self.entry = Text(self.window)
      self.entry.place(x=10, y=10, width=200, height=165)
      self.entry.bind("<Key-Return>")

      # 创建历史溯源按钮
      self.submit_btn5 = Button(self.window, text=u"历史溯源", command=self.submit_5)
      self.submit_btn5.place(x=220, y=150, width=60, height=25)
      # 翻译结果标题
      self.title_label = Label(self.window, text=u"运行日志:")
      self.title_label.place(x=10, y=180)
      # 翻译结果
      self.result_text = Text(self.window, background="#ccc")
      self.result_text.place(x=10, y=205, width=270, height=205)
      # 所属标签
      self.title_label = Label(self.window, text=u"2019_a7_price ")
      self.title_label.place(x=60, y=410)
      self.file_path = None
      self.writer = None

def submit_5(self):
      # 从输入框获取用户输入的值
      self.result_text.delete(0.0, END)
      self.file_path = "./" + now_date + "-" + "历史溯源" + ".xlsx"
      self.writer = pd.ExcelWriter(self.file_path)
      try:
         key_title = (
            self.entry.get(0.0, END)
            .strip()
            .replace("\n", " ")
            .replace("，", ",")
            .split(",")
         )
         for key in key_title:
            if key:
                  # 将值传入对象
                  self.result_text.delete(0.0, END)
                  search_history = HistoryPriceSearch(key, self.writer)
                  # 抓取历史趋势
                  search_history.parser_history_preferential_info()
                  search_history.parser_history_price_info()
                  search_history.download_preferential_info_data()
                  search_history.download_price_info_data()
                  # 下载趋势数据
                  log_1 = (
                     "历史价格搜索中"
                     + "\n"
                     + "开始下载数据中…………"
                     + "\n"
                     + "下载数据请在跟程序处于相同位置查找，文件名为【当前时间+历史趋势】"
                  )
                  self.result_text.insert(END, log_1)


      except ValueError as e:
         self.result_text.delete(0.0, END)
         log = (
            "log: " + now_time + "  " + "查询异常 " + str(e) + "\n" + "请检查键入格式:  国产红富士"
         )
         self.result_text.insert(END, log)
      except KeyError as e:
         self.result_text.delete(0.0, END)
         log = (
            "log: " + now_time + "  " + "查询异常 " + str(e) + "\n" + "请检查键入格式:  烟台红富士"
         )
         self.result_text.insert(END, log)
      except error.HTTPError as e:
         self.result_text.delete(0.0, END)
         log = "log: " + now_time + "  " + "URL异常 " + str(e) + "更换关键词重试"
         self.result_text.insert(END, log)
      except error.URLError as e:
         self.result_text.delete(0.0, END)
         log = (
            "log: "
            + now_time
            + "  "
            + "请求异常 "
            + str(e)
            + "\n"
            + "查询太频繁啦~请稍后重新或换IP重试"
         )
         self.result_text.insert(END, log)
      finally:
         self.writer.save()
         log = "\n" + "log: " + now_time + "  " + "数据下载结束，请在程序所处位置查收Excel"
         self.result_text.insert(END, log)

def run(self):
      self.window.mainloop()

app = Application()
app.run()

后续可能会陆续更新python服务器与android客户端的交互，别问为什么，因为作业需要

mixer1q84 · 发表于 2019-11-15 11:09

看不懂，但支持下。从最早的惠惠购到后来购物党、比价网，经过几次双十一618的对比目前最好的比价应该就是慢慢买了。

Darren_Smith · 发表于 2019-11-16 09:12

川哥发表于 2019-11-15 11:09
小白一枚，不知道这串代码怎么使用？需要下载软件吗还是直接在网页F12中用？

需要下载python的编译软件哦，轻量级的idle便可，但不推荐没接触过编程的人使用呢，因为需要安装各种包等问题，需要debug一会

还不算晚 · 发表于 2019-11-15 10:34

感谢分享 !

小小三毛 · 发表于 2019-11-15 11:01

小白一名

angelhunte · 发表于 2019-11-15 11:03

很好，学习了

打卡 · 发表于 2019-11-15 11:04

厉害了。不过历史价格，人家有优惠券什么的

yaojiahong · 发表于 2019-11-15 11:05

完全看不懂，谢谢

BrandStark · 发表于 2019-11-15 11:06

略长的代码，感谢分享！

川哥 · 发表于 2019-11-15 11:09

小白一枚，不知道这串代码怎么使用？需要下载软件吗还是直接在网页F12中用？

林楠杰 · 发表于 2019-11-15 11:12

提示: 作者被禁止或删除内容自动屏蔽

帐号		自动登录	找回密码
密码			注册[Register]

林楠杰林楠杰当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	林楠杰发表于 2019-11-15 11:12 提示: 作者被禁止或删除内容自动屏蔽
林楠杰林楠杰当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽
	回复支持举报

[Python 转载] 从慢慢买网站中爬取京东和天猫的商品历史价格

免费评分

个人中心