[Python] 纯文本查看 复制代码 import requests
import xlwt
from lxml import etree
# 网页请求
def gethtml(url):
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
response=requests.get(url,headers=headers)
return response.content.decode()
#网页解析
def getxpath(a):
html=etree.HTML(a)
d=html.xpath("//*[@id='content']/div/div[1]/ol//li")
# print(d[0])
data_list = []
for i in d:
temp={}
temp['rank']=i.xpath(".//em/text()")[0]
temp['title']=i.xpath("./div/div[2]/div[1]/a/span[1]/text()")[0]
temp['score']=i.xpath(".//span[@class='rating_num']/text()")[0]
data_list.append(temp)
return data_list
# 翻页
def next_page():
url_np="https://movie.douban.com/top250?start={}&filter="
url_list=[url_np.format(i) for i in range(0,250,25)]
return url_list
#执行函数
def run_db():
page=next_page() #显示所有页,并形成列表'
for url in page:
ghtml=gethtml(url) #获取当前页源码
gpath=getxpath(ghtml) #获取当前页数据
return gpath
print(run_db())
想输出完整的250条内容,但不知为什么只输出最后一页25条内容?不知道代码哪里有问题?另外用pandas把爬取结果输出excel格式的内容代码怎样写?刚开始学爬虫,希望有大神能支持一下
|