好友
阅读权限10
听众
最后登录1970-1-1
|
说实话,这个人微博发太多了,微博又限制必须登录才能查看用户发博,还有IP限制和获取频率限制,不是本来就有闲置爬虫资源的话很难搞定的,封装了一个Python的代码,可以获取到['用户','头像','发布地','发布设备','发布时间','内容','图片','页码']这些信息,你可以找人跑一下或者你自己用你的微博账号跑一下,填好参数就可以把信息爬成excel文档并保存了
[Python] 纯文本查看 复制代码 from random import randint
from time import sleep
import httpx
from fake_useragent import UserAgent
from openpyxl import Workbook
ua = UserAgent()
def get_wb_contents(startPage: int, endPage: int, cookies: str, savePath: str = None, proxyList: list=None, userID:str='1173935352'):
"""
获取微博用户的发博信息
:param startPage: int: 开始页数,起始为1
:param endPage: int: 结束页数
:param cookies: str: 你的微博账号Cookies
:param savePath: str: excel文件保存地址,不填不保存
:param proxyList: list: 代{过}{滤}理列表,可空,格式为:'http(s)://url:port'
:param userID: str: 微博用户ID
:return: dict
"""
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-GB-oxendict;q=0.8,en;q=0.7,de;q=0.6',
'cookie': cookies,
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': ua.random
}
if savePath is not None:
wb = Workbook()
sh = wb.create_sheet()
sh.append(['用户', '头像', '发布地', '发布设备', '发布时间', '内容', '图片', '页码'])
def get_wb_content(content_row):
date_time = content_row['created_at']
try:
location = content_row['region_name']
except:
location = ''
try:
source = content_row['source']
except:
source = ''
user = content_row['user']
avatar_hd = user['avatar_hd']
screen_name = user['screen_name']
content_text = content_row['text_raw']
pic_num = content_row['pic_num']
pics = []
if pic_num != 0:
pic_pinfos = content_row['pic_infos']
for pic in pic_pinfos:
url = pic_pinfos[pic]['original']['url']
pics.append(url)
picture = ';'.join(pics)
return {'user': screen_name, 'avatar': avatar_hd, 'location': location, 'device': source, 'time': date_time,
'content': content_text, 'pictures': picture}
result = []
page_url = 'https://weibo.com/ajax/statuses/mymblog?uid=' + userID + '&page=1&feature=0&since_id='
if proxyList is not None:
proxy_select = randint(0, len(proxyList))
proxy = {
'http://': proxyList[proxy_select]
}
res = httpx.get(page_url, headers=headers, proxies=proxy).json().get('data')
else:
res = httpx.get(page_url, headers=headers).json().get('data')
if startPage == 1:
content = res['list']
for c in content:
try:
gwc = get_wb_content(c)
result.append(gwc)
if savePath is not None:
sh.append([gwc['user'], gwc['avatar'], gwc['location'], gwc['device'], gwc['time'], gwc['content'],
gwc['pictures'], 1])
except:
result.append({'since_id': res['since_id'], 'page': 1})
if savePath is not None:
sh.append(['since_id', res['since_id'], 'page', 1])
p = startPage
while res['since_id'] != '':
p = p + 1
if p > endPage:
break
else:
page_url = f'https://weibo.com/ajax/statuses/mymblog?uid={userID}&page={str(p)}&feature=0&since_id=' + res[
'since_id']
if proxyList is not None:
proxy_select = randint(0, len(proxyList))
proxy = {
'http://': proxyList[proxy_select]
}
res = httpx.get(page_url, headers=headers, proxies=proxy).json().get('data')
else:
res = httpx.get(page_url, headers=headers).json().get('data')
content = res['list']
for c in content:
try:
gwc = get_wb_content(c)
result.append(gwc)
if savePath is not None:
sh.append([gwc['user'], gwc['avatar'], gwc['location'], gwc['device'], gwc['time'],
gwc['content'], gwc['pictures'], p])
except:
result.append({'since_id': res['since_id'], 'page': 1})
if savePath is not None:
sh.append(['since_id', res['since_id'], 'page', 1])
sleep(1000)
if savePath is not None:
wb.save(savePath)
wb.close()
return result
|
|