好友
阅读权限10
听众
最后登录1970-1-1
|
你输入链接就可以获取到内容,返回值放最后
本来抓取了第二页的内容,发现他返回的链接打开后没有目录了,有时间再改吧
目前的话程序功能是 获取当前页的内容 以及获取下一页的链接
[Python] 纯文本查看 复制代码 import json
import re
import time
import requests
from bs4 import BeautifulSoup
class CrawlMessage:
def __init__(self):
self.url = ""
self.session = requests.session()
def get_list(self, result):
image_list = []
resp = self.session.get(self.url, headers={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"})
print(resp.text)
soup = BeautifulSoup(resp.text, "lxml")
req_id = re.search("req_id = '(.*?)';", str(soup)).group()
album_id = re.search(";album_id=(.*?)#wechat_redirect", str(soup)).group(1)
activity_name = soup.find("h1", id='activity-name')
js_contents = soup.find("div", id='js_content')
createTime = re.search(" createTime = '(.*?)';", str(soup)).group(1)
imgs = js_contents.findAll("img")
for i in imgs:
image_list.append(i["data-src"])
# self.download_pictur(i["data-src"])
result["title"] = activity_name.text.replace("\n", "").strip()
result["createTime"] = createTime
result["image_urls"] = image_list
result["req_id"] = req_id
result["album_id"] = album_id
return result
def get_next_url(self, result):
url = "https://mp.weixin.qq.com/mp/getappmsgext?f=json&mock=&uin=&key=&pass_ticket=&wxtoken=777&devicetype=&clientversion=&__biz={0}&appmsg_token=&x5=0&f=json".format(
result["__biz"])
print(url)
data = {
"r": "0.9572518177857114",
"__biz": result["__biz"]+"==",
"appmsg_type": "9",
"mid": result["mid"],
"sn": result["sn"],
"idx": "1",
"scene": "189",
"title": result["title"],
"ct": time.time(),
"abtest_cookie": "",
"devicetype": "",
"version": "",
"is_need_ticket": "0",
"is_need_ad": "0",
"comment_id": "0",
"is_need_reward": "0",
"both_ad": "0",
"reward_uin_count": "0",
"send_time": "",
"msg_daily_idx": "1",
"is_original": "0",
"is_only_read": "1",
"req_id": result["req_id"],
"pass_ticket": "",
"is_temp_url": "0",
"item_show_type": "0",
"tmp_version": "1",
"more_read_type": "0",
"appmsg_like_type": "2",
"related_video_sn": "",
"related_video_num": "5",
"vid": "",
"is_pay_subscribe": "0",
"pay_subscribe_uin_count": "0",
"has_red_packet_cover": "0",
"album_id": result["album_id"],
"album_video_num": "5",
"cur_album_id": result["cur_album_id"],
"is_public_related_video": "NaN",
"encode_info_by_base64": "undefined",
"exptype": "",
"export_key": "",
"export_key_extinfo": "",
"business_type": "0"
}
print(data)
resp = self.session.post(url, headers={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"},
data=data)
print("------------------------------")
result["next_url"]=resp.json()["appmsg_album_extinfo"]["next_article_link"]
print(result)
return result
def main(self, url):
url_message_list = url.split("?")[1].split('&')
self.url = url
result = {
"__biz": url_message_list[0].split("=")[1],
"mid": url_message_list[1].split("=")[1],
"sn": url_message_list[3].split("=")[1],
"cur_album_id": url_message_list[5].split("=")[1]
}
result = self.get_list(result)
self.get_next_url(result)
return result
if __name__ == '__main__':
# CrawlMessage().get_next_url()
url = "https://mp.weixin.qq.com/s?__biz=MzU4Nzc5OTQ1Mw==&mid=2247497422&idx=1&sn=03103c2d73b1f58bb6eefe86e1b6ef10&chksm=fde4383cca93b12a2aefd3a6739cca9941c077924f643a668ffd25821491a4ae61397093b2a1&cur_album_id=2095303856095133698&scene=189"
print(CrawlMessage().main(url))
返回内容:[Python] 纯文本查看 复制代码 {'__biz': 'MzU4Nzc5OTQ1Mw', 'mid': '2247497422', 'sn': '03103c2d73b1f58bb6eefe86e1b6ef10', 'cur_album_id': '2095303856095133698', 'title': '超级地球在宇宙中到处都是,偏偏太阳系里却没有,为什么?', 'createTime': '2023-12-15 17:01', 'image_urls': ['https://mmbiz.qpic.cn/mmbiz_jpg/ZU4DYPV27d0OMK0upUdXpWB8tCMwkM3wnL4Ujj2A0WE8c9OqqbNZsAh6tqt1Ng1rnYOBEO2BI2JibZevOTQ3iavQ/640?wx_fmt=jpeg&from=appmsg', 'https://mmbiz.qpic.cn/mmbiz_jpg/ZU4DYPV27d0OMK0upUdXpWB8tCMwkM3w9IsQeXNkvBiabdrl218zC4rWnCOEowEyxicA1vnBDPuQuOKDdBxvpicMg/640?wx_fmt=jpeg&from=appmsg', 'https://mmbiz.qpic.cn/mmbiz_jpg/ZU4DYPV27d0OMK0upUdXpWB8tCMwkM3wH6icvx7N1z8W5iabgYBza86Licd8yCCOgv2UIeej9fjXESqbRHicYzGfyw/640?wx_fmt=jpeg&from=appmsg', 'https://mmbiz.qpic.cn/mmbiz_jpg/ZU4DYPV27d0OMK0upUdXpWB8tCMwkM3w8Ajc5tibX0cUqyk50nk59Z999zibdpL4pMFRusxR8rMiaj0tXxSxYOmwA/640?wx_fmt=jpeg&from=appmsg', 'https://mmbiz.qpic.cn/mmbiz_jpg/ZU4DYPV27d0OMK0upUdXpWB8tCMwkM3wJNr2vFGaNSQbFqZibTLfyfazwcoK56UST1MR0nRe7BEsmic7liaE4pM6g/640?wx_fmt=jpeg&from=appmsg', 'https://mmbiz.qpic.cn/mmbiz_jpg/ZU4DYPV27d0OMK0upUdXpWB8tCMwkM3wlU5xT5I7jUpv2CkERoP6zAicww2SUhJryQXYEupkkSGOOLoeR9DFkUw/640?wx_fmt=jpeg&from=appmsg'], 'req_id': "req_id = '0621fjwf9xXEIdV1UsbivKnN';", 'album_id': '2095303856095133698', 'next_url': 'http://mp.weixin.qq.com/s?__biz=MzU4Nzc5OTQ1Mw==&mid=2247498092&idx=1&sn=0cbcf8519c7ba37a28462606d2e8c961&chksm=fde4379eca93be88c395eefea1d58f7483d5409a9658ba7819b030944e17c207fdac065a8270#wechat_redirect'}
其中title:标题 image_urls:就是图片集 js_content是内容 next_url是下一个链接
===============
最后你看着改一下吧 |
|