[原创源码] 用selenium和requests自动获取百度、B站、微博热榜并推送到企业微信

wkdxz · 发表于 2022-6-15 15:22

这几项需要自己补充完整，要不推送不了

[Python] 纯文本查看 复制代码

corpid = ''  #企业微信的 corpid
corpsecret = ''  #企业微信 corpsecret 
appid = ''  #企业微信 appid

最近在学Selenium，想起平时看热榜比较多，所以做了这个，每天定时推送百度、B站、微博热榜到企业微信上，不用自己去慢慢找了。本来还写了爬抖音热榜的代码，但不能在微信里直接打开抖音的链接，所以抖音删了。
添加计划任务，可以后台自动推送

推送效果类似这样子

完整代码

[Python] 纯文本查看 复制代码

import json
from datetime import datetime
from os import path

import emoji
import requests as req
from fake_useragent import UserAgent
from selenium import webdriver
# from urllib.parse import quote_plus  #搜索时，中文转英文

corpid = ''  #企业微信的 corpid
corpsecret = ''  #企业微信 corpsecret 
appid = ''  #企业微信 appid

tToday = datetime.now().strftime('%H:%M')
send_count = 11  #推送n-1条


def filter_str(s):  #过滤标题的特殊字符
    for i in range(3):
        for d in r'!！？?.。-_#￥$%&·`、、：;*/\\':
            s = s.replace(f'{d}{d}', d)
        s = s.replace(' ', '')
    return emoji.get_emoji_regexp().sub(r'', s.encode('utf8').decode('utf8'))


def get_with_se(site):  #百度和B站都通过selenium获取
    ua = UserAgent().random
    option = webdriver.ChromeOptions()
    #躲过webdriver检测1
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    #躲过webdriver检测2
    option.add_experimental_option('useAutomationExtension', False)
    # 防止selenium错误提示
    option.add_experimental_option('excludeSwitches', ['enable-logging'])
    option.add_argument('user-agent=' + ua)  #随机UA

    #加速运行
    option.add_argument('--no-sandbox')  # 解决DevToolsActivePort文件不存在的报错
    option.add_argument('--disable-gpu')  # 规避bug
    option.add_argument('--hide-scrollbars')  # 隐藏滚动条
    option.add_argument('blink-settings=imagesEnabled=false')  # 不加载图片
    option.add_argument('--headless')  #隐藏运行

    wd = webdriver.Chrome(options=option)
    wd.execute_cdp_cmd(
        'Page.addScriptToEvaluateOnNewDocument', {
            'source':
            'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
        })  #躲过webdriver检测3
    hot_list = []
    if site == 'bili':
        url = 'https://www.bilibili.com/v/popular/rank/all'
        wd.get(url)
        for i in range(1, send_count):
            url_cmd = f'//*[@id="app"]/div/div[2]/div[2]/ul/li[{i}]/div/div[1]/a'
            url = wd.find_element_by_xpath(url_cmd).get_attribute('href')
            title_cmd = f'//*[@id="app"]/div/div[2]/div[2]/ul/li[{i}]/div/div[2]/a'
            title = wd.find_element_by_xpath(title_cmd).text
            zuozhe_cmd = f'//*[@id="app"]/div/div[2]/div[2]/ul/li[{i}]/div/div[2]/div/a/span'
            zuozhe = wd.find_element_by_xpath(zuozhe_cmd).text
            hot_list.append([f'{zuozhe}：{title}', url])
    elif site == 'baidu':
        url = 'https://top.baidu.com/board?tab=realtime'
        wd.get(url)
        for i in range(1, send_count):
            title = wd.find_element_by_css_selector(
                f'.category-wrap_iQLoo:nth-child({i}) .c-single-text-ellipsis'
            ).text.strip()
            url = f'https://www.baidu.com/s?wd={title}'
            hot_list.append([title, url])
    wd.quit()
    return hot_list


def save(file, content):
    with open(file, 'w', encoding='gb2312') as f:
        f.write(content)


def load(file):
    with open(file, 'r', encoding='gb2312') as f:
        hot_list = f.readlines()
    return [i.strip() for i in hot_list]


def send_wx(x):
    url = f'https://qyapi.weixin.qq.com/cgi-bin/gettoken?corpid={corpid}&corpsecret={corpsecret}'
    r = req.get(url, timeout=5)
    tokens = json.loads(r.text)['access_token']
    url = "https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token=" + tokens
    data = {
        "touser": "@all",
        "msgtype": "text",
        "agentid": appid,
        "text": {
            "content": x
        },
        "safe": 0,
    }
    data = json.dumps(data)
    return req.post(url, data=data, timeout=9).text


def weibo():
    response = req.get("https://weibo.com/ajax/side/hotSearch")
    data_json = response.json()['data']['realtime']
    n = 1
    hot_list = []
    for i in data_json:
        title = i['note']
        url = 'https://s.weibo.com/weibo?q=%23' + i['word'] + '%23'
        hot_list.append([title, url])
        n += 1
        if n == send_count: break  #满10条就不写入了
    return hot_list


def send_top(site_name, hot_list):

    if site_name == 'bili':
        site = 'B站'
    elif site_name == 'baidu':
        site = '百度'
    elif site_name == 'weibo':
        site = '微博'

    new_list = [f'【{site}】{tToday}']

    full_hots_list = []
    start_num = 1
    file = f'E:/Backup/脚本/txt/{site_name}.txt'
    if not path.exists(file): save(file, '')  #旧热词文件不存在，就新建一个
    old_hot = load(file)  #获取已存在的热词
    for i in hot_list:
        title, url = i
        title = filter_str(title)  #过滤一下标题的特殊字符
        full_hots_list.append(title)  #标题写入列表，之后再写入文本，方便下次对比
        if title not in old_hot:
            new_list.append(f'<a href="{url}">{start_num}. {title}</a>')  #标记新的
            start_num += 1
    new_txts = '\n\n'.join(new_list)  #连接新热词（排除已存在的老热词）
    save(file, '\n'.join(full_hots_list))  #存储hotlist，方便下次对比
    if len(new_list) > 1:
        send_wx(new_txts)


if __name__ == '__main__':
    send_top('bili', get_with_se('bili')[:send_count])
    send_top('baidu', get_with_se('baidu')[:send_count])
    send_top('weibo', weibo())

wkdxz · 发表于 2022-6-18 16:28

本帖最后由 wkdxz 于 2022-6-18 18:01 编辑

小初发表于 2022-6-18 14:53
在弄个抖音知乎的呗

抖音的热榜做了，因为头条链接无法在微信打开，所以没加上去。知乎日报：https://www.52pojie.cn/thread-1651017-1-1.html 的热榜我有，每天推送一次。

抖音的获取模块，返回一个带关键词和搜索URL的列表

[Python] 纯文本查看 复制代码

def get_douyin():  #微信不能打开抖音连接，不推
    r = req.get('https://aweme.snssdk.com/aweme/v1/hot/search/list/',
                timeout=5)
    obj = json.loads(r.text)
    word_list = obj['data']['word_list']
    items = list(word_list)
    hot_list = []
    for i in items:
        title = i['word']
        url = f'https://www.douyin.com/search/{title}'
        hot_list.append([title, url])
    return hot_list

wkdxz · 发表于 2022-8-24 14:06

本帖最后由 wkdxz 于 2022-8-24 14:21 编辑

wangke333 发表于 2022-8-24 14:00
代码有误，大佬检查下

大佬不敢当，给个自用的新版本，需要自建一个Access数据库（可以使用SQL数据库替代，改下连接数据库的代码就可以了），结构如下：

[Python] 纯文本查看 复制代码

from collections import Counter
from datetime import datetime
from random import sample
from urllib.parse import quote_plus  #搜索时，中文转英文
import difflib
import jieba
import json
import pypyodbc
import re
import requests

corpid = ''  ###########################企业微信 corpid
corpsecret = ''  #######################企业微信应用 corpsecret
appid = ''  ############################企业微信应用 appid

str_now = datetime.now().strftime('%H:%M')
str_month = datetime.now().strftime('%Y-%m')  #('%Y-%m-%d')

send_count = 11  #推送n-1条


def data(sql, write=False):
    hot_data = 'E:/hots.mdb'
    conn = pypyodbc.connect(
        u'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=' + hot_data)
    cursor = conn.cursor()
    cursor.execute(sql)
    if write:
        cursor.commit()  # 别忘了立即提交
    else:
        data = cursor.fetchall()
        cursor.close()
        return data


def similar_title(title, old_title_list, bili=False):  #相似度>n才算是重复标题
    if bili:
        return False  #如果是B站，则不匹配相似度
    return any(
        difflib.SequenceMatcher(None, title, old_title).quick_ratio() > 0.7
        for old_title in old_title_list)


def filter_str(s):  #过滤标题的特殊字符
    for _ in range(3):
        for d in r'!！？?.。-_&·`、、：;*/\\':
            s = s.replace(f'{d}{d}', d)
        for d in r'{}“”【】~●▲▼◆■★':
            s = s.replace(d, '')
    s = s.replace(' ', '')
    return s


def zhong_wen(s):  #提取中文字符作为标题关键词
    res = re.findall('[\u4e00-\u9fa5A-Za-z0-9.-]', s)
    return ''.join(res)


def save(hot_list):
    title, url, site_name = hot_list
    sql = f"insert into list (标题,网址,来源) values('{title}','{url}','{site_name}')"
    data(sql, True)


def recently_hots(isbili=False, days=10):  #B站不管日期，只认来源
    if isbili:
        sql = "select 网址 from list where 来源='bili'"
    else:
        sql = f"select 标题 from list where 日期>date()-{days}"  #默认查询5天前的数据 不管来源
    return {i[0] for i in data(sql)}


def send_wx(x):
    url = f'https://qyapi.weixin.qq.com/cgi-bin/gettoken?corpid={corpid}&corpsecret={corpsecret}'
    r = requests.get(url, timeout=5)
    tokens = r.json()['access_token']
    url = f"https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token={tokens}"
    data = {
        "touser": "@all",
        # "touser": 'wuxiaozhi',
        "msgtype": "text",
        "agentid": appid,
        "text": {
            "content": x
        },
        "safe": 0,
    }
    data = json.dumps(data)
    return requests.post(url, data=data, timeout=9).json()


def weibo():
    rjson = requests.get("https://weibo.com/ajax/side/hotSearch")
    rjson = rjson.json()['data']['realtime']
    hot_list = set()
    for i in rjson[:send_count]:
        if 'is_ad' in i:  #过滤广告
            continue
        title = i['note']
        new_title = quote_plus(title) if '%' in title else title  #防止%在网址里面转码出错
        url = f'https://s.weibo.com/weibo/{new_title}'
        hot_list.add((title, url))
    return hot_list


def hot_words(title_list):
    words = jieba.cut_for_search(title_list)
    true_words = [i for i in words if len(i.strip()) >= 2]
    sl = Counter(true_words)
    return [i for i in sl if sl[i] >= 5]


def baidu():
    rjson = requests.get('https://top.baidu.com/board?tab=realtime')
    rjson.encoding = 'utf-8'
    html = rjson.text
    if zhushi_re := re.findall('<!--s-data:(.*false})-->', html, re.S):
        txt_json = f'{zhushi_re[0]}'  #以文本存储在HTML里面的JSON
    datas = json.loads(txt_json)
    real_data = datas['data']['cards'][0]['content']
    hot_list = set()
    for i in real_data[:send_count]:
        title = i['word']
        if title[0] != '#' and title[-1] != '#':
            new_title = quote_plus(
                title) if '%' in title else title  #防止%在网址里面转码出错
            url = f'https://www.baidu.com/s?wd={new_title}'
            hot_list.add((title, url))
    return hot_list


def all_bili_list(ups):  #获取所有UP主的更新
    hot_list = set()
    for uid in ups:
        params = (
            ('mid', uid),
            ('ps', '30'),
            ('tid', '0'),
            ('pn', '1'),
            ('keyword', ''),
            ('order', 'pubdate'),
            ('jsonp', 'jsonp'),
        )
        rjson = requests.get('https://api.bilibili.com/x/space/arc/search',
                             params=params).json()
        for i in rjson['data']['list']['vlist']:
            play = i['play']  #播放
            danmu = i['video_review']  #弹幕
            comment = i['comment']  #评论
            title = i['title']
            bvid = i['bvid']
            url = f'https://www.bilibili.com/video/{bvid}'
            # 播放>，弹幕>,评论>
            if (play > (300 * 10000) or danmu > 3000
                    or comment > 2000) and url not in recently_hots(True):
                hot_list.add((f'{ups[uid]}：{title}', url))
    return hot_list


def bili():
    sql = '''
    select top 10 Uid,Nick
        from up
        where unlike=false
        order by rnd(id)
    '''  #随机抽取10个Up主
    ups = dict(data(sql))
    hots = all_bili_list(ups)
    # for i in hots:
    #     print(i)
    shu = 3  #推送条数
    return sample(hots, shu) if len(hots) >= shu else hots


def replace_hot_title(title):  #热词加括号
    hot_list = ' '.join(recently_hots(False, 1))
    hwords = hot_words(hot_list)
    for i in hwords:
        if i in title:
            title = title.replace(i, f'({i})')
    return title


def send_top_news(site_name, hot_list):
    if site_name == 'baidu':
        site = '百度'
    elif site_name == 'bili':
        site = 'B站'
    elif site_name == 'weibo':
        site = '微博'

    new_list = [f'【{site}】{str_now}']
    start_num = 1
    old_title_list = recently_hots()  #获取已存在的热词

    for i in hot_list:
        title, url = i
        title = filter_str(title)  #过滤一下标题的特殊字符
        new_title = zhong_wen(title)  #提取title中文作为关键词 防止写入时出错

        if site_name == 'bili' or not similar_title(
                title, old_title_list):  #B站的不判断相似标题
            save([new_title, url, site_name])  #标题，网址，来源写入数据库
            new_list.append(
                f'<a href="{url}">{chr(10101+start_num)} {replace_hot_title(title)}</a>'
            )  #把热词加上括号
            start_num += 1

    if len(new_list) > 1:
        new_txts = '\n\n'.join(new_list)  #连接新热词（排除已存在的老热词）
        if not all([corpid, corpsecret, appid]):
            print('企业微信应用的信息未填写完整，不发送，只展示')
            print(new_txts)
        else:
            send_wx(new_txts)


if __name__ == '__main__':
    print('爬取 bili ...')
    send_top_news('bili', bili())

    print('爬取 baidu ...')
    send_top_news('baidu', baidu())

    print('爬取 weibo ...')
    send_top_news('weibo', weibo())

yzqhj · 发表于 2022-6-15 15:45

这是云函数么？

wkdxz · 发表于 2022-6-15 15:53

yzqhj 发表于 2022-6-15 15:45
这是云函数么？

不是不是我还没用过云函数

wang919 · 发表于 2022-6-15 16:44

6666666666

yzqhj · 发表于 2022-6-15 17:36

wkdxz 发表于 2022-6-15 15:53
不是不是我还没用过云函数

那这个是本地运行的？

wkdxz · 发表于 2022-6-15 17:50

yzqhj 发表于 2022-6-15 17:36
那这个是本地运行的？

是的理论上也可以在服务器运行只是要改下文件路径

shangpeng · 发表于 2022-6-15 19:09

顶了~~~~~~~

sssguo · 发表于 2022-6-15 22:33

感谢分享

hannce · 发表于 2022-6-17 23:43

请教下楼主：corpsecret = '' #企业微信 corpsecret，这个corpsecret是不是要新建一个自建的应用，才能生成？是否可以直接调用现在的？

wkdxz · 发表于 2022-6-18 07:48

hannce 发表于 2022-6-17 23:43
请教下楼主：corpsecret = '' #企业微信 corpsecret，这个corpsecret是不是要新建一个自建的应用，才能生 ...

可以调用现在的，只要是应用都可以。

帐号		自动登录	找回密码
密码			注册[Register]

[Python 原创] [原创源码] 用selenium和requests自动获取百度、B站、微博热榜并推送到企业微信

免费评分

本帖被以下淘专辑推荐:

个人中心