部署云服务器，爬取并推送二次元美女榜单

xjr741238569 · 发表于 2022-4-12 18:47

背景

突然爆发的疫情，被迫隔离在宿舍上网课，有充足的时间
之前一直想学python没机会，这次通过这个项目希望有所体会
写下这个贴，也算是对这段时间的总结，潜水至今第一次发帖
对于python的使用还有很多不足之处，多谢谅解

说明

原理是使用python爬取到榜单的图片保存到本地，然后可以设置推送到qq邮箱和企业微信机器人
代码默认是运行在linux服务器上，如需要运行在windows，可能需要修改一下路径
爬取的网站是P站热门排行-P站图片排行榜-触站 (huashi6.com)（https://www.huashi6.com/rank）

源码

[Python] 纯文本查看 复制代码

# coding=<coding=utf-8>
import random
import datetime
import json
import random
from concurrent.futures import ThreadPoolExecutor
import datetime
import requests
import os
import smtplib
import time
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.image import MIMEImage
import datetime
import sys
import requests
import base64
import hashlib
my_headers = [
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Opera/9.25 (Windows NT 5.1; U; en)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
]
headers = {
    "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1 Edg/100.0.4896.60",
    "accept": "application/json, text/plain, */*",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "content-length": "21",
    "content-type": "application/x-www-form-urlencoded",
    "dnt": "1",
    "origin": "https://m.huashi6.com",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-site",
}
index_url = "https://rt.huashi6.com/front/works/rank_page"
data = {
    "index": "1",
    "size": "10",
    "date": ""
}

def gettime():
    year = datetime.datetime.now().year
    month = datetime.datetime.now().month
    month = str(month).zfill(2)
    day = datetime.datetime.now().day
    day = str(day).zfill(2)
    time = str(year) + "-" + month + "-" + day
    return time

def date_del(str):
    # Y-M-D
    y, m, d = map(int, str.split('-'))
    D = ['', 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    if y % 400 == 0 or (y % 4 == 0 and y % 100 != 0):
        D[2] = 29
    if d == 1:
        if m != 1:
            m = m - 1
            d = D[m]
        else:
            m = 12
            y -= 1
            d = D[12]
    else:
        d = d - 1
    return '%04d-%02d-%02d' % (y, m, d)

def download(index, path):
    headers["User-Agent"] = random.choice(my_headers)
    data['index'] = str(index)
    #data['date'] = s
    response = requests.post(index_url, headers=headers, data=data, timeout=(3, 7))
    response = response.json()['data']['works']['datas']
    for i in range(0, 10):
        herf = "https://img2.huashi6.com/" + str(response[i]['coverImage']['path'])
        title = response[i]['title']
        print(response[i]['title'])
        name = herf.split('.')[-1]
        pic = requests.get(herf).content
        with open(path + title + '.' + str(name), 'wb') as file:  # 创建并打开一个文件
            file.write(pic)  # 放进去内容，写入
            file.close()  # 关闭

def wx_markdown(num,second):
    t=gettime()
    t=date_del(t)
    header = {
        "Content-Type": "application/json;charset=UTF-8"
    }
    message_body = {
        "msgtype": "markdown",
        "markdown": {
            "content": "#### %s %s \n" % (t,"的榜单已为你送达") +
                       "##### &#8226;  现在是：%s \n" % (gettime()) +
                    "##### &#8226;  有 %d 张图片请过目： \n" % (num) +
                     "##### &#8226;  此次发送耗时 %s 秒： \n" % (second) +
                       "##### &#8226; 已经全部打包好发往你的邮箱 \n"
        },
        "at": {
            "atMobiles": [],
            "isAtAll": False
        }
    }
    web_hook = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxxxxx"
    send_data = json.dumps(message_body)  # 将字典类型数据转化为json格式
    ChatBot = requests.post(url=web_hook, data=send_data, headers=header)
    opener = ChatBot.json()
    if opener["errmsg"] == "ok":
        print(u"通知消息发送成功！" )
    else:
        print(u"通知消息发送失败，原因：{}".format(opener))
def wx_image(image):
    with open(image, 'rb') as file:  # 转换图片成base64格式
        data = file.read()
        encodestr = base64.b64encode(data)
        image_data = encodestr.decode('utf-8')

    with open(image, 'rb') as file:  # 图片的MD5值
        md = hashlib.md5()
        md.update(file.read())
        image_md5 = md.hexdigest()
    url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxxxxx"  # 填上机器人Webhook地址
    headers = {"Content-Type": "application/json"}
    data = {
        "msgtype": "image",
        "image": {
            "base64": image_data,
            "md5": image_md5
        }
    }
    result = requests.post(url, headers=headers, json=data)
    return result
def wx_text(content):
    wx_url="https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxxxxx"
    #企业微信机器人
    data = json.dumps({"msgtype": "text", "text": {"content": content}})
    r = requests.post(wx_url, data, auth=('Content-Type', 'application/json'))
def sendmail():
    sender = 'xxxxxx'  # 发件人的地址
    password = 'xxxxxx'  # 此处是我们刚刚在邮箱中获取的授权码
    receivers = 'xxxxxxx'  # 邮件接受方邮箱地址，可以配置多个，实现群发，注意这里要是字符串
    content = MIMEText("<html><h2>已为你爬取图片，请你过目</h2>", _subtype="html", _charset="utf-8")
    msg = MIMEMultipart('related')
    msg.attach(content)
    msg['Subject'] = str(gettime()) + '的榜单已经更新'
    msg['From'] = sender
    msg['To'] = receivers
    # path = "D:\\图片\\" + s + "//"
    day = gettime()
    day=date_del(day)
    path = "/www/python/pic/" + str(day) + "/"
    print("正在读取"+str(day)+"文件夹，请稍等")
    num = 0
    start0 = time.time()
    for filename in os.listdir(path):
        num = num + 1
        name = os.path.join(path, filename)
        imageFile = r"name".replace("name", name)
        print(imageFile)
        imageApart = MIMEImage(open(imageFile, 'rb').read(), imageFile.split('.')[-1])
        imageFile=str(num)+"-"+imageFile.split('/')[-1]
        time.sleep(6)
        introduce="以下作品名字是\n"+imageFile
        wx_text(introduce)
        wx_image(name)
        imageApart.add_header('Content-Disposition', 'attachment', filename=imageFile)
        msg.attach(imageApart)
    print(str(num) + "个文件蓄势待发")
    mail_host = "smtp.qq.com"
    runtime0 = str(time.time() - start0)
    wx_markdown(num,runtime0)
    start1 = time.time()
    # 通过授权码,登录邮箱,并发送邮件
    try:
        server = smtplib.SMTP_SSL(mail_host, 465, timeout=30000)
        # server = smtplib.SMTP(mail_host)  # 配置QQ邮箱的smtp服务器地址
        server.login(sender, password)
        server.sendmail(sender, receivers.split(','), msg.as_string())
        print('发送成功')
        server.quit()
    except smtplib.SMTPException as e:
        print('error', e)
    runtime1 = str(time.time() - start1)
    print("此次耗时"+runtime1)

if __name__ == "__main__":
    s = gettime()
    s = date_del(s)
    yearpath = "/www/python/pic/" + str(s) + "/"
    isExists = os.path.exists(yearpath)
    if not isExists:
        os.makedirs(yearpath)
        print(s)
    file_nums = sum([len(files) for root,dirs,files in os.walk(yearpath)])
    if(file_nums>0):
        print("读取到"+yearpath+"目录下有"+str(file_nums)+"个文件")
        print("程序自动终结")
        sys.exit()
    else:
        print("正在下载到"+yearpath+"文件夹，请稍后")
        for j in range(1, 10):
            try:
                download(j, yearpath)
            except:
                continue
        sendmail()

zhi54 · 发表于 2022-4-12 20:23

二次元有点漂亮,喜欢真实点的美女,可以改下不.

xjr741238569 · 发表于 2022-4-12 21:12

zhi54 发表于 2022-4-12 20:23
二次元有点漂亮,喜欢真实点的美女,可以改下不.

有什么推荐的网站吗

若雪 · 发表于 2022-4-12 21:48

感谢分享

linhonghong1027 · 发表于 2022-4-12 22:25

这个必须支持一下~

lxfx1 · 发表于 2022-4-12 23:20

很不错，学习下

Asy_少洋 · 发表于 2022-4-29 21:21

果然是大佬呀，感谢分享已经用上了

Asy_少洋 · 发表于 2022-4-30 10:27

本帖最后由 Asy_少洋于 2022-4-30 10:50 编辑

可以了，搞定了，刚才报错

话痨司机啊 · 发表于 2022-4-30 10:47

感觉时间函数那块可以改改，python本身就有datetime库都封装好了，拿来用就行了

Asy_少洋 · 发表于 2022-4-30 12:29

定时任务那里没搞懂不懂这个是什么意思，百度半天也没玩明白，

★[2022-04-30 12:25:28] Successful
----------------------------------------------------------------------------
/www/server/cron/6f5f08a393f572a3e3f59b87aa0553eb: line 5: python3: command not found
----------------------------------------------------------------------------
★[2022-04-30 12:29:15] Successful

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 部署云服务器，爬取并推送二次元美女榜单

免费评分