爬小姐姐视频为啥异步开不起来问题在哪里呢

lihu5841314 · 发表于 2021-7-5 21:46

本帖最后由 lihu5841314 于 2021-7-6 11:18 编辑

[Asm] 纯文本查看 复制代码

#--------------------------------------------------
import requests, time, re,os
from bs4 import BeautifulSoup
from multiprocessing.dummy import  Pool

"""
1.通过抓包工具分析视频网址   视频音频一般在Media中  找到视频url
https://huya-w10.huya.com/2119/505632739/1300/8bafda5b5bbace86a2edcb1cdb2da201.mp4
2.分析视频url的来源  通过截取视频url的后半段数据用全局搜索去搜 例如：8bafda5b5bbace86a2edcb1cdb2da201
找到视频url的来源          url_1 = https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124001977014823594203_1625483619844&videoId=505632739&_=1625483619852
同过多个视频分析来源url的变化url_2 = https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_1625484034808&videoId=515515867&uid=&_=1625484034830
?后面是请求携带的参数通过分析可以知道=1625483619852是时间戳可以通过python的time模块实现
&videoId  就是视频请求url的后半部分
jQuery112403179934484805591_1625484034808没搞明白 暂时不管试试
"""
# 1.从列表页响应中获得每个视频的播放页url
# 2.videoId+时间戳构建视频来源的url
# 3.从视频来源的url中提取出视频的url地址
# 4.请求视频url地址  持久化存储


# 目标网站
url = 'https://v.huya.com/g/all?set_id=31&order=hot&page=1'

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}


# 构建请求函数
def get_response(url):
    try:
        resp = requests.get(url=url, headers=headers)
        resp.encoding = resp.apparent_encoding
        if resp.status_code == 200:
            return resp
    except:
        print("请求失败")


# 解析每个视频的url
def parse_videoid(resp):
    # 创建BeautifulSoup对象
    soup = BeautifulSoup(resp.text, "lxml")  # 不填写"lxml"会报警告
    li_lis = soup.find('ul', "vhy-video-list w215 clearfix").findAll('li')
    video_dics = []
    for li in li_lis:
        video_url = 'https://v.huya.com' + li.find('a').get('href')
        video_name = li.find('a').get('title')
        video_dic = {
            'video_url': video_url,
            'video_name': video_name
        }
        video_dics.append(video_dic)
    return video_dics


# 构建来源url  的时间戳是13位  对时间戳进行构造
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124005236691164789575_1625490993380&videoId=525648071&_=1625490993386
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124040335487863006936_1625490632031&videoId=508624469&_=162549063203
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_1625484034808&videoId=515515867&_=1625484034830
def create_url(video_dics):
    mov_dics = []
    for video_dic in video_dics:
        video_url = video_dic['video_url']
        video_name = video_dic['video_name']
        video_id = video_url.split('/')[-1].split('.')[0]
        url_time = int(time.time() * 1000)  # 把时间戳变13位取整
        mov_url = f'https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_{url_time}&videoId={video_id}&_={url_time}'
        mov_dic = {
            'video_url': mov_url,
            'video_name': video_name
        }
        mov_dics.append(mov_dic)
    return mov_dics


# 从来源url响应中提取出视频的url
def get_movie_url(mov_dic):
    url = mov_dic['video_url']
    video_name = mov_dic['video_name']
    try:
        resp = get_response(url)
        movie_url = re.findall(r',"url":"(?P<movie_url>.*?)"', resp.text)
        if len(movie_url) > 0:
            movie_url = movie_url[0].split("?")[0]
            dic ={
                'movie_url':movie_url,
                'video_name':video_name,
            }
            return dic
    except IndexError:
        print('list index out of range')


def Down_movie(dic):
    url = dic['movie_url']
    print(url)
    name =dic['video_name']
    path = url.split('/')[-1].split('?')[0]
    path1 = 'video/'+path.replace(path.split('.')[0], name)
    resp = requests.get(url=url,headers=headers)
    print(name, '************正在下载**********')
    with open(path1, 'wb') as f:
        f.write(resp.content)
        print(name, '下载完成')
        time.sleep(2)


def main():
    if not os.path.exists("video"):
        os.makedirs("video")
    start = time.time()
    resp = get_response(url)
    video_dics = parse_videoid(resp)
    mov_dics = create_url(video_dics)
    dics = []
    for mov_dic in mov_dics:
        dic = get_movie_url(mov_dic)
        dics.append(dic)
    pool.map(Down_movie,dics)
    pool.close()
    pool.join()
    print('over',time.time()-start)


if __name__ == '__main__':
    pool = Pool(6)
    main()

[Asm] 纯文本查看 复制代码

import   requests,time,re
from bs4  import  BeautifulSoup
import aiohttp
import aiofiles
import asyncio
"""
1.通过抓包工具分析视频网址   视频音频一般在Media中  找到视频url
https://huya-w10.huya.com/2119/505632739/1300/8bafda5b5bbace86a2edcb1cdb2da201.mp4
2.分析视频url的来源  通过截取视频url的后半段数据用全局搜索去搜 例如：8bafda5b5bbace86a2edcb1cdb2da201
找到视频url的来源          url_1 = https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124001977014823594203_1625483619844&videoId=505632739&_=1625483619852
同过多个视频分析来源url的变化url_2 = https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_1625484034808&videoId=515515867&uid=&_=1625484034830
?后面是请求携带的参数通过分析可以知道=1625483619852是时间戳可以通过python的time模块实现
&videoId  就是视频请求url的后半部分
jQuery112403179934484805591_1625484034808没搞明白 暂时不管试试
"""
#1.从列表页响应中获得每个视频的播放页url
#2.videoId+时间戳构建视频来源的url
#3.从视频来源的url中提取出视频的url地址
#4.请求视频url地址  持久化存储


#目标网站
url = 'https://v.huya.com/g/all?set_id=31&order=hot&page=1'

headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
#构建请求函数
def  get_response(url):
    try:
        resp =  requests.get(url=url,headers=headers)
        resp.encoding = resp.apparent_encoding
        if resp.status_code == 200:
            return resp
    except:
        print("请求失败")

#解析每个视频的url
def parse_videoid(resp):
    #创建BeautifulSoup对象
    soup = BeautifulSoup(resp.text,"lxml")  #不填写"lxml"会报警告
    li_lis = soup.find('ul',"vhy-video-list w215 clearfix").findAll('li')
    video_dics = []
    for li in  li_lis:
        video_url ='https://v.huya.com' + li.find('a').get('href')
        video_name = li.find('a').get('title')
        video_dic = {
            'video_url':video_url,
            'video_name':video_name
        }
        video_dics.append(video_dic)
    return video_dics

#构建来源url  的时间戳是13位  对时间戳进行构造
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124005236691164789575_1625490993380&videoId=525648071&_=1625490993386
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124040335487863006936_1625490632031&videoId=508624469&_=162549063203
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_1625484034808&videoId=515515867&_=1625484034830
def  create_url(video_dics):
    mov_dics = []
    for video_dic in video_dics:
        video_url = video_dic['video_url']
        video_name = video_dic['video_name']
        video_id = video_url.split('/')[-1].split('.')[0]
        url_time = int(time.time() * 1000)  # 把时间戳变13位取整
        mov_url = f'https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_{url_time}&videoId={video_id}&_={url_time}'
        mov_dic={
            'video_url':  mov_url,
            'video_name': video_name
        }
        mov_dics.append(mov_dic)
    return mov_dics

#从来源url响应中提取出视频的url
def get_movie_url(mov_dic):
    url = mov_dic['video_url']
    video_name = mov_dic['video_name']
    try:
        resp = get_response(url)
        movie_url = re.findall(r',"url":"(?P<movie_url>.*?)"',resp.text)
        if  len(movie_url) > 0:
            movie_url = movie_url[0]
            return movie_url,video_name
    except IndexError:
        print('list index out of range')


async def Down_movie(url,name):
        path = url.split('/')[-1].split('?')[0]
        path1 = path.replace(path.split('.')[0],name)
        async with aiohttp.ClientSession()  as session:
            async with await  session.get(url,headers=headers)  as resp:
                resp = await  resp.read()
                print(name,'************正在下载**********')
                async with aiofiles.open(path1,'wb') as f:
                      await f.write(resp)
                      print(name,'下载完成')
                      await asyncio.sleep(2)


def  main():
    resp = get_response(url)
    video_dics = parse_videoid(resp)
    mov_dics = create_url(video_dics)
    tasks = []
    for mov_dic in  mov_dics:
        movie_url,video_name = get_movie_url(mov_dic)
        task = asyncio.ensure_future(Down_movie(movie_url,video_name))
        tasks.append(task)
    loop.run_until_complete(asyncio.wait(tasks))
    print('over')


if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    main()

ofo · 发表于 2021-7-5 23:12

这种不用API输出json格式的的网站，一般称为LJ网站，看的兴趣都没有

股票亏损员 · 发表于 2021-7-5 23:29

没有啥东西，不敢兴趣

列明 · 发表于 2021-7-5 23:43

視頻直接是用mp4存放傳輸的，
放到現在來說，
商用的話，
肯定是短視頻！
嗯，
短的我已經有很多了，
不感興趣了！

Spa495 · 发表于 2021-7-6 08:27

学习一下，没学懂

chenkeai深蓝 · 发表于 2021-7-6 09:18

可惜了看不懂

zhengxinjun · 发表于 2021-7-6 10:47

换个网址试试

key_user · 发表于 2021-7-6 10:49

这种不用API输出json格式的的网站，一般称为LJ网站，看的兴趣都没有

tricky6 · 发表于 2021-7-6 11:56

看起来又是个不用API输出json格式的网站诶不用开异步吧

gentlespider · 发表于 2021-7-6 14:45

我怎么感觉调用这个函数，loop没传进去啊

帐号		自动登录	找回密码
密码			注册[Register]

[讨论] 爬小姐姐视频为啥异步开不起来问题在哪里呢

个人中心

[讨论] 爬小姐姐视频 为啥 异步开不起来 问题在哪里呢

个人中心

[讨论] 爬小姐姐视频为啥异步开不起来问题在哪里呢