吾爱破解 - LCG - LSG |安卓破解|病毒分析|www.52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 1691|回复: 19
上一主题 下一主题
收起左侧

[学习记录] Python获取人人电影网的链接

  [复制链接]
跳转到指定楼层
楼主
鸠山一茶 发表于 2023-8-8 21:15 回帖奖励
本帖最后由 鸠山一茶 于 2023-8-9 00:44 编辑

平时有收藏影视作品的爱好,很久之前接触到人人电影网(https://www.rrdynb.com/)
但是一个一个的保存非常耗费时间,于是使用Python写了两个脚本用来实现保存链接的目的
首先来对网站进行分析
这是一个很简单的网页,可以直接使用URL进行请求
https://www.rrdynb.com/dianshiju/2023/0607/35586.html

于是就可以实现下面的代码
[Python] 纯文本查看 复制代码
import requests
import pandas as pd
import os
import logging
import json
import random
import time
from bs4 import BeautifulSoup

# Logger setup
logging.basicConfig(filename='extractor.log',encoding="utf-8", level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

DOWNLOAD_METHODS = ["阿里网盘", "夸克网盘", "迅雷云盘", "百度网盘"]

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3 Edge/16.16299",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
    # 可根据需要添加更多的User Agent
]


def extract_html_content_with_class(url, class_name):

    user_agent = random.choice(USER_AGENTS)
    headers = {"User-Agent": user_agent}

    try:
        response = requests.get(url,headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.find(class_=class_name)
    except Exception as e:
        logger.error(f"获取网址 {url} 的内容时出现错误:{e}")
        return None


def modified_extract_links(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    resources = soup.find_all('a', href=True)

    links_dict = {
        "阿里网盘链接": None,
        "夸克网盘链接": None,
        "迅雷云盘链接": None,
        "百度网盘链接": None
    }

    for resource in resources:
        link = resource['href']
        if "aliyundrive.com" in link:
            links_dict["阿里网盘链接"] = link
        elif "pan.quark.cn" in link:
            links_dict["夸克网盘链接"] = link
        elif "pan.xunlei.com" in link:
            links_dict["迅雷云盘链接"] = link
        elif "pan.baidu.com" in link:
            links_dict["百度网盘链接"] = link

    return [links_dict.get(method + "链接", None) for method in DOWNLOAD_METHODS]


def write_links_to_csv(filename, movie_name, data):
    data_row = [movie_name] + data
    df = pd.DataFrame([data_row], columns=['网址链接', '阿里云盘链接', '夸克网盘链接', '迅雷云盘链接', '百度网盘链接'])
    try:
        df.to_csv(filename, index=False, encoding='utf-8-sig', mode='a', header=(not os.path.exists(filename)))
    except Exception as e:
        logger.error(f"写入CSV文件时出现错误:{e}")


if __name__ == "__main__":
    input_file = input("请输入JSON文件路径:")
    csv_filename = os.path.splitext(os.path.basename(input_file))[0] + ".csv"

    start_index = 0
    if os.path.exists("breakpoint.txt"):
        with open("breakpoint.txt", "r") as bp:
            start_index = int(bp.read().strip())

    with open(input_file, 'r') as file:
        urls = json.load(file)

    for index, url in enumerate(urls[start_index:]):
        logger.info(f"开始处理网址:{url}")
        movie_txt_content = extract_html_content_with_class(url, 'movie-txt')
        if movie_txt_content:
            download_links = modified_extract_links(str(movie_txt_content))
            if download_links:
                write_links_to_csv(csv_filename, url, download_links)
            else:
                logger.warning("未找到下载链接。")
        else:
            logger.warning("未找到带有'movie-txt' class的HTML内容。")

        # 随机暂停
        if random.randint(1, 10) <= 3:  # 30% chance to pause after a URL
            sleep_time = random.randint(5, 15)
            logger.info(f"随机暂停{sleep_time}秒")
            time.sleep(sleep_time)

        # 断点再续文件
        with open("breakpoint.txt", "w") as bp:
            bp.write(str(index + start_index))

但是我们发现这个URL没有什么规律,这对于我们全自动的抓取就很不友好
经过对网站的挖掘发现,网站一个提供了4个列表对全部的网页进行展示

所以就可以根据这四个页面写一个脚本,先把全部的网页链接抓取下来,然后再使用脚本进行抓取,下面展示内容
[Python] 纯文本查看 复制代码
import requests
import random
import time
import json
import logging
import os
from bs4 import BeautifulSoup
from datetime import datetime

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3 Edge/16.16299",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
    # 可根据需要添加更多的User Agent
]

# 设置日志模块使用utf-8编码
log_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'crawler.log')
logging.basicConfig(filename=log_file_path, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', encoding='utf-8')

def get_target_links(category, listnum, start_page=1):
    category_list = [
        ("dongman", "13"),
        ("movie", "2"),
        ("dianshiju", "6"),
        ("zongyi", "10"),
    ]

    # Check if the provided category and listnum are within valid range
    if not (1 <= category <= len(category_list)):
        logging.warning("无效的选项。")
        return []

    if not (1 <= listnum <= len(category_list)):
        logging.warning("无效的选择。")
        return []

    # Get the selected category and listnum
    selected_category, selected_listnum = category_list[category - 1]

    base_url = f"https://www.rrdynb.com/{selected_category}/list_{selected_listnum}_"
    page_number = start_page
    target_links = []

    while True:
        url = f"{base_url}{page_number}.html"

        # 随机选择一个User Agent
        user_agent = random.choice(USER_AGENTS)
        headers = {"User-Agent": user_agent}

        logging.info(f"正在获取第{page_number}页...")
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            logging.info(f"成功连接,正在抓取...")
            soup = BeautifulSoup(response.text, "html.parser")
            target_link = soup.find("a", class_="movie-thumbnails")["href"]
            target_links.append(target_link)
            page_number += 1
            time.sleep(3)  # 暂停
        elif response.status_code == 404:
            logging.info(f"已获取所有页面,共{page_number - 1}页。")
            break
        else:
            logging.warning(f"获取第{page_number}页失败。5秒后重试...")
            time.sleep(5)  # 5秒后重试
            continue

    return target_links

def save_to_json(target_links, category):
    categories = {
        1: "动漫",
        2: "电影",
        3: "电视剧",
        4: "老电影",
    }

    if category not in categories:
        logging.warning("无效的选项。")
        return

    today = datetime.now().strftime("%Y%m%d")
    file_name = f"{categories[category]}_{today}.json"
    file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), file_name)
    with open(file_path, "w", encoding="utf-8") as json_file:
        json.dump(target_links, json_file, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    print("1. 动漫")
    print("2. 电影")
    print("3. 电视剧")
    print("4. 老电影")

    selected_category = int(input("请选择一个选项:"))

    logging.info(f"用户选择了选项 {selected_category}")

    checkpoint_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'checkpoint.txt')
    try:
        with open(checkpoint_file_path, 'r') as checkpoint_file:
            start_page = int(checkpoint_file.readline())
    except FileNotFoundError:
        start_page = 1

    target_links = get_target_links(selected_category, start_page)
    if target_links:
        today = datetime.now().strftime("%Y%m%d")
        with open(checkpoint_file_path, 'w') as checkpoint_file:
            checkpoint_file.write(f"{len(target_links) + start_page - 1}_{today}")
            logging.info("断点已保存。")

        logging.info("目标链接列表:")
        for link in target_links:
            logging.info(link)

            # 保存当前页的链接至JSON文件并添加https://www.rrdynb.com前缀
            current_page_links = [f"https://www.rrdynb.com{link}" for link in target_links]
            save_to_json(current_page_links, selected_category)
            logging.info("当前页链接已保存至对应的JSON文件。")

        logging.info(f"{len(target_links)}页获取成功已保存到JSON文件当中。")
    else:
        logging.warning("未找到目标链接。")

于是就可以成功的抓取到内容

下面是部分抓取到的内容展示

我已经抓取到了相关的文件,为了给网站方服务器不造成影响,下面将内容打包放在下面
https://www.123pan.com/s/oNv9-7yJ2.html提取码:52pj

免费评分

参与人数 7吾爱币 +7 热心值 +4 收起 理由
billsmiless + 1 + 1 我很赞同!
xudabenshi + 1 热心回复!
jiushiyaole + 1 + 1 谢谢@Thanks!
kabin + 1 谢谢@Thanks!
b12312312 + 1 热心回复!
rufan321 + 1 + 1 谢谢@Thanks!
5420dbbb + 1 + 1 谢谢@Thanks!

查看全部评分

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

沙发
52soft 发表于 2023-8-8 21:41
好好学习
3#
吖力锅 发表于 2023-8-8 22:26
4#
5cfsz 发表于 2023-8-8 22:32
5#
Blackberry01 发表于 2023-8-8 22:44
感谢分享
6#
nanqian 发表于 2023-8-9 00:12
用心了,感谢分享
7#
kabin 发表于 2023-8-9 08:49
感谢楼主分享,都打包了,真方便,谢谢
8#
skzhaixing 发表于 2023-8-9 08:56
学习一下  不错的思路
9#
Ylvan 发表于 2023-8-9 09:00
收藏了,有空学习!
10#
acs 发表于 2023-8-9 09:00
代码学习了
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则 警告:本版块禁止回复与主题无关非技术内容,违者重罚!

快速回复 收藏帖子 返回列表 搜索

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-6-5 20:05

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表