吾爱破解 - LCG - LSG |安卓破解|病毒分析|www.52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 4606|回复: 56
收起左侧

[Python 原创] 中小学教材下载

  [复制链接]
boychn 发表于 2023-9-27 20:52
本帖最后由 boychn 于 2023-9-27 21:04 编辑


[Python] 纯文本查看 复制代码
import concurrent.futures
import logging
import os
import queue
import sys

import requests


def init_logging():
    logging_format = '%(asctime)s\t[%(levelname)s]\t[%(name)s]\t[%(threadName)s]\t%(message)s'
    logging.basicConfig(format=logging_format, level=logging.INFO)


def request_data_version() -> [str]:
    # 目标URL
    data_url = "https://s-file-1.ykt.cbern.com.cn/zxx/ndrs/resources/tch_material/version/data_version.json"
    try:
        logging.info("start to request url: %s", data_url)
        response = requests.get(data_url)
        logging.info("status_code %s, body: %s", response.status_code, response.text)
        if response.status_code == 200:
            data = response.json()
            return data["urls"].split(",")
        else:
            logging.warning("could not get right response, system exit -1.")
            sys.exit(-1)
    except requests.exceptions.RequestException as e:
        logging.exception("failed to request data_version", e)


def request_single_data_url(data_url: str) -> [dict]:
    try:
        logging.info("start to request url: %s", data_url)
        response = requests.get(data_url)
        logging.info("status_code %s", response.status_code)
        if response.status_code == 200:
            data = response.json()
            books = []
            for d in data:
                book_data = parse_book_data(d)
                if len(book_data) != 0:
                    books.append(book_data)
                else:
                    logging.warning("skip this book: %s", d['title'])
            return books
    except requests.exceptions.RequestException as e:
        logging.exception("failed to request single_data_url", e)


def parse_book_data(data: dict) -> dict:
    book_id = data['id']
    book_name = data['title']
    book_tags = data['tag_list']
    book_tags = sorted(book_tags, key=custom_sort_tag_list, reverse=True)
    if '版' not in (book_tags[0]['tag_name']):
        return {}
    return {"id": book_id, "name": book_name, "dirs": get_dirs_from_tags(book_tags)}


def get_dirs_from_tags(book_tags) -> str:
    return f"{book_tags[0]['tag_name']}/{book_tags[1]['tag_name']}/{book_tags[2]['tag_name']}"


def custom_sort_tag_list(tag):
    tag_name = tag['tag_name']
    school_level, edition, grade, semester, course_name = '', '', '', '', ''
    if "版" in tag_name:
        edition = tag_name
    elif "年级" in tag_name:
        grade = tag_name
    elif "册" in tag_name:
        semester = tag_name
    elif '小学' in tag_name or '初中' in tag_name or '高中' in tag_name:
        school_level = tag_name
    elif '教材' not in tag_name:
        course_name = tag_name

    return edition, school_level, grade, course_name, semester


def download_book(book):
    file_dir = f"{download_base_dir}/{book['dirs']}"
    file_name = f"{book['name']}.pdf"
    file_full_path = os.path.join(file_dir, file_name)
    logging.info("start download book: %s/%s", file_dir, file_name)
    try:
        if not os.path.exists(file_dir):
            os.makedirs(file_dir)
    except FileExistsError:
        logging.warning("the folder %s already exists", file_dir)

    download_url = f"https://r1-ndr.ykt.cbern.com.cn/edu_product/esp/assets_document/{book['id']}.pkg/pdf.pdf"
    response = requests.get(download_url)
    if response.status_code == 200:
        with open(os.path.join(file_dir, file_name), "wb") as file:
            file.write(response.content)
            logging.info("download book: %s successfully", file_full_path)
    else:
        logging.warning("failed to download book, status_code: %s", response.status_code)


def multi_thread_download_books(shared_queue: queue.Queue):
    try:
        while True:
            item = shared_queue.get_nowait()
            download_book(item)
            shared_queue.task_done()
    except queue.Empty:
        logging.warning("queue is empty")


def request_all_books_to_shared_queue():
    shared_queue = queue.Queue(maxsize=3000)
    urls = request_data_version()
    for url in urls:
        books = request_single_data_url(url)
        for book in books:
            shared_queue.put(book)
    return shared_queue


if __name__ == '__main__':
    init_logging()

    # 下载文件保存的文件夹
    download_base_dir = 'd:/book3'
    # 线程数量
    thread_count = 8

    all_books_queue = request_all_books_to_shared_queue()
    with concurrent.futures.ThreadPoolExecutor(max_workers=thread_count) as executor:
        futures = [executor.submit(multi_thread_download_books, all_books_queue) for i in range(thread_count)]
        concurrent.futures.wait(futures)

    logging.info("Done!!!")

免费评分

参与人数 3吾爱币 +9 热心值 +2 收起 理由
qiuweihg + 1 + 1 谢谢@Thanks!
春又来人已去 + 1 谢谢@Thanks!
苏紫方璇 + 7 + 1 欢迎分析讨论交流,吾爱破解论坛有你更精彩!

查看全部评分

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

unpy 发表于 2023-10-4 18:15
本帖最后由 unpy 于 2023-10-4 18:21 编辑

def get_dirs_from_tags(book_tags) -> str:

改为:return f"{book_tags[1]['tag_name']}\\{book_tags[3]['tag_name']}\\{book_tags[0]['tag_name']}"

目录结构如下 :  小初高\科目\版本\各年级放一起(每个年级才2本,不需要分子目录).pdf

适合比如语文用统编,数学用aa版,英语用bb版的同学

免费评分

参与人数 1吾爱币 +1 热心值 +1 收起 理由
nanfangchezhan + 1 + 1 我很赞同!

查看全部评分

cn2jp 发表于 2023-9-27 23:00
用过UI版的,这个对懂程序的人来说是好东西,但对小白不算太友好。
baliao 发表于 2023-9-27 22:06
sht281 发表于 2023-9-27 22:28
不知如何用
小莫愁湖 发表于 2023-9-27 22:43
之前有人发过
jieyang0663 发表于 2023-9-27 23:01
学习试试看
wushishen 发表于 2023-9-27 23:26
不错,值得学习。
sxlh2311 发表于 2023-9-27 23:28
技术牛人,路过
Sogrey 发表于 2023-9-27 23:37
本帖最后由 Sogrey 于 2023-9-27 23:40 编辑

这个很有趣,之前写过批量下载写入文本信息的工具,这个是批量下载图片和pdf文件,最好能简述下用法,url从哪里获取来的
二小 发表于 2023-9-27 23:51
谢谢大哥
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则 警告:本版块禁止灌水或回复与主题无关内容,违者重罚!

快速回复 收藏帖子 返回列表 搜索

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-5-6 15:03

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表