吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 1416|回复: 32
收起左侧

[Python 原创] 下载经济日报PDF

[复制链接]
perlma 发表于 2024-3-21 20:29
本帖最后由 perlma 于 2024-3-21 20:44 编辑

1. 新手第一次发帖,本帖学习参照论坛 嘻哈星星 发布的 人民日报pdf电子版下载器v1.0.0
https://www.52pojie.cn/thread-1899708-1-1.html
(出处: 吾爱破解论坛)
2. 代码没有打包,需要自行安装python环境
3. 使用模块PyMuPDF进行pdf文件合并
4. 使用前请创建目录,比如: E:/downloads/epaper/经济日报
下载当日pdf
download_jjrb()
下载指定日期pdf
download_jjrb("20240311")
[Python] 纯文本查看 复制代码
# -*- coding: utf-8 -*-
import os

import random
from datetime import date
import time
from urllib.parse import urljoin
import re
import requests
import shutil

import fitz

from bs4 import BeautifulSoup


def datestr2ymdw(s: str):
    """
    日期字串 20240318 -> 2024年3月18日 星期日
    Args:
        * s(str): "20240318"
    """
    year = int(s[:4])
    month = int(s[4:6])
    day = int(s[6:8])
    week = date(year, month, day).weekday()

    week_list = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"]
    return f"{year}年{month}月{day}日 {week_list[week]}"


def remove_repeat(a):
    """去重: 剔除list重复元素"""
    # https://blog.csdn.net/zhuoqingjoking97298/article/details/116946704

    return list(dict.fromkeys(a))
    # from collections import OrderedDict
    # return list(OrderedDict.fromkeys(a))


def merge_pdfs(file_list, out_filename):
    """
    合并pdf文件, 采用PyMuPDF库

    参考: https://blog.csdn.net/winniezhang/article/details/132333475
    """
    pdf_merger = fitz.open()

    for filename in file_list:
        pdf = fitz.open(filename)
        pdf_merger.insert_pdf(pdf)

    pdf_merger.save(out_filename)
    pdf_merger.close()


# 经济日报下载
class Jjrb(object):
    def __init__(self, epaper_date: str = None):
        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 SE 2.X MetaSr 1.0"
        }
        self.headers = headers

        url = "http://paper.ce.cn/pc/layout/"
        # 电子版html超链接
        urls = []
        if epaper_date is None:
            response = requests.get(url, headers=headers)
            response.encoding = "UTF-8"
            # print(response.text)

            soup = BeautifulSoup(response.text, features="lxml")
            # 限定特定节点内容
            soup = soup.find(name="ul", attrs={"id": "list"})
            allnode_of_a = soup.find_all("a")
            result = [_.get("href") for _ in allnode_of_a]

            # 生成epaper_date
            urlptn = r".*(\d{6})/(\d{2}).*\.html"
            m = re.match(urlptn, result[0])
            epaper_date = m.group(1) + m.group(2)

            for ul in result:
                urls.append(urljoin(url, ul))
        else:
            urls.append(urljoin(url, epaper_date[:6] + "/" + epaper_date[6:8] + "/node_01.html"))

        # 选择一个url即可获得所有pdf文件列表
        url = urls[0]
        response = requests.get(url, headers=headers)
        response.encoding = "UTF-8"

        soup = BeautifulSoup(response.text, features="lxml")
        allvalue_of_node = soup.find_all("input")
        result = [_.get("value") for _ in allvalue_of_node]

        # 过滤出所有包含"attachment"的url
        urlptn = r".*attachment(.+)"
        urls = [re.match(urlptn, str(_)) for _ in result]  # 正则筛选
        while None in urls:
            urls.remove(None)  # 移除表中空元素
        urls = [_.group() for _ in urls]  # group方法获得re.match()返回值中的字符

        # pdf文件超链接
        self.pdf_urls = [urljoin(url, _) for _ in urls]

        # 合并后的pdf文件名
        self.pdf_name = "经济日报 " + datestr2ymdw(epaper_date)
        print(f"[{self.pdf_name}] 共 {len(self.pdf_urls)} 个PDF文件需要合并")

    def save_pdf(self, download_path, pdf_href, pdf_detail_name):
        resp_download_pdf = requests.get(pdf_href, headers=self.headers)

        # 创建文件夹,不存在就创建
        path = f"{download_path}/temp_file"
        if not os.path.exists(path):
            os.mkdir(rf"{download_path}/temp_file")

        with open(f"{download_path}/temp_file/{pdf_detail_name}", mode="wb") as f:
            f.write(resp_download_pdf.content)
        print(f"{pdf_detail_name} 下载完成")

    def download_single_pdf_file(self, download_path):
        self.pdf_files = []
        for url in self.pdf_urls:
            num = random.randint(1, 3)
            print(f"{url}, 随机暂停时间:{num}秒")
            pdf_detail_name = os.path.basename(url)
            self.save_pdf(download_path, url, pdf_detail_name)
            self.pdf_files.append(f"{download_path}/temp_file/" + pdf_detail_name)
            time.sleep(num)

    def download(self):
        save_path = "E:/downloads/epaper/经济日报"
        self.download_single_pdf_file(save_path)

        # 合成绝对路径
        file_list = [
            os.path.join(f"{save_path}/temp_file", filename) for filename in self.pdf_files
        ]

        out_filename = f"{save_path}/{self.pdf_name}.pdf"
        merge_pdfs(file_list, out_filename)
        if os.path.exists(f"{save_path}/temp_file"):
            shutil.rmtree(f"{save_path}/temp_file")
        print(f"下载已完成:{save_path}")


def download_jjrb(epaper_date: str = None):
    """
    下载经济日报电子版

    Args:
        * epaper_date(str): 20240318
    """

    epaper = Jjrb(epaper_date)
    epaper.download()

# 下载当日经济日报
download_jjrb()
# 下载指定日期经济日报
download_jjrb("20240311")

免费评分

参与人数 2威望 +1 吾爱币 +20 热心值 +2 收起 理由
爱飞的猫 + 1 + 20 + 1 感谢发布原创作品,吾爱破解论坛因你更精彩!
baishushe1234 + 1 我很赞同!

查看全部评分

本帖被以下淘专辑推荐:

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

 楼主| perlma 发表于 2024-3-22 11:35
本帖最后由 perlma 于 2024-3-22 11:40 编辑

[Python] 纯文本查看 复制代码
# -*- coding: utf-8 -*-
import os

import random
from datetime import date
import time
from urllib.parse import urljoin
import re
import requests
import shutil

import fitz

from bs4 import BeautifulSoup


def datestr2ymdw(s: str):
    """
    日期字串 20240318 -> 2024年3月18日 星期日
    Args:
        * s(str): "20240318"
    """
    year = int(s[:4])
    month = int(s[4:6])
    day = int(s[6:8])
    week = date(year, month, day).weekday()

    week_list = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"]
    return f"{year}年{month}月{day}日 {week_list[week]}"

def merge_pdfs(file_list, out_filename):
    """
    合并pdf文件, 采用PyMuPDF库

    参考: https://blog.csdn.net/winniezhang/article/details/132333475
    """
    pdf_merger = fitz.open()

    for filename in file_list:
        try:
            pdf = fitz.open(filename)
            pdf_merger.insert_pdf(pdf)
        except Exception as e:
            print(e)
            continue

    try:
        pdf_merger.save(out_filename)
    except Exception as e:
        print(e)

    pdf_merger.close()


# 吉林日报下载
class Jlrb(object):
    def __init__(self, epaper_date: str = None):
        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 SE 2.X MetaSr 1.0"
        }
        self.headers = headers

        # 电子版html超链接
        html_urls = []
        if epaper_date is None:
            url = "http://jlrbszb.dajilin.com/pc/paper/layout/index.html"
            response = requests.get(url, headers=headers)
            response.encoding = "UTF-8"

            soup = BeautifulSoup(response.text, features="lxml")
            # 限定特定节点内容
            soup = soup.find(name="ul", attrs={"id": "list"})
            allnode_of_a = soup.find_all("a")
            result = [_.get("href") for _ in allnode_of_a]

            # 生成epaper_date
            urlptn = r".*(\d{6})/(\d{2}).*\.html"
            m = re.match(urlptn, result[0])
            epaper_date = m.group(1) + m.group(2)

            for ul in result:
                html_urls.append(urljoin(url, ul))
        else:
            url = "http://jlrbszb.dajilin.com/pc/paper/layout/" + epaper_date[:6] + "/" + epaper_date[6:8] + "/node_01.html"
            response = requests.get(url, headers=headers)
            response.encoding = "UTF-8"

            soup = BeautifulSoup(response.text, features="lxml")
            # 限定特定节点内容
            soup = soup.find(name="ul", attrs={"id": "layoutlist"})
            allnode_of_a = soup.find_all("a")
            result = [_.get("href") for _ in allnode_of_a]

            for ul in result:
                html_urls.append(urljoin(url, ul))

        self.pdf_urls = []
        for url_temp in html_urls:
            response = requests.get(url_temp, headers=headers)
            response.encoding = "UTF-8"

            soup = BeautifulSoup(response.text, features="lxml")
            # 限定特定节点内容
            soup = soup.find(name="p", attrs={"id": "pdfUrl"})
            # pdf文件超链接
            self.pdf_urls.append(urljoin(url_temp, soup.text))

        # 合并后的pdf文件名
        self.pdf_name = "吉林日报 " + datestr2ymdw(epaper_date)
        print(f"[{self.pdf_name}] 共 {len(self.pdf_urls)} 个PDF文件需要合并")

    def save_pdf(self, download_path, pdf_href, pdf_detail_name):
        resp_download_pdf = requests.get(pdf_href, headers=self.headers)

        # 创建文件夹,不存在就创建
        path = f"{download_path}/temp_file"
        if not os.path.exists(path):
            os.mkdir(rf"{download_path}/temp_file")

        with open(f"{download_path}/temp_file/{pdf_detail_name}", mode="wb") as f:
            f.write(resp_download_pdf.content)
        print(f"{pdf_detail_name} 下载完成")

    def download_single_pdf_file(self, download_path):
        self.pdf_files = []
        for url in self.pdf_urls:
            num = random.randint(1, 3)
            print(f"{url}, 随机暂停时间:{num}秒")
            pdf_detail_name = os.path.basename(url)
            self.save_pdf(download_path, url, pdf_detail_name)
            self.pdf_files.append(f"{download_path}/temp_file/" + pdf_detail_name)
            time.sleep(num)

    def download(self):
        save_path = "E:/downloads/epaper/吉林日报"
        self.download_single_pdf_file(save_path)

        # 合成绝对路径
        file_list = [
            os.path.join(f"{save_path}/temp_file", filename) for filename in self.pdf_files
        ]

        out_filename = f"{save_path}/{self.pdf_name}.pdf"
        merge_pdfs(file_list, out_filename)
        if os.path.exists(f"{save_path}/temp_file"):
            shutil.rmtree(f"{save_path}/temp_file")
        print(f"下载已完成:{save_path}")


def download_jlrb(epaper_date: str = None):
    """
    下载吉林日报电子版

    Args:
        * epaper_date(str): 20240318
    """

    epaper = Jlrb(epaper_date)
    epaper.download()

# 下载当日
download_jlrb()
# 特定日期
download_jlrb("20240311")

免费评分

参与人数 1吾爱币 +1 热心值 +1 收起 理由
最好的幸福 + 1 + 1 谢谢@Thanks!

查看全部评分

jone33 发表于 2024-3-21 22:08
shubiao 发表于 2024-3-21 22:28
snakenba580 发表于 2024-3-21 22:45
非常的好用,谢谢分享。
飞鸟热 发表于 2024-3-22 00:19
请楼主或 大神们受累打个包
abmabmabm 发表于 2024-3-22 00:25
谢谢分享,日常下载报纸来看看
DaBengui1 发表于 2024-3-22 03:32
谢谢分享                                                        
3969 发表于 2024-3-22 06:28
报纸没落了,现在讲究的是快消品,唉。。。
raindrop00 发表于 2024-3-22 06:43
谢谢分享
FlaSh2023 发表于 2024-3-22 07:22
非常感谢。。。
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-12-12 07:43

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表