吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 6404|回复: 100
收起左侧

[原创工具] 一个简单的针对扫描版pdf压缩的软件

  [复制链接]
duskdust 发表于 2024-3-23 19:48
本帖最后由 duskdust 于 2024-7-8 19:17 编辑

初学py,想写个小文件方便自己对扫描版文字类pdf阅读
  • 使用pyinstaller对pypdf中减小pdf的功能进行打包,参见https://pypdf.readthedocs.io/en/stable/user/file-size.html
  • 将exe放置在pdf同目录文件夹下操作,有移除重复对象,压缩图片,二值化压缩,针对扫描版pdf的图像二值化压缩效果比较好
  • 直接可用的exe链接 https://penguin-a.lanzoue.com/iuqlC1zm2ych 备用链接 https://img.223886.xyz/cpdf240524.exe
[Python] 纯文本查看 复制代码
"""
压缩pdf, 保留标签
---
2024-05-24 对书签层级保留问题修复

---
[mw_shl_code=python,true]"""
"""
压缩pdf, 保留标签
"""
import os
from pypdf import PdfReader, PdfWriter
from tqdm import tqdm
from PIL import Image, ImageEnhance
from io import BytesIO
 
def get_page_number_from_indirect(reader, indirect_ref):
    for i, page in enumerate(reader.pages):
        if page.indirect_ref == indirect_ref:
            return i
    return None
 
def add_bookmarks_to_writer(writer, reader, outlines,parent=None):
    for item in outlines:
        if isinstance(item, list):
            # 如果书签有子项(嵌套的书签),递归处理
            add_bookmarks_to_writer(writer, reader, item)
            parents.pop()
        else:
            title = item.get('/Title')
            indirect_ref = item.get('/Page')
            page_num = get_page_number_from_indirect(reader, indirect_ref)
            if '/Count' in item and item['/Count'] < 0:
                bookmark=writer.add_outline_item(title, page_num,parent=parents[-1] if len(parents) != 0 else None)
                parents.append(bookmark)
            else:
                writer.add_outline_item(title, page_num,parent=parents[-1] if len(parents) != 0 else None)

def blacky(im):
    #调整对比度
    im = im.convert('L')
    im = ImageEnhance.Contrast(im).enhance(3)
    #调整高亮度
    im = ImageEnhance.Brightness(im).enhance(1.5)
    #调整锐化
    im = ImageEnhance.Sharpness(im).enhance(2)
    #调整饱和度
    im = ImageEnhance.Color(im).enhance(1.5)
 
    #定义灰度界限
    threshold = 128
    table =  []
    for i in range(256):
        if i < threshold:
            table.append(0)
        else:
            table.append(1)
    new_image =  im.point(table, '1')
    #convert tiff image compression to ccitt t.6 
    imgbuffer =  BytesIO()
    new_image.save(imgbuffer,format="TIFF",compression='group4', optimize=True,dpi=[300, 300])
    return Image.open(imgbuffer)
 
# List all PDF files in the current directory
pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
for idx, file in enumerate(pdf_files):
    print(f"{idx}: {file}")
 
# User selects a PDF file
file_index = int(input("输入要压缩的pdf文件序号: "))
pdf_file = pdf_files[file_index]
 
# Options for reducing file size
print("选择压缩pdf的方式")
print("1: 删除重复对象")
print("2: 删除图像")
print("3: 降低图片质量")
print("4: 使用无损压缩")
print("5: 二值化压缩为tif")
choice = int(input("输入选择 (1-5): "))
 
reader = PdfReader(pdf_file)
writer = PdfWriter()
 
for page in tqdm(reader.pages, desc="读取页面"):
    writer.add_page(page)
 
# Apply the chosen method
if choice == 1:
    writer.add_metadata(reader.metadata)
    pass
elif choice == 2:
    writer.remove_images()
    pass
elif choice == 3:
    for page in tqdm(writer.pages,desc="压缩图像"):
        for img in page.images:
            img.replace(img.image, quality=80)
    pass
 
elif choice == 4:
    # Apply lossless compression code here
 
    for page in tqdm(writer.pages,desc="写入页面"):
        # &#9888;&#65039; This has to be done on the writer, not the reader!
        page.compress_content_streams()  # This is CPU intensive!
    pass
 
elif choice == 5:
    for page in tqdm(writer.pages,desc="写入页面"):
        for img in page.images:
            img.replace(blacky(img.image))
 
# Write the output file
print(f"写入书签中")
outlines = reader.outline
parents = []
add_bookmarks_to_writer(writer, reader, outlines)
 
 
output_file = "reduced_" + pdf_file
with open(output_file, "wb") as f:
    writer.write(f)
 
print(f"处理后的文件为 {output_file}")

"""

压缩效果较为明显

压缩效果较为明显

免费评分

参与人数 9吾爱币 +15 热心值 +4 收起 理由
帅帅的王 + 1 欢迎分析讨论交流,吾爱破解论坛有你更精彩!
xbw98 + 1 + 1 我很赞同!
catoo1 + 1 谢谢@Thanks!
sirsunny + 1 感谢发布原创作品,吾爱破解论坛因你更精彩!
bfkeyi + 1 我很赞同!
amirfly + 1 我很赞同!
Lussering + 1 + 1 我很赞同!
zcyp0314 + 1 + 1 谢谢@Thanks!
风之暇想 + 7 + 1 感谢发布原创作品,吾爱破解论坛因你更精彩!

查看全部评分

本帖被以下淘专辑推荐:

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

858983646 发表于 2024-5-30 23:41
本帖最后由 858983646 于 2024-6-1 10:05 编辑

用ai改了下,可以多文件同时处理,但是就是编译exe后反复输入文件号,py代码直接运行就没这个问题,Python没学过搞不定了。求教大佬
[Asm] 纯文本查看 复制代码
import os
import concurrent.futures
from pypdf import PdfReader, PdfWriter
from tqdm import tqdm
from PIL import Image, ImageEnhance
from io import BytesIO

# 将图片转换为黑白,并增强对比度、亮度、锐度和色彩
def blacky(im):
    im = im.convert('L')
    im = ImageEnhance.Contrast(im).enhance(3)
    im = ImageEnhance.Brightness(im).enhance(1.5)
    im = ImageEnhance.Sharpness(im).enhance(2)
    im = ImageEnhance.Color(im).enhance(1.5)
    threshold = 128
    table = [0 if i < threshold else 1 for i in range(256)]
    new_image = im.point(table, '1')
    imgbuffer = BytesIO()
    new_image.save(imgbuffer, format="TIFF", compression='group4', optimize=True, dpi=[300, 300])
    return Image.open(imgbuffer)

def get_page_number_from_indirect(reader, indirect_reference):
    for i, page in enumerate(reader.pages):
        if page.indirect_reference == indirect_reference:
            return i
    return None
  
def add_bookmarks_to_writer(writer, reader, outlines, parent=None):
    parents = []  # 初始化parents列表
    for item in outlines:
        if isinstance(item, list):
            # 如果书签有子项(嵌套的书签),递归处理
            add_bookmarks_to_writer(writer, reader, item, parent=parent)
            parents.append(parent)  # 将当前父书签添加到parents列表中
        else:
            title = item.get('/Title')
            indirect_reference = item.get('/Page')
            page_num = get_page_number_from_indirect(reader, indirect_reference)
            if '/Count' in item and item['/Count'] < 0:
                new_parent = writer.add_outline_item(title, page_num, parent=parent)
                parents.append(new_parent)  # 将新创建的书签作为父书签添加到parents列表中
            else:
                writer.add_outline_item(title, page_num, parent=parent)
def process_pdf(file_index, choice, pdf_file, quality=None):
    try:
        reader = PdfReader(pdf_file)
        writer = PdfWriter()
        for page in tqdm(reader.pages, desc=f"Processing {pdf_file}"):
            writer.add_page(page)
        
        if choice == 1:
            if reader.metadata is not None:
                writer.add_metadata(reader.metadata)
            else:
                print("没有元数据可以添加。")
        elif choice == 2:
            writer.remove_images()
        elif choice == 3:
            if quality is None:
                raise ValueError("图片质量值未提供")
            for page in tqdm(writer.pages, desc=f"Compressing images {pdf_file}"):
                for img in page.images:
                    img.replace(img.image, quality=quality)
        elif choice == 4:
            for page in tqdm(writer.pages, desc=f"Applying lossless compression {pdf_file}"):
                page.compress_content_streams()
        elif choice == 5:
            for page in tqdm(writer.pages, desc=f"Binarizing images {pdf_file}"):
                for img in page.images:
                    img.replace(blacky(img.image))
        
        if reader.outline:
            add_bookmarks_to_writer(writer, reader, reader.outline)
        
        output_file = f"reduced_{os.path.splitext(pdf_file)[0]}.pdf"
        with open(output_file, "wb") as f:
            writer.write(f)
        print(f"Processed file saved as {output_file}")
    except Exception as e:
        print(f"处理文件 {pdf_file} 时发生错误:{e}")

def main():
    pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
    if not pdf_files:
        print("当前目录下没有找到PDF文件。")
        return

    # 用户交互部分,一次性获取所有输入
    indices = input_indices(pdf_files)
    choice = input_choice()
    quality = input_quality(choice)

    # 处理文件之前,先检查是否有输入错误,如果有则不执行任何操作
    if indices is None or choice is None or (choice == 3 and quality is None):
        return  # 退出程序

    # 获取CPU核心数
    cpu_cores = os.cpu_count() or 1

    # 使用ProcessPoolExecutor并行处理PDF文件
    with concurrent.futures.ProcessPoolExecutor(max_workers=cpu_cores) as executor:
        futures = [executor.submit(process_pdf, idx, choice, pdf_files[idx], quality if choice == 3 else None) for idx in indices]
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as exc:
                print(f"处理文件 {pdf_files[indices.index(idx)]} 时发生错误:{exc}")

# List all PDF files in the current directory
pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
for idx, file in enumerate(pdf_files):
    print(f"{idx}: {file}")

def input_indices(pdf_files):
    while True:
        selection = input("输入'all'以处理所有PDF文件,或者输入用逗号分隔的文件索引:").strip().lower()
        if selection == 'all':
            return range(len(pdf_files))
        else:
            try:
                indices = [int(idx) for idx in selection.split(',')]
                if all(0 <= idx < len(pdf_files) for idx in indices):
                    return indices
                else:
                    print("所有索引必须在0到{}之间。".format(len(pdf_files)-1))
            except ValueError:
                print("输入无效。请输入'all'或者用逗号分隔的索引。")

def input_choice():
    while True:
        try:
            print("选择压缩PDF的方式")
            print("1: 删除重复对象")
            print("2: 删除图像")
            print("3: 降低图片质量")
            print("4: 使用无损压缩")
            print("5: 二值化压缩为tif")
            choice = int(input("输入选择 (1-5): "))
            if choice < 1 or choice > 5:
                raise ValueError
            return choice
        except ValueError:
            print("输入无效。请输入1到5之间的数字。")

def input_quality(choice):
    if choice == 3:
        while True:
            quality = input("请输入图片质量(1-100):")
            if quality.isdigit() and 1 <= int(quality) <= 100:
                return int(quality)
            else:
                print("输入错误,请重新输入图片质量(1-100)。")
    return None

if __name__ == "__main__":
    main()
ianlcc 发表于 2024-3-29 11:25
duskdust 发表于 2024-3-29 10:19
pyinstaller打包的时候会有对应的错误吗?

不知道是不是我哪里没设置好…
D:\111\0612\123\0>Pyinstaller -F -w pdf2small.py
505 INFO: PyInstaller: 6.3.0
505 INFO: Python: 3.11.0
523 INFO: Platform: Windows-10-10.0.19045-SP0
525 INFO: wrote D:\111\0612\123\0\pdf2small.spec
531 INFO: Extending PYTHONPATH with paths
['D:\\111\\0612\\123\\0']
1034 INFO: checking Analysis
1034 INFO: Building Analysis because Analysis-00.toc is non existent
1034 INFO: Initializing module dependency graph...
1037 INFO: Caching module graph hooks...
1062 INFO: Analyzing base_library.zip ...
3199 INFO: Loading module hook 'hook-heapq.py' from 'C:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\PyInstaller\\hooks'...
3315 INFO: Loading module hook 'hook-encodings.py' from 'C:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\PyInstaller\\hooks'...
5663 INFO: Loading module hook 'hook-pickle.py' from 'C:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\PyInstaller\\hooks'...
7709 INFO: Caching module dependency graph...
7869 INFO: Running Analysis Analysis-00.toc
7869 INFO: Looking for Python shared library...
7901 INFO: Using Python shared library: C:\Users\Administrator\AppData\Local\Programs\Python\Python311\python311.dll
7901 INFO: Analyzing D:\111\0612\123\0\pdf2small.py
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Scripts\pyinstaller.exe\__main__.py", line 7, in <module>
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\PyInstaller\__main__.py", line 214, in _console_script_run
    run()
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\PyInstaller\__main__.py", line 198, in run
    run_build(pyi_config, spec_file, **vars(args))
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\PyInstaller\__main__.py", line 69, in run_build
    PyInstaller.building.build_main.main(pyi_config, spec_file, **kwargs)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\PyInstaller\building\build_main.py", line 1071, in main
    build(specfile, distpath, workpath, clean_build)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\PyInstaller\building\build_main.py", line 1011, in build
    exec(code, spec_namespace)
  File "D:\111\0612\123\0\pdf2small.spec", line 4, in <module>
    a = Analysis(
        ^^^^^^^^^
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\PyInstaller\building\build_main.py", line 470, in __init__
    self.__postinit__()
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\PyInstaller\building\datastruct.py", line 184, in __postinit__
    self.assemble()
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\PyInstaller\building\build_main.py", line 608, in assemble
    priority_scripts.append(self.graph.add_script(script))
                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\PyInstaller\depend\analysis.py", line 268, in add_script
    self._top_script_node = super().add_script(pathname)
                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\PyInstaller\lib\modulegraph\modulegraph.py", line 1153, in add_script
    contents = importlib.util.decode_source(contents)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<frozen importlib._bootstrap_external>", line 770, in decode_source
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc0 in position 5: invalid start byte
why110609 发表于 2024-3-24 18:24
jori 发表于 2024-3-25 00:09
我这64位win7打开程序怎么一闪就没自动关闭
hwiori 发表于 2024-3-25 00:28
马克一下,感谢分享
头像被屏蔽
sxzswx 发表于 2024-3-25 05:05
提示: 作者被禁止或删除 内容自动屏蔽
lx5012012 发表于 2024-3-25 06:44
对于以前旧的文件比较有用
David1000 发表于 2024-3-25 07:49
压缩效果不错
qdllss 发表于 2024-3-25 08:29
感谢感谢
LXWY2K 发表于 2024-3-25 09:49
效果真不错!!!
laochous 发表于 2024-3-25 10:24
感谢楼主分享,对节省空间有用
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-12-12 04:40

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表