[Python] 纯文本查看 复制代码
from PyPDF2 import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from io import BytesIO
import re
import os
import pandas as pd
def clean_filename(filename):
"""移除文件名中的非法字符"""
return re.sub(r'[\\/*?:"<>|\n]', '_', filename)
def create_watermark(name, page_width=992, page_height=792, font_size=20, alpha=0.15, gap=150):
packet = BytesIO()
can = canvas.Canvas(packet, pagesize=(page_width, page_height))
# 设置中文字体
try:
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
pdfmetrics.registerFont(TTFont('SimSun', 'C:/Windows/Fonts/simsun.ttc')) # Windows 宋体
can.setFont("SimSun", font_size)
except:
can.setFont("Helvetica", font_size) # 回退
can.setFillColorRGB(0.1, 0.1, 0.1, alpha=alpha)
# 计算水印分布
x_vals = range(-int(gap * 2), int(page_width + gap * 2), gap)
y_vals = range(-int(gap * 2), int(page_height + gap * 2), gap)
can.saveState()
for x in x_vals:
for y in y_vals:
can.saveState()
can.translate(x, y)
can.rotate(45)
# 处理换行文本
lines = name.split('\n')
line_height = font_size * 1.2
for i, line in enumerate(lines):
y_offset = - (len(lines) - 1) * line_height / 2 + i * line_height
can.drawCentredString(0, y_offset, line)
can.restoreState()
can.restoreState()
can.save()
packet.seek(0)
return PdfReader(packet)
def add_watermark(input_pdf_path, output_pdf_path, watermark_name):
original = PdfReader(input_pdf_path)
watermark = create_watermark(watermark_name).pages[0]
writer = PdfWriter()
for page in original.pages:
page.merge_page(watermark)
writer.add_page(page)
with open(output_pdf_path, "wb") as f:
writer.write(f)
def read_recipients_from_excel(excel_path, sheet_name=0, name_column='姓名', org_column='部门', additional_columns=None):
"""
从Excel文件中读取接收者列表
:param excel_path: Excel文件路径
:param sheet_name: 工作表名称或索引
:param name_column: 姓名所在列名
:param org_column: 组织/部门所在列名
:param additional_columns: 需要添加到水印中的其他列名列表
:return: 接收者列表(包含组织信息)
"""
df = pd.read_excel(excel_path, sheet_name=sheet_name)
recipients = []
for index, row in df.iterrows():
name = str(row[name_column])
org = str(row[org_column]) if org_column in row and not pd.isna(row[org_column]) else '未分类'
# 如果有其他列需要添加到水印中
if additional_columns:
additional_info = []
for col in additional_columns:
if col in row and not pd.isna(row[col]):
additional_info.append(str(row[col]))
if additional_info:
name += '\n' + '\n'.join(additional_info)
recipients.append({
'name': name,
'org': org
})
return recipients
def batch_add_watermarks(input_pdf_path, recipients, output_base_dir="watermarked_pdfs"):
"""
批量为不同的人添加水印,并按组织分类
:param input_pdf_path: 原始PDF文件路径
:param recipients: 接收者列表,每个元素是包含'name'和'org'的字典
:param output_base_dir: 输出基础目录
"""
# 确保基础输出目录存在
os.makedirs(output_base_dir, exist_ok=True)
# 按组织分组处理
org_groups = {}
for recipient in recipients:
org = recipient['org']
if org not in org_groups:
org_groups[org] = []
org_groups[org].append(recipient)
# 为每个组织创建文件夹并生成PDF
for org, members in org_groups.items():
# 创建组织文件夹
org_dir = os.path.join(output_base_dir, clean_filename(org))
os.makedirs(org_dir, exist_ok=True)
# 为每个成员生成带水印的PDF
for member in members:
name = member['name']
output_pdf = os.path.join(org_dir, f"watermarked_{clean_filename(name)}.pdf")
add_watermark(input_pdf_path, output_pdf, name)
print(f"生成: {output_pdf}")
print(f"\n所有带水印的PDF文件已生成在 {output_base_dir} 目录中")
print(f"共生成 {len(recipients)} 个PDF文件,分布在 {len(org_groups)} 个组织文件夹中")
if __name__ == "__main__":
# 示例使用 - 从Excel读取
try:
# 从Excel文件读取接收者列表
# 假设Excel文件中有"姓名"列和"部门"列
recipients = read_recipients_from_excel(
'recipients.xlsx',
name_column='姓名',
org_column='部门',
additional_columns=['备注']
)
# 请将此处替换为你的原始PDF文件路径
input_pdf = "source.pdf"
# 检查文件是否存在
if not os.path.exists(input_pdf):
print(f"错误:未找到文件 {input_pdf}")
print("请将原始PDF文件放在当前目录下,并命名为 source.pdf")
else:
batch_add_watermarks(input_pdf, recipients)
except Exception as e:
print(f"发生错误: {e}")
print("\n使用说明:")
print("1. 确保存在recipients.xlsx文件,包含'姓名'列和'部门'列")
print("2. 确保存在source.pdf文件")
print("3. 安装依赖:pip install PyPDF2 reportlab pandas openpyxl")