import pdfplumber
import re
def merge_split_cells(table):
merged_table = []
for i, row in enumerate(table):
if i == 0 or all(cell.strip() for cell in row if cell):
merged_table.append(row)
else:
for j, cell in enumerate(row):
if cell and cell.strip():
if len(merged_table) > 0:
if merged_table[-1][j] is None or merged_table[-1][j].strip() == '':
merged_table[-1][j] = cell
else:
merged_table[-1][j] += ' ' + cell
return merged_table
def process_pdf(pdf_path, txt_path):
with pdfplumber.open(pdf_path) as pdf:
with open(txt_path, 'w', encoding='utf-8') as txt_file:
for page in pdf.pages:
text = page.extract_text()
if text:
txt_file.write(text + '\n\n')
# 处理表格
for pdf_table in page.extract_tables():
merged_table = merge_split_cells(pdf_table)
for row in merged_table:
cleaned_row = [re.sub('\s+', ' ', cell).strip() if cell is not None else '' for cell in row]
txt_file.write(' '.join(cleaned_row) + '\n')
txt_file.write('\n') # 表格之间添加空行
txt_file.write('---------- 分割线 ----------\n\n') # 页面之间的分隔线
pdf_path = 'D:/A股年报/新建文件夹/1219702294.PDF'
txt_path = 'D:/A股年报/新建文件夹/result.txt'
process_pdf(pdf_path, txt_path)