好友
阅读权限 10
听众
最后登录 1970-1-1
查找资料时,利用浏览器插件:网页表格导出为 Excel 表格1.0,下载了好多网页格式的表格,为了提高效率,借助AI帮助,编写了合并表格的Python程序,经过调试,运行OK,为了使用方便,加了批处理。
===========批处理==================
@echo off
:: 将当前目录设置为批处理文件所在目录
cd /d "%~dp0"
:: 将 TableToExcel.xls 更名为 TableToExcel (0).xls
if exist "TableToExcel.xls" (
ren "TableToExcel.xls" "TableToExcel (0).xls"
) else (
echo 错误:未找到文件 TableToExcel.xls
)
python merge_HTML-2-xlsx.py
pause
===========Python 脚本程序=merge_HTML-2-xlsx.py=================
import sys
import os
import glob
import re
import traceback
import pandas as pd
from lxml import html
# 日志输出
log_file = open("merge_log.txt", "w", encoding="utf-8")
def log_print(msg):
print(msg)
log_file.write(msg + "\n")
log_file.flush()
log_print("=" * 60)
log_print("程序启动(通用:从参考文件读取列名,用户指定标题行数 n)")
log_print("=" * 60)
# ================= 用户输入 =================
try:
n = int(input("请输入每个源文件的标题行数(需跳过的行数)n = "))
log_print(f"用户指定标题行数: {n}")
except ValueError:
log_print("输入无效,必须为整数。程序退出。")
sys.exit(1)
# ================= 参考文件路径 =================
ref_file = "TableToExcel (0).xls"
if not os.path.exists(ref_file):
log_print(f"错误:参考文件 {ref_file} 不存在,请确保文件在当前目录。")
sys.exit(1)
# ================= 从参考文件读取列名(第一行) =================
try:
with open(ref_file, 'r', encoding='utf-8') as f:
content = f.read()
tree = html.fromstring(content)
tables = tree.xpath('//table')
if not tables:
log_print("参考文件中未找到表格,无法读取列名。")
sys.exit(1)
first_table = tables[0]
first_row = first_table.xpath('.//tr')[0]
cells = first_row.xpath('.//td|.//th')
column_names = [cell.text_content().strip() for cell in cells]
m = len(column_names)
log_print(f"从参考文件 {ref_file} 读取到 {m} 列: {column_names}")
except Exception as e:
log_print(f"读取参考文件失败: {e}")
traceback.print_exc()
sys.exit(1)
# ================= 查找所有 .xls 文件 =================
current_dir = os.getcwd()
log_print(f"当前工作目录: {current_dir}")
pattern = "TableToExcel (*).xls"
files = glob.glob(pattern)
log_print(f"找到 {len(files)} 个文件")
def extract_number(filename):
match = re.search(r'\((\d+)\)', filename)
return int(match.group(1)) if match else 9999
files_sorted = sorted(files, key=extract_number)
log_print("排序后的文件列表:")
for f in files_sorted:
log_print(f" {f}")
# ================= 读取并合并数据 =================
all_dfs = []
success_count = 0
for file in files_sorted:
try:
log_print(f"\n正在读取: {file}")
with open(file, 'r', encoding='utf-8') as f:
content = f.read()
tree = html.fromstring(content)
tables = tree.xpath('//table')
if not tables:
log_print(" 未找到表格,跳过")
continue
table = tables[0]
rows = table.xpath('.//tr')
total_rows = len(rows)
log_print(f" 表格总行数: {total_rows}")
if total_rows <= n:
log_print(f" 行数不足 {n+1},跳过")
continue
# 跳过前 n 行标题,取剩余所有行
data_rows = rows[n:]
data = []
for tr in data_rows:
cells = tr.xpath('.//td')
if not cells:
cells = tr.xpath('.//th')
# 只取前 m 列(与参考文件列数一致)
row_values = [cell.text_content().strip() for cell in cells[:m]]
# 如果该行所有单元格都为空,则跳过
if any(row_values):
data.append(row_values)
log_print(f" 提取的数据行数: {len(data)}")
# 转换为 DataFrame,使用从参考文件读取的列名
df = pd.DataFrame(data, columns=column_names)
df = df.astype(str).replace('nan', '').replace('None', '')
all_dfs.append(df)
success_count += 1
except Exception as e:
log_print(f" 读取失败: {e}")
log_print(traceback.format_exc())
if success_count == 0:
log_print("没有成功读取任何数据,退出")
input("按回车键退出...")
sys.exit(1)
log_print(f"\n成功读取 {success_count} 个文件")
# 合并数据
merged = pd.concat(all_dfs, ignore_index=True)
total_rows = len(merged)
log_print(f"合并后总数据行数: {total_rows}")
# 输出为 .xlsx 文件(只有一行标题)
output_file = "TableToExcel (all).xlsx"
log_print(f"正在写入文件: {output_file}")
try:
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
# 写入数据(从第2行开始,因为第1行是标题)
merged.to_excel(writer, sheet_name="TableToExcel (all)", index=False, header=False, startrow=1)
workbook = writer.book
worksheet = writer.sheets["TableToExcel (all)"]
# 写入标题行(第1行)
for col_idx, header in enumerate(column_names, start=1):
worksheet.cell(row=1, column=col_idx, value=header)
abs_path = os.path.abspath(output_file)
log_print(f"\n✅ 合并成功!")
log_print(f" 输出文件: {abs_path}")
log_print(f" 总数据行数: {total_rows}")
except Exception as e:
log_print(f"写入文件失败: {e}")
log_print(traceback.format_exc())
input("按回车键退出...")
sys.exit(1)
log_print("\n程序执行完毕")
log_file.close()
input("按回车键退出...")
免费评分
查看全部评分
发帖前要善用【论坛搜索 】 功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。