python-将英文文章切割为单词，并添加发音和释义

justfly99 · 发表于 2023-7-10 17:23

本帖最后由 justfly99 于 2023-7-12 10:37 编辑

20230712，进行了小修改，主要是以下2个：
1、支持选择多个文件进行转换
2、将单个字符和2个字符的单词给过滤了，如a,at等。

成品地址(源码在压缩包里)：https://wwch.lanzoul.com/iPQBt124yqmf
PS：这次解压后，执行的是article2words v1.01.exe的

经历多次Chatgpt后的成品，虽然图形界面有点丑，不过也懒得弄了。
不足：有些带时态或复数的单词，无法添加发音和释义的。

图片.png

成品见这里：https://wwch.lanzoul.com/in1Zg11uejab
解压后执行main.exe即可。

源码见这里(如果有人优化了，希望也能给我一份优化后的代码)：

[Python] 纯文本查看 复制代码

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

023

024

025

026

027

028

029

030

031

032

033

034

035

036

037

038

039

040

041

042

043

044

045

046

047

048

049

050

051

052

053

054

055

056

057

058

059

060

061

062

063

064

065

066

067

068

069

070

071

072

073

074

075

076

077

078

079

080

081

082

083

084

085

086

087

088

089

090

091

092

093

094

095

096

097

098

099

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

import tkinter as tk
from tkinter import filedialog
from tkinter import messagebox
from functools import partial
from threading import Thread
import pandas as pd
import re
import requests
from lxml import etree
from openpyxl import load_workbook
from concurrent.futures import ThreadPoolExecutor
from openpyxl.styles import Font, NamedStyle
 
 
def get_word_info(word):
    # 构造请求URL
    url = f'https://www.youdao.com/w/eng/{word}'
 
    try:
        paraphrase = ""
        data = requests.get(url).text
        html = etree.HTML(data)
        British_pronunciation = html.xpath('//*[@id="phrsListTab"]/h2/div/span[1]/span/text()')[0]
        American_pronunciation = html.xpath('//*[@id="phrsListTab"]/h2/div/span[2]/span/text()')[0]
        li_elements = html.xpath('//*[@id="phrsListTab"]/div/ul')
        for li in li_elements:
            paraphrase = ''.join(li.xpath('.//text()'))
        return British_pronunciation, American_pronunciation, paraphrase
    except Exception as e:
        print(e, word)
        return None
 
 
def process_text_file(file_path):
    # 读取文本文件
    with open(file_path, 'r') as file:
        content = file.read()
 
    # 将内容分隔为单词列表
    words = re.split(r"\b[,.:?!()'\"\s\n\t\r]+?\b", content)
 
    # 全部转换为小写
    lowercase_words = [word.lower() for word in words]
 
    # 去重
    unique_words = list(set(lowercase_words))
 
    # 排序
    sorted_words = sorted(unique_words)
 
    # 清洗
    filtered_words = [word for word in sorted_words if "'" not in word and not re.search(r'[\u4e00-\u9fff]', word) and not re.search(r'\d', word)]  # 删除包含乱码、数字和缩写的单词
 
    # 导出到Excel
    df = pd.DataFrame(filtered_words, columns=['Words'])
    output_file = file_path.replace('.txt', '.xlsx')
    df.to_excel(output_file, index=False)
 
    # 打开Excel文件
    workbook = load_workbook(output_file)
    worksheet = workbook.active
    worksheet.cell(row=1, column=2, value="British_pronunciation")
    worksheet.cell(row=1, column=3, value="American_pronunciation")
    worksheet.cell(row=1, column=4, value="paraphrase")
 
    # 创建样式并设置为加粗
    bold_style = NamedStyle(name="bold_style")
    bold_style.font = Font(bold=True)
    worksheet.cell(row=1, column=2).style = bold_style
    worksheet.cell(row=1, column=3).style = bold_style
    worksheet.cell(row=1, column=4).style = bold_style
 
    # 使用线程池处理请求
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(get_word_info, word) for word in filtered_words]
 
        # 遍历每个单元格，获取单词并添加发音和释义
        row_index = 2  # 设置初始单元格
        for future, row in zip(futures, worksheet.iter_rows(min_row=2, max_col=4)):
            word = row[0].value
            word_info = future.result()
 
            if word_info:
                British_pronunciation, American_pronunciation, paraphrase = word_info
                worksheet.cell(row=row_index, column=2).value = British_pronunciation
                worksheet.cell(row=row_index, column=3).value = American_pronunciation
                worksheet.cell(row=row_index, column=4).value = paraphrase
            else:
                # 如果单词发音获取不到，则检查单词是否s,ed,ing结尾，如果是，则去除s,d,ing后再试试
                if word.endswith(('s', 'ed', 'ing')):
                    word_without_suffix = re.sub(r'(s|d|ing)$', '', word)
                    word_info = get_word_info(word_without_suffix)
                    if word_info:
                        British_pronunciation, American_pronunciation, paraphrase = word_info
                        worksheet.cell(row=row_index, column=2).value = British_pronunciation
                        worksheet.cell(row=row_index, column=3).value = American_pronunciation
                        worksheet.cell(row=row_index, column=4).value = paraphrase
 
            row_index += 1
 
    # 保存修改后的Excel文件
    workbook.save(output_file)
    messagebox.showinfo('Success', 'Process completed successfully.')
 
 
def browse_file(file_entry):
    file_path = filedialog.askopenfilename(filetypes=[('Text Files', '*.txt')])
    if file_path:
        file_entry.delete(0, tk.END)
        file_entry.insert(tk.END, file_path)
 
 
def execute_function(file_entry):
    file_path = file_entry.get()
    if not file_path:
        messagebox.showerror('Error', 'Please select a file.')
        return
 
    execute_button.config(state=tk.DISABLED)
    thread = Thread(target=process_text_file, args=(file_path,))
    thread.start()
 
 
# Create the main window
window = tk.Tk()
window.title('英文文章切割为单词 V1.0')
window.configure(bg='sky blue')
 
# Create the file browse widget
file_label = tk.Label(window, text='Select a text file:', bg='sky blue')
file_label.pack()
 
file_entry = tk.Entry(window, width=50)
file_entry.pack()
 
browse_button = tk.Button(window, text='Browse', command=partial(browse_file, file_entry))
browse_button.pack()
 
# Create the execute button
execute_button = tk.Button(window, text='Execute', command=partial(execute_function, file_entry))
execute_button.pack()
 
# Start the main loop
window.mainloop()

justfly99 · 发表于 2023-7-12 15:17

szair 发表于 2023-7-12 15:07
问一下，以后接口会不会到某天就失效了呢？希望持续更新，谢谢

实话实说，依靠别人不如自己学会修改，代码里面调用是有道的接口，涉及的代码就几行，使用的etree和xpath定位到具体的位置的，这两块的知识点其实不多的，网上找点文章学习一下很快可以上手的

url = f'https://www.youdao.com/w/eng/{word}'

try:
      paraphrase = ""
      data = requests.get(url).text
      html = etree.HTML(data)
      British_pronunciation = html.xpath('//*[@id="phrsListTab"]/h2/div/span[1]/span/text()')[0]
      American_pronunciation = html.xpath('//*[@id="phrsListTab"]/h2/div/span[2]/span/text()')[0]
      li_elements = html.xpath('//*[@id="phrsListTab"]/div/ul')
      for li in li_elements:
         paraphrase = ''.join(li.xpath('.//text()'))
      return British_pronunciation, American_pronunciation, paraphrase

justfly99 · 发表于 2023-7-10 20:25

毋庸讳言发表于 2023-7-10 19:47
这个怎么使用呢？

下载后，双击 main.exe，然后点击 Browser，选择带有英文文章的 txt 文件，点击Execute，然后就在你 txt 文件的目录下，找到 txt 文件同名的 excel 文件，打开就行了

gaogao0000 · 发表于 2023-7-10 17:32

楼主厉害,加油

ckchen123 · 发表于 2023-7-10 17:33

楼主厉害,加油

zhuvsyue · 发表于 2023-7-10 17:36

666哇

jxcllj · 发表于 2023-7-10 17:38

谢谢楼主分享。

markhoo911 · 发表于 2023-7-10 17:42

我靠，这个不错啊，挺实用的

Lcx丶 · 发表于 2023-7-10 17:51

楼主厉害,加油

word11 · 发表于 2023-7-10 17:53

感谢大哥的分享，内容很精彩

m1250674036 · 发表于 2023-7-10 18:25

不错不错，感谢分享收藏备用

blindcat · 发表于 2023-7-10 18:29

跟楼主学习学习

帐号		自动登录	找回密码
密码			注册[Register]

[原创工具] python-将英文文章切割为单词，并添加发音和释义

免费评分

免费评分