3.20更新，开源，使用ai模型，用自己文章生成输入法词库的工具

858983646 · 发表于 2025-3-18 21:32

本帖最后由 858983646 于 2025-3-20 10:19 编辑

3.20更新，开源，使用ai模型，用自己文章生成输入法词库的工具
用spacy的zh_core_web_lg的中文分词模型重写了，这个工具可以把自己历年写的文档转换成个性化的输入法词库，提高自己工作时的打字效率。
输出单纯txt，可以直接导入搜狗输入法。
建议附带拼音和数量输出，这样可以使用站里的深蓝词库转换成其他输入法的词库，还能带词频，导入时选择自定义格式https://www.52pojie.cn/thread-1972180-1-1.html

使用方法,打开软件,输入要的词库数量,加载自己的文本(TXT),然后等输出,然后保存为txt词库，站里的深蓝转换，转换方法看截图然后输入法导入
链接: https://pan.baidu.com/s/1Vq_7Ej5ljL9csCh0XEJ5gg?pwd=hcty 提取码: hcty

[Python] 纯文本查看 复制代码

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

023

024

025

026

027

028

029

030

031

032

033

034

035

036

037

038

039

040

041

042

043

044

045

046

047

048

049

050

051

052

053

054

055

056

057

058

059

060

061

062

063

064

065

066

067

068

069

070

071

072

073

074

075

076

077

078

079

080

081

082

083

084

085

086

087

088

089

090

091

092

093

094

095

096

097

098

099

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

import tkinter as tk
from tkinter import filedialog, messagebox
from collections import Counter
import re
import spacy
import threading
import os
import random
from pypinyin import pinyin, lazy_pinyin, Style
 
# 尝试启用 GPU 加速
try:
    spacy.prefer_gpu()
    print("GPU 加速已启用")
except Exception as e:
    print(f"无法启用 GPU 加速: {str(e)}")
 
# 获取程序所在目录
current_dir = os.path.dirname(os.path.abspath(__file__))
 
# 模型文件夹名称（假设模型文件放在一个名为"models"的子文件夹中）
model_folder = os.path.join(current_dir, "models", "zh_core_web_lg")
 
# 加载模型
nlp = spacy.load(model_folder)
 
 
def split_text(text):
    """
    将文本按照标点符号拆分成多个段落，去掉“、”和换行符。若段落大于2000字符，用空格或“的”拆分，若仍大于2000字符，随机拆分。
    """
    # 定义中文标点符号（去掉“、”和换行符）
    punctuation = r"。！？；"  # 去掉逗号（，）
 
    # 按标点符号拆分
    sentences = re.split(f"([{punctuation}])", text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
 
    # 检查每个段落长度，若大于2000字符，进一步拆分
    final_segments = []
    for segment in sentences:
        if len(segment) > 2000:
            # 尝试用空格拆分
            sub_segments = re.split(r'\s+', segment)
            if len(sub_segments) > 1:
                final_segments.extend(sub_segments)
            else:
                # 尝试用“的”拆分
                sub_segments = re.split(r'的', segment)
                if len(sub_segments) > 1:
                    final_segments.extend(sub_segments)
                else:
                    # 随机拆分
                    while len(segment) > 2000:
                        split_index = random.randint(0, len(segment) - 1)
                        final_segments.append(segment[:split_index])
                        segment = segment[split_index:]
                    final_segments.append(segment)
        else:
            final_segments.append(segment)
 
    # 拆分完成后打印确认消息
    print(f"文本拆分完成！共拆分为 {len(final_segments)} 段。")
    return final_segments
 
 
def generate_all_combinations(tokens):
    """
    生成所有可能的连续单词组合（从两个单词到整个句子的所有单词组合）
    """
    combinations = []
    n = len(tokens)
    for start in range(n):
        for end in range(start + 1, n + 1):
            combinations.append("".join(tokens[start:end]))
    return combinations
 
 
def extract_phrases(doc):
    # 提取所有单词
    words = [token.text for token in doc]
    # 生成所有可能的连续组合
    return generate_all_combinations(words)
 
 
def count_words(text, top_n, progress_text, filter_symbols_entry):
    try:
        # 按标点符号拆分文本
        segments = split_text(text)
        all_words = []
        all_phrases = []
 
        # 对每个段落分别进行分词处理
        total_segments = len(segments)
        processed_segments = 0  # 用于统计处理的段落数量
 
        for idx, segment in enumerate(segments, start=1):
            doc = nlp(segment)
            # 提取单词
            words = [token.text for token in doc if token.is_alpha]
            all_words.extend(words)
            # 提取短语
            phrases = extract_phrases(doc)
            # 去掉短语开头和结尾的标点符号
            cleaned_phrases = [re.sub(r'^[^\w\u4e00-\u9fa5]+|[^\w\u4e00-\u9fa5]+$', '', phrase) for phrase in phrases]
            cleaned_phrases = [phrase for phrase in cleaned_phrases if phrase]  # 去掉空字符串
 
            # 获取用户输入的过滤符号
            filter_symbols = filter_symbols_entry.get()
 
            # 构建正则表达式，用于匹配包含这些符号的短语
            pattern = r'^[^' + re.escape(filter_symbols) + r'\s]*$'  # 匹配不包含过滤符号和空白字符的短语
 
            # 过滤掉包含过滤符号的短语
            cleaned_phrases = [phrase for phrase in cleaned_phrases if re.match(pattern, phrase)]
 
            all_phrases.extend(cleaned_phrases)
            processed_segments += 1  # 每处理一个段落，计数加1
 
            # 实时打印进度
            progress_text.insert(tk.END, f"正在处理段落 {idx}/{total_segments}...\n")
            progress_text.see(tk.END)
 
        # 过滤特殊符号（保留中文、字母、数字）
        all_words = [word for word in all_words if re.search(r'[\u4e00-\u9fa5\w]', word)]
        # 对于短语，不进行过滤，保留标点符号
        all_phrases = [phrase for phrase in all_phrases]
 
        # 统计词频
        word_counts = Counter(all_words)
        phrase_counts = Counter(all_phrases)
 
        # 去除短语中与单词完全一致的部分
        phrase_counts = {phrase: count for phrase, count in phrase_counts.items() if phrase not in word_counts}
        phrase_counts = Counter(phrase_counts)  # 转换回 Counter 对象
 
        # 打印最终结果
        progress_text.insert(tk.END, f"模型处理了 {processed_segments} 段文本，总段落数为 {total_segments}。\n")
        progress_text.see(tk.END)
 
        return word_counts.most_common(top_n), phrase_counts.most_common(top_n)
    except Exception as e:
        messagebox.showerror("分词错误", f"分词过程出现异常: {str(e)}")
        return [], []
 
 
class RedirectText:
    def __init__(self, text_widget):
        self.text_space = text_widget
 
    def write(self, text):
        self.text_space.insert(tk.END, text)
        self.text_space.see(tk.END)
 
    def flush(self):
        pass
 
 
def load_files(filter_symbols_entry, include_count_var, include_pinyin_var):
    filenames = filedialog.askopenfilenames(
        filetypes=[("Text files", "*.txt"), ("All files", "*.*")]
    )
    if not filenames:
        return
 
    # 读取所有文件内容
    all_text = ""
    for filename in filenames:
        try:
            with open(filename, 'r', encoding='utf-8') as file:
                all_text += file.read() + "\n"
        except UnicodeDecodeError:
            try:
                with open(filename, 'r', encoding='gbk') as file:
                    all_text += file.read() + "\n"
            except Exception as e:
                messagebox.showerror("读取失败", f"无法读取文件 {filename}:\n{str(e)}")
                return
 
    # 清空进度文本框
    progress_text.delete(1.0, tk.END)
 
    # 使用多线程避免界面卡死
    def process_text():
        try:
            top_n = int(top_n_entry.get())
            word_results, phrase_results = count_words(all_text, top_n, progress_text, filter_symbols_entry)
            word_text.delete(1.0, tk.END)
            phrase_text.delete(1.0, tk.END)
            for word, count in word_results:
                word_text.insert(tk.END, f'{word}: {count}\n')
            for phrase, count in phrase_results:
                phrase_text.insert(tk.END, f'{phrase}: {count}\n')
        except ValueError:
            messagebox.showerror("输入错误", "请输入有效的数字")
 
    # 启动线程
    threading.Thread(target=process_text).start()
 
 
def save_word_results(include_count_var, include_pinyin_var):
    filename = filedialog.asksaveasfilename(
        defaultextension=".txt",
        filetypes=[("Text files", "*.txt"), ("All files", "*.*")]
    )
    if not filename:
        return
 
    # 提取并保存单词
    word_result_text = word_text.get(1.0, tk.END).strip()
    lines = word_result_text.split('\n')
    words = []
    for line in lines:
        if line:
            parts = line.split(':')
            word = parts[0].strip()
            count = parts[1].strip() if len(parts) > 1 else '1'
            if include_pinyin_var.get():
                # 添加拼音转换逻辑
                pinyin_list = lazy_pinyin(word, style=Style.NORMAL, errors='default')
                pinyin_str = ','.join(pinyin_list)
                words.append(f"{pinyin_str} {word} {count}")
            else:
                if include_count_var.get():
                    words.append(f"{word}: {count}")
                else:
                    words.append(word)
     
    try:
        with open(filename, 'w', encoding='utf-8') as file:
            file.write('\n'.join(words))
        messagebox.showinfo("保存成功", "单词已成功保存")
    except Exception as e:
        messagebox.showerror("保存失败", f"文件保存失败:\n{str(e)}")
 
 
def save_phrase_results(include_count_var, include_pinyin_var):
    filename = filedialog.asksaveasfilename(
        defaultextension=".txt",
        filetypes=[("Text files", "*.txt"), ("All files", "*.*")]
    )
    if not filename:
        return
 
    # 提取并保存短语
    phrase_result_text = phrase_text.get(1.0, tk.END).strip()
    lines = phrase_result_text.split('\n')
    phrases = []
    for line in lines:
        if line:
            parts = line.split(':')
            phrase = parts[0].strip()
            count = parts[1].strip() if len(parts) > 1 else '1'
            if include_pinyin_var.get():
                # 添加拼音转换逻辑
                pinyin_list = lazy_pinyin(phrase, style=Style.NORMAL, errors='default')
                pinyin_str = ','.join(pinyin_list)
                phrases.append(f"{pinyin_str} {phrase} {count}")
            else:
                if include_count_var.get():
                    phrases.append(f"{phrase}: {count}")
                else:
                    phrases.append(phrase)
     
    try:
        with open(filename, 'w', encoding='utf-8') as file:
            file.write('\n'.join(phrases))
        messagebox.showinfo("保存成功", "短语已成功保存")
    except Exception as e:
        messagebox.showerror("保存失败", f"文件保存失败:\n{str(e)}")
 
 
def save_combined_results(include_count_var, include_pinyin_var):
    filename = filedialog.asksaveasfilename(
        defaultextension=".txt",
        filetypes=[("Text files", "*.txt"), ("All files", "*.*")]
    )
    if not filename:
        return
 
    # 提取单词和短语
    word_result_text = word_text.get(1.0, tk.END).strip()
    phrase_result_text = phrase_text.get(1.0, tk.END).strip()
 
    # 解析单词和短语
    word_lines = word_result_text.split('\n')
    phrase_lines = phrase_result_text.split('\n')
 
    word_counts = {}
    for line in word_lines:
        if line:
            parts = line.split(':')
            word = parts[0].strip()
            count = int(parts[1].strip()) if len(parts) > 1 else 1
            word_counts[word] = count
 
    phrase_counts = {}
    for line in phrase_lines:
        if line:
            parts = line.split(':')
            phrase = parts[0].strip()
            count = int(parts[1].strip()) if len(parts) > 1 else 1
            phrase_counts[phrase] = count
 
    # 合并并排序
    combined = {}
    combined.update(word_counts)
    combined.update(phrase_counts)
    sorted_combined = sorted(combined.items(), key=lambda x: x[1], reverse=True)
 
    # 格式化输出
    lines = []
    for item, count in sorted_combined:
        if include_pinyin_var.get():
            pinyin_list = lazy_pinyin(item, style=Style.NORMAL, errors='default')
            pinyin_str = ','.join(pinyin_list)
            lines.append(f"{pinyin_str} {item} {count}")
        else:
            if include_count_var.get():
                lines.append(f"{item}: {count}")
            else:
                lines.append(item)
 
    # 保存到文件
    try:
        with open(filename, 'w', encoding='utf-8') as file:
            file.write('\n'.join(lines))
        messagebox.showinfo("保存成功", "单词和短语已成功合并并保存")
    except Exception as e:
        messagebox.showerror("保存失败", f"文件保存失败:\n{str(e)}")
 
 
# 创建GUI界面
root = tk.Tk()
root.title("个性化输入法词库生成器1.0-来自吾爱破解论坛")
 
# 创建一个文本框用于显示进度
progress_frame = tk.LabelFrame(root, text="处理进度")
progress_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
progress_text = tk.Text(progress_frame, height=3, width=60, wrap=tk.WORD)
progress_scrollbar = tk.Scrollbar(progress_frame, command=progress_text.yview)
progress_text.config(yscrollcommand=progress_scrollbar.set)
progress_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
progress_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
 
# 重定向打印输出到进度文本框
redirect = RedirectText(progress_text)
print = redirect.write
 
# 单词显示区域
word_frame = tk.LabelFrame(root, text="单词统计")
word_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
word_text = tk.Text(word_frame, height=20, width=40, wrap=tk.NONE)
word_scrollbar = tk.Scrollbar(word_frame, command=word_text.yview)
word_text.config(yscrollcommand=word_scrollbar.set)
word_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
word_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
 
# 短语显示区域
phrase_frame = tk.LabelFrame(root, text="短语统计")
phrase_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
phrase_text = tk.Text(phrase_frame, height=20, width=40, wrap=tk.NONE)
phrase_scrollbar = tk.Scrollbar(phrase_frame, command=phrase_text.yview)
phrase_text.config(yscrollcommand=phrase_scrollbar.set)
phrase_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
phrase_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
 
# 控制面板
control_frame = tk.Frame(root)
control_frame.pack(pady=10)
 
tk.Label(control_frame, text="显示数量:").pack(side=tk.LEFT)
top_n_entry = tk.Entry(control_frame, width=8)
top_n_entry.insert(0, "100")
top_n_entry.pack(side=tk.LEFT, padx=5)
 
# 添加过滤符号输入框
tk.Label(control_frame, text="短语过滤掉的符号:").pack(side=tk.LEFT, padx=5)
filter_symbols_entry = tk.Entry(control_frame, width=20)
filter_symbols_entry.insert(0, "、，；。？！：“”《》（）()＂")  # 默认值
filter_symbols_entry.pack(side=tk.LEFT, padx=5)
 
# 添加复选框
include_count_var = tk.BooleanVar()
include_count_var.set(False)
tk.Checkbutton(control_frame, text="保存时附带统计数量", variable=include_count_var).pack(side=tk.LEFT, padx=5)
 
include_pinyin_var = tk.BooleanVar()
include_pinyin_var.set(False)
tk.Checkbutton(control_frame, text="附带拼音", variable=include_pinyin_var).pack(side=tk.LEFT, padx=5)
 
tk.Button(control_frame, text="加载文件", command=lambda: load_files(filter_symbols_entry, include_count_var, include_pinyin_var)).pack(side=tk.LEFT, padx=5)
tk.Button(control_frame, text="保存单词", command=lambda: save_word_results(include_count_var, include_pinyin_var)).pack(side=tk.LEFT, padx=5)
tk.Button(control_frame, text="保存短语", command=lambda: save_phrase_results(include_count_var, include_pinyin_var)).pack(side=tk.LEFT, padx=5)
tk.Button(control_frame, text="合并去重保存", command=lambda: save_combined_results(include_count_var, include_pinyin_var)).pack(side=tk.LEFT, padx=5)
 
root.mainloop()

858983646 · 发表于 2025-3-19 15:00

过几天更新个使用大模型分词的，效果好很多

lxq8432 · 发表于 2025-3-18 21:41

自己做了用户词库

使用起来方便多了

xzx1022 · 发表于 2025-3-18 22:17

本帖最后由 xzx1022 于 2025-3-19 18:44 编辑

感谢分享，我用看看

lllone · 发表于 2025-3-18 22:28

感谢分享

WQ52pojie · 发表于 2025-3-18 22:38

感谢分享

manglang · 发表于 2025-3-18 23:40

单字词没有必要当做词语存储吧？

Caipiao3645 · 发表于 2025-3-18 23:44

这个可以手机适用吗

MRXZ1994 · 发表于 2025-3-19 00:04

没有其他盘吗，比如蓝奏云或者123

开心长寿果 · 发表于 2025-3-19 00:18

感谢无私的分享！

52PJ070 · 发表于 2025-3-19 04:37

这个功能需求挺实用的，可以拿以前打的内容作为训练数据，挺好的

帐号		自动登录	找回密码
密码			注册[Register]

[原创工具] 3.20更新，开源，使用ai模型，用自己文章生成输入法词库的工具

免费评分