import
tkinter as tk
from
tkinter
import
filedialog, messagebox
from
collections
import
Counter
import
re
import
spacy
import
threading
import
os
import
random
from
pypinyin
import
pinyin, lazy_pinyin, Style
try
:
spacy.prefer_gpu()
print
(
"GPU 加速已启用"
)
except
Exception as e:
print
(f
"无法启用 GPU 加速: {str(e)}"
)
current_dir
=
os.path.dirname(os.path.abspath(__file__))
model_folder
=
os.path.join(current_dir,
"models"
,
"zh_core_web_lg"
)
nlp
=
spacy.load(model_folder)
def
split_text(text):
punctuation
=
r
"。!?;"
sentences
=
re.split(f
"([{punctuation}])"
, text)
sentences
=
[sentence.strip()
for
sentence
in
sentences
if
sentence.strip()]
final_segments
=
[]
for
segment
in
sentences:
if
len
(segment) >
2000
:
sub_segments
=
re.split(r
'\s+'
, segment)
if
len
(sub_segments) >
1
:
final_segments.extend(sub_segments)
else
:
sub_segments
=
re.split(r
'的'
, segment)
if
len
(sub_segments) >
1
:
final_segments.extend(sub_segments)
else
:
while
len
(segment) >
2000
:
split_index
=
random.randint(
0
,
len
(segment)
-
1
)
final_segments.append(segment[:split_index])
segment
=
segment[split_index:]
final_segments.append(segment)
else
:
final_segments.append(segment)
print
(f
"文本拆分完成!共拆分为 {len(final_segments)} 段。"
)
return
final_segments
def
generate_all_combinations(tokens):
combinations
=
[]
n
=
len
(tokens)
for
start
in
range
(n):
for
end
in
range
(start
+
1
, n
+
1
):
combinations.append("".join(tokens[start:end]))
return
combinations
def
extract_phrases(doc):
words
=
[token.text
for
token
in
doc]
return
generate_all_combinations(words)
def
count_words(text, top_n, progress_text, filter_symbols_entry):
try
:
segments
=
split_text(text)
all_words
=
[]
all_phrases
=
[]
total_segments
=
len
(segments)
processed_segments
=
0
for
idx, segment
in
enumerate
(segments, start
=
1
):
doc
=
nlp(segment)
words
=
[token.text
for
token
in
doc
if
token.is_alpha]
all_words.extend(words)
phrases
=
extract_phrases(doc)
cleaned_phrases
=
[re.sub(r
'^[^\w\u4e00-\u9fa5]+|[^\w\u4e00-\u9fa5]+$'
, '', phrase)
for
phrase
in
phrases]
cleaned_phrases
=
[phrase
for
phrase
in
cleaned_phrases
if
phrase]
filter_symbols
=
filter_symbols_entry.get()
pattern
=
r
'^[^'
+
re.escape(filter_symbols)
+
r
'\s]*$'
cleaned_phrases
=
[phrase
for
phrase
in
cleaned_phrases
if
re.match(pattern, phrase)]
all_phrases.extend(cleaned_phrases)
processed_segments
+
=
1
progress_text.insert(tk.END, f
"正在处理段落 {idx}/{total_segments}...\n"
)
progress_text.see(tk.END)
all_words
=
[word
for
word
in
all_words
if
re.search(r
'[\u4e00-\u9fa5\w]'
, word)]
all_phrases
=
[phrase
for
phrase
in
all_phrases]
word_counts
=
Counter(all_words)
phrase_counts
=
Counter(all_phrases)
phrase_counts
=
{phrase: count
for
phrase, count
in
phrase_counts.items()
if
phrase
not
in
word_counts}
phrase_counts
=
Counter(phrase_counts)
progress_text.insert(tk.END, f
"模型处理了 {processed_segments} 段文本,总段落数为 {total_segments}。\n"
)
progress_text.see(tk.END)
return
word_counts.most_common(top_n), phrase_counts.most_common(top_n)
except
Exception as e:
messagebox.showerror(
"分词错误"
, f
"分词过程出现异常: {str(e)}"
)
return
[], []
class
RedirectText:
def
__init__(
self
, text_widget):
self
.text_space
=
text_widget
def
write(
self
, text):
self
.text_space.insert(tk.END, text)
self
.text_space.see(tk.END)
def
flush(
self
):
pass
def
load_files(filter_symbols_entry, include_count_var, include_pinyin_var):
filenames
=
filedialog.askopenfilenames(
filetypes
=
[(
"Text files"
,
"*.txt"
), (
"All files"
,
"*.*"
)]
)
if
not
filenames:
return
all_text
=
""
for
filename
in
filenames:
try
:
with
open
(filename,
'r'
, encoding
=
'utf-8'
) as
file
:
all_text
+
=
file
.read()
+
"\n"
except
UnicodeDecodeError:
try
:
with
open
(filename,
'r'
, encoding
=
'gbk'
) as
file
:
all_text
+
=
file
.read()
+
"\n"
except
Exception as e:
messagebox.showerror(
"读取失败"
, f
"无法读取文件 {filename}:\n{str(e)}"
)
return
progress_text.delete(
1.0
, tk.END)
def
process_text():
try
:
top_n
=
int
(top_n_entry.get())
word_results, phrase_results
=
count_words(all_text, top_n, progress_text, filter_symbols_entry)
word_text.delete(
1.0
, tk.END)
phrase_text.delete(
1.0
, tk.END)
for
word, count
in
word_results:
word_text.insert(tk.END, f
'{word}: {count}\n'
)
for
phrase, count
in
phrase_results:
phrase_text.insert(tk.END, f
'{phrase}: {count}\n'
)
except
ValueError:
messagebox.showerror(
"输入错误"
,
"请输入有效的数字"
)
threading.Thread(target
=
process_text).start()
def
save_word_results(include_count_var, include_pinyin_var):
filename
=
filedialog.asksaveasfilename(
defaultextension
=
".txt"
,
filetypes
=
[(
"Text files"
,
"*.txt"
), (
"All files"
,
"*.*"
)]
)
if
not
filename:
return
word_result_text
=
word_text.get(
1.0
, tk.END).strip()
lines
=
word_result_text.split(
'\n'
)
words
=
[]
for
line
in
lines:
if
line:
parts
=
line.split(
':'
)
word
=
parts[
0
].strip()
count
=
parts[
1
].strip()
if
len
(parts) >
1
else
'1'
if
include_pinyin_var.get():
pinyin_list
=
lazy_pinyin(word, style
=
Style.NORMAL, errors
=
'default'
)
pinyin_str
=
','
.join(pinyin_list)
words.append(f
"{pinyin_str} {word} {count}"
)
else
:
if
include_count_var.get():
words.append(f
"{word}: {count}"
)
else
:
words.append(word)
try
:
with
open
(filename,
'w'
, encoding
=
'utf-8'
) as
file
:
file
.write(
'\n'
.join(words))
messagebox.showinfo(
"保存成功"
,
"单词已成功保存"
)
except
Exception as e:
messagebox.showerror(
"保存失败"
, f
"文件保存失败:\n{str(e)}"
)
def
save_phrase_results(include_count_var, include_pinyin_var):
filename
=
filedialog.asksaveasfilename(
defaultextension
=
".txt"
,
filetypes
=
[(
"Text files"
,
"*.txt"
), (
"All files"
,
"*.*"
)]
)
if
not
filename:
return
phrase_result_text
=
phrase_text.get(
1.0
, tk.END).strip()
lines
=
phrase_result_text.split(
'\n'
)
phrases
=
[]
for
line
in
lines:
if
line:
parts
=
line.split(
':'
)
phrase
=
parts[
0
].strip()
count
=
parts[
1
].strip()
if
len
(parts) >
1
else
'1'
if
include_pinyin_var.get():
pinyin_list
=
lazy_pinyin(phrase, style
=
Style.NORMAL, errors
=
'default'
)
pinyin_str
=
','
.join(pinyin_list)
phrases.append(f
"{pinyin_str} {phrase} {count}"
)
else
:
if
include_count_var.get():
phrases.append(f
"{phrase}: {count}"
)
else
:
phrases.append(phrase)
try
:
with
open
(filename,
'w'
, encoding
=
'utf-8'
) as
file
:
file
.write(
'\n'
.join(phrases))
messagebox.showinfo(
"保存成功"
,
"短语已成功保存"
)
except
Exception as e:
messagebox.showerror(
"保存失败"
, f
"文件保存失败:\n{str(e)}"
)
def
save_combined_results(include_count_var, include_pinyin_var):
filename
=
filedialog.asksaveasfilename(
defaultextension
=
".txt"
,
filetypes
=
[(
"Text files"
,
"*.txt"
), (
"All files"
,
"*.*"
)]
)
if
not
filename:
return
word_result_text
=
word_text.get(
1.0
, tk.END).strip()
phrase_result_text
=
phrase_text.get(
1.0
, tk.END).strip()
word_lines
=
word_result_text.split(
'\n'
)
phrase_lines
=
phrase_result_text.split(
'\n'
)
word_counts
=
{}
for
line
in
word_lines:
if
line:
parts
=
line.split(
':'
)
word
=
parts[
0
].strip()
count
=
int
(parts[
1
].strip())
if
len
(parts) >
1
else
1
word_counts[word]
=
count
phrase_counts
=
{}
for
line
in
phrase_lines:
if
line:
parts
=
line.split(
':'
)
phrase
=
parts[
0
].strip()
count
=
int
(parts[
1
].strip())
if
len
(parts) >
1
else
1
phrase_counts[phrase]
=
count
combined
=
{}
combined.update(word_counts)
combined.update(phrase_counts)
sorted_combined
=
sorted
(combined.items(), key
=
lambda
x: x[
1
], reverse
=
True
)
lines
=
[]
for
item, count
in
sorted_combined:
if
include_pinyin_var.get():
pinyin_list
=
lazy_pinyin(item, style
=
Style.NORMAL, errors
=
'default'
)
pinyin_str
=
','
.join(pinyin_list)
lines.append(f
"{pinyin_str} {item} {count}"
)
else
:
if
include_count_var.get():
lines.append(f
"{item}: {count}"
)
else
:
lines.append(item)
try
:
with
open
(filename,
'w'
, encoding
=
'utf-8'
) as
file
:
file
.write(
'\n'
.join(lines))
messagebox.showinfo(
"保存成功"
,
"单词和短语已成功合并并保存"
)
except
Exception as e:
messagebox.showerror(
"保存失败"
, f
"文件保存失败:\n{str(e)}"
)
root
=
tk.Tk()
root.title(
"个性化输入法词库生成器1.0-来自吾爱破解论坛"
)
progress_frame
=
tk.LabelFrame(root, text
=
"处理进度"
)
progress_frame.pack(fill
=
tk.BOTH, expand
=
True
, padx
=
10
, pady
=
5
)
progress_text
=
tk.Text(progress_frame, height
=
3
, width
=
60
, wrap
=
tk.WORD)
progress_scrollbar
=
tk.Scrollbar(progress_frame, command
=
progress_text.yview)
progress_text.config(yscrollcommand
=
progress_scrollbar.
set
)
progress_text.pack(side
=
tk.LEFT, fill
=
tk.BOTH, expand
=
True
)
progress_scrollbar.pack(side
=
tk.RIGHT, fill
=
tk.Y)
redirect
=
RedirectText(progress_text)
print
=
redirect.write
word_frame
=
tk.LabelFrame(root, text
=
"单词统计"
)
word_frame.pack(fill
=
tk.BOTH, expand
=
True
, padx
=
10
, pady
=
5
)
word_text
=
tk.Text(word_frame, height
=
20
, width
=
40
, wrap
=
tk.NONE)
word_scrollbar
=
tk.Scrollbar(word_frame, command
=
word_text.yview)
word_text.config(yscrollcommand
=
word_scrollbar.
set
)
word_text.pack(side
=
tk.LEFT, fill
=
tk.BOTH, expand
=
True
)
word_scrollbar.pack(side
=
tk.RIGHT, fill
=
tk.Y)
phrase_frame
=
tk.LabelFrame(root, text
=
"短语统计"
)
phrase_frame.pack(fill
=
tk.BOTH, expand
=
True
, padx
=
10
, pady
=
5
)
phrase_text
=
tk.Text(phrase_frame, height
=
20
, width
=
40
, wrap
=
tk.NONE)
phrase_scrollbar
=
tk.Scrollbar(phrase_frame, command
=
phrase_text.yview)
phrase_text.config(yscrollcommand
=
phrase_scrollbar.
set
)
phrase_text.pack(side
=
tk.LEFT, fill
=
tk.BOTH, expand
=
True
)
phrase_scrollbar.pack(side
=
tk.RIGHT, fill
=
tk.Y)
control_frame
=
tk.Frame(root)
control_frame.pack(pady
=
10
)
tk.Label(control_frame, text
=
"显示数量:"
).pack(side
=
tk.LEFT)
top_n_entry
=
tk.Entry(control_frame, width
=
8
)
top_n_entry.insert(
0
,
"100"
)
top_n_entry.pack(side
=
tk.LEFT, padx
=
5
)
tk.Label(control_frame, text
=
"短语过滤掉的符号:"
).pack(side
=
tk.LEFT, padx
=
5
)
filter_symbols_entry
=
tk.Entry(control_frame, width
=
20
)
filter_symbols_entry.insert(
0
,
"、,;。?!:“”《》()()""
)
filter_symbols_entry.pack(side
=
tk.LEFT, padx
=
5
)
include_count_var
=
tk.BooleanVar()
include_count_var.
set
(
False
)
tk.Checkbutton(control_frame, text
=
"保存时附带统计数量"
, variable
=
include_count_var).pack(side
=
tk.LEFT, padx
=
5
)
include_pinyin_var
=
tk.BooleanVar()
include_pinyin_var.
set
(
False
)
tk.Checkbutton(control_frame, text
=
"附带拼音"
, variable
=
include_pinyin_var).pack(side
=
tk.LEFT, padx
=
5
)
tk.Button(control_frame, text
=
"加载文件"
, command
=
lambda
: load_files(filter_symbols_entry, include_count_var, include_pinyin_var)).pack(side
=
tk.LEFT, padx
=
5
)
tk.Button(control_frame, text
=
"保存单词"
, command
=
lambda
: save_word_results(include_count_var, include_pinyin_var)).pack(side
=
tk.LEFT, padx
=
5
)
tk.Button(control_frame, text
=
"保存短语"
, command
=
lambda
: save_phrase_results(include_count_var, include_pinyin_var)).pack(side
=
tk.LEFT, padx
=
5
)
tk.Button(control_frame, text
=
"合并去重保存"
, command
=
lambda
: save_combined_results(include_count_var, include_pinyin_var)).pack(side
=
tk.LEFT, padx
=
5
)
root.mainloop()