import
fitz
import
os
data_dir_in
=
"raw/"
data_dir_out
=
"out/"
Red
=
[
1
,
0.8
,
0.8
]
Aqua
=
[
0.5
,
1
,
1
]
Yellow
=
[
1
,
0.9
,
0.6
]
Green
=
[
0.6
,
1
,
0.6
]
keywords
=
[[
"asthma"
,Green],
[
"IgE"
,Aqua],
[
"omalizumab"
,Red],
[
"biologic"
,Yellow]]
def
getRawFileList(path):
files
=
[]
names
=
[]
for
f
in
os.listdir(path):
if
not
f.endswith(
"~"
)
or
not
f
=
=
"":
files.append(os.path.join(path, f))
names.append(f)
return
files, names
def
highlight_pdf(in_path,name):
doc
=
fitz.
open
(in_path)
for
page
in
doc:
for
i
in
keywords:
text_instances
=
page.searchFor(i[
0
])
for
inst
in
text_instances:
highlight
=
page.addHighlightAnnot(inst)
highlight.setColors(stroke
=
i[
1
])
highlight.update()
doc.save(
"out/"
+
name, garbage
=
4
, deflate
=
True
, clean
=
True
)
def
main():
files,names
=
getRawFileList(data_dir_in)
count_num
=
len
(files)
for
i
in
range
(
0
,
len
(files)):
print
(
"正在Highlight第{0}篇pdf"
.
format
(i
+
1
))
highlight_pdf(files[i],names[i])
print
(
"第{0}篇Highlight完成"
.
format
(i
+
1
))
if
__name__
=
=
'__main__'
:
main()