import
os
from
docx
import
Document
from
docx.shared
import
Inches
from
docx.shared
import
Pt
from
docx.shared
import
Cm
def
extract_and_save_images_from_docx(doc_path):
doc
=
Document(doc_path)
image_folder
=
'images'
if
not
os.path.exists(image_folder):
os.makedirs(image_folder)
def
find_images(document):
images
=
[]
rels
=
document.part.rels
for
rel
in
rels.values():
if
'image'
in
rel.reltype:
image_part
=
rel.target_part
embed_id
=
rel.rId
images.append((image_part, embed_id))
return
images
image_list
=
find_images(doc)
for
i, (image_part, embed_id)
in
enumerate
(image_list):
image_filename
=
f
'{embed_id}.png'
image_path
=
os.path.join(image_folder, image_filename)
with
open
(image_path,
'wb'
) as f:
f.write(image_part.blob)
print
(
"Images extracted and saved successfully."
)
def
extract_images_from_docx(doc_path):
doc
=
Document(doc_path)
new_doc
=
Document()
doc
=
Document(doc_path)
current_title
=
None
current_index
=
1
new_doc
=
None
file_folder
=
'零散'
if
not
os.path.exists(file_folder):
os.makedirs(file_folder)
for
paragraph
in
doc.paragraphs:
if
paragraph.style.name.startswith(
'Heading'
):
if
new_doc
is
not
None
:
file_name
=
str
(current_index)
+
current_title
+
'.docx'
file_path
=
os.path.join(file_folder, file_name)
new_doc.save(file_path)
current_index
+
=
1
current_title
=
paragraph.text
new_doc
=
Document()
new_doc.add_paragraph(paragraph.text, style
=
paragraph.style.name)
elif
new_doc
is
not
None
:
new_paragraph
=
new_doc.add_paragraph(paragraph.text, style
=
'Normal'
)
for
run
in
paragraph.runs:
run_xml
=
run._r
if
'<w:drawing>'
in
run_xml.xml:
print
(
"found an image1"
)
image_start
=
run_xml.xml.find(
'<w:drawing>'
)
image_end
=
run_xml.xml.find(
'</w:drawing>'
)
+
len
(
'</w:drawing>'
)
image_xml
=
run_xml.xml[image_start:image_end]
image_id_start
=
image_xml.find(
'r:embed="'
)
+
len
(
'r:embed="'
)
image_id_end
=
image_xml.find(
'"'
, image_id_start)
image_id
=
image_xml[image_id_start:image_id_end]
image_path
=
'images/'
+
image_id
+
'.png'
print
(
"image_path="
+
image_path)
new_paragraph.add_run().add_picture(image_path, width
=
Inches(
5
))
if
not
new_paragraph.runs:
new_paragraph.text
=
paragraph.text
if
new_doc
is
not
None
:
file_name
=
str
(current_index)
+
current_title
+
'.docx'
file_path
=
os.path.join(file_folder, file_name)
new_doc.save(file_path)
extract_and_save_images_from_docx(
'pp2.docx'
)
extract_images_from_docx(
'pp2.docx'
)