I need help modifying this code below. The purpose of the code is to export the paragraphs in this PDF that contain an asterisk along with the associated photos directly below the paragraphs. The issue I am running into, is that it is exporting ALL the images on the page with the paragraphs. I only need it to extract the images directly below the paragraphs with the asterisk.
import fitz # PyMuPDF
from docx import Document
from docx.shared import Inches
import io
from PIL import Image
Load the PDF document
pdf_document = fitz.open(“Sample Home.pdf”)
Create a Word document
word_document = Document()
Iterate through each page of the PDF
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
blocks = page.get_text(“blocks”)
for block in blocks:
block_text = block[4]
# Check if the paragraph includes an asterisk
if '*' in block_text:
# Add the paragraph to the Word document
word_document.add_paragraph(block_text)
# Extract images associated with this paragraph
image_list = page.get_images(full=True)
for image_index, img in enumerate(image_list):
xref = img[0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
# Load image using PIL
image = Image.open(io.BytesIO(image_bytes))
image_filename = f"image_{page_num}_{image_index}.png"
image.save(image_filename)
# Add image to the Word document
word_document.add_picture(image_filename, width=Inches(5))
Save the Word document
word_document.save(“Extracted_Paragraphs_and_Images.docx”)