Script for detecting handwritten Text with Google Vision API - ChatGPT generated

Hi @all,
as mentioned in the title, I tried to get a script for detecting handwritten Text in a PDF-file.
Because I am absolutely new to coding, but wanted my idea to be realized, I used ChatGPT to get the basic script done.

The script does not work reliable at the moment. I was able to do some adjustments, so that sometimes the script works kind of properly but sometimes it does not.
When it doesn’t, then an Error appears, that the new generated .png-file cannot be found.
But it indeed it was created and is present in the folder.

Maybe deletion of this created .png-file has to be called at some other point?

Furthermore I would like to remove entries from listbox via double click.
This works just visually. Nevertheless the dropped file will still be processed.
Does anyone have some detailed tips?

This is my code so far:

import os
import csv
from pdf2image import convert_from_path
from google.cloud import vision
from google.cloud.vision_v1 import types
import tkinter as tk
from tkinterdnd2 import DND_FILES, TkinterDnD

#Google Vision API - authetification
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/***/Desktop/HTR/GoogleCloudVision.json"
client = vision.ImageAnnotatorClient()

processing_queue = []

def process_single_file(pdf_path):
    png_path = pdf_path.replace(".pdf", ".png")
    pdf_images = convert_from_path(pdf_path)
    png_filename = os.path.basename(png_path)  #Filename
    png_folder = os.path.dirname(pdf_path)  #Folderpath
    png_path = os.path.join(png_folder, png_filename)  #creating new Filepath
    pdf_images[0].convert("L").save(png_path)

    with open(png_path, "rb") as image_file:
        content = image_file.read()

    image = types.Image(content=content)
    response = client.document_text_detection(image=image)
    text = response.full_text_annotation.text

    with open("C:/Users/***/Desktop/HTR/keywords.txt", "r") as keywords_file:
        keywords = keywords_file.read().splitlines()

    matched_keywords = [keyword for keyword in keywords if keyword in text]

    if matched_keywords:
        with open("C:/Users/***/Desktop/HTR/daten.csv", "r") as csv_file:
            csv_reader = csv.DictReader(csv_file)
            for row in csv_reader:
                if row["Keyword"] in matched_keywords:
                    new_filename = f"{row['Nachname']}, {row['Vorname']}.png"
                    new_filename_png = new_filename
                    
                    os.rename(png_path, os.path.join(png_folder, new_filename))
                    new_pdf_path = os.path.join(png_folder, new_filename.replace(".png", ".pdf"))
                    pdf_images[0].save(new_pdf_path, "PDF", resolution=100.0, save_all=True)
                    processing_queue.remove(pdf_path)
                    
                    os.remove(pdf_path) #remove original PDF-File     
                    os.remove(new_filename_png)  #delete the temporary created PNG file
                    
                    break
            else:
                os.remove(png_path)
                return
    else:
        os.remove(png_path)
        return
    
def start_processing():
    queue_copy = processing_queue.copy()  #create a copy of the queue
    for file_path in queue_copy:
        process_single_file(file_path)
        listbox.delete(listbox.get(0, tk.END).index(file_path))

def on_drop(event):
    file_paths = event.data.split()
    for file_path in file_paths:
        processing_queue.append(file_path)
        listbox.insert(tk.END, file_path)

root = TkinterDnD.Tk()
root.title("HTR with GUI")
root.geometry("510x220")

label = tk.Label(root, text="Drag&Drop PDF-Dateien:")
label.pack()

listbox = tk.Listbox(root, width=70,height=10,font='ansifixed')
listbox.pack()

start_button = tk.Button(root, text="Verarbeitung starten", command=start_processing)
start_button.pack(padx='5', pady='10')

def on_select(event):
    selected_index = listbox.curselection()[0]
    selected_file = listbox.get(selected_index)
    listbox.delete(selected_index)

listbox.bind('<Double-Button-1>', on_select)

root.drop_target_register(DND_FILES)
root.dnd_bind('<<Drop>>', lambda event: on_drop(event))
root.mainloop()

I think it’s necessary to convert the .pdf in .png, because Google Vision API only works with images.
So for me it is also necessary to reconvert the file to .pdf and delete the original .pdf-file and also the created .png-file, that was only created for text recognition.

Best regards
Fisatec

PS: This is the Error I am talking about

Look at what on_select does.

It gets the index of the selected item, puts the text of the item (the filename) into a local variable called selected_file, and then deletes the item from the listbox.

At no point does it delete the filename from processing_queue.

As for the second issue, the line says os.remove(new_filename_png).

new_filename_png is the name of the file itself, not a full path of the file.

The traceback says that it couldn’t find Göbel, Robin.png, not that it couldn’t find C:/Users/RGoebel/Desktop/Test/Göbel, Robin.png.

As you didn’t give it the full path, it looked for the file in the current folder, wherever that is.

1 Like

Thank you very much for your fast reply @MRAB
That absolutely makes sense, even to me :slight_smile:

I will give it a try!

Best regards
Fisatec

Edit: Everything is wotking as intended now. Thanks again @MRAB
This is my fully working code:

import os
import csv
from pdf2image import convert_from_path
from google.cloud import vision
from google.cloud.vision_v1 import types
import tkinter as tk
from tkinterdnd2 import DND_FILES, TkinterDnD

#Google Vision API - authetification
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/RGoebel/Desktop/HTR/GoogleCloudVision.json"
client = vision.ImageAnnotatorClient()

processing_queue = []

def process_single_file(pdf_path):
    png_path = pdf_path.replace(".pdf", ".png")
    pdf_images = convert_from_path(pdf_path)
    png_filename = os.path.basename(png_path)  #Filename
    png_folder = os.path.dirname(pdf_path)  #Folderpath
    png_path = os.path.join(png_folder, png_filename)  #creating new Filepath
    pdf_images[0].convert("L").save(png_path)

    with open(png_path, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)
    response = client.document_text_detection(image=image)
    text = response.full_text_annotation.text

    with open("C:/Users/RGoebel/Desktop/HTR/keywords.txt", "r") as keywords_file:
        keywords = keywords_file.read().splitlines()

    matched_keywords = [keyword for keyword in keywords if keyword in text]

    if matched_keywords:
        with open("C:/Users/RGoebel/Desktop/HTR/daten.csv", "r") as csv_file:
            csv_reader = csv.DictReader(csv_file)
            for row in csv_reader:
                if row["Keyword"] in matched_keywords:
                    new_filename = f"{row['Nachname']}, {row['Vorname']}.png"
                    new_filename_png = os.path.join(png_folder, new_filename)
                    
                    os.rename(png_path, os.path.join(png_folder, new_filename))
                    new_pdf_path = os.path.join(png_folder, new_filename.replace(".png", ".pdf"))
                    pdf_images[0].save(new_pdf_path, "PDF", resolution=100.0, save_all=True)
                    processing_queue.remove(pdf_path)
                    
                    os.remove(pdf_path) #remove original PDF-File     
                    os.remove(new_filename_png)  #delete the temporary created PNG file
                    
                    break
            else:
                os.remove(png_path)
                return
    else:
        os.remove(png_path)
        return
    
def start_processing():
    queue_copy = processing_queue.copy()  #create a copy of the queue
    for file_path in queue_copy:
        process_single_file(file_path)
        listbox.delete(listbox.get(0, tk.END).index(file_path))

def on_drop(event):
    file_paths = event.data.split()
    for file_path in file_paths:
        processing_queue.append(file_path)
        listbox.insert(tk.END, file_path)

root = TkinterDnD.Tk()
root.title("HTR with GUI")
root.geometry("510x220")

label = tk.Label(root, text="Drag&Drop PDF-Dateien:")
label.pack()

listbox = tk.Listbox(root, width=70,height=10,font='ansifixed')
listbox.pack()

start_button = tk.Button(root, text="Verarbeitung starten", command=start_processing)
start_button.pack(padx='5', pady='10')

def on_select(event):
    selected_index = listbox.curselection()[0]
    selected_file = listbox.get(selected_index)
    listbox.delete(selected_index)
    processing_queue.remove(selected_file)

listbox.bind('<Double-Button-1>', on_select)

root.drop_target_register(DND_FILES)
root.dnd_bind('<<Drop>>', lambda event: on_drop(event))
root.mainloop()