Kernel restarts at start of QThread

Newbie here and I thought this would be simple to crack. After 2 days trying I proved myself wrong.
I have a quite simple program where I want to load an excel file that contains location data. I want to use fuzzywuzzy to see if there are similar lines (‘Main str 1’ is likely the same as ‘Main Street 1’). As I wanted to show progress going through the file (there are n * (n - 1) / 2 combinations to be looked at), I tried to run the comparison in a separate thread, emitting progress to a pyqt5 window.
I tried smaller size files (and without the separate thread it all works fine), but as soon as i enter the thread the kernel restarts. In debug mode I mostly walk smoothly through the full code and I tried adding timers. Asked ChatGPT but all seems fruitless.

Not sure what is next?

import sys
import pandas as pd
from PyQt5.QtWidgets import (
    QApplication, QVBoxLayout, QHBoxLayout, QGridLayout, QWidget,
    QPushButton, QFileDialog, QLabel, QComboBox,  
    QGroupBox, QRadioButton, QProgressBar
)
from PyQt5.QtGui import QIcon

from fuzzywuzzy import fuzz

from PyQt5.QtCore import QObject, QThread, pyqtSignal


from PyQt5 import QtCore, QtGui, QtWidgets


class LocationFilesMergerApp(QWidget):
    def __init__(self):
        super().__init__()
        self.single_file_df = None
        self.initUI()

    def initUI(self):
        self.main_layout = QVBoxLayout()

        self.groupbox = QGroupBox("File selection")
        self.layout = QVBoxLayout()
        self.groupbox.setLayout(self.layout)
        self.main_layout.addWidget(self.groupbox)

        self.setLayout(self.main_layout)

        self.show()

        self.init_cleanup_UI()

    
    def init_cleanup_UI(self):
        layout = self.layout
    
        # Buttons to load files
        self.loadSingleFileBtn = QPushButton("Select Excel File", self)
        self.loadSingleFileBtn.clicked.connect(self.load_single_file)
        layout.addWidget(self.loadSingleFileBtn)
    
        # Labels to display file names
        self.single_fileLabel = QLabel("No file selected", self)
        layout.addWidget(self.single_fileLabel)
    
   
        # Clean-up Button
        self.cleanupBtn = QPushButton("Clean-up File", self)
        self.cleanupBtn.setVisible(False)
        self.cleanupBtn.clicked.connect(self.cleanup_file)
        layout.addWidget(self.cleanupBtn)



    def load_single_file(self):
        file_path, _ = QFileDialog.getOpenFileName(self, "Open Excel File", "", "Excel Files (*.xlsx *.xls)")
        if file_path:
            self.single_file_df = pd.read_excel(file_path)
            self.single_fileLabel.setText(file_path)
            self.cleanupBtn.setVisible(True)


    def cleanup_file(self):
        self.close()
        main_cleanup = MainCleanUp(self.single_file_df)
    
    def closeEvent(self, event):
        # Clean up resources or disconnect signals
        event.accept()
                

class MainCleanUp():
    def __init__(self, df):
        super().__init__()

        self.initCleanup(df)
        

    def initCleanup(self, df):
        
        self.df = df

        # # Define the columns to compare
        self.columns_to_compare = ['Location_ID','Address', 'City', 'Country']
        # Define the weights for each column
        self.column_weights = {
            'Location_ID': 0.5,
            'Address': 0.8,
            'City': 0.8,
            'Country': 1.0,
        }
        
        self.fuzzy_comparison = FuzzyCompare(self.df, 
                                              self.columns_to_compare, 
                                              self.column_weights, 
                                              90)

         # Connect the finished signal to a custom handler
        self.fuzzy_comparison.worker.finished.connect(self.on_cleanup_finished)

    def on_cleanup_finished(self, similar_pairs):
        print("Cleanup finished. Similar pairs found:", similar_pairs)
        # You can add any additional logic you want to execute when the cleanup is finished


class FuzzyCompareWorker(QObject):
    progress = pyqtSignal(int)
    finished = pyqtSignal(list)

    def __init__(self, df, columns_to_compare, column_weights, threshold):
        super().__init__()
        self.df = df
        self.columns_to_compare = columns_to_compare
        self.column_weights = column_weights
        self.threshold = threshold

    def run(self):
        similar_pairs = self.find_similar_pairs(self.df, self.columns_to_compare, column_weights=self.column_weights, threshold=self.threshold)
        self.finished.emit(similar_pairs)

    def calculate_similarity(self, row1, row2, columns, column_weights=None):
        if column_weights is None:
            column_weights = {col: 1 for col in columns}

        score = 0
        total_weight = 0

        for col in columns:
            if pd.isna(row1[col]) or pd.isna(row2[col]):
                continue

            weight = column_weights.get(col, 1)
            similarity = fuzz.ratio(str(row1[col]), str(row2[col]))
            score += similarity * weight
            total_weight += weight

        if total_weight == 0:
            return 0

        return score / total_weight

    def find_similar_pairs(self, df, columns, column_weights=None, threshold=80):
        num_rows = df.shape[0]
        similar_pairs = []
        comparisons_counter = 0
        total_comparisons = (num_rows * (num_rows - 1)) // 2

        steps = max(1, total_comparisons // 20)

        for i in range(num_rows):
            for j in range(i + 1, num_rows):
                score = self.calculate_similarity(df.iloc[i], df.iloc[j], columns, column_weights)
                if score > threshold:
                    similar_pairs.append((i, j, score))

                comparisons_counter += 1

                if comparisons_counter % steps == 0:
                    progress = int(comparisons_counter / total_comparisons * 100 + 1)
                    self.progress.emit(progress)

        self.progress.emit(100)  # Ensure the progress bar completes
        return similar_pairs

class FuzzyCompare:

    def __init__(self, df, columns_to_compare, column_weights, threshold):
        super().__init__()
        self.df = df
        self.columns_to_compare = columns_to_compare
        self.column_weights = column_weights
        self.threshold = threshold
        
        # Initialize the progress window
        self.progress_window = ProgressWindow()
        self.progress_window.show()

        # Set up the worker and the thread
        self.thread = QThread()
        self.worker = FuzzyCompareWorker(df, columns_to_compare, column_weights, threshold)
        self.worker.moveToThread(self.thread)

        # Connect signals and slots
        self.worker.progress.connect(self.progress_window.update_progress)
        self.worker.finished.connect(self.on_finished)
        self.thread.started.connect(self.worker.run)
        self.worker.finished.connect(self.thread.quit)
        self.worker.finished.connect(self.worker.deleteLater)
        self.thread.finished.connect(self.thread.deleteLater)

        # Start the thread
        self.thread.start()

    def on_finished(self, similar_pairs):
        self.progress_window.close()
        print("Comparison finished. Similar pairs found:", similar_pairs)


class ProgressWindow(QWidget):
    def __init__(self):
        super().__init__()
        self.initUI()

    def initUI(self):
        self.setWindowTitle('Progress')
        self.setGeometry(150, 150, 300, 100)

        layout = QVBoxLayout()
        self.progress_bar = QProgressBar(self)
        self.progress_bar.setMinimum(0)
        self.progress_bar.setMaximum(100)
        layout.addWidget(self.progress_bar)
        
        self.setLayout(layout)

    def update_progress(self, value):
        self.progressBar.setValue(value)

    def closeEvent(self, event):
        # Override to handle the close event if necessary
        event.accept()


if __name__ == "__main__":
    app = QApplication(sys.argv)

    ex = LocationFilesMergerApp()

    sys.exit(app.exec_())

When you say the kernel restarts, do you mean that you are running this code in Jupyter? Or are you running this from the commands line and the interpreter crashes?

I have Anaconda installed, using Spyder as my editor. From there, the kernel restarts when the thread tries to start. I have also ran python file.py from my command prompt which had the same result.
Have deleted the fuzzy.ratio line, deleted all the emit-commands, etc; so am kind of lost

Going through the code more it seems you are sending a close to the main thread before the worker thread is done. I recommend refactoring to clearly separate the logic from the threading and don’t overload the signals. Perhaps start by deleting all the finished connections and start adding them back one by one.

Appreciate the effort, thank you. Not sure where I send the close of the main though…. And “refactoring”?.. Had deleted all the code in the thread before, still was no success, so looking for that close…