Newbie here and I thought this would be simple to crack. After 2 days trying I proved myself wrong.
I have a quite simple program where I want to load an excel file that contains location data. I want to use fuzzywuzzy to see if there are similar lines (‘Main str 1’ is likely the same as ‘Main Street 1’). As I wanted to show progress going through the file (there are n * (n - 1) / 2 combinations to be looked at), I tried to run the comparison in a separate thread, emitting progress to a pyqt5 window.
I tried smaller size files (and without the separate thread it all works fine), but as soon as i enter the thread the kernel restarts. In debug mode I mostly walk smoothly through the full code and I tried adding timers. Asked ChatGPT but all seems fruitless.
Not sure what is next?
import sys
import pandas as pd
from PyQt5.QtWidgets import (
QApplication, QVBoxLayout, QHBoxLayout, QGridLayout, QWidget,
QPushButton, QFileDialog, QLabel, QComboBox,
QGroupBox, QRadioButton, QProgressBar
)
from PyQt5.QtGui import QIcon
from fuzzywuzzy import fuzz
from PyQt5.QtCore import QObject, QThread, pyqtSignal
from PyQt5 import QtCore, QtGui, QtWidgets
class LocationFilesMergerApp(QWidget):
def __init__(self):
super().__init__()
self.single_file_df = None
self.initUI()
def initUI(self):
self.main_layout = QVBoxLayout()
self.groupbox = QGroupBox("File selection")
self.layout = QVBoxLayout()
self.groupbox.setLayout(self.layout)
self.main_layout.addWidget(self.groupbox)
self.setLayout(self.main_layout)
self.show()
self.init_cleanup_UI()
def init_cleanup_UI(self):
layout = self.layout
# Buttons to load files
self.loadSingleFileBtn = QPushButton("Select Excel File", self)
self.loadSingleFileBtn.clicked.connect(self.load_single_file)
layout.addWidget(self.loadSingleFileBtn)
# Labels to display file names
self.single_fileLabel = QLabel("No file selected", self)
layout.addWidget(self.single_fileLabel)
# Clean-up Button
self.cleanupBtn = QPushButton("Clean-up File", self)
self.cleanupBtn.setVisible(False)
self.cleanupBtn.clicked.connect(self.cleanup_file)
layout.addWidget(self.cleanupBtn)
def load_single_file(self):
file_path, _ = QFileDialog.getOpenFileName(self, "Open Excel File", "", "Excel Files (*.xlsx *.xls)")
if file_path:
self.single_file_df = pd.read_excel(file_path)
self.single_fileLabel.setText(file_path)
self.cleanupBtn.setVisible(True)
def cleanup_file(self):
self.close()
main_cleanup = MainCleanUp(self.single_file_df)
def closeEvent(self, event):
# Clean up resources or disconnect signals
event.accept()
class MainCleanUp():
def __init__(self, df):
super().__init__()
self.initCleanup(df)
def initCleanup(self, df):
self.df = df
# # Define the columns to compare
self.columns_to_compare = ['Location_ID','Address', 'City', 'Country']
# Define the weights for each column
self.column_weights = {
'Location_ID': 0.5,
'Address': 0.8,
'City': 0.8,
'Country': 1.0,
}
self.fuzzy_comparison = FuzzyCompare(self.df,
self.columns_to_compare,
self.column_weights,
90)
# Connect the finished signal to a custom handler
self.fuzzy_comparison.worker.finished.connect(self.on_cleanup_finished)
def on_cleanup_finished(self, similar_pairs):
print("Cleanup finished. Similar pairs found:", similar_pairs)
# You can add any additional logic you want to execute when the cleanup is finished
class FuzzyCompareWorker(QObject):
progress = pyqtSignal(int)
finished = pyqtSignal(list)
def __init__(self, df, columns_to_compare, column_weights, threshold):
super().__init__()
self.df = df
self.columns_to_compare = columns_to_compare
self.column_weights = column_weights
self.threshold = threshold
def run(self):
similar_pairs = self.find_similar_pairs(self.df, self.columns_to_compare, column_weights=self.column_weights, threshold=self.threshold)
self.finished.emit(similar_pairs)
def calculate_similarity(self, row1, row2, columns, column_weights=None):
if column_weights is None:
column_weights = {col: 1 for col in columns}
score = 0
total_weight = 0
for col in columns:
if pd.isna(row1[col]) or pd.isna(row2[col]):
continue
weight = column_weights.get(col, 1)
similarity = fuzz.ratio(str(row1[col]), str(row2[col]))
score += similarity * weight
total_weight += weight
if total_weight == 0:
return 0
return score / total_weight
def find_similar_pairs(self, df, columns, column_weights=None, threshold=80):
num_rows = df.shape[0]
similar_pairs = []
comparisons_counter = 0
total_comparisons = (num_rows * (num_rows - 1)) // 2
steps = max(1, total_comparisons // 20)
for i in range(num_rows):
for j in range(i + 1, num_rows):
score = self.calculate_similarity(df.iloc[i], df.iloc[j], columns, column_weights)
if score > threshold:
similar_pairs.append((i, j, score))
comparisons_counter += 1
if comparisons_counter % steps == 0:
progress = int(comparisons_counter / total_comparisons * 100 + 1)
self.progress.emit(progress)
self.progress.emit(100) # Ensure the progress bar completes
return similar_pairs
class FuzzyCompare:
def __init__(self, df, columns_to_compare, column_weights, threshold):
super().__init__()
self.df = df
self.columns_to_compare = columns_to_compare
self.column_weights = column_weights
self.threshold = threshold
# Initialize the progress window
self.progress_window = ProgressWindow()
self.progress_window.show()
# Set up the worker and the thread
self.thread = QThread()
self.worker = FuzzyCompareWorker(df, columns_to_compare, column_weights, threshold)
self.worker.moveToThread(self.thread)
# Connect signals and slots
self.worker.progress.connect(self.progress_window.update_progress)
self.worker.finished.connect(self.on_finished)
self.thread.started.connect(self.worker.run)
self.worker.finished.connect(self.thread.quit)
self.worker.finished.connect(self.worker.deleteLater)
self.thread.finished.connect(self.thread.deleteLater)
# Start the thread
self.thread.start()
def on_finished(self, similar_pairs):
self.progress_window.close()
print("Comparison finished. Similar pairs found:", similar_pairs)
class ProgressWindow(QWidget):
def __init__(self):
super().__init__()
self.initUI()
def initUI(self):
self.setWindowTitle('Progress')
self.setGeometry(150, 150, 300, 100)
layout = QVBoxLayout()
self.progress_bar = QProgressBar(self)
self.progress_bar.setMinimum(0)
self.progress_bar.setMaximum(100)
layout.addWidget(self.progress_bar)
self.setLayout(layout)
def update_progress(self, value):
self.progressBar.setValue(value)
def closeEvent(self, event):
# Override to handle the close event if necessary
event.accept()
if __name__ == "__main__":
app = QApplication(sys.argv)
ex = LocationFilesMergerApp()
sys.exit(app.exec_())