I would like to ask a question regarding a logic that I have been developing for the past couple of weeks, unfortunately was not able to produce the desired result.
The goal of my project is to extract data from two separate columns from different spreadsheets, use the fuzzy wuzzy string matching algorithm to estimate the percentage match between the row values and then output the result in Excel.
However, my case is a bit trickier - every row from the first spreadsheet column I have to compare with each row from the second spreadsheet column. By estimating the similarity ratio, I have to then filter it by using sort and find the first 3 matches which register the higher percentage (limit = 3), then output the result. But since I will be working with enormous amounts of data, was wondering if there was a way I could first estimate the string similarity ratio via the fuzzy wuzzy module. I am using pandas and numpy to extract and reformat the column data into data frames and then reformat it to numpy arrays for faster performance.
My question is - how can I perform the actions described above and after done, write into an Excel file the matching pairs in neighboring columns (1st match number - 1st match %, 2nd match number - 2nd match % and so on)?
Here is my code. Would highly appreciate any hints or pieces of advice
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import numpy as np
import openpyxl
list_POData = np.array(purchaseOrderItemDescription)
list_MaterialsData = np.array(materialDescription)
matCounter = 0
counter = 0
while matCounter <= len(list_POData) or matCounter > len(list_POData):
if matCounter >= len(list_POData):
break
else:
def first_func(query, match, limit=3):
percent_Match = process.extract(query, match, limit=limit, scorer=fuzz.token_sort_ratio)
return percent_Match
# Counter to increment the values inside appended list/+1 after each iteration.
def iterator_Func():
global first_Match, second_Match, third_Match, counter
for materialElement in range(len(list_MaterialsData)):
new_list = [materialElement]
new_array = np.hstack(purchaseOrderItemDescription[0:])
array_appendedValues = np.append(new_array, new_list)
n = 30000
array_appendedValues = np.append(array_appendedValues[:n], array_appendedValues[:n + 1])
first_Match = np.array(
first_func(array_appendedValues.astype(str)[counter], array_appendedValues, limit=3)[0])
counter += 1
second_Match = np.array(
first_func(array_appendedValues.astype(str)[counter], array_appendedValues, limit=3)[1])
counter += 1
third_Match = np.array(
first_func(array_appendedValues.astype(str)[counter], array_appendedValues, limit=3)[2])
return [first_Match[0], first_Match[1]], [second_Match[0], second_Match[1]], [third_Match[0],
third_Match[1]]
listResult = iterator_Func()
matCounter += 1
# Function to write in Excel spreadsheet
def write_Results():
global materialDescription, listResult, materialNumber
resultFile = openpyxl.load_workbook('Spreadhsheet1.xlsx', read_only=False)
firstSheet = resultFile.worksheets[0]
# Hardcoded value of starting column to insert values
col = 20
firstSheet.cell(row=1, column=col).value = "Material Number"
firstSheet.cell(row=1, column=col + 1).value = "Material Number Match"
firstSheet.cell(row=1, column=col + 2).value = "Top 3 Matches, %"
for index, materialItem in enumerate(list_MaterialsData):
firstSheet.cell(row=2 + index * 3, column=col).value = materialNumber.tolist()[index]
for innerIndex, innerItem in enumerate(listResult):
firstSheet.cell(row=2 + index * 3 + innerIndex, column=col + 1).value = listResult[0][0]
firstSheet.cell(row=2 + index * 3 + innerIndex, column=col + 2).value = listResult[0][1]
firstSheet.cell(row=2 + index * 3 + 1, column=col + 1).value = listResult[1][0]
firstSheet.cell(row=2 + index * 3 + 1, column=col + 2).value = listResult[1][1]
firstSheet.cell(row=2 + index * 3 + 2, column=col + 2).value = listResult[2][0]
firstSheet.cell(row=2 + index * 3 + 2, column=col + 2).value = listResult[2][1]
resultFile.save('Spreadsheet1.xlsx')
write_Results()