I wrote the below code for a pipeline to process data in a dataframe. On execution I get this error:
ValueError: not enough values to unpack (expected 3, got 2)
I suspect that the error is caused by the FunctionTransformer, but I cannot figure out what the issue is. Can anyone help me find the error?
The code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
def funct(X):
X.Age.replace({'<35':0,'<35':1},inplace=True)
X.Accessibility.replace({'No':0,'Yes':1},inplace=True)
X.MentalHealth.replace({'No':0,'Yes':1},inplace=True)
X.MainBranch.replace({'NotDev':0,'Dev':1},inplace=True)
X.YearsCode = np.sqrt(X.YearsCode)
X.YearsCodePro = np.sqrt(X.YearsCodePro)
X.PreviousSalary = np.sqrt(X.PreviousSalary)
X.ComputerSkills = np.sqrt(X.ComputerSkills)
X.Country = pd.util.hash_pandas_object(X.Country)
X.HaveWorkedWith = pd.util.hash_pandas_object(X.HaveWorkedWith)
data = pd.read_csv('stackoverflow_full.csv')
data.info()
X = data.drop('Employed',axis=True)
y = data['Employed'].copy()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train, X_eval, y_train, y_eval = train_test_split(X_train,y_train,test_size=0.01,random_state=42)
#Columns to process
all_cols = list(data.columns)
all_cols.remove('Unnamed: 0')
drop_cols = ['Unnamed: 0']
onehot_cols = ['Gender']
ordinal_cols = ['EdLevel']
impute_cols = ['HaveWorkedWith']
#Instantiate Transformers to process columns
func_transformer = FunctionTransformer(func= funct)
onehot_transformer = Pipeline(steps=[('onehot encode',OneHotEncoder(handle_unknown='ignore'))],verbose=True)
ordinal_transformer = Pipeline(steps=[('ordinal encode',OrdinalEncoder())],verbose=True)
impute_transformer = Pipeline(steps=[('imputing',SimpleImputer(strategy='most frequent'))],verbose=True)
scaling_transformer = Pipeline(steps=[('scaling',StandardScaler())],verbose=True)
preprocessing = ColumnTransformer(transformers=[('drop cols','drop',drop_cols),('funcT',func_transformer),('onehot',onehot_transformer,onehot_cols),('ordinal',ordinal_transformer,ordinal_cols),('impute',impute_transformer,impute_cols),('scale',scaling_transformer,all_cols)],verbose=True)
model = Pipeline(steps=[('preprocessing',preprocessing),('clustering',KMeans(n_clusters=2))],verbose=True)
model.fit_transform(X_train,y_train)