I have already spent a couple of hours trying to make it work.
Here is the code:
#Feature Engineering through Aggregation
# Define custom function
range_ = lambda x: x.max() - x.min()
range_.__name__ = 'range_'
# Group and aggregate
ind_agg = ind_variables.drop(axis = 1, labels = 'Target').groupby('idhogar').agg(['min', 'max', 'sum', 'count', 'std', range_])
ind_agg.head()
it promoted me this error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[108], line 8
5 range_.__name__ = 'range_'
7 # Group and aggregate
----> 8 ind_agg = ind_variables.drop(axis = 1, labels = 'Target').groupby('idhogar').agg(['min', 'max', 'sum', 'count', 'std', range_])
9 ind_agg.head()
File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/generic.py:895, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
892 func = maybe_mangle_lambdas(func)
894 op = GroupByApply(self, func, args, kwargs)
--> 895 result = op.agg()
896 if not is_dict_like(func) and result is not None:
897 return result
File ~/anaconda3/lib/python3.11/site-packages/pandas/core/apply.py:175, in Apply.agg(self)
172 return self.agg_dict_like()
173 elif is_list_like(arg):
174 # we require a list, but not a 'str'
--> 175 return self.agg_list_like()
177 if callable(arg):
178 f = com.get_cython_func(arg)
File ~/anaconda3/lib/python3.11/site-packages/pandas/core/apply.py:401, in Apply.agg_list_like(self)
394 try:
395 # Capture and suppress any warnings emitted by us in the call
396 # to agg below, but pass through any warnings that were
397 # generated otherwise.
398 # This is necessary because of https://bugs.python.org/issue29672
399 # See GH #43741 for more details
400 with warnings.catch_warnings(record=True) as record:
--> 401 new_res = colg.aggregate(arg)
402 if len(record) > 0:
403 match = re.compile(depr_nuisance_columns_msg.format(".*"))
File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/generic.py:281, in SeriesGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
277 elif isinstance(func, abc.Iterable):
278 # Catch instances of lists / tuples
279 # but not the class list / tuple itself.
280 func = maybe_mangle_lambdas(func)
--> 281 ret = self._aggregate_multiple_funcs(func)
282 if relabeling:
283 # columns is not narrowed by mypy from relabeling flag
284 assert columns is not None # for mypy
File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/generic.py:336, in SeriesGroupBy._aggregate_multiple_funcs(self, arg)
333 for idx, (name, func) in enumerate(arg):
335 key = base.OutputKey(label=name, position=idx)
--> 336 results[key] = self.aggregate(func)
338 if any(isinstance(x, DataFrame) for x in results.values()):
339 from pandas import concat
File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/generic.py:275, in SeriesGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
272 kwargs = {}
274 if isinstance(func, str):
--> 275 return getattr(self, func)(*args, **kwargs)
277 elif isinstance(func, abc.Iterable):
278 # Catch instances of lists / tuples
279 # but not the class list / tuple itself.
280 func = maybe_mangle_lambdas(func)
File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:2282, in GroupBy.std(self, ddof, engine, engine_kwargs, numeric_only)
2273 if (
2274 numeric_only_bool
2275 and self.obj.ndim == 1
2276 and not is_numeric_dtype(self.obj.dtype)
2277 ):
2278 raise TypeError(
2279 f"{type(self).__name__}.std called with "
2280 f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
2281 )
-> 2282 result = self._get_cythonized_result(
2283 libgroupby.group_var,
2284 cython_dtype=np.dtype(np.float64),
2285 numeric_only=numeric_only_bool,
2286 needs_counts=True,
2287 post_processing=lambda vals, inference: np.sqrt(vals),
2288 ddof=ddof,
2289 )
2290 self._maybe_warn_numeric_only_depr("std", result, numeric_only)
2291 return result
File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:3866, in GroupBy._get_cythonized_result(self, base_func, cython_dtype, numeric_only, needs_counts, needs_nullable, needs_mask, pre_processing, post_processing, **kwargs)
3863 if numeric_only_bool:
3864 mgr = mgr.get_numeric_data()
-> 3866 res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
3868 if not is_ser and len(res_mgr.items) != orig_mgr_len:
3869 howstr = how.replace("group_", "")
File ~/anaconda3/lib/python3.11/site-packages/pandas/core/internals/base.py:199, in SingleDataManager.grouped_reduce(self, func, ignore_failures)
193 """
194 ignore_failures : bool, default False
195 Not used; for compatibility with ArrayManager/BlockManager.
196 """
198 arr = self.array
--> 199 res = func(arr)
200 index = default_index(len(res))
202 mgr = type(self).from_array(res, index)
File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:3826, in GroupBy._get_cythonized_result.<locals>.blk_func(values)
3823 if pre_processing:
3824 vals, inferences = pre_processing(vals)
-> 3826 vals = vals.astype(cython_dtype, copy=False)
3827 if vals.ndim == 1:
3828 vals = vals.reshape((-1, 1))
ValueError: could not convert string to float: 'ID_279628684'
I want to keep the ID in my dataset, I have already group them by “idhogar”. The idhogar is in this format: 000a08204, how could I fix this problem?
Please help. THANK YOU!