Dataframe.aggregate()

I have already spent a couple of hours trying to make it work.
Here is the code:

#Feature Engineering through Aggregation

# Define custom function
range_ = lambda x: x.max() - x.min()
range_.__name__ = 'range_'

# Group and aggregate
ind_agg = ind_variables.drop(axis = 1, labels = 'Target').groupby('idhogar').agg(['min', 'max', 'sum', 'count', 'std', range_])
ind_agg.head()

it promoted me this error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[108], line 8
      5 range_.__name__ = 'range_'
      7 # Group and aggregate
----> 8 ind_agg = ind_variables.drop(axis = 1, labels = 'Target').groupby('idhogar').agg(['min', 'max', 'sum', 'count', 'std', range_])
      9 ind_agg.head()

File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/generic.py:895, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
    892 func = maybe_mangle_lambdas(func)
    894 op = GroupByApply(self, func, args, kwargs)
--> 895 result = op.agg()
    896 if not is_dict_like(func) and result is not None:
    897     return result

File ~/anaconda3/lib/python3.11/site-packages/pandas/core/apply.py:175, in Apply.agg(self)
    172     return self.agg_dict_like()
    173 elif is_list_like(arg):
    174     # we require a list, but not a 'str'
--> 175     return self.agg_list_like()
    177 if callable(arg):
    178     f = com.get_cython_func(arg)

File ~/anaconda3/lib/python3.11/site-packages/pandas/core/apply.py:401, in Apply.agg_list_like(self)
    394 try:
    395     # Capture and suppress any warnings emitted by us in the call
    396     # to agg below, but pass through any warnings that were
    397     # generated otherwise.
    398     # This is necessary because of https://bugs.python.org/issue29672
    399     # See GH #43741 for more details
    400     with warnings.catch_warnings(record=True) as record:
--> 401         new_res = colg.aggregate(arg)
    402     if len(record) > 0:
    403         match = re.compile(depr_nuisance_columns_msg.format(".*"))

File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/generic.py:281, in SeriesGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
    277 elif isinstance(func, abc.Iterable):
    278     # Catch instances of lists / tuples
    279     # but not the class list / tuple itself.
    280     func = maybe_mangle_lambdas(func)
--> 281     ret = self._aggregate_multiple_funcs(func)
    282     if relabeling:
    283         # columns is not narrowed by mypy from relabeling flag
    284         assert columns is not None  # for mypy

File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/generic.py:336, in SeriesGroupBy._aggregate_multiple_funcs(self, arg)
    333 for idx, (name, func) in enumerate(arg):
    335     key = base.OutputKey(label=name, position=idx)
--> 336     results[key] = self.aggregate(func)
    338 if any(isinstance(x, DataFrame) for x in results.values()):
    339     from pandas import concat

File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/generic.py:275, in SeriesGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
    272     kwargs = {}
    274 if isinstance(func, str):
--> 275     return getattr(self, func)(*args, **kwargs)
    277 elif isinstance(func, abc.Iterable):
    278     # Catch instances of lists / tuples
    279     # but not the class list / tuple itself.
    280     func = maybe_mangle_lambdas(func)

File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:2282, in GroupBy.std(self, ddof, engine, engine_kwargs, numeric_only)
   2273 if (
   2274     numeric_only_bool
   2275     and self.obj.ndim == 1
   2276     and not is_numeric_dtype(self.obj.dtype)
   2277 ):
   2278     raise TypeError(
   2279         f"{type(self).__name__}.std called with "
   2280         f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
   2281     )
-> 2282 result = self._get_cythonized_result(
   2283     libgroupby.group_var,
   2284     cython_dtype=np.dtype(np.float64),
   2285     numeric_only=numeric_only_bool,
   2286     needs_counts=True,
   2287     post_processing=lambda vals, inference: np.sqrt(vals),
   2288     ddof=ddof,
   2289 )
   2290 self._maybe_warn_numeric_only_depr("std", result, numeric_only)
   2291 return result

File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:3866, in GroupBy._get_cythonized_result(self, base_func, cython_dtype, numeric_only, needs_counts, needs_nullable, needs_mask, pre_processing, post_processing, **kwargs)
   3863 if numeric_only_bool:
   3864     mgr = mgr.get_numeric_data()
-> 3866 res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
   3868 if not is_ser and len(res_mgr.items) != orig_mgr_len:
   3869     howstr = how.replace("group_", "")

File ~/anaconda3/lib/python3.11/site-packages/pandas/core/internals/base.py:199, in SingleDataManager.grouped_reduce(self, func, ignore_failures)
    193 """
    194 ignore_failures : bool, default False
    195     Not used; for compatibility with ArrayManager/BlockManager.
    196 """
    198 arr = self.array
--> 199 res = func(arr)
    200 index = default_index(len(res))
    202 mgr = type(self).from_array(res, index)

File ~/anaconda3/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:3826, in GroupBy._get_cythonized_result.<locals>.blk_func(values)
   3823 if pre_processing:
   3824     vals, inferences = pre_processing(vals)
-> 3826 vals = vals.astype(cython_dtype, copy=False)
   3827 if vals.ndim == 1:
   3828     vals = vals.reshape((-1, 1))

ValueError: could not convert string to float: 'ID_279628684'

I want to keep the ID in my dataset, I have already group them by “idhogar”. The idhogar is in this format: 000a08204, how could I fix this problem?

Please help. THANK YOU!

For a properly named function, we recommend using def instead of lambda (PEP8).

def range_(x): return x.max() - x.min()

I don’t understand the question. If idhogar is a hex string, convert to float with int('000a08204', 16) (10519044). What does this have to do with the ValueError and unconvertable strings like that given?

The error message:

ValueError: could not convert string to float: 'ID_279628684'

I think the only variable containing a string in the dataset is the Id.

I wonder if I need to remove the Id, but I want to keep that in the ind_variables dataset because I will use it later.

I am unsure why the program threw me that error message, and I wonder if my coding is wrong.

Because you asked to aggregate some groups of rows, by finding the difference between the maximum value and minimum value in each column for the group; and there is a column that contains strings. How should the result be calculated for that column?

Thanks for replying. I ended up not using it, problem solved. I sometimes might not understand the error message especially they printed out a bunch of words which is quite daunting. But I start getting used to the error messages now and then I will try different things or search online if I could solve it or do it other ways.

Please disregard the post.