diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a7ebb3555007e..ec5027840dfd5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -507,7 +507,7 @@ Renamed the following offset aliases (:issue:`57986`): Other Removals ^^^^^^^^^^^^^^ -- :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when used with ``skipna=False`` and an NA value is encountered (:issue:`10694`) +- :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when a group has all NA values, or when used with ``skipna=False`` and any NA value is encountered (:issue:`10694`, :issue:`57745`) - :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`) - :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`) - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 1ec4dc1ffb482..9f8ff86cbcb7e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -2048,9 +2048,8 @@ def group_idxmin_idxmax( group_min_or_max = np.empty_like(out, dtype=values.dtype) seen = np.zeros_like(out, dtype=np.uint8) - # When using transform, we need a valid value for take in the case - # a category is not observed; these values will be dropped - out[:] = 0 + # Sentinel for no valid values. + out[:] = -1 with nogil(numeric_object_t is not object): for i in range(N): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b2531e2abf7f1..4b2a9e070f0c9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1404,7 +1404,15 @@ def idxmin(self, skipna: bool = True) -> Series: Raises ------ ValueError - If the Series is empty or skipna=False and any value is NA. + When there are no valid values for a group. Then can happen if: + + * There is an unobserved group and ``observed=False``. + * All values for a group are NA. + * Some values for a group are NA and ``skipna=False``. + + .. versionchanged:: 3.0.0 + Previously if all values for a group are NA or some values for a group are + NA and ``skipna=False``, this method would return NA. Now it raises instead. See Also -------- @@ -1457,7 +1465,15 @@ def idxmax(self, skipna: bool = True) -> Series: Raises ------ ValueError - If the Series is empty or skipna=False and any value is NA. + When there are no valid values for a group. Then can happen if: + + * There is an unobserved group and ``observed=False``. + * All values for a group are NA. + * Some values for a group are NA and ``skipna=False``. + + .. versionchanged:: 3.0.0 + Previously if all values for a group are NA or some values for a group are + NA and ``skipna=False``, this method would return NA. Now it raises instead. See Also -------- @@ -2597,7 +2613,15 @@ def idxmax( Raises ------ ValueError - * If a column is empty or skipna=False and any value is NA. + When there are no valid values for a group. Then can happen if: + + * There is an unobserved group and ``observed=False``. + * All values for a group are NA. + * Some values for a group are NA and ``skipna=False``. + + .. versionchanged:: 3.0.0 + Previously if all values for a group are NA or some values for a group are + NA and ``skipna=False``, this method would return NA. Now it raises instead. See Also -------- @@ -2663,7 +2687,15 @@ def idxmin( Raises ------ ValueError - * If a column is empty or skipna=False and any value is NA. + When there are no valid values for a group. Then can happen if: + + * There is an unobserved group and ``observed=False``. + * All values for a group are NA. + * Some values for a group are NA and ``skipna=False``. + + .. versionchanged:: 3.0.0 + Previously if all values for a group are NA or some values for a group are + NA and ``skipna=False``, this method would return NA. Now it raises instead. See Also -------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 74497ca723edb..0e9c014b48a9c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1784,7 +1784,8 @@ def array_func(values: ArrayLike) -> ArrayLike: new_mgr = data.grouped_reduce(array_func) res = self._wrap_agged_manager(new_mgr) if how in ["idxmin", "idxmax"]: - res = self._wrap_idxmax_idxmin(res) + # mypy expects how to be Literal["idxmin", "idxmax"]. + res = self._wrap_idxmax_idxmin(res, how=how, skipna=kwargs["skipna"]) # type: ignore[arg-type] out = self._wrap_aggregated_output(res) return out @@ -5715,10 +5716,17 @@ def _idxmax_idxmin( ) return result - def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: + def _wrap_idxmax_idxmin( + self, res: NDFrameT, how: Literal["idxmax", "idxmin"], skipna: bool + ) -> NDFrameT: index = self.obj.index if res.size == 0: result = res.astype(index.dtype) + elif skipna and res.lt(0).any(axis=None): + raise ValueError( + f"{type(self).__name__}.{how} with skipna=True encountered all NA " + f"values in a group." + ) else: if isinstance(index, MultiIndex): index = index.to_flat_index() diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index f8e92b7e2650a..1e740280cd75b 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -286,18 +286,22 @@ def __init__( self._indexer: npt.NDArray[np.intp] | None = None def _get_grouper( - self, obj: NDFrameT, validate: bool = True + self, obj: NDFrameT, validate: bool = True, observed: bool = True ) -> tuple[ops.BaseGrouper, NDFrameT]: """ Parameters ---------- obj : Series or DataFrame + Object being grouped. validate : bool, default True - if True, validate the grouper + If True, validate the grouper. + observed : bool, default True + Whether only observed groups should be in the result. Only + has an impact when grouping on categorical data. Returns ------- - a tuple of grouper, obj (possibly sorted) + A tuple of grouper, obj (possibly sorted) """ obj, _, _ = self._set_grouper(obj) grouper, _, obj = get_grouper( @@ -307,6 +311,7 @@ def _get_grouper( sort=self.sort, validate=validate, dropna=self.dropna, + observed=observed, ) return grouper, obj @@ -787,7 +792,7 @@ def get_grouper( # a passed-in Grouper, directly convert if isinstance(key, Grouper): - grouper, obj = key._get_grouper(obj, validate=False) + grouper, obj = key._get_grouper(obj, validate=False, observed=observed) if key.key is None: return grouper, frozenset(), obj else: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 08e3beef99e60..2778e6e7eef76 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2305,8 +2305,22 @@ def _get_resampler(self, obj: NDFrame) -> Resampler: ) def _get_grouper( - self, obj: NDFrameT, validate: bool = True + self, obj: NDFrameT, validate: bool = True, observed: bool = True ) -> tuple[BinGrouper, NDFrameT]: + """ + Parameters + ---------- + obj : Series or DataFrame + Object being grouped. + validate : bool, default True + Unused. Only for compatibility with ``Grouper._get_grouper``. + observed : bool, default True + Unused. Only for compatibility with ``Grouper._get_grouper``. + + Returns + ------- + A tuple of grouper, obj (possibly sorted) + """ # create the resampler and return our binner r = self._get_resampler(obj) return r._grouper, cast(NDFrameT, r.obj) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 2b18fd175bb10..e9527ed3a9c0e 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -272,7 +272,7 @@ def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): max_value = np.finfo(float_numpy_dtype).max df = DataFrame( { - "a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"), + "a": Series(np.repeat(range(1, 5), repeats=2), dtype="intp"), "b": Series( [ np.nan, @@ -283,8 +283,6 @@ def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): np.nan, max_value, np.nan, - np.nan, - np.nan, ], dtype=float_numpy_dtype, ), @@ -299,7 +297,7 @@ def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): return result = getattr(gb, how)(skipna=skipna) expected = DataFrame( - {"b": [1, 3, 4, 6, np.nan]}, index=pd.Index(range(1, 6), name="a", dtype="intp") + {"b": [1, 3, 4, 6]}, index=pd.Index(range(1, 5), name="a", dtype="intp") ) tm.assert_frame_equal(result, expected) @@ -1003,8 +1001,6 @@ def test_string_dtype_all_na( else: expected_dtype = "int64" expected_value = 1 if reduction_func == "size" else 0 - elif reduction_func in ["idxmin", "idxmax"]: - expected_dtype, expected_value = "float64", np.nan elif not skipna or min_count > 0: expected_value = pd.NA elif reduction_func == "sum": @@ -1032,8 +1028,11 @@ def test_string_dtype_all_na( with pytest.raises(TypeError, match=msg): method(*args, **kwargs) return - elif reduction_func in ["idxmin", "idxmax"] and not skipna: - msg = f"{reduction_func} with skipna=False encountered an NA value." + elif reduction_func in ["idxmin", "idxmax"]: + if skipna: + msg = f"{reduction_func} with skipna=True encountered all NA values" + else: + msg = f"{reduction_func} with skipna=False encountered an NA value." with pytest.raises(ValueError, match=msg): method(*args, **kwargs) return