pandas-dev · mroeschke · Apr 19, 2025 · Feb 22, 2025 · Feb 22, 2025 · Feb 22, 2025
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -119,6 +119,7 @@ Conversion
 
 Strings
 ^^^^^^^
+- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` on string input of all NA values would return float dtype; now returns string (:issue:`60810`)
 - Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`60229`)
 - Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
 - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)

@@ -81,6 +81,7 @@ class providing the base-class of operations.
     is_numeric_dtype,
     is_object_dtype,
     is_scalar,
+    is_string_dtype,
     needs_i8_conversion,
     pandas_dtype,
 )
@@ -1723,8 +1724,13 @@ def _agg_py_fallback(
             # preserve the kind of exception that raised
             raise type(err)(msg) from err
 
-        if ser.dtype == object:
+        dtype = ser.dtype
+        if dtype == object:
             res_values = res_values.astype(object, copy=False)
+        elif is_string_dtype(dtype):
+            # mypy doesn't infer dtype is an ExtensionDtype
+            string_array_cls = dtype.construct_array_type()  # type: ignore[union-attr]
+            res_values = string_array_cls._from_sequence(res_values, dtype=dtype)
 
         # If we are DataFrameGroupBy and went through a SeriesGroupByPath
         # then we need to reshape

diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
@@ -20,6 +20,7 @@
     isna,
 )
 import pandas._testing as tm
+from pandas.tests.groupby import get_groupby_method_args
 from pandas.util import _test_decorators as td
 
 
@@ -956,17 +957,95 @@ def test_min_empty_string_dtype(func, string_dtype_no_object):
 
 
 @pytest.mark.parametrize("min_count", [0, 1])
-def test_string_dtype_empty_sum(string_dtype_no_object, skipna, min_count):
-    # https://github.com/pandas-dev/pandas/issues/60229
+@pytest.mark.parametrize("test_series", [True, False])
+def test_string_dtype_all_na(
+    string_dtype_no_object, reduction_func, skipna, min_count, test_series
+):
+    # https://github.com/pandas-dev/pandas/issues/60985
+    if reduction_func == "corrwith":
+        # corrwith is deprecated.
+        return
+
     dtype = string_dtype_no_object
+
+    if reduction_func in [
+        "any",
+        "all",
+        "idxmin",
+        "idxmax",
+        "mean",
+        "median",
+        "std",
+        "var",
+    ]:
+        kwargs = {"skipna": skipna}
+    elif reduction_func in ["kurt"]:
+        kwargs = {"min_count": min_count}
+    elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]:
+        kwargs = {}
+    else:
+        kwargs = {"skipna": skipna, "min_count": min_count}
+
+    expected_dtype, expected_value = dtype, pd.NA
+    if reduction_func in ["all", "any"]:
+        expected_dtype = "bool"
+        # TODO: For skipna=False, bool(pd.NA) raises; should groupby?
 def test_empty_groupby(columns, keys, values, method, op, dropna, using_infer_string): 
     # GH8093 & GH26411 
     override_dtype = None 
     if isinstance(values, BooleanArray) and op in ["sum", "prod"]: 
         # We expect to get Int64 back for these 
         override_dtype = "Int64" 
     if isinstance(values[0], bool) and op in ("prod", "sum"): 
         # sum/product of bools is an integer 
         override_dtype = "int64" 
     df = DataFrame({"A": values, "B": values, "C": values}, columns=list("ABC")) 
     if hasattr(values, "dtype"): 
         # check that we did the construction right 
         assert (df.dtypes == values.dtype).all() 
     df = df.iloc[:0] 
     gb = df.groupby(keys, group_keys=False, dropna=dropna, observed=False)[columns] 
     def get_result(**kwargs): 
         if method == "attr": 
             return getattr(gb, op)(**kwargs) 
         else: 
             return getattr(gb, method)(op, **kwargs) 
     def get_categorical_invalid_expected(): 
         # Categorical is special without 'observed=True', we get an NaN entry 
         #  corresponding to the unobserved group. If we passed observed=True 
         #  to groupby, expected would just be 'df.set_index(keys)[columns]' 
         #  as below 
         lev = Categorical([0], dtype=values.dtype) 
         if len(keys) != 1: 
             idx = MultiIndex.from_product([lev, lev], names=keys) 
         else: 
             # all columns are dropped, but we end up with one row 
             # Categorical is special without 'observed=True' 
             idx = Index(lev, name=keys[0]) 
         if using_infer_string: 
             columns = Index([], dtype="str") 
         else: 
             columns = [] 
         expected = DataFrame([], columns=columns, index=idx) 
         return expected 
     is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) 
     is_dt64 = df.dtypes.iloc[0].kind == "M" 
     is_cat = isinstance(values, Categorical) 
     is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype) 
     if ( 
         isinstance(values, Categorical) 
         and not values.ordered 
         and op in ["min", "max", "idxmin", "idxmax"] 
     ): 
         if op in ["min", "max"]: 
             msg = f"Cannot perform {op} with non-ordered Categorical" 
             klass = TypeError 
         else: 
             msg = f"Can't get {op} of an empty group due to unobserved categories" 
             klass = ValueError 
         with pytest.raises(klass, match=msg): 
             get_result() 
         if op in ["min", "max", "idxmin", "idxmax"] and isinstance(columns, list): 
             # i.e. DataframeGroupBy, not SeriesGroupBy 
             result = get_result(numeric_only=True) 
             expected = get_categorical_invalid_expected() 
             tm.assert_equal(result, expected) 
         return 
     if op in ["prod", "sum", "skew", "kurt"]: 
         # ops that require more than just ordered-ness 
         if is_dt64 or is_cat or is_per or (is_str and op != "sum"): 
             # GH#41291 
             # datetime64 -> prod and sum are invalid 
             if is_dt64: 
                 msg = "datetime64 type does not support" 
             elif is_per: 
                 msg = "Period type does not support" 
             elif is_str: 
                 msg = f"dtype 'str' does not support operation '{op}'" 
             else: 
                 msg = "category type does not support" 
             if op in ["skew", "kurt"]: 
                 msg = "|".join([msg, f"does not support operation '{op}'"]) 
             with pytest.raises(TypeError, match=msg): 
                 get_result() 
             if not isinstance(columns, list): 
                 # i.e. SeriesGroupBy 
                 return 
             elif op in ["skew", "kurt"]: 
                 # TODO: test the numeric_only=True case 
                 return 
             else: 
                 # i.e. op in ["prod", "sum"]: 
                 # i.e. DataFrameGroupBy 
                 # ops that require more than just ordered-ness 
                 # GH#41291 
                 result = get_result(numeric_only=True) 
                 # with numeric_only=True, these are dropped, and we get 
                 # an empty DataFrame back 
                 expected = df.set_index(keys)[[]] 
                 if is_cat: 
                     expected = get_categorical_invalid_expected() 
                 tm.assert_equal(result, expected) 
                 return 
     result = get_result() 
     expected = df.set_index(keys)[columns] 
     if op in ["idxmax", "idxmin"]: 
         expected = expected.astype(df.index.dtype) 
     if override_dtype is not None: 
         expected = expected.astype(override_dtype) 
     if len(keys) == 1: 
         expected.index.name = keys[0] 
     tm.assert_equal(result, expected) 
 def test_empty_groupby(columns, keys, values, method, op, dropna, using_infer_string): 
     # GH8093 & GH26411 
     override_dtype = None 
  
     if isinstance(values, BooleanArray) and op in ["sum", "prod"]: 
         # We expect to get Int64 back for these 
         override_dtype = "Int64" 
  
     if isinstance(values[0], bool) and op in ("prod", "sum"): 
         # sum/product of bools is an integer 
         override_dtype = "int64" 
  
     df = DataFrame({"A": values, "B": values, "C": values}, columns=list("ABC")) 
  
     if hasattr(values, "dtype"): 
         # check that we did the construction right 
         assert (df.dtypes == values.dtype).all() 
  
     df = df.iloc[:0] 
  
     gb = df.groupby(keys, group_keys=False, dropna=dropna, observed=False)[columns] 
  
     def get_result(**kwargs): 
         if method == "attr": 
             return getattr(gb, op)(**kwargs) 
         else: 
             return getattr(gb, method)(op, **kwargs) 
  
     def get_categorical_invalid_expected(): 
         # Categorical is special without 'observed=True', we get an NaN entry 
         #  corresponding to the unobserved group. If we passed observed=True 
         #  to groupby, expected would just be 'df.set_index(keys)[columns]' 
         #  as below 
         lev = Categorical([0], dtype=values.dtype) 
         if len(keys) != 1: 
             idx = MultiIndex.from_product([lev, lev], names=keys) 
         else: 
             # all columns are dropped, but we end up with one row 
             # Categorical is special without 'observed=True' 
             idx = Index(lev, name=keys[0]) 
  
         if using_infer_string: 
             columns = Index([], dtype="str") 
         else: 
             columns = [] 
         expected = DataFrame([], columns=columns, index=idx) 
         return expected 
  
     is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) 
     is_dt64 = df.dtypes.iloc[0].kind == "M" 
     is_cat = isinstance(values, Categorical) 
     is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype) 
  
     if ( 
         isinstance(values, Categorical) 
         and not values.ordered 
         and op in ["min", "max", "idxmin", "idxmax"] 
     ): 
         if op in ["min", "max"]: 
             msg = f"Cannot perform {op} with non-ordered Categorical" 
             klass = TypeError 
         else: 
             msg = f"Can't get {op} of an empty group due to unobserved categories" 
             klass = ValueError 
         with pytest.raises(klass, match=msg): 
             get_result() 
  
         if op in ["min", "max", "idxmin", "idxmax"] and isinstance(columns, list): 
             # i.e. DataframeGroupBy, not SeriesGroupBy 
             result = get_result(numeric_only=True) 
             expected = get_categorical_invalid_expected() 
             tm.assert_equal(result, expected) 
         return 
  
     if op in ["prod", "sum", "skew", "kurt"]: 
         # ops that require more than just ordered-ness 
         if is_dt64 or is_cat or is_per or (is_str and op != "sum"): 
             # GH#41291 
             # datetime64 -> prod and sum are invalid 
             if is_dt64: 
                 msg = "datetime64 type does not support" 
             elif is_per: 
                 msg = "Period type does not support" 
             elif is_str: 
                 msg = f"dtype 'str' does not support operation '{op}'" 
             else: 
                 msg = "category type does not support" 
             if op in ["skew", "kurt"]: 
                 msg = "|".join([msg, f"does not support operation '{op}'"]) 
             with pytest.raises(TypeError, match=msg): 
                 get_result() 
  
             if not isinstance(columns, list): 
                 # i.e. SeriesGroupBy 
                 return 
             elif op in ["skew", "kurt"]: 
                 # TODO: test the numeric_only=True case 
                 return 
             else: 
                 # i.e. op in ["prod", "sum"]: 
                 # i.e. DataFrameGroupBy 
                 # ops that require more than just ordered-ness 
                 # GH#41291 
                 result = get_result(numeric_only=True) 
  
                 # with numeric_only=True, these are dropped, and we get 
                 # an empty DataFrame back 
                 expected = df.set_index(keys)[[]] 
                 if is_cat: 
                     expected = get_categorical_invalid_expected() 
                 tm.assert_equal(result, expected) 
                 return 
  
     result = get_result() 
     expected = df.set_index(keys)[columns] 
     if op in ["idxmax", "idxmin"]: 
         expected = expected.astype(df.index.dtype) 
     if override_dtype is not None: 
         expected = expected.astype(override_dtype) 
     if len(keys) == 1: 
         expected.index.name = keys[0] 
     tm.assert_equal(result, expected) 
+        expected_value = not skipna if reduction_func == "any" else True
+    elif reduction_func in ["count", "nunique", "size"]:
+        # TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA?
+        if (
+            test_series
+            and reduction_func == "size"
+            and dtype.storage == "pyarrow"
+            and dtype.na_value is pd.NA
+        ):
+            expected_dtype = "Int64"
+        else:
+            expected_dtype = "int64"
+        expected_value = 1 if reduction_func == "size" else 0
+    elif reduction_func in ["idxmin", "idxmax"]:
+        expected_dtype, expected_value = "float64", np.nan
+    elif not skipna or min_count > 0:
+        expected_value = pd.NA
+    elif reduction_func == "sum":
+        # https://github.com/pandas-dev/pandas/pull/60936
+        expected_value = ""
+
     df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
-    gb = df.groupby("a")
-    result = gb.sum(skipna=skipna, min_count=min_count)
-    value = "" if skipna and min_count == 0 else pd.NA
-    expected = DataFrame(
-        {"b": value}, index=pd.Index(["x"], name="a", dtype=dtype), dtype=dtype
-    )
-    tm.assert_frame_equal(result, expected)
+    obj = df["b"] if test_series else df
+    args = get_groupby_method_args(reduction_func, obj)
+    gb = obj.groupby(df["a"])
+    method = getattr(gb, reduction_func)
+
+    if reduction_func in [
+        "mean",
+        "median",
+        "kurt",
+        "prod",
+        "quantile",
+        "sem",
+        "skew",
+        "std",
+        "var",
+    ]:
+        msg = f"dtype '{dtype}' does not support operation '{reduction_func}'"
+        with pytest.raises(TypeError, match=msg):
+            method(*args, **kwargs)
+        return
+    elif reduction_func in ["idxmin", "idxmax"] and not skipna:
+        msg = f"{reduction_func} with skipna=False encountered an NA value."
+        with pytest.raises(ValueError, match=msg):
+            method(*args, **kwargs)
+        return
+
+    result = method(*args, **kwargs)
+    index = pd.Index(["x"], name="a", dtype=dtype)
+    if test_series or reduction_func == "size":
+        name = None if not test_series and reduction_func == "size" else "b"
+        expected = Series(expected_value, index=index, dtype=expected_dtype, name=name)
+    else:
+        expected = DataFrame({"b": expected_value}, index=index, dtype=expected_dtype)
+    tm.assert_equal(result, expected)
 
 
 def test_max_nan_bug():

diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import is_platform_windows
 
 import pandas as pd
@@ -462,7 +460,6 @@ def test_empty(keys):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize("consolidate", [True, False])
 def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
     # https://github.com/pandas-dev/pandas/issues/39329