feat(dataframe): update group_by to accept None and normalize to empty list

kosiew · kosiew · commit ae04178a26d0 · 2026-06-06T13:03:27.000+08:00
- Updated `group_by` method to accept `None` and normalize it to an empty list.
- Improved docstring for clarity.
- Added regression test in `test_dataframe.py` to verify that `None` equals an empty list.
- Updated documentation to mention that `group_by=None` is now supported.
diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.rst
@@ -41,8 +41,8 @@ to form a single summary value. For performing an aggregation, DataFusion provid
         f.approx_median(col_speed).alias("Median Speed"),
         f.approx_percentile_cont(col_speed, 0.9).alias("90% Speed")])
 
-When the :code:`group_by` list is empty the aggregation is done over the whole :class:`.DataFrame`.
-For grouping the :code:`group_by` list must contain at least one column.
+When :code:`group_by` is :code:`None` or an empty list, the aggregation is done over the whole
+:class:`.DataFrame`. For grouping the :code:`group_by` list must contain at least one column.
 
 .. ipython:: python
 
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -798,7 +798,7 @@ def with_column_renamed(self, old_name: str, new_name: str) -> DataFrame:
 
     def aggregate(
         self,
-        group_by: Sequence[Expr | str] | Expr | str,
+        group_by: Sequence[Expr | str] | Expr | str | None,
         aggs: Sequence[Expr] | Expr,
     ) -> DataFrame:
         """Aggregates the rows of the current DataFrame.
@@ -816,23 +816,24 @@ def aggregate(
 
         Args:
             group_by: Sequence of expressions or column names to group
-                by. A :py:class:`~datafusion.expr.GroupingSet`
-                expression may be included to produce multiple grouping
-                levels (rollup, cube, or explicit grouping sets).
+                by, or ``None`` for aggregation over the whole DataFrame.
+                A :py:class:`~datafusion.expr.GroupingSet` expression may
+                be included to produce multiple grouping levels (rollup,
+                cube, or explicit grouping sets).
             aggs: Sequence of expressions to aggregate.
 
         Returns:
             DataFrame after aggregation.
 
         Examples:
-            Aggregate without grouping — an empty ``group_by`` produces a
-            single row:
+            Aggregate without grouping — ``None`` or an empty ``group_by``
+            produces a single row:
 
             >>> ctx = dfn.SessionContext()
             >>> df = ctx.from_pydict(
             ...     {"team": ["x", "x", "y"], "score": [1, 2, 5]}
             ... )
-            >>> df.aggregate([], [F.sum(col("score")).alias("total")]).to_pydict()
+            >>> df.aggregate(None, [F.sum(col("score")).alias("total")]).to_pydict()
             {'total': [8]}
 
             Group by a column and produce one row per group:
@@ -842,11 +843,15 @@ def aggregate(
             ... ).sort("team").to_pydict()
             {'team': ['x', 'y'], 'total': [3, 5]}
         """
-        group_by_list = (
-            list(group_by)
-            if isinstance(group_by, Sequence) and not isinstance(group_by, Expr | str)
-            else [group_by]
-        )
+        if group_by is None:
+            group_by_list = []
+        else:
+            group_by_list = (
+                list(group_by)
+                if isinstance(group_by, Sequence)
+                and not isinstance(group_by, Expr | str)
+                else [group_by]
+            )
         aggs_list = (
             list(aggs)
             if isinstance(aggs, Sequence) and not isinstance(aggs, Expr)
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -475,6 +475,12 @@ def test_aggregate_tuple_group_by(df):
     assert result_tuple == result_list
 
 
+def test_aggregate_none_group_by_equivalent_to_empty_list(df):
+    result_none = df.aggregate(None, [f.count()]).to_pydict()
+    result_empty = df.aggregate([], [f.count()]).to_pydict()
+    assert result_none == result_empty
+
+
 def test_aggregate_tuple_aggs(df):
     result_list = df.aggregate("a", [f.count()]).sort("a").to_pydict()
     result_tuple = df.aggregate("a", (f.count(),)).sort("a").to_pydict()