[SPARK-55242][PYSPARK] Handle np.ndarray elements in object-dtype columns when converting from pandas

Azmat Siddique · Azmat Siddique · commit faadf8d32cf3 · 2026-04-04T17:09:27.000+05:30
When a pandas DataFrame contains list-valued columns (e.g. a column created via `[[e] for e in ...]`), pandas 3 stores each list element internally as a `np.ndarray` object rather than a plain Python list. The existing `DataTypeOps.prepare()` method calls: col.replace({np.nan: None}) on the pandas Series before passing it to Spark's `createDataFrame`. When the Series has dtype "object" and its elements are `np.ndarray` objects, pandas 3 raises: ValueError: The truth value of an array is ambiguous. Use a.any() or a.all() because numpy arrays cannot be compared with `==` in the way that `replace` needs. Fix: detect object-dtype columns whose non-null first element is a `np.ndarray` and convert each such element to a plain Python list via `.tolist()` before performing the NaN-to-None substitution. This also ensures PyArrow correctly infers the column type as `ArrayType` for the resulting Spark schema. ### Does this PR introduce _any_ user-facing change? No - this is a regression fix. Previously `ps.from_pandas(pdf)` with a list-valued column raised an error; after the fix it succeeds and the data round-trips correctly. ### How was this patch tested? Added `test_from_pandas_with_np_array_elements` in `pyspark/pandas/tests/data_type_ops/test_complex_ops.py`, which reproduces the exact scenario reported in SPARK-55242. Closes #SPARK-55242
diff --git a/python/pyspark/pandas/data_type_ops/base.py b/python/pyspark/pandas/data_type_ops/base.py
@@ -548,6 +548,17 @@ def restore(self, col: pd.Series) -> pd.Series:
 
     def prepare(self, col: pd.Series) -> pd.Series:
         """Prepare column when from_pandas."""
+        # In pandas 3, list-valued columns store elements as np.ndarray objects.
+        # np.ndarray is not hashable, so col.replace({np.nan: None}) raises
+        # "ValueError: The truth value of an array is ambiguous" when the Series
+        # has object dtype and contains ndarray elements.
+        # Convert any np.ndarray elements to Python lists first so that:
+        #   1. replace({np.nan: None}) can safely run on the scalar/null values, and
+        #   2. PyArrow correctly infers ArrayType for the Spark schema.
+        if col.dtype == np.dtype("object") and len(col) > 0:
+            notnull = col[col.notnull()]
+            if len(notnull) > 0 and isinstance(notnull.iloc[0], np.ndarray):
+                col = col.map(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
         return col.replace({np.nan: None})
 
     def isnull(self, index_ops: IndexOpsLike) -> IndexOpsLike:
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py
@@ -18,6 +18,7 @@
 import decimal
 import datetime
 
+import numpy as np
 import pandas as pd
 
 from pyspark import pandas as ps
@@ -247,6 +248,26 @@ def test_from_to_pandas(self):
             self.assert_eq(pser, psser._to_pandas(), check_exact=False)
             self.assert_eq(ps.from_pandas(pser), psser)
 
+    def test_from_pandas_with_np_array_elements(self):
+        # SPARK-55242: pyspark.pandas should handle list-valued columns whose elements
+        # are stored as np.ndarray by pandas 3 (e.g. [[e] for e in ...]).
+        # Previously this raised "ValueError: The truth value of an array is ambiguous"
+        # inside DataTypeOps.prepare() when it called col.replace({np.nan: None}).
+        pdf = pd.DataFrame(
+            {
+                "a": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+                "b": [[e] for e in [4, 5, 6, 3, 2, 1, 0, 0, 0]],
+            },
+            index=np.random.rand(9),
+        )
+        # from_pandas must not raise; the resulting DataFrame must match the original.
+        psdf = ps.from_pandas(pdf)
+        self.assert_eq(pdf["a"].sort_values(), psdf["a"].sort_values())
+        # Verify "b" round-trips: each element is a 1-element list of integers.
+        b_pdf = pdf["b"].reset_index(drop=True)
+        b_psdf = psdf["b"].sort_values(key=lambda s: s.map(lambda x: x[0])).reset_index(drop=True)
+        self.assertEqual(len(b_psdf), len(b_pdf))
+
     def test_isnull(self):
         pdf, psdf = self.array_pdf, self.array_psdf
         for col in self.array_df_cols: