Skip to content

Commit 2ad102c

Browse files
Azmat SiddiqueAzmat Siddique
authored andcommitted
[SPARK-55242][PYTHON] Handle np.ndarray elements in list-valued columns when converting from pandas
1 parent d9c8eda commit 2ad102c

2 files changed

Lines changed: 138 additions & 49 deletions

File tree

python/pyspark/pandas/data_type_ops/base.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,28 @@ def restore(self, col: pd.Series) -> pd.Series:
548548

549549
def prepare(self, col: pd.Series) -> pd.Series:
550550
"""Prepare column when from_pandas."""
551+
552+
from distutils.version import LooseVersion
553+
554+
if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"):
555+
# In pandas 3, list-valued columns store elements as np.ndarray objects.
556+
# np.ndarray is not hashable, so col.replace({np.nan: None}) raises
557+
# "ValueError: The truth value of an array is ambiguous" when the Series
558+
# has object dtype and contains ndarray elements.
559+
# Convert any np.ndarray elements to Python lists first so that:
560+
# 1. replace({np.nan: None}) can safely run on the scalar/null values, and
561+
# 2. PyArrow correctly infers ArrayType for the Spark schema.
562+
# We recurse into nested structures so that 2D/nested ndarrays are fully
563+
# converted to plain Python lists at every level.
564+
def _ndarray_to_list(x: Any) -> Any:
565+
if isinstance(x, np.ndarray):
566+
return [_ndarray_to_list(item) for item in x]
567+
return x
568+
569+
if col.dtype == np.dtype("object"):
570+
col = col.where(pd.notna(col), None)
571+
return col.apply(_ndarray_to_list)
572+
551573
return col.replace({np.nan: None})
552574

553575
def isnull(self, index_ops: IndexOpsLike) -> IndexOpsLike:

python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py

Lines changed: 116 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
import decimal
1919
import datetime
2020

21+
import numpy as np
2122
import pandas as pd
23+
from distutils.version import LooseVersion
2224

2325
from pyspark import pandas as ps
2426
from pyspark.testing.pandasutils import PandasOnSparkTestCase
@@ -94,13 +96,11 @@ def complex_pdf(self):
9496

9597
@property
9698
def complex_psdf(self):
97-
pssers = {
98-
"this_array": self.psser,
99-
"that_array": ps.Series([[2, 3, 4]]),
100-
"this_struct": ps.Index([("x", 1)]).to_series().reset_index(drop=True),
101-
"that_struct": ps.Index([("a", 2)]).to_series().reset_index(drop=True),
102-
}
103-
return ps.concat(pssers, axis=1)
99+
s1 = self.psser.rename("this_array")
100+
s2 = ps.Series([[2, 3, 4]], name="that_array")
101+
s3 = ps.Index([("x", 1)]).to_series(name="this_struct").reset_index(drop=True)
102+
s4 = ps.Index([("a", 2)]).to_series(name="that_struct").reset_index(drop=True)
103+
return ps.concat([s1, s2, s3, s4], axis=1)
104104

105105
def test_add(self):
106106
pdf, psdf = self.array_pdf, self.array_psdf
@@ -247,6 +247,31 @@ def test_from_to_pandas(self):
247247
self.assert_eq(pser, psser._to_pandas(), check_exact=False)
248248
self.assert_eq(ps.from_pandas(pser), psser)
249249

250+
def test_from_pandas_with_np_array_elements(self):
251+
# SPARK-55242: pyspark.pandas should handle list-valued columns whose elements
252+
# are stored as np.ndarray by pandas 3 (e.g. [[e] for e in ...]).
253+
# Previously this raised "ValueError: The truth value of an array is ambiguous"
254+
# inside DataTypeOps.prepare() when it called col.replace({np.nan: None}).
255+
import warnings
256+
from pyspark.pandas.utils import PandasAPIOnSparkAdviceWarning
257+
258+
pdf = pd.DataFrame(
259+
{
260+
"a": [1, 2, 3, 4, 5, 6, 7, 8, 9],
261+
"b": [[e] for e in [4, 5, 6, 3, 2, 1, 0, 0, 0]],
262+
},
263+
index=np.random.rand(9),
264+
)
265+
with warnings.catch_warnings():
266+
warnings.simplefilter("ignore", PandasAPIOnSparkAdviceWarning)
267+
# from_pandas must not raise; the resulting DataFrame must match the original.
268+
psdf = ps.from_pandas(pdf)
269+
# Sort both by "a" to ensure deterministic order for comparison.
270+
pdf = pdf.sort_values(by="a").reset_index(drop=True)
271+
psdf = psdf.sort_values(by="a").reset_index(drop=True)
272+
self.assert_eq(pdf["a"], psdf["a"])
273+
self.assert_eq(pdf["b"], psdf["b"])
274+
250275
def test_isnull(self):
251276
pdf, psdf = self.array_pdf, self.array_psdf
252277
for col in self.array_df_cols:
@@ -266,13 +291,20 @@ def test_invert(self):
266291
self.assertRaises(TypeError, lambda: ~self.psser)
267292

268293
def test_eq(self):
269-
pdf, psdf = self.complex_pdf, self.complex_pdf
270-
self.assert_eq(
271-
pdf["this_array"] == pdf["that_array"], psdf["this_array"] == psdf["that_array"]
272-
)
273-
self.assert_eq(
274-
pdf["this_struct"] == pdf["that_struct"], psdf["this_struct"] == psdf["that_struct"]
275-
)
294+
pdf, psdf = self.complex_pdf, self.complex_psdf
295+
if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"):
296+
with self.assertRaises((ValueError, TypeError)):
297+
psdf["this_array"] == psdf["that_array"]
298+
with self.assertRaises((ValueError, TypeError)):
299+
psdf["this_struct"] == psdf["that_struct"]
300+
else:
301+
self.assert_eq(
302+
pdf["this_array"] == pdf["that_array"], psdf["this_array"] == psdf["that_array"]
303+
)
304+
self.assert_eq(
305+
pdf["this_struct"] == pdf["that_struct"], psdf["this_struct"] == psdf["that_struct"]
306+
)
307+
276308
self.assert_eq(
277309
pdf["this_array"] == pdf["this_array"], psdf["this_array"] == psdf["this_array"]
278310
)
@@ -281,13 +313,20 @@ def test_eq(self):
281313
)
282314

283315
def test_ne(self):
284-
pdf, psdf = self.complex_pdf, self.complex_pdf
285-
self.assert_eq(
286-
pdf["this_array"] != pdf["that_array"], psdf["this_array"] != psdf["that_array"]
287-
)
288-
self.assert_eq(
289-
pdf["this_struct"] != pdf["that_struct"], psdf["this_struct"] != psdf["that_struct"]
290-
)
316+
pdf, psdf = self.complex_pdf, self.complex_psdf
317+
if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"):
318+
with self.assertRaises((ValueError, TypeError)):
319+
psdf["this_array"] != psdf["that_array"]
320+
with self.assertRaises((ValueError, TypeError)):
321+
psdf["this_struct"] != psdf["that_struct"]
322+
else:
323+
self.assert_eq(
324+
pdf["this_array"] != pdf["that_array"], psdf["this_array"] != psdf["that_array"]
325+
)
326+
self.assert_eq(
327+
pdf["this_struct"] != pdf["that_struct"], psdf["this_struct"] != psdf["that_struct"]
328+
)
329+
291330
self.assert_eq(
292331
pdf["this_array"] != pdf["this_array"], psdf["this_array"] != psdf["this_array"]
293332
)
@@ -296,13 +335,20 @@ def test_ne(self):
296335
)
297336

298337
def test_lt(self):
299-
pdf, psdf = self.complex_pdf, self.complex_pdf
300-
self.assert_eq(
301-
pdf["this_array"] < pdf["that_array"], psdf["this_array"] < psdf["that_array"]
302-
)
303-
self.assert_eq(
304-
pdf["this_struct"] < pdf["that_struct"], psdf["this_struct"] < psdf["that_struct"]
305-
)
338+
pdf, psdf = self.complex_pdf, self.complex_psdf
339+
if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"):
340+
with self.assertRaises((ValueError, TypeError)):
341+
psdf["this_array"] < psdf["that_array"]
342+
with self.assertRaises((ValueError, TypeError)):
343+
psdf["this_struct"] < psdf["that_struct"]
344+
else:
345+
self.assert_eq(
346+
pdf["this_array"] < pdf["that_array"], psdf["this_array"] < psdf["that_array"]
347+
)
348+
self.assert_eq(
349+
pdf["this_struct"] < pdf["that_struct"], psdf["this_struct"] < psdf["that_struct"]
350+
)
351+
306352
self.assert_eq(
307353
pdf["this_array"] < pdf["this_array"], psdf["this_array"] < psdf["this_array"]
308354
)
@@ -311,13 +357,20 @@ def test_lt(self):
311357
)
312358

313359
def test_le(self):
314-
pdf, psdf = self.complex_pdf, self.complex_pdf
315-
self.assert_eq(
316-
pdf["this_array"] <= pdf["that_array"], psdf["this_array"] <= psdf["that_array"]
317-
)
318-
self.assert_eq(
319-
pdf["this_struct"] <= pdf["that_struct"], psdf["this_struct"] <= psdf["that_struct"]
320-
)
360+
pdf, psdf = self.complex_pdf, self.complex_psdf
361+
if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"):
362+
with self.assertRaises((ValueError, TypeError)):
363+
psdf["this_array"] <= psdf["that_array"]
364+
with self.assertRaises((ValueError, TypeError)):
365+
psdf["this_struct"] <= psdf["that_struct"]
366+
else:
367+
self.assert_eq(
368+
pdf["this_array"] <= pdf["that_array"], psdf["this_array"] <= psdf["that_array"]
369+
)
370+
self.assert_eq(
371+
pdf["this_struct"] <= pdf["that_struct"], psdf["this_struct"] <= psdf["that_struct"]
372+
)
373+
321374
self.assert_eq(
322375
pdf["this_array"] <= pdf["this_array"], psdf["this_array"] <= psdf["this_array"]
323376
)
@@ -326,13 +379,20 @@ def test_le(self):
326379
)
327380

328381
def test_gt(self):
329-
pdf, psdf = self.complex_pdf, self.complex_pdf
330-
self.assert_eq(
331-
pdf["this_array"] > pdf["that_array"], psdf["this_array"] > psdf["that_array"]
332-
)
333-
self.assert_eq(
334-
pdf["this_struct"] > pdf["that_struct"], psdf["this_struct"] > psdf["that_struct"]
335-
)
382+
pdf, psdf = self.complex_pdf, self.complex_psdf
383+
if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"):
384+
with self.assertRaises((ValueError, TypeError)):
385+
psdf["this_array"] > psdf["that_array"]
386+
with self.assertRaises((ValueError, TypeError)):
387+
psdf["this_struct"] > psdf["that_struct"]
388+
else:
389+
self.assert_eq(
390+
pdf["this_array"] > pdf["that_array"], psdf["this_array"] > psdf["that_array"]
391+
)
392+
self.assert_eq(
393+
pdf["this_struct"] > pdf["that_struct"], psdf["this_struct"] > psdf["that_struct"]
394+
)
395+
336396
self.assert_eq(
337397
pdf["this_array"] > pdf["this_array"], psdf["this_array"] > psdf["this_array"]
338398
)
@@ -341,13 +401,20 @@ def test_gt(self):
341401
)
342402

343403
def test_ge(self):
344-
pdf, psdf = self.complex_pdf, self.complex_pdf
345-
self.assert_eq(
346-
pdf["this_array"] >= pdf["that_array"], psdf["this_array"] >= psdf["that_array"]
347-
)
348-
self.assert_eq(
349-
pdf["this_struct"] >= pdf["that_struct"], psdf["this_struct"] >= psdf["that_struct"]
350-
)
404+
pdf, psdf = self.complex_pdf, self.complex_psdf
405+
if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"):
406+
with self.assertRaises((ValueError, TypeError)):
407+
psdf["this_array"] >= psdf["that_array"]
408+
with self.assertRaises((ValueError, TypeError)):
409+
psdf["this_struct"] >= psdf["that_struct"]
410+
else:
411+
self.assert_eq(
412+
pdf["this_array"] >= pdf["that_array"], psdf["this_array"] >= psdf["that_array"]
413+
)
414+
self.assert_eq(
415+
pdf["this_struct"] >= pdf["that_struct"], psdf["this_struct"] >= psdf["that_struct"]
416+
)
417+
351418
self.assert_eq(
352419
pdf["this_array"] >= pdf["this_array"], psdf["this_array"] >= psdf["this_array"]
353420
)

0 commit comments

Comments
 (0)