From eb0ebb8b7067fe48cf755bd7cb255a336cdd697b Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 26 Feb 2026 09:07:37 -0800 Subject: [PATCH 1/4] [SPARK-55325][PYTHON] Introduce ArrowArrayToPandasConversion.convert_pyarrow --- python/pyspark/sql/conversion.py | 86 +++++++++++++++++++ python/pyspark/sql/tests/test_conversion.py | 94 +++++++++++++++++++++ 2 files changed, 180 insertions(+) diff --git a/python/pyspark/sql/conversion.py b/python/pyspark/sql/conversion.py index 34e047ae52d5f..992453bce2dfd 100644 --- a/python/pyspark/sql/conversion.py +++ b/python/pyspark/sql/conversion.py @@ -1750,3 +1750,89 @@ def convert_numpy( assert False, f"Need converter for {spark_type} but failed to find one." return series.rename(ser_name) + + @classmethod + def convert_pyarrow( + cls, + arr: Union["pa.Array", "pa.ChunkedArray"], + spark_type: DataType, + *, + ser_name: Optional[str] = None, + ) -> "pd.Series": + """ + Convert a PyArrow Array or ChunkedArray to a pandas Series backed by ArrowDtype. + + This is similar to :meth:`convert_numpy`, but instead of producing + numpy-backed pandas Series, it produces ArrowDtype-backed Series via + ``arr.to_pandas(types_mapper=pd.ArrowDtype)``. + + Parameters + ---------- + arr : pa.Array or pa.ChunkedArray + The Arrow column to convert. + spark_type : DataType + The target Spark type for the column to be converted to. + ser_name : str, optional + The name of returned pd.Series. If not set, will try to get it from arr._name. + + Returns + ------- + pd.Series + Converted pandas Series backed by ArrowDtype. + """ + import pyarrow as pa + import pandas as pd + + assert isinstance(arr, (pa.Array, pa.ChunkedArray)) + + if ser_name is None: + ser_name = arr._name + + arr = ArrowArrayConversion.preprocess_time(arr) + + series: pd.Series + + if isinstance( + spark_type, + ( + NullType, + BinaryType, + BooleanType, + FloatType, + DoubleType, + ByteType, + ShortType, + IntegerType, + LongType, + DecimalType, + StringType, + DateType, + TimeType, + TimestampType, + TimestampNTZType, + DayTimeIntervalType, + YearMonthIntervalType, + ), + ): + series = arr.to_pandas(types_mapper=pd.ArrowDtype) + # elif isinstance(spark_type, UserDefinedType): + # TODO: Support UserDefinedType + # elif isinstance(spark_type, VariantType): + # TODO: Support VariantType + # elif isinstance(spark_type, GeographyType): + # TODO: Support GeographyType + # elif isinstance(spark_type, GeometryType): + # TODO: Support GeometryType + # elif isinstance( + # spark_type, + # ( + # ArrayType, + # MapType, + # StructType, + # ), + # ): + # TODO: Support complex types + else: # pragma: no cover + assert False, f"Need converter for {spark_type} but failed to find one." + + return series.rename(ser_name) diff --git a/python/pyspark/sql/tests/test_conversion.py b/python/pyspark/sql/tests/test_conversion.py index c3ac461ca1d44..c993fe8bfa2e0 100644 --- a/python/pyspark/sql/tests/test_conversion.py +++ b/python/pyspark/sql/tests/test_conversion.py @@ -30,8 +30,13 @@ from pyspark.sql.types import ( ArrayType, BinaryType, + BooleanType, + ByteType, + DateType, + DayTimeIntervalType, DecimalType, DoubleType, + FloatType, GeographyType, GeometryType, IntegerType, @@ -39,13 +44,17 @@ MapType, NullType, Row, + ShortType, StringType, StructField, StructType, + TimeType, + TimestampNTZType, TimestampType, UserDefinedType, VariantType, VariantVal, + YearMonthIntervalType, ) from pyspark.testing.objects import ExamplePoint, ExamplePointUDT, PythonOnlyPoint, PythonOnlyUDT from pyspark.testing.utils import ( @@ -656,6 +665,91 @@ def test_variant_convert_numpy(self): ) self.assertEqual(len(result), 0) + def test_convert_pyarrow(self): + import pyarrow as pa + import pandas as pd + + from decimal import Decimal + + # Cases where input data equals expected output + cases = [ + ([None, None], pa.null(), NullType()), + ([b"\x01", None], pa.binary(), BinaryType()), + ([True, None, False], pa.bool_(), BooleanType()), + ([1.0, None], pa.float32(), FloatType()), + ([1.0, None], pa.float64(), DoubleType()), + ([1, None, 3], pa.int8(), ByteType()), + ([1, None, 3], pa.int16(), ShortType()), + ([1, None, 3], pa.int32(), IntegerType()), + ([1, None, 3], pa.int64(), LongType()), + ([Decimal("1.23"), None], pa.decimal128(10, 2), DecimalType(10, 2)), + (["a", None, "c"], pa.string(), StringType()), + ([1, None], pa.int32(), YearMonthIntervalType()), + ] + for data, arrow_type, spark_type in cases: + arr = pa.array(data, type=arrow_type) + result = ArrowArrayToPandasConversion.convert_pyarrow(arr, spark_type) + self.assertIsInstance(result.dtype, pd.ArrowDtype, f"Failed for {spark_type}") + for i, val in enumerate(data): + msg = f"Failed for {spark_type} at index {i}: expected {val}, got {result.iloc[i]}" + if val is None: + self.assertTrue(pd.isna(result.iloc[i]), msg) + else: + self.assertEqual(result.iloc[i], val, msg) + + def test_convert_pyarrow_temporal(self): + import pyarrow as pa + import pandas as pd + + cases = [ + ([1, None], pa.date32(), DateType(), [datetime.date(1970, 1, 2), None]), + ([1000000, None], pa.time64("us"), TimeType(), [datetime.time(0, 0, 1), None]), + ( + [1000000, None], + pa.timestamp("us", tz="UTC"), + TimestampType(), + [datetime.datetime(1970, 1, 1, 0, 0, 1), None], + ), + ( + [1000000, None], + pa.timestamp("us"), + TimestampNTZType(), + [datetime.datetime(1970, 1, 1, 0, 0, 1), None], + ), + ( + [1000000, None], + pa.duration("us"), + DayTimeIntervalType(), + [datetime.timedelta(seconds=1), None], + ), + ] + for data, arrow_type, spark_type, expected in cases: + arr = pa.array(data, type=arrow_type) + result = ArrowArrayToPandasConversion.convert_pyarrow(arr, spark_type) + self.assertIsInstance(result.dtype, pd.ArrowDtype, f"Failed for {spark_type}") + for i, exp in enumerate(expected): + msg = f"Failed for {spark_type} at index {i}: expected {exp}, got {result.iloc[i]}" + if exp is None: + self.assertTrue(pd.isna(result.iloc[i]), msg) + else: + self.assertEqual(result.iloc[i], exp, msg) + + def test_convert_pyarrow_ser_name(self): + import pyarrow as pa + import pandas as pd + + # explicit ser_name + arr = pa.array([1, 2, 3], type=pa.int64()) + result = ArrowArrayToPandasConversion.convert_pyarrow(arr, LongType(), ser_name="col") + self.assertEqual(result.name, "col") + self.assertIsInstance(result.dtype, pd.ArrowDtype) + + # default name from arrow array (set via RecordBatch column extraction) + batch = pa.record_batch({"my_col": [1, 2, 3]}) + arr = batch.column("my_col") + result = ArrowArrayToPandasConversion.convert_pyarrow(arr, LongType()) + self.assertEqual(result.name, "my_col") + if __name__ == "__main__": from pyspark.testing import main From ce5d30d030c219423eeb11ac74f3897efefd5277 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 9 Apr 2026 23:47:46 -0700 Subject: [PATCH 2/4] add type selection --- python/pyspark/sql/conversion.py | 18 +++++++++++++++++ python/pyspark/sql/tests/test_conversion.py | 22 +++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/python/pyspark/sql/conversion.py b/python/pyspark/sql/conversion.py index 68648050f2beb..83c7361ac6df9 100644 --- a/python/pyspark/sql/conversion.py +++ b/python/pyspark/sql/conversion.py @@ -188,6 +188,7 @@ def to_pandas( ndarray_as_list: bool = False, prefer_int_ext_dtype: bool = False, df_for_struct: bool = False, + arrow_cast_types: Optional[tuple] = None, ) -> List[Union["pd.Series", "pd.DataFrame"]]: """ Convert a RecordBatch or Table to a list of pandas Series. @@ -208,6 +209,10 @@ def to_pandas( Whether to convert integers to Pandas ExtensionDType. df_for_struct : bool If True, convert struct columns to DataFrame instead of Series. + arrow_cast_types : tuple of DataType classes, optional + If provided, columns whose Spark type matches one of these classes will be + converted via convert_pyarrow (ArrowDtype-backed). Unsupported types fall + through to convert_numpy/convert_legacy. Default is None (disabled). Returns ------- @@ -232,6 +237,7 @@ def to_pandas( ndarray_as_list=ndarray_as_list, prefer_int_ext_dtype=prefer_int_ext_dtype, df_for_struct=df_for_struct, + arrow_cast_types=arrow_cast_types, ) for i in range(batch.num_columns) ] @@ -1471,6 +1477,7 @@ def convert( ndarray_as_list: bool = False, prefer_int_ext_dtype: bool = False, df_for_struct: bool = False, + arrow_cast_types: Optional[tuple] = None, ) -> Union["pd.Series", "pd.DataFrame"]: """ Convert a PyArrow Array or ChunkedArray to a pandas Series or DataFrame. @@ -1495,6 +1502,10 @@ def convert( df_for_struct : bool, optional If True, convert struct columns to a DataFrame with columns corresponding to struct fields instead of a Series. Default is False. + arrow_cast_types : tuple of DataType classes, optional + If provided, columns whose Spark type matches one of these classes will be + converted via convert_pyarrow (ArrowDtype-backed). Unsupported types fall + through to convert_numpy/convert_legacy. Default is None (disabled). Returns ------- @@ -1502,6 +1513,13 @@ def convert( Converted pandas Series. If df_for_struct is True and the type is StructType, returns a DataFrame with columns corresponding to struct fields. """ + if arrow_cast_types is not None and isinstance(spark_type, arrow_cast_types): + return cls.convert_pyarrow( + arr, + spark_type, + ser_name=ser_name, + ) + if cls._prefer_convert_numpy(spark_type, df_for_struct): return cls.convert_numpy( arr, diff --git a/python/pyspark/sql/tests/test_conversion.py b/python/pyspark/sql/tests/test_conversion.py index d8973cba11459..7895c41dea073 100644 --- a/python/pyspark/sql/tests/test_conversion.py +++ b/python/pyspark/sql/tests/test_conversion.py @@ -802,6 +802,28 @@ def test_convert_pyarrow_ser_name(self): arr = batch.column("my_col") result = ArrowArrayToPandasConversion.convert_pyarrow(arr, LongType()) self.assertEqual(result.name, "my_col") + + def test_convert_arrow_cast_types(self): + """Test that arrow_cast_types routes matching types to convert_pyarrow.""" + import pyarrow as pa + import pandas as pd + + arr = pa.array([1, 2, 3], type=pa.int64()) + + # With arrow_cast_types including LongType: should get ArrowDtype + result = ArrowArrayToPandasConversion.convert(arr, LongType(), arrow_cast_types=(LongType,)) + self.assertIsInstance(result.dtype, pd.ArrowDtype) + + # With arrow_cast_types not including LongType: should get numpy dtype + result = ArrowArrayToPandasConversion.convert( + arr, LongType(), arrow_cast_types=(StringType,) + ) + self.assertNotIsInstance(result.dtype, pd.ArrowDtype) + + # With arrow_cast_types=None (default): should get numpy dtype + result = ArrowArrayToPandasConversion.convert(arr, LongType()) + self.assertNotIsInstance(result.dtype, pd.ArrowDtype) + def test_geography_convert_numpy(self): import pyarrow as pa From 10ff8b99ebee6667fa157a5d7ca38d38b47718ed Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 10 Apr 2026 10:42:04 -0700 Subject: [PATCH 3/4] add flag to turn on arrow --- python/pyspark/sql/conversion.py | 35 ++++++++++++--- python/pyspark/sql/pandas/conversion.py | 43 +++++++++++++------ python/pyspark/sql/tests/test_conversion.py | 40 ++++++++++++++--- .../apache/spark/sql/internal/SQLConf.scala | 14 ++++++ 4 files changed, 106 insertions(+), 26 deletions(-) diff --git a/python/pyspark/sql/conversion.py b/python/pyspark/sql/conversion.py index 83c7361ac6df9..66af57533f9c9 100644 --- a/python/pyspark/sql/conversion.py +++ b/python/pyspark/sql/conversion.py @@ -188,7 +188,7 @@ def to_pandas( ndarray_as_list: bool = False, prefer_int_ext_dtype: bool = False, df_for_struct: bool = False, - arrow_cast_types: Optional[tuple] = None, + arrow_dtype_types: Optional[tuple] = None, ) -> List[Union["pd.Series", "pd.DataFrame"]]: """ Convert a RecordBatch or Table to a list of pandas Series. @@ -209,7 +209,7 @@ def to_pandas( Whether to convert integers to Pandas ExtensionDType. df_for_struct : bool If True, convert struct columns to DataFrame instead of Series. - arrow_cast_types : tuple of DataType classes, optional + arrow_dtype_types : tuple of DataType classes, optional If provided, columns whose Spark type matches one of these classes will be converted via convert_pyarrow (ArrowDtype-backed). Unsupported types fall through to convert_numpy/convert_legacy. Default is None (disabled). @@ -237,7 +237,7 @@ def to_pandas( ndarray_as_list=ndarray_as_list, prefer_int_ext_dtype=prefer_int_ext_dtype, df_for_struct=df_for_struct, - arrow_cast_types=arrow_cast_types, + arrow_dtype_types=arrow_dtype_types, ) for i in range(batch.num_columns) ] @@ -1465,6 +1465,29 @@ class ArrowArrayToPandasConversion: where Arrow data needs to be converted to pandas for Python UDF processing. """ + # Types supported by convert_pyarrow (ArrowDtype-backed pandas Series). + # This tuple controls which types are routed to the pyarrow path when + # arrow_cast is enabled. Expand as more types are supported. + ARROW_DTYPE_TYPES = ( + NullType, + BinaryType, + BooleanType, + FloatType, + DoubleType, + ByteType, + ShortType, + IntegerType, + LongType, + DecimalType, + StringType, + DateType, + TimeType, + TimestampType, + TimestampNTZType, + DayTimeIntervalType, + YearMonthIntervalType, + ) + @classmethod def convert( cls, @@ -1477,7 +1500,7 @@ def convert( ndarray_as_list: bool = False, prefer_int_ext_dtype: bool = False, df_for_struct: bool = False, - arrow_cast_types: Optional[tuple] = None, + arrow_dtype_types: Optional[tuple] = None, ) -> Union["pd.Series", "pd.DataFrame"]: """ Convert a PyArrow Array or ChunkedArray to a pandas Series or DataFrame. @@ -1502,7 +1525,7 @@ def convert( df_for_struct : bool, optional If True, convert struct columns to a DataFrame with columns corresponding to struct fields instead of a Series. Default is False. - arrow_cast_types : tuple of DataType classes, optional + arrow_dtype_types : tuple of DataType classes, optional If provided, columns whose Spark type matches one of these classes will be converted via convert_pyarrow (ArrowDtype-backed). Unsupported types fall through to convert_numpy/convert_legacy. Default is None (disabled). @@ -1513,7 +1536,7 @@ def convert( Converted pandas Series. If df_for_struct is True and the type is StructType, returns a DataFrame with columns corresponding to struct fields. """ - if arrow_cast_types is not None and isinstance(spark_type, arrow_cast_types): + if arrow_dtype_types is not None and isinstance(spark_type, arrow_dtype_types): return cls.convert_pyarrow( arr, spark_type, diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index 3f5d68d10452e..622b48f9fd16e 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -39,6 +39,7 @@ ArrayType, MapType, TimestampType, + StructField, StructType, _has_type, DataType, @@ -184,6 +185,7 @@ def _convert_arrow_table_to_pandas( struct_handling_mode: Optional[str] = None, date_as_object: bool = False, self_destruct: bool = False, + arrow_dtype: bool = False, ) -> "PandasDataFrameLike": """ Helper function to convert Arrow table columns to a pandas DataFrame. @@ -207,6 +209,9 @@ def _convert_arrow_table_to_pandas( Whether to convert date values to Python datetime.date objects (default: False) self_destruct : bool Whether to enable memory-efficient self-destruct mode for large tables (default: False) + arrow_dtype : bool + Whether to produce ArrowDtype-backed pandas Series for supported types + (default: False) Returns ------- @@ -254,23 +259,32 @@ def _convert_arrow_table_to_pandas( error_on_duplicated_field_names = True struct_handling_mode = "dict" - # Convert arrow columns to pandas Series - column_data = (arrow_col.to_pandas(**pandas_options) for arrow_col in arrow_table.columns) + if arrow_dtype: + from pyspark.sql.conversion import ArrowArrayToPandasConversion + + arrow_dtype_types = ArrowArrayToPandasConversion.ARROW_DTYPE_TYPES + + def _convert_column(arrow_col: "pa.ChunkedArray", field: "StructField") -> "pd.Series": + if arrow_dtype and isinstance(field.dataType, arrow_dtype_types): + return ArrowArrayToPandasConversion.convert_pyarrow( + arrow_col, field.dataType, ser_name=field.name + ) + series = arrow_col.to_pandas(**pandas_options) + return _create_converter_to_pandas( + field.dataType, + field.nullable, + timezone=timezone, + struct_in_pandas=struct_handling_mode, + error_on_duplicated_field_names=error_on_duplicated_field_names, + )(series) - # Apply Spark-specific type converters to each column pdf = pd.concat( objs=cast( Sequence[pd.Series], - ( - _create_converter_to_pandas( - field.dataType, - field.nullable, - timezone=timezone, - struct_in_pandas=struct_handling_mode, - error_on_duplicated_field_names=error_on_duplicated_field_names, - )(series) - for series, field in zip(column_data, schema.fields) - ), + [ + _convert_column(arrow_table.column(i), schema.fields[i]) + for i in range(len(schema.fields)) + ], ), axis="columns", ) @@ -306,6 +320,7 @@ def _to_pandas(self, **kwargs: Any) -> "PandasDataFrameLike": arrowPySparkFallbackEnabled, arrowPySparkSelfDestructEnabled, pandasStructHandlingMode, + arrowPySparkArrowDtypeEnabled, ) = self.sparkSession._jconf.getConfs( [ "spark.sql.session.timeZone", @@ -314,6 +329,7 @@ def _to_pandas(self, **kwargs: Any) -> "PandasDataFrameLike": "spark.sql.execution.arrow.pyspark.fallback.enabled", "spark.sql.execution.arrow.pyspark.selfDestruct.enabled", "spark.sql.execution.pandas.structHandlingMode", + "spark.sql.execution.arrow.pyspark.arrowDtype.enabled", ] ) @@ -386,6 +402,7 @@ def _to_pandas(self, **kwargs: Any) -> "PandasDataFrameLike": struct_handling_mode=pandasStructHandlingMode, date_as_object=True, self_destruct=arrowPySparkSelfDestructEnabled == "true", + arrow_dtype=arrowPySparkArrowDtypeEnabled == "true", ) return pdf diff --git a/python/pyspark/sql/tests/test_conversion.py b/python/pyspark/sql/tests/test_conversion.py index 7895c41dea073..0f494e83d89c4 100644 --- a/python/pyspark/sql/tests/test_conversion.py +++ b/python/pyspark/sql/tests/test_conversion.py @@ -803,27 +803,53 @@ def test_convert_pyarrow_ser_name(self): result = ArrowArrayToPandasConversion.convert_pyarrow(arr, LongType()) self.assertEqual(result.name, "my_col") - def test_convert_arrow_cast_types(self): - """Test that arrow_cast_types routes matching types to convert_pyarrow.""" + def test_convert_arrow_dtype_types(self): + """Test that arrow_dtype_types routes matching types to convert_pyarrow.""" import pyarrow as pa import pandas as pd arr = pa.array([1, 2, 3], type=pa.int64()) - # With arrow_cast_types including LongType: should get ArrowDtype - result = ArrowArrayToPandasConversion.convert(arr, LongType(), arrow_cast_types=(LongType,)) + # With arrow_dtype_types including LongType: should get ArrowDtype + result = ArrowArrayToPandasConversion.convert( + arr, LongType(), arrow_dtype_types=(LongType,) + ) self.assertIsInstance(result.dtype, pd.ArrowDtype) - # With arrow_cast_types not including LongType: should get numpy dtype + # With arrow_dtype_types not including LongType: should get numpy dtype result = ArrowArrayToPandasConversion.convert( - arr, LongType(), arrow_cast_types=(StringType,) + arr, LongType(), arrow_dtype_types=(StringType,) ) self.assertNotIsInstance(result.dtype, pd.ArrowDtype) - # With arrow_cast_types=None (default): should get numpy dtype + # With arrow_dtype_types=None (default): should get numpy dtype result = ArrowArrayToPandasConversion.convert(arr, LongType()) self.assertNotIsInstance(result.dtype, pd.ArrowDtype) + def test_convert_arrow_table_to_pandas_arrow_dtype(self): + """Test _convert_arrow_table_to_pandas with arrow_dtype flag.""" + import pyarrow as pa + import pandas as pd + + from pyspark.sql.pandas.conversion import _convert_arrow_table_to_pandas + + table = pa.table({"a": [1, 2, 3], "b": ["x", "y", "z"]}) + schema = StructType([StructField("a", LongType()), StructField("b", StringType())]) + + # arrow_dtype=False: numpy-backed + pdf_numpy = _convert_arrow_table_to_pandas(table, schema, timezone="UTC", arrow_dtype=False) + self.assertNotIsInstance(pdf_numpy["a"].dtype, pd.ArrowDtype) + self.assertNotIsInstance(pdf_numpy["b"].dtype, pd.ArrowDtype) + + # arrow_dtype=True: ArrowDtype-backed for supported types + pdf_arrow = _convert_arrow_table_to_pandas(table, schema, timezone="UTC", arrow_dtype=True) + self.assertIsInstance(pdf_arrow["a"].dtype, pd.ArrowDtype) + self.assertIsInstance(pdf_arrow["b"].dtype, pd.ArrowDtype) + + # Values should be equal + self.assertEqual(pdf_numpy["a"].tolist(), pdf_arrow["a"].tolist()) + self.assertEqual(pdf_numpy["b"].tolist(), pdf_arrow["b"].tolist()) + def test_geography_convert_numpy(self): import pyarrow as pa diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 3297e4ef99e46..26fa93ab9d070 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -4183,6 +4183,18 @@ object SQLConf { .booleanConf .createWithDefault(false) + val ARROW_PYSPARK_ARROW_DTYPE_ENABLED = + buildConf("spark.sql.execution.arrow.pyspark.arrowDtype.enabled") + .doc("(Experimental) When true, use ArrowDtype-backed pandas Series in " + + "pyspark.sql.DataFrame.toPandas for supported data types. This keeps data in Arrow " + + "format without converting to numpy, which handles nulls natively via pd.NA and " + + "avoids type coercion issues. " + + "This optimization applies to: pyspark.sql.DataFrame.toPandas " + + "when 'spark.sql.execution.arrow.pyspark.enabled' is set.") + .version("4.2.0") + .booleanConf + .createWithDefault(false) + val PYSPARK_BINARY_AS_BYTES = buildConf("spark.sql.execution.pyspark.binaryAsBytes") .doc("When true, BinaryType is consistently mapped to bytes in PySpark. " + @@ -7962,6 +7974,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def arrowPySparkSelfDestructEnabled: Boolean = getConf(ARROW_PYSPARK_SELF_DESTRUCT_ENABLED) + def arrowPySparkArrowDtypeEnabled: Boolean = getConf(ARROW_PYSPARK_ARROW_DTYPE_ENABLED) + def pysparkBinaryAsBytes: Boolean = getConf(PYSPARK_BINARY_AS_BYTES) def pysparkToJSONReturnDataFrame: Boolean = getConf(PYSPARK_TOJSON_RETURN_DATAFRAME) From e2a09ae6d216bb4e4a4c67ad0661385dda0e3990 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 10 Apr 2026 18:06:08 -0700 Subject: [PATCH 4/4] add binding policy --- .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 5843a4979db11..8c47af4ee88d7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -4192,6 +4192,7 @@ object SQLConf { "This optimization applies to: pyspark.sql.DataFrame.toPandas " + "when 'spark.sql.execution.arrow.pyspark.enabled' is set.") .version("4.2.0") + .withBindingPolicy(ConfigBindingPolicy.SESSION) .booleanConf .createWithDefault(false)