apache · lyne7-sc · Jun 9, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/native-engine/auron-planner/proto/auron.proto b/native-engine/auron-planner/proto/auron.proto
@@ -809,6 +809,8 @@ message Field {
   bool nullable = 3;
   // for complex data types like structs, unions
   repeated Field children = 4;
+  // Iceberg/Parquet field id. Zero means unset.
+  int32 field_id = 5;
 }
 
 message FixedSizeBinary {

diff --git a/native-engine/auron-planner/src/lib.rs b/native-engine/auron-planner/src/lib.rs
@@ -13,10 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::sync::Arc;
+use std::{collections::HashMap, sync::Arc};
 
 use arrow::datatypes::{DataType, Field, Fields, IntervalUnit, Schema, TimeUnit};
-use datafusion::{common::JoinSide, logical_expr::Operator, scalar::ScalarValue};
+use datafusion::{
+    common::JoinSide, logical_expr::Operator, parquet::arrow::PARQUET_FIELD_ID_META_KEY,
+    scalar::ScalarValue,
+};
 use datafusion_ext_plans::{agg::AggFunction, joins::join_utils::JoinType};
 
 use crate::error::PlanSerDeError;
@@ -406,17 +409,29 @@ impl TryInto<DataType> for &Box<protobuf::List> {
 impl TryInto<Field> for &protobuf::Field {
     type Error = PlanSerDeError;
     fn try_into(self) -> Result<Field, Self::Error> {
-        let pb_datatype = self.arrow_type.as_ref().ok_or_else(|| {
-            proto_error(
-                "Protobuf deserialization error: Field message missing required field 'arrow_type'",
-            )
-        })?;
+        build_arrow_field(self)
+    }
+}
+
+fn build_arrow_field(field: &protobuf::Field) -> Result<Field, PlanSerDeError> {
+    let pb_datatype = field.arrow_type.as_ref().ok_or_else(|| {
+        proto_error(
+            "Protobuf deserialization error: Field message missing required field 'arrow_type'",
+        )
+    })?;
+    let arrow_field = Field::new(
+        field.name.as_str(),
+        pb_datatype.as_ref().try_into()?,
+        field.nullable,
+    );
 
-        Ok(Field::new(
-            self.name.as_str(),
-            pb_datatype.as_ref().try_into()?,
-            self.nullable,
-        ))
+    if field.field_id == 0 {
+        Ok(arrow_field)
+    } else {
+        Ok(arrow_field.with_metadata(HashMap::from([(
+            PARQUET_FIELD_ID_META_KEY.to_string(),
+            field.field_id.to_string(),
+        )])))
     }
 }
 
@@ -427,17 +442,7 @@ impl TryInto<Schema> for &protobuf::Schema {
         let fields = self
             .columns
             .iter()
-            .map(|c| {
-                let pb_arrow_type_res = c
-                    .arrow_type
-                    .as_ref()
-                    .ok_or_else(|| proto_error("Protobuf deserialization error: Field message was missing required field 'arrow_type'"));
-                let pb_arrow_type: &protobuf::ArrowType = match pb_arrow_type_res {
-                    Ok(res) => res,
-                    Err(e) => return Err(e),
-                };
-                Ok(Field::new(&c.name, pb_arrow_type.try_into()?, c.nullable))
-            })
+            .map(build_arrow_field)
             .collect::<Result<Vec<_>, _>>()?;
         Ok(Schema::new(fields))
     }

diff --git a/native-engine/datafusion-ext-plans/src/scan/mod.rs b/native-engine/datafusion-ext-plans/src/scan/mod.rs
@@ -25,6 +25,7 @@ use datafusion::{
     datasource::schema_adapter::{
         SchemaAdapter, SchemaAdapterFactory, SchemaMapper, SchemaMapping,
     },
+    parquet::arrow::PARQUET_FIELD_ID_META_KEY,
 };
 use datafusion_ext_commons::df_execution_err;
 
@@ -57,11 +58,10 @@ impl SchemaAdapter for AuronSchemaAdapter {
     fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
         let field = self.table_schema.field(index);
 
-        // use case insensitive matching
         file_schema
             .fields()
             .iter()
-            .position(|f| f.name().eq_ignore_ascii_case(field.name()))
+            .position(|file_field| fields_match(field, file_field))
     }
 
     fn map_schema(&self, file_schema: &Schema) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
@@ -73,7 +73,7 @@ impl SchemaAdapter for AuronSchemaAdapter {
                 .table_schema
                 .fields()
                 .iter()
-                .position(|f| f.name().eq_ignore_ascii_case(file_field.name()))
+                .position(|table_field| fields_match(table_field, file_field))
             {
                 field_mappings[table_idx] = Some(projection.len());
                 projection.push(file_idx);
@@ -89,6 +89,16 @@ impl SchemaAdapter for AuronSchemaAdapter {
     }
 }
 
+fn fields_match(table_field: &Field, file_field: &Field) -> bool {
+    match table_field.metadata().get(PARQUET_FIELD_ID_META_KEY) {
+        Some(table_field_id) => match file_field.metadata().get(PARQUET_FIELD_ID_META_KEY) {
+            Some(file_field_id) => file_field_id == table_field_id,
+            None => table_field.name().eq_ignore_ascii_case(file_field.name()),
+        },
+        None => table_field.name().eq_ignore_ascii_case(file_field.name()),
+    }
+}
+
 pub fn create_auron_schema_mapper(
     table_schema: &SchemaRef,
     field_mappings: &[Option<usize>],

diff --git a/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeConverters.scala b/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeConverters.scala
@@ -216,18 +216,22 @@ object NativeConverters extends Logging {
     arrowTypeBuilder.build()
   }
 
-  def convertField(sparkField: StructField): pb.Field = {
-    pb.Field
+  def convertField(sparkField: StructField, fieldId: Option[Int] = None): pb.Field = {
+    val fieldBuilder = pb.Field
       .newBuilder()
       .setName(sparkField.name)
       .setNullable(sparkField.nullable)
       .setArrowType(convertDataType(sparkField.dataType))
-      .build()
+    fieldId.foreach(fieldBuilder.setFieldId)
+    fieldBuilder.build()
   }
 
-  def convertSchema(sparkSchema: StructType): pb.Schema = {
+  def convertSchema(
+      sparkSchema: StructType,
+      fieldIdsByName: Map[String, Int] = Map.empty): pb.Schema = {
     val schemaBuilder = pb.Schema.newBuilder()
-    sparkSchema.foreach(sparkField => schemaBuilder.addColumns(convertField(sparkField)))
+    sparkSchema.foreach(sparkField =>
+      schemaBuilder.addColumns(convertField(sparkField, fieldIdsByName.get(sparkField.name))))
     schemaBuilder.build()
   }
 

diff --git a/...tension/src/main/scala/org/apache/spark/sql/execution/auron/plan/NativeGenerateBase.scala b/...tension/src/main/scala/org/apache/spark/sql/execution/auron/plan/NativeGenerateBase.scala
@@ -118,7 +118,7 @@ abstract class NativeGenerateBase(
   }
 
   private def nativeGeneratorOutput =
-    Util.getSchema(generatorOutput).map(NativeConverters.convertField)
+    Util.getSchema(generatorOutput).map(field => NativeConverters.convertField(field))
 
   private def nativeRequiredChildOutput =
     Util.getSchema(requiredChildOutput).map(_.name)

diff --git a/...auron-iceberg/src/main/scala/org/apache/iceberg/spark/source/AuronIcebergSourceUtil.scala b/...auron-iceberg/src/main/scala/org/apache/iceberg/spark/source/AuronIcebergSourceUtil.scala
@@ -16,13 +16,72 @@
  */
 package org.apache.iceberg.spark.source
 
+import scala.collection.JavaConverters._
+
+import org.apache.iceberg.types.TypeUtil
+
 object AuronIcebergSourceUtil {
 
+  final case class RenameOrDrop(topLevel: Boolean, nested: Boolean)
+
   def getClassOfSparkBatchQueryScan(): Class[SparkBatchQueryScan] = {
     classOf[SparkBatchQueryScan]
   }
 
   def getClassOfSparkInputPartition(): Class[SparkInputPartition] = {
     classOf[SparkInputPartition]
   }
+
+  def expectedFieldIds(scan: AnyRef): Map[String, Int] = {
+    val expectedSchema = asBatchQueryScan(scan).expectedSchema()
+    expectedSchema.columns().asScala.map(field => field.name() -> field.fieldId()).toMap
+  }
+
+  def detectRenameOrDrop(scan: AnyRef): RenameOrDrop = {
+    val table = asBatchQueryScan(scan).table()
+    val currentFields = collectFieldIdToName(table.schema())
+
+    table
+      .schemas()
+      .asScala
+      .filterNot(_._1 == table.schema().schemaId())
+      .values
+      .foldLeft(RenameOrDrop(topLevel = false, nested = false)) { (result, schema) =>
+        collectFieldIdToName(schema).foldLeft(result) {
+          case (currentResult, (fieldId, historicalField)) =>
+            currentFields.get(fieldId) match {
+              case Some(currentField) if currentField.name != historicalField.name =>
+                if (historicalField.topLevel || currentField.topLevel) {
+                  currentResult.copy(topLevel = true)
+                } else {
+                  currentResult.copy(nested = true)
+                }
+              case None =>
+                if (historicalField.topLevel) {
+                  currentResult.copy(topLevel = true)
+                } else {
+                  currentResult.copy(nested = true)
+                }
+              case _ =>
+                currentResult
+            }
+        }
+      }
+  }
+
+  final private case class FieldIdentity(name: String, topLevel: Boolean)
+
+  private def collectFieldIdToName(schema: org.apache.iceberg.Schema): Map[Int, FieldIdentity] = {
+    val topLevelFieldIds = schema.columns().asScala.map(_.fieldId()).toSet
+    TypeUtil
+      .indexById(schema.asStruct())
+      .asScala
+      .map { case (fieldId, field) =>
+        fieldId.toInt -> FieldIdentity(field.name(), topLevelFieldIds.contains(fieldId.toInt))
+      }
+      .toMap
+  }
+
+  private def asBatchQueryScan(scan: AnyRef): SparkBatchQueryScan =
+    scan.asInstanceOf[SparkBatchQueryScan]
 }
diff --git a/.../auron-iceberg/src/main/scala/org/apache/spark/sql/auron/iceberg/IcebergScanSupport.scala b/.../auron-iceberg/src/main/scala/org/apache/spark/sql/auron/iceberg/IcebergScanSupport.scala
@@ -25,6 +25,7 @@ import org.apache.iceberg.spark.source.AuronIcebergSourceUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.auron.NativeConverters
 import org.apache.spark.sql.catalyst.expressions.{And => SparkAnd, AttributeReference, EqualTo, Expression => SparkExpression, GreaterThan, GreaterThanOrEqual, In, IsNaN, IsNotNull, IsNull, LessThan, LessThanOrEqual, Literal, Not => SparkNot, Or => SparkOr}
+import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 import org.apache.spark.sql.connector.read.InputPartition
 import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
 import org.apache.spark.sql.internal.SQLConf
@@ -41,11 +42,24 @@ final case class IcebergScanPlan(
     readSchema: StructType,
     fileSchema: StructType,
     partitionSchema: StructType,
-    pruningPredicates: Seq[pb.PhysicalExprNode])
+    pruningPredicates: Seq[pb.PhysicalExprNode],
+    fieldIdsByName: Map[String, Int])
 
 object IcebergScanSupport extends Logging {
+  private val scanPlanTag: TreeNodeTag[Option[IcebergScanPlan]] = TreeNodeTag(
+    "auron.iceberg.scan.plan")
 
   def plan(exec: BatchScanExec): Option[IcebergScanPlan] = {
+    exec.getTagValue(scanPlanTag) match {
+      case Some(cached) => cached
+      case None =>
+        val planned = planUncached(exec)
+        exec.setTagValue(scanPlanTag, planned)
+        planned
+    }
+  }
+
+  private def planUncached(exec: BatchScanExec): Option[IcebergScanPlan] = {
     val scan = exec.scan
     val scanClassName = scan.getClass.getName
     // Only handle Iceberg scans; other sources must stay on Spark's path.
@@ -75,6 +89,31 @@ object IcebergScanSupport extends Logging {
       partitionSchema.fields.forall(field => NativeConverters.isTypeSupported(field.dataType)),
       "Has unsupported schema type.")
 
+    val fieldIdsByName =
+      try {
+        AuronIcebergSourceUtil.expectedFieldIds(scan.asInstanceOf[AnyRef])
+      } catch {
+        case NonFatal(t) =>
+          logWarning(s"Failed to inspect Iceberg field ids for $scanClassName.", t)
+          return None
+      }
+
+    val renameOrDrop =
+      try {
+        AuronIcebergSourceUtil.detectRenameOrDrop(scan.asInstanceOf[AnyRef])
+      } catch {
+        case NonFatal(t) =>
+          logWarning(s"Failed to inspect Iceberg schema history for $scanClassName.", t)
+          return None
+      }
+    assert(!renameOrDrop.nested, "Nested Iceberg rename or drop is not supported.")
+
+    val missingFieldIds =
+      fileSchema.fields.filterNot(field => fieldIdsByName.contains(field.name)).map(_.name)
+    assert(
+      missingFieldIds.isEmpty,
+      s"Missing Iceberg field ids for columns: ${missingFieldIds.mkString(", ")}")
+
     val partitions = inputPartitions(exec)
     // Empty scan (e.g. empty table) should still build a plan to return no rows.
     if (partitions.isEmpty) {
@@ -86,7 +125,8 @@ object IcebergScanSupport extends Logging {
           readSchema,
           fileSchema,
           partitionSchema,
-          Seq.empty))
+          Seq.empty,
+          fieldIdsByName))
     }
 
     val icebergPartitions = partitions.flatMap(icebergPartition)
@@ -110,6 +150,9 @@ object IcebergScanSupport extends Logging {
     assert(
       !(format != FileFormat.PARQUET && format != FileFormat.ORC),
       "Only support parquet or orc.")
+    assert(
+      !(format == FileFormat.ORC && renameOrDrop.topLevel),
+      "Iceberg ORC rename or drop is not supported.")
 
     val pruningPredicates = collectPruningPredicates(scan.asInstanceOf[AnyRef], readSchema)
     Some(
@@ -119,7 +162,8 @@ object IcebergScanSupport extends Logging {
         readSchema,
         fileSchema,
         partitionSchema,
-        pruningPredicates))
+        pruningPredicates,
+        fieldIdsByName))
   }
 
   private def collectUnsupportedMetadataColumns(schema: StructType): Seq[String] =

diff --git a/...src/main/scala/org/apache/spark/sql/execution/auron/plan/NativeIcebergTableScanExec.scala b/...src/main/scala/org/apache/spark/sql/execution/auron/plan/NativeIcebergTableScanExec.scala
@@ -74,7 +74,8 @@ case class NativeIcebergTableScanExec(basedScan: BatchScanExec, plan: IcebergSca
   private lazy val fileSizes: Map[String, Long] = buildFileSizes()
   private lazy val fileSpecIds: Map[String, Int] = buildFileSpecIds()
 
-  private lazy val nativeFileSchema: pb.Schema = NativeConverters.convertSchema(fileSchema)
+  private lazy val nativeFileSchema: pb.Schema =
+    NativeConverters.convertSchema(fileSchema, plan.fieldIdsByName)
   private lazy val nativePartitionSchema: pb.Schema =
     NativeConverters.convertSchema(partitionSchema)