From f57935918d38046817c2c6917174af644e6a1255 Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Fri, 29 May 2026 02:23:16 +0000 Subject: [PATCH 1/9] SCD2 preprocess microbatch --- .../autocdc/AutoCdcReservedNames.scala | 11 + .../autocdc/Scd1BatchProcessor.scala | 13 +- .../autocdc/Scd2BatchProcessor.scala | 187 +++++++ .../spark/sql/pipelines/graph/Flow.scala | 4 +- .../sql/pipelines/graph/FlowExecution.scala | 5 +- .../AutoCdcCatalogExecutionTestBase.scala | 4 +- .../pipelines/autocdc/AutoCdcFlowSuite.scala | 18 +- .../Scd1BatchProcessorMergeSuite.scala | 10 +- .../autocdc/Scd1BatchProcessorSuite.scala | 26 +- .../Scd1ForeachBatchHandlerSuite.scala | 12 +- .../autocdc/Scd2BatchProcessorSuite.scala | 478 ++++++++++++++++++ .../AutoCdcGraphExecutionTestMixin.scala | 3 +- ...CdcScd1AuxiliaryTableDurabilitySuite.scala | 6 +- ...utoCdcScd1TargetTableDurabilitySuite.scala | 6 +- 14 files changed, 730 insertions(+), 53 deletions(-) create mode 100644 sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala create mode 100644 sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcReservedNames.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcReservedNames.scala index 2b0f8e293e76b..8284441e9e2b1 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcReservedNames.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcReservedNames.scala @@ -29,4 +29,15 @@ private[pipelines] object AutoCdcReservedNames { /** Common reserved-name prefix shared by AutoCDC internal columns and internal tables. */ val prefix: String = "__spark_autocdc_" + + /** + * Reserved name of the operational metadata column AutoCDC that is projected on every AutoCDC + * microbatch, auxiliary table, and target table. + * + * Shared across all SCD strategies and across the flow resolution, batch-processor, and + * streaming-write layers. + * + * Note that the schema of the CDC metadata column however can and does differ on the SCD-type. + */ + val cdcMetadataColName: String = s"${prefix}metadata" } diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala index 0656a7eb91b01..35006dc4ee21f 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala @@ -143,7 +143,7 @@ case class Scd1BatchProcessor( F.when(rowDeleteSequence.isNull, changeArgs.sequencing).otherwise(F.lit(null)) validatedMicrobatch.withColumn( - Scd1BatchProcessor.cdcMetadataColName, + AutoCdcReservedNames.cdcMetadataColName, Scd1BatchProcessor.constructCdcMetadataCol( deleteSequence = rowDeleteSequence, upsertSequence = rowUpsertSequence, @@ -175,7 +175,7 @@ case class Scd1BatchProcessor( schema = microbatchWithCdcMetadataDf.schema, columnSelection = Some( ColumnSelection.ExcludeColumns( - Seq(UnqualifiedColumnName(Scd1BatchProcessor.cdcMetadataColName)) + Seq(UnqualifiedColumnName(AutoCdcReservedNames.cdcMetadataColName)) ) ), caseSensitive = caseSensitiveColumnComparison @@ -197,7 +197,7 @@ case class Scd1BatchProcessor( // select. Identifiers could have special characters such as '.'. F.col(QuotingUtils.quoteIdentifier(colName)) }) :+ F.col( - Scd1BatchProcessor.cdcMetadataColName + AutoCdcReservedNames.cdcMetadataColName ) microbatchWithCdcMetadataDf.select( @@ -223,7 +223,7 @@ case class Scd1BatchProcessor( val aliasedMicrobatchDf = microbatchDf.alias("microbatch") val aliasedAuxiliaryTableDf = auxiliaryTableDf.alias("auxiliaryTable") - val cdcMetadata = Scd1BatchProcessor.cdcMetadataColName + val cdcMetadata = AutoCdcReservedNames.cdcMetadataColName val microbatchCdcMetadata = F.col(s"microbatch.$cdcMetadata") val effectiveSeq = F.greatest( @@ -267,7 +267,7 @@ case class Scd1BatchProcessor( auxiliaryTableIdentifier: TableIdentifier ): Unit = { val auxIdentQuoted = auxiliaryTableIdentifier.quotedString - val meta = Scd1BatchProcessor.cdcMetadataColName + val meta = AutoCdcReservedNames.cdcMetadataColName // Project the reconciled microbatch down to just keys + `_cdc_metadata`; data columns are // irrelevant for the auxiliary table and should not be persisted. @@ -330,7 +330,7 @@ case class Scd1BatchProcessor( reconciledMicrobatchDf: DataFrame, targetTableIdentifier: TableIdentifier ): Unit = { - val meta = Scd1BatchProcessor.cdcMetadataColName + val meta = AutoCdcReservedNames.cdcMetadataColName val destinationTableStr = targetTableIdentifier.quotedString // (Re-)alias the reconciled microbatch DF for easy reference for the remainder of the merge. @@ -415,7 +415,6 @@ object Scd1BatchProcessor { * enforced at [[org.apache.spark.sql.pipelines.graph.AutoCdcMergeFlow]] construction. */ private[autocdc] val winningRowColName: String = s"${AutoCdcReservedNames.prefix}winning_row" - private[pipelines] val cdcMetadataColName: String = s"${AutoCdcReservedNames.prefix}metadata" private[pipelines] val cdcDeleteSequenceFieldName: String = "deleteSequence" private[pipelines] val cdcUpsertSequenceFieldName: String = "upsertSequence" diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala new file mode 100644 index 0000000000000..31dd198b65d67 --- /dev/null +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.pipelines.autocdc + +import org.apache.spark.sql.{functions => F} +import org.apache.spark.sql.Column +import org.apache.spark.sql.catalyst.util.QuotingUtils +import org.apache.spark.sql.classic.DataFrame +import org.apache.spark.sql.types.{DataType, StructType} + +/** + * Per-microbatch processor for SCD Type 2 AutoCDC flows, complying to the specified + * [[changeArgs]] configuration. + * + * @param changeArgs The CDC flow configuration. + * @param resolvedSequencingType The post-analysis [[DataType]] of the sequencing column, derived + * from the flow's resolved DataFrame at flow setup time. + */ +case class Scd2BatchProcessor( + changeArgs: ChangeArgs, + resolvedSequencingType: DataType) { + + /** + * Reconcile a CDC microbatch into the canonical form the auxiliary- and target-table merges + * consume. + * + * Step ordering is load-bearing: the row-extension steps reference user data columns that + * target-column selection is allowed to drop, so selection runs last. Unlike SCD1, no per-key + * deduplication step is needed - SCD2 preserves every event as part of the row's history. + * + * Requires the microbatch to have been validated upstream so that the sequencing column is + * non-null and orderable. + */ + private[autocdc] def preprocessMicrobatch(validatedBatchDf: DataFrame): DataFrame = { + validatedBatchDf + .transform(extendMicrobatchRowsWithStartAt) + .transform(extendMicrobatchRowsWithEndAt) + .transform(extendMicrobatchRowsWithCdcMetadata) + .transform(projectTargetColumnsOntoMicrobatch) + } + + /** + * Stamp each microbatch row with its currently known start-at (i.e active-from) using its + * sequencing. + */ + private def extendMicrobatchRowsWithStartAt(microbatchDf: DataFrame): DataFrame = { + microbatchDf.withColumn( + colName = Scd2BatchProcessor.startAtColName, + col = changeArgs.sequencing + ) + } + + /** + * Stamp each microbatch delete event row with its end time sequence, as they are instantaneous + * events. + * + * Non-deletes leave a null end, as do not yet know if the row reprsents an active upsert, or a + * closed upsert. This will become clear in later reconciliation against the aux/target tables. + */ + private def extendMicrobatchRowsWithEndAt(microbatchDf: DataFrame): DataFrame = { + microbatchDf.withColumn( + colName = Scd2BatchProcessor.endAtColName, + col = ( + changeArgs.deleteCondition match { + case Some(deleteCondition) => + F.when(deleteCondition, changeArgs.sequencing).otherwise(null) + case None => + F.lit(null) + } + ).cast(resolvedSequencingType) + ) + } + + /** + * Project the operational CDC metadata column carrying the literal event sequence. Downstream + * merges rely on it to preserve original event lineage regardless of how rows start/end-at are + * coalesced. + */ + private def extendMicrobatchRowsWithCdcMetadata(microbatchDf: DataFrame): DataFrame = { + microbatchDf.withColumn( + colName = AutoCdcReservedNames.cdcMetadataColName, + col = Scd2BatchProcessor.constructCdcMetadataStruct( + recordStartAt = changeArgs.sequencing, + sequencingType = resolvedSequencingType + ) + ) + } + + /** + * Apply the user's target column selection while preserving the SCD2 framework columns; the + * latter are required by downstream merges and persisted to both the auxiliary and target + * tables, so users cannot deselect them. + * + * Requires the framework columns to already be present on the input. + */ + private def projectTargetColumnsOntoMicrobatch( + microbatch: DataFrame + ): DataFrame = { + val dataSchema = StructType( + microbatch.schema.fields.filterNot(f => + Scd2BatchProcessor.reservedFrameworkColNames.contains(f.name) + ) + ) + val userSelectedDataSchema = + ColumnSelection.applyToSchema( + schemaName = "microbatch", + schema = dataSchema, + columnSelection = changeArgs.columnSelection, + caseSensitive = + microbatch.sparkSession.sessionState.conf.caseSensitiveAnalysis + ) + val finalColumnsToSelect: Seq[Column] = + userSelectedDataSchema.fieldNames.toSeq.map(colName => { + // Spark drops backticks in the schema, quote all identifiers for safety before executing + // select. Identifiers could have special characters such as '.'. + F.col(QuotingUtils.quoteIdentifier(colName)) + }) ++ Seq( + F.col(Scd2BatchProcessor.startAtColName), + F.col(Scd2BatchProcessor.endAtColName), + F.col(AutoCdcReservedNames.cdcMetadataColName) + ) + microbatch.select(finalColumnsToSelect: _*) + } + +} + +object Scd2BatchProcessor { + // Metadata field that represents the exact time (sequence) of the CDC event that produced this + // row. Null only for synthetic decomposition tails. + private[autocdc] val recordStartAtFieldName: String = "__RECORD_START_AT" + + // In the target table: + // The user-visible column representing when this row is considered active from, i.e. + // this upsert run head's [[recordStartAtFieldName]]. + // In the aux table: + // If this row represents a tombstone, then the same value as [[recordStartAtFieldName]]. + // Else this row represents a coalesced no-op row that is part of an upsert run. + // Inherit the [[recordStartAtFieldName]] of the head of this upsert's run. + // + // The invariant in both tables is: startAtColName <= recordStartAtFieldName. If an event was + // generated at time X, it is active by time X, or earlier if it is not a run head. + private[autocdc] val startAtColName: String = "__START_AT" + + // In the target table: + // The user-visible column representing when this row became inactive. Null IFF the row + // is active: neither superseded by a state-changing upsert nor affected by a delete. + // In the aux table: + // If this row is a tombstone, then by convention the sequence of the delete event that + // produced it. Delete events are considered instantaneous in time. + // Else this row is a coalesced no-op row that is part of an upsert run, and by + // convention the value will always be null. + private[autocdc] val endAtColName: String = "__END_AT" + + private val reservedFrameworkColNames: Set[String] = Set( + startAtColName, + endAtColName, + AutoCdcReservedNames.cdcMetadataColName + ) + + /** + * Construct the CDC metadata struct column for SCD1, following the exact schema and field + * ordering defined by [[cdcMetadataColSchema]]. + */ + def constructCdcMetadataStruct( + recordStartAt: Column, + sequencingType: DataType + ): Column = { + F.struct( + recordStartAt.cast(sequencingType).as(recordStartAtFieldName) + ) + } +} diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala index f88b0cd3a1cbe..dd4d1556afbf8 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/Flow.scala @@ -288,7 +288,7 @@ class AutoCdcMergeFlow( // CDC operational metadata column at the end. StructType( userSelectedSchema.fields :+ StructField( - Scd1BatchProcessor.cdcMetadataColName, + AutoCdcReservedNames.cdcMetadataColName, Scd1BatchProcessor.cdcMetadataColSchema(sequencingType), nullable = false ) @@ -334,7 +334,7 @@ class AutoCdcMergeFlow( deleteSequence = F.lit(null), upsertSequence = F.lit(null), sequencingType = sequencingType - ).as(Scd1BatchProcessor.cdcMetadataColName) + ).as(AutoCdcReservedNames.cdcMetadataColName) df.select(userSelectedCols :+ emptyCdcMetadataCol: _*) case ScdType.Type2 => diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala index 0d1c33be21727..3841495f01c8f 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala @@ -688,11 +688,12 @@ class Scd1MergeStreamingWrite( /** CDC metadata field resolved out of the flow's augmented schema. */ private lazy val cdcMetadataField: StructField = { val resolver = updateContext.spark.sessionState.conf.resolver + val cdcMetadataColName = AutoCdcReservedNames.cdcMetadataColName flow.schema.fields - .find(field => resolver(field.name, Scd1BatchProcessor.cdcMetadataColName)) + .find(field => resolver(field.name, cdcMetadataColName)) .getOrElse( throw SparkException.internalError( - s"CDC metadata column '${Scd1BatchProcessor.cdcMetadataColName}' was not found in the " + + s"CDC metadata column '$cdcMetadataColName' was not found in the " + s"AutoCDC flow's target table schema." ) ) diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcCatalogExecutionTestBase.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcCatalogExecutionTestBase.scala index 0dc0a90276600..8688df071113b 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcCatalogExecutionTestBase.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcCatalogExecutionTestBase.scala @@ -82,7 +82,7 @@ trait AutoCdcCatalogExecutionTestBase { } /** - * Schema of the [[Scd1BatchProcessor.cdcMetadataColName]] struct column for a given + * Schema of the [[AutoCdcReservedNames.cdcMetadataColName]] struct column for a given * sequencing column type. Defaults to [[LongType]] because all current SCD1 tests use * `Long` sequencing. */ @@ -92,7 +92,7 @@ trait AutoCdcCatalogExecutionTestBase { .add(Scd1BatchProcessor.cdcUpsertSequenceFieldName, sequencingType) /** - * Build a [[Row]] matching the [[Scd1BatchProcessor.cdcMetadataColName]] struct's two fields, + * Build a [[Row]] matching the [[AutoCdcReservedNames.cdcMetadataColName]] struct's two fields, * in the order produced by [[Scd1BatchProcessor.constructCdcMetadataCol]]: */ protected def cdcMetadataRow[T](deleteSeq: Option[T], upsertSeq: Option[T]): Row = diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcFlowSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcFlowSuite.scala index cf7c9533bee98..32374f8ecb048 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcFlowSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/AutoCdcFlowSuite.scala @@ -186,7 +186,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { /** Convenience to extract the [[StructType]] of the projected `_cdc_metadata` column. */ private def cdcMetadataStruct(schema: StructType): StructType = - schema(Scd1BatchProcessor.cdcMetadataColName).dataType.asInstanceOf[StructType] + schema(AutoCdcReservedNames.cdcMetadataColName).dataType.asInstanceOf[StructType] test( "AutoCdcMergeFlow.schema appends _cdc_metadata to the source schema when no " + @@ -200,7 +200,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { .add("seq", LongType) .add( StructField( - Scd1BatchProcessor.cdcMetadataColName, + AutoCdcReservedNames.cdcMetadataColName, Scd1BatchProcessor.cdcMetadataColSchema(LongType), nullable = false ) @@ -223,7 +223,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { .add("seq", LongType) .add( StructField( - Scd1BatchProcessor.cdcMetadataColName, + AutoCdcReservedNames.cdcMetadataColName, Scd1BatchProcessor.cdcMetadataColSchema(LongType), nullable = false ) @@ -244,7 +244,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { .add("seq", LongType) .add( StructField( - Scd1BatchProcessor.cdcMetadataColName, + AutoCdcReservedNames.cdcMetadataColName, Scd1BatchProcessor.cdcMetadataColSchema(LongType), nullable = false ) @@ -270,7 +270,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { test("AutoCdcMergeFlow.schema's _cdc_metadata field is non-null with nullable inner fields") { val resolvedFlow = newAutoCdcMergeFlow(threeColumnSourceDf()) - val metaField = resolvedFlow.schema(Scd1BatchProcessor.cdcMetadataColName) + val metaField = resolvedFlow.schema(AutoCdcReservedNames.cdcMetadataColName) assert(!metaField.nullable, "_cdc_metadata column itself must be non-null") val metaStruct = metaField.dataType.asInstanceOf[StructType] @@ -330,7 +330,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { // The user-selected portion drops `name`; the trailing column is the SCD1 metadata. assert( loadedDf.schema.fieldNames.toSeq == - Seq("id", "seq", Scd1BatchProcessor.cdcMetadataColName) + Seq("id", "seq", AutoCdcReservedNames.cdcMetadataColName) ) } @@ -345,7 +345,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { assert(loadedDf.schema == resolvedFlow.schema) assert( loadedDf.schema.fieldNames.toSeq == - Seq("id", "seq", Scd1BatchProcessor.cdcMetadataColName) + Seq("id", "seq", AutoCdcReservedNames.cdcMetadataColName) ) } @@ -442,7 +442,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { // Locks in the previous engine-level guard at flow-construction time. Any future // regression where a user-supplied CDC stream carries the reserved metadata column name // should fail eagerly here. - val sourceDf = sourceDfWithExtraColumns(Scd1BatchProcessor.cdcMetadataColName -> StringType) + val sourceDf = sourceDfWithExtraColumns(AutoCdcReservedNames.cdcMetadataColName -> StringType) checkError( exception = intercept[AnalysisException] { @@ -452,7 +452,7 @@ class AutoCdcFlowSuite extends QueryTest with SharedSparkSession { sqlState = "42710", parameters = Map( "caseSensitivity" -> CaseSensitivityLabels.CaseInsensitive, - "columnName" -> Scd1BatchProcessor.cdcMetadataColName, + "columnName" -> AutoCdcReservedNames.cdcMetadataColName, "schemaName" -> "changeDataFeed", "reservedColumnNamePrefix" -> AutoCdcReservedNames.prefix ) diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessorMergeSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessorMergeSuite.scala index 475d25f5aa2cf..1aa2cbcd5417b 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessorMergeSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessorMergeSuite.scala @@ -47,13 +47,13 @@ class Scd1BatchProcessorMergeSuite */ private val minimalSchema: StructType = new StructType() .add("id", IntegerType) - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType()) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType()) /** Minimal target-table shape: one key, one data column, and CDC metadata. */ private val targetSchema: StructType = new StructType() .add("id", IntegerType) .add("value", StringType) - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType()) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType()) /** * A processor with a single key column `id`. `sequencing` is irrelevant for @@ -85,7 +85,7 @@ class Scd1BatchProcessorMergeSuite val withKeys = keyColumns.foldLeft(new StructType()) { case (s, (name, dt)) => s.add(name, dt) } - withKeys.add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType()) + withKeys.add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType()) } /** @@ -116,7 +116,7 @@ class Scd1BatchProcessorMergeSuite .add("id", IntegerType) .add("value", StringType) .add( - Scd1BatchProcessor.cdcMetadataColName, + AutoCdcReservedNames.cdcMetadataColName, new StructType() .add(Scd1BatchProcessor.cdcDeleteSequenceFieldName, LongType) .add(Scd1BatchProcessor.cdcUpsertSequenceFieldName, LongType) @@ -446,7 +446,7 @@ class Scd1BatchProcessorMergeSuite // The schema always stores the backtick consumed column name, so unticked the raw name here. .add(rawKeyName, IntegerType) .add("value", StringType) - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType()) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType()) createTable( defaultTargetIdent, diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessorSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessorSuite.scala index 9432150c40167..d2c78442c4762 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessorSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessorSuite.scala @@ -33,7 +33,7 @@ class Scd1BatchProcessorSuite extends QueryTest with SharedSparkSession { .add("name", StringType) .add("age", IntegerType) .add( - Scd1BatchProcessor.cdcMetadataColName, + AutoCdcReservedNames.cdcMetadataColName, new StructType() .add(Scd1BatchProcessor.cdcDeleteSequenceFieldName, LongType) .add(Scd1BatchProcessor.cdcUpsertSequenceFieldName, LongType) @@ -596,7 +596,7 @@ class Scd1BatchProcessorSuite extends QueryTest with SharedSparkSession { // Original columns are preserved in their original order, with CDC metadata appended at // the very end. assert(result.schema.fieldNames.toSeq == - schema.fieldNames.toSeq :+ Scd1BatchProcessor.cdcMetadataColName) + schema.fieldNames.toSeq :+ AutoCdcReservedNames.cdcMetadataColName) } test("extendMicrobatchRowsWithCdcMetadata casts delete / upsert sequence fields to " + @@ -624,7 +624,7 @@ class Scd1BatchProcessorSuite extends QueryTest with SharedSparkSession { val resultDf = processor.extendMicrobatchRowsWithCdcMetadata(batch) val cdcMetadataDataType = - resultDf.schema(Scd1BatchProcessor.cdcMetadataColName).dataType.asInstanceOf[StructType] + resultDf.schema(AutoCdcReservedNames.cdcMetadataColName).dataType.asInstanceOf[StructType] assert(columnNamesAndDataTypes(cdcMetadataDataType) == Seq( Scd1BatchProcessor.cdcDeleteSequenceFieldName -> LongType, Scd1BatchProcessor.cdcUpsertSequenceFieldName -> LongType)) @@ -723,7 +723,7 @@ class Scd1BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.projectTargetColumnsOntoMicrobatch(batch) assert(result.schema.fieldNames.toSeq == - Seq("id", "age", Scd1BatchProcessor.cdcMetadataColName)) + Seq("id", "age", AutoCdcReservedNames.cdcMetadataColName)) checkAnswer( df = result, expectedAnswer = Row(1, 30, Row(null, 10L)) @@ -753,7 +753,7 @@ class Scd1BatchProcessorSuite extends QueryTest with SharedSparkSession { assert( result.schema.fieldNames.toSeq == - Seq("id", "name", Scd1BatchProcessor.cdcMetadataColName) + Seq("id", "name", AutoCdcReservedNames.cdcMetadataColName) ) checkAnswer( df = result, @@ -785,7 +785,7 @@ class Scd1BatchProcessorSuite extends QueryTest with SharedSparkSession { // in which the user listed columns in IncludeColumns. The CDC metadata column is appended // last as always. assert(result.schema.fieldNames.toSeq == - Seq("id", "age", Scd1BatchProcessor.cdcMetadataColName)) + Seq("id", "age", AutoCdcReservedNames.cdcMetadataColName)) checkAnswer( df = result, @@ -800,7 +800,7 @@ class Scd1BatchProcessorSuite extends QueryTest with SharedSparkSession { // Even if a column is created with backticks via DDL, those backticks are consumed by Spark // before resolving the schema; they won't show up in the schema field. .add("user.id", StringType) - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType) val batch = microbatchOf(schema)( Row(1, "u-100", Row(null, 10L)) @@ -826,7 +826,7 @@ class Scd1BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.projectTargetColumnsOntoMicrobatch(batch) assert(result.schema.fieldNames.toSeq == - Seq("id", "user.id", Scd1BatchProcessor.cdcMetadataColName)) + Seq("id", "user.id", AutoCdcReservedNames.cdcMetadataColName)) checkAnswer( df = result, expectedAnswer = Row(1, "u-100", Row(null, 10L)) @@ -860,7 +860,7 @@ class Scd1BatchProcessorSuite extends QueryTest with SharedSparkSession { // Output column names follow the microbatch schema's casing, not the casing in the user's // columnSelection. The CDC metadata column is appended last as always. assert(result.schema.fieldNames.toSeq == - Seq("id", "age", Scd1BatchProcessor.cdcMetadataColName)) + Seq("id", "age", AutoCdcReservedNames.cdcMetadataColName)) checkAnswer( df = result, expectedAnswer = Row(1, 30, Row(null, 10L)) @@ -880,7 +880,7 @@ class Scd1BatchProcessorSuite extends QueryTest with SharedSparkSession { // Data column. .add("value", StringType) // CDC metadata column. - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType) /** * Schema for the auxiliary input to [[Scd1BatchProcessor.applyTombstonesToMicrobatch]] tests. @@ -893,7 +893,7 @@ class Scd1BatchProcessorSuite extends QueryTest with SharedSparkSession { // Key column. .add("id", IntegerType) // CDC metadata column. - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType) test("applyTombstonesToMicrobatch drops late-arriving deletes and upserts when a matching " + "tombstone exists for the same key") { @@ -1015,7 +1015,7 @@ class Scd1BatchProcessorSuite extends QueryTest with SharedSparkSession { val schema = new StructType() .add("region", StringType) .add("customer_id", IntegerType) - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType) val microbatch = microbatchOf(schema)( Row("US", 1, cdcMetadataRow(deleteSeq = None, upsertSeq = Some(5))), @@ -1051,7 +1051,7 @@ class Scd1BatchProcessorSuite extends QueryTest with SharedSparkSession { test("applyTombstonesToMicrobatch supports backticked key names containing a literal dot") { val schema = new StructType() .add("user.id", IntegerType) - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType) val microbatch = microbatchOf(schema)( Row(1, cdcMetadataRow(deleteSeq = None, upsertSeq = Some(5))) diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1ForeachBatchHandlerSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1ForeachBatchHandlerSuite.scala index 76790847ede5c..bb8043e720c65 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1ForeachBatchHandlerSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd1ForeachBatchHandlerSuite.scala @@ -41,12 +41,12 @@ class Scd1ForeachBatchHandlerSuite private val auxiliarySchema = new StructType() .add("id", IntegerType) - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType()) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType()) private val targetSchema = new StructType() .add("id", IntegerType) .add("value", StringType) - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType()) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType()) private val processor = Scd1BatchProcessor( changeArgs = ChangeArgs( @@ -155,11 +155,11 @@ class Scd1ForeachBatchHandlerSuite val compositeAuxSchema = new StructType() .add("country", StringType) .add("city", StringType) - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType()) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType()) val compositeTargetSchema = new StructType() .add("country", StringType) .add("city", StringType) - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType()) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType()) val compositeProcessor = Scd1BatchProcessor( changeArgs = ChangeArgs( @@ -492,12 +492,12 @@ class Scd1ForeachBatchHandlerSuite val compositeAuxSchema = new StructType() .add("country", StringType) .add("city", StringType) - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType()) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType()) val compositeTargetSchema = new StructType() .add("country", StringType) .add("city", StringType) .add("population", LongType) - .add(Scd1BatchProcessor.cdcMetadataColName, cdcMetadataColSchemaType()) + .add(AutoCdcReservedNames.cdcMetadataColName, cdcMetadataColSchemaType()) val compositeProcessor = Scd1BatchProcessor( changeArgs = ChangeArgs( diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala new file mode 100644 index 0000000000000..80dd4975b7304 --- /dev/null +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala @@ -0,0 +1,478 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.pipelines.autocdc + +import org.apache.spark.sql.{functions => F, QueryTest, Row} +import org.apache.spark.sql.classic.DataFrame +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types._ + +class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { + + /** Build a microbatch [[DataFrame]] from explicit rows and an explicit schema. */ + private def microbatchOf(schema: StructType)(rows: Row*): DataFrame = + spark.createDataFrame(spark.sparkContext.parallelize(rows), schema) + + // =============== preprocessMicrobatch tests =============== + + test("preprocessMicrobatch appends framework columns __START_AT, __END_AT, " + + "_cdc_metadata at the end of the schema in that order") { + val schema = new StructType() + .add("id", IntegerType) + .add("seq", LongType) + .add("value", StringType) + + val batch = microbatchOf(schema)(Row(1, 10L, "a")) + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2 + ), + resolvedSequencingType = LongType + ) + + val result = processor.preprocessMicrobatch(batch) + + assert(result.schema.fieldNames.toSeq == Seq( + "id", "seq", "value", + Scd2BatchProcessor.startAtColName, + Scd2BatchProcessor.endAtColName, + AutoCdcReservedNames.cdcMetadataColName + )) + } + + test("preprocessMicrobatch returns an empty DataFrame with the full preprocessed schema") { + val schema = new StructType() + .add("id", IntegerType) + .add("seq", LongType) + .add("value", StringType) + + val batch = microbatchOf(schema)() + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2 + ), + resolvedSequencingType = LongType + ) + + val result = processor.preprocessMicrobatch(batch) + + assert(result.collect().isEmpty) + assert(result.schema.fieldNames.toSeq == Seq( + "id", "seq", "value", + Scd2BatchProcessor.startAtColName, + Scd2BatchProcessor.endAtColName, + AutoCdcReservedNames.cdcMetadataColName + )) + } + + test("preprocessMicrobatch stamps __START_AT, __END_AT, and __RECORD_START_AT correctly " + + "across delete and upsert events for the same key") { + val schema = new StructType() + .add("id", IntegerType) + .add("seq", LongType) + .add("value", StringType) + .add("is_delete", BooleanType) + + // All three events target the same key. SCD2 must preserve every event in the output - + // unlike SCD1, no per-key deduplication is performed; this also implicitly pins the + // no-dedup contract of preprocessMicrobatch. + val batch = microbatchOf(schema)( + Row(1, 10L, "first-upsert", false), + Row(1, 20L, "second-upsert", false), + Row(1, 30L, null, true) + ) + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2, + deleteCondition = Some(F.col("is_delete")) + ), + resolvedSequencingType = LongType + ) + + // Per-row contract for the framework columns: + // - __START_AT = sequencing for every row (the active-from time) + // - __END_AT = sequencing for delete rows; null for upserts (mutual exclusion) + // - __RECORD_START_AT = sequencing for every row, regardless of delete vs upsert + // (lineage preserved into the merge step) + checkAnswer( + df = processor.preprocessMicrobatch(batch), + expectedAnswer = Seq( + Row(1, 10L, "first-upsert", false, 10L, null, Row(10L)), + Row(1, 20L, "second-upsert", false, 20L, null, Row(20L)), + Row(1, 30L, null, true, 30L, 30L, Row(30L)) + ) + ) + } + + test("preprocessMicrobatch leaves __END_AT null on every row when deleteCondition is None") { + val schema = new StructType() + .add("id", IntegerType) + .add("seq", LongType) + .add("value", StringType) + + val batch = microbatchOf(schema)( + Row(1, 10L, "a"), + Row(2, 20L, "b") + ) + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2, + deleteCondition = None + ), + resolvedSequencingType = LongType + ) + + checkAnswer( + df = processor.preprocessMicrobatch(batch).select( + F.col(Scd2BatchProcessor.endAtColName) + ), + expectedAnswer = Seq(Row(null), Row(null)) + ) + } + + test("preprocessMicrobatch treats null deleteCondition results as upsert " + + "(__END_AT stays null)") { + val schema = new StructType() + .add("id", IntegerType) + .add("seq", LongType) + .add("is_delete", BooleanType) + + val batch = microbatchOf(schema)( + // is_delete is null - the delete condition evaluates to null, which Spark treats as the + // otherwise branch, so the row is classified as an upsert. + Row(1, 10L, null) + ) + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2, + deleteCondition = Some(F.col("is_delete")) + ), + resolvedSequencingType = LongType + ) + + checkAnswer( + df = processor.preprocessMicrobatch(batch).select( + F.col(Scd2BatchProcessor.endAtColName) + ), + expectedAnswer = Row(null) + ) + } + + test("preprocessMicrobatch evaluates an arbitrary sequencing expression per-row") { + val schema = new StructType() + .add("id", IntegerType) + .add("seq", LongType) + .add("alt_seq", LongType) + .add("value", StringType) + + // Sequencing is a function call referencing multiple columns, not a bare identifier. Locks + // in that the framework columns evaluate the full expression per-row rather than treating + // `sequencing` as a single column reference. + val batch = microbatchOf(schema)( + // greatest(10, 30) = 30 + Row(1, 10L, 30L, "row1"), + // greatest(40, 20) = 40 + Row(2, 40L, 20L, "row2") + ) + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.greatest(F.col("seq"), F.col("alt_seq")), + storedAsScdType = ScdType.Type2 + ), + resolvedSequencingType = LongType + ) + + val result = processor.preprocessMicrobatch(batch) + + checkAnswer( + df = result.select( + F.col(Scd2BatchProcessor.startAtColName), + F.col(s"${AutoCdcReservedNames.cdcMetadataColName}." + + s"${Scd2BatchProcessor.recordStartAtFieldName}") + ), + expectedAnswer = Seq( + Row(30L, 30L), + Row(40L, 40L) + ) + ) + } + + /** Schema reused by columnSelection tests: id (key), name, age, seq (sequencing). */ + private val multiUserColSchema: StructType = new StructType() + .add("id", IntegerType) + .add("name", StringType) + .add("age", IntegerType) + .add("seq", LongType) + + test("preprocessMicrobatch keeps every user column when columnSelection is None") { + val batch = microbatchOf(multiUserColSchema)( + Row(1, "alice", 30, 10L) + ) + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2, + columnSelection = None + ), + resolvedSequencingType = LongType + ) + + val result = processor.preprocessMicrobatch(batch) + + assert(result.schema.fieldNames.toSeq == Seq( + "id", "name", "age", "seq", + Scd2BatchProcessor.startAtColName, + Scd2BatchProcessor.endAtColName, + AutoCdcReservedNames.cdcMetadataColName + )) + } + + test("preprocessMicrobatch retains framework columns even when IncludeColumns omits them") { + val batch = microbatchOf(multiUserColSchema)( + Row(1, "alice", 30, 10L) + ) + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2, + columnSelection = Some(ColumnSelection.IncludeColumns( + Seq(UnqualifiedColumnName("id"), UnqualifiedColumnName("age")) + )) + ), + resolvedSequencingType = LongType + ) + + val result = processor.preprocessMicrobatch(batch) + + assert(result.schema.fieldNames.toSeq == Seq( + "id", "age", + Scd2BatchProcessor.startAtColName, + Scd2BatchProcessor.endAtColName, + AutoCdcReservedNames.cdcMetadataColName + )) + checkAnswer( + df = result, + expectedAnswer = Row(1, 30, 10L, null, Row(10L)) + ) + } + + test("preprocessMicrobatch drops user columns listed in ExcludeColumns; " + + "framework columns survive") { + val batch = microbatchOf(multiUserColSchema)( + Row(1, "alice", 30, 10L) + ) + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2, + columnSelection = Some(ColumnSelection.ExcludeColumns( + Seq(UnqualifiedColumnName("name")) + )) + ), + resolvedSequencingType = LongType + ) + + val result = processor.preprocessMicrobatch(batch) + + assert(result.schema.fieldNames.toSeq == Seq( + "id", "age", "seq", + Scd2BatchProcessor.startAtColName, + Scd2BatchProcessor.endAtColName, + AutoCdcReservedNames.cdcMetadataColName + )) + checkAnswer( + df = result, + expectedAnswer = Row(1, 30, 10L, 10L, null, Row(10L)) + ) + } + + test("preprocessMicrobatch preserves the microbatch schema's user-column order, " + + "ignoring the order of IncludeColumns") { + val batch = microbatchOf(multiUserColSchema)( + Row(1, "alice", 30, 10L) + ) + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2, + // User specifies (age, id) - intentionally different from the schema order (id, age). + columnSelection = Some(ColumnSelection.IncludeColumns( + Seq(UnqualifiedColumnName("age"), UnqualifiedColumnName("id")) + )) + ), + resolvedSequencingType = LongType + ) + + val result = processor.preprocessMicrobatch(batch) + + // Output column order follows the microbatch schema (id before age), not the user's listing + // order in IncludeColumns. Framework columns are always appended last. + assert(result.schema.fieldNames.toSeq == Seq( + "id", "age", + Scd2BatchProcessor.startAtColName, + Scd2BatchProcessor.endAtColName, + AutoCdcReservedNames.cdcMetadataColName + )) + } + + test("preprocessMicrobatch resolves columnSelection case-insensitively " + + "when SQLConf.CASE_SENSITIVE=false") { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + val batch = microbatchOf(multiUserColSchema)( + Row(1, "alice", 30, 10L) + ) + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2, + // User columns intentionally use a different case than the schema (id, age). + columnSelection = Some(ColumnSelection.IncludeColumns( + Seq(UnqualifiedColumnName("ID"), UnqualifiedColumnName("AGE")) + )) + ), + resolvedSequencingType = LongType + ) + + val result = processor.preprocessMicrobatch(batch) + + // Output column names follow the microbatch schema's casing, not the user's casing. + assert(result.schema.fieldNames.toSeq == Seq( + "id", "age", + Scd2BatchProcessor.startAtColName, + Scd2BatchProcessor.endAtColName, + AutoCdcReservedNames.cdcMetadataColName + )) + } + } + + test("preprocessMicrobatch handles backticked column names containing a literal dot") { + val schema = new StructType() + .add("id", IntegerType) + // Even if a column is created with backticks via DDL, those backticks are consumed by Spark + // before resolving the schema; they won't show up in the schema field. + .add("user.id", StringType) + .add("seq", LongType) + + val batch = microbatchOf(schema)( + Row(1, "u-100", 10L) + ) + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2, + columnSelection = Some(ColumnSelection.IncludeColumns( + Seq( + UnqualifiedColumnName("id"), + UnqualifiedColumnName("`user.id`") + ) + )) + ), + resolvedSequencingType = LongType + ) + + val result = processor.preprocessMicrobatch(batch) + + assert(result.schema.fieldNames.toSeq == Seq( + "id", "user.id", + Scd2BatchProcessor.startAtColName, + Scd2BatchProcessor.endAtColName, + AutoCdcReservedNames.cdcMetadataColName + )) + checkAnswer( + df = result, + expectedAnswer = Row(1, "u-100", 10L, null, Row(10L)) + ) + } + + test("preprocessMicrobatch correctly populates framework columns even when ExcludeColumns " + + "drops the columns referenced by sequencing and deleteCondition") { + val schema = new StructType() + .add("id", IntegerType) + .add("value", StringType) + // Both seq and is_delete are referenced by the flow's sequencing / deleteCondition + // expressions, but the user wants them excluded from the target table. + .add("seq", LongType) + .add("is_delete", BooleanType) + + val batch = microbatchOf(schema)( + Row(1, "alice", 10L, false), + Row(1, null, 20L, true) + ) + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2, + deleteCondition = Some(F.col("is_delete")), + columnSelection = Some(ColumnSelection.ExcludeColumns( + Seq(UnqualifiedColumnName("seq"), UnqualifiedColumnName("is_delete")) + )) + ), + resolvedSequencingType = LongType + ) + + // The orchestrator runs row-extension steps before column selection, so the framework + // columns reference seq / is_delete fully even though the final projection drops them. + val result = processor.preprocessMicrobatch(batch) + + assert(result.schema.fieldNames.toSeq == Seq( + "id", "value", + Scd2BatchProcessor.startAtColName, + Scd2BatchProcessor.endAtColName, + AutoCdcReservedNames.cdcMetadataColName + )) + checkAnswer( + df = result, + expectedAnswer = Seq( + Row(1, "alice", 10L, null, Row(10L)), + Row(1, null, 20L, 20L, Row(20L)) + ) + ) + } +} diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala index 5ebdb4b4c86d2..8538ef92a588b 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcGraphExecutionTestMixin.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.connector.catalog.SharedTablesInMemoryRowLevelOperationTableCatalog import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.pipelines.autocdc.{ + AutoCdcReservedNames, ChangeArgs, ColumnSelection, Scd1BatchProcessor, @@ -145,7 +146,7 @@ trait AutoCdcGraphExecutionTestMixin extends BeforeAndAfterEach { * Assumes sequence type is BIGINT (Long). */ protected val cdcMetadataDdl: String = { - val col = Scd1BatchProcessor.cdcMetadataColName + val col = AutoCdcReservedNames.cdcMetadataColName val del = Scd1BatchProcessor.cdcDeleteSequenceFieldName val ups = Scd1BatchProcessor.cdcUpsertSequenceFieldName s"$col STRUCT<$del:BIGINT,$ups:BIGINT> NOT NULL" diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala index 5a9f6cb6710be..78dbb70027b45 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1AuxiliaryTableDurabilitySuite.scala @@ -21,8 +21,8 @@ import org.apache.spark.sql.Row import org.apache.spark.sql.execution.streaming.runtime.MemoryStream import org.apache.spark.sql.functions import org.apache.spark.sql.pipelines.autocdc.{ + AutoCdcReservedNames, ColumnSelection, - Scd1BatchProcessor, UnqualifiedColumnName } import org.apache.spark.sql.pipelines.utils.{ExecutionTest, TestGraphRegistrationContext} @@ -157,7 +157,7 @@ class AutoCdcScd1AuxiliaryTableDurabilitySuite // The auxiliary table only contains keys and the metadata column, hence "name" should not be // included. - assert(auxSchema.fieldNames.toSeq == Seq("id", Scd1BatchProcessor.cdcMetadataColName)) + assert(auxSchema.fieldNames.toSeq == Seq("id", AutoCdcReservedNames.cdcMetadataColName)) assert(getAuxTableKeyColumnNames(target = "target") == Seq("id")) } @@ -195,7 +195,7 @@ class AutoCdcScd1AuxiliaryTableDurabilitySuite val auxSchema = spark.table(auxTableNameFor("target")).schema assert(auxSchema.fieldNames.toSeq == - Seq("region", "id", Scd1BatchProcessor.cdcMetadataColName)) + Seq("region", "id", AutoCdcReservedNames.cdcMetadataColName)) assert(getAuxTableKeyColumnNames(target = "target") == Seq("region", "id")) } diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1TargetTableDurabilitySuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1TargetTableDurabilitySuite.scala index 46f8ee47db02f..a5f3a13a012a6 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1TargetTableDurabilitySuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/AutoCdcScd1TargetTableDurabilitySuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.pipelines.graph import org.apache.spark.sql.Row import org.apache.spark.sql.execution.streaming.runtime.MemoryStream import org.apache.spark.sql.functions -import org.apache.spark.sql.pipelines.autocdc.Scd1BatchProcessor +import org.apache.spark.sql.pipelines.autocdc.AutoCdcReservedNames import org.apache.spark.sql.pipelines.utils.{ExecutionTest, TestGraphRegistrationContext} import org.apache.spark.sql.test.SharedSparkSession @@ -147,8 +147,8 @@ class AutoCdcScd1TargetTableDurabilitySuite val schema = spark.table(s"$catalog.$namespace.target").schema assert( - schema.fieldNames.contains(Scd1BatchProcessor.cdcMetadataColName), - s"Target must have ${Scd1BatchProcessor.cdcMetadataColName} after first AutoCDC run; " + + schema.fieldNames.contains(AutoCdcReservedNames.cdcMetadataColName), + s"Target must have ${AutoCdcReservedNames.cdcMetadataColName} after first AutoCDC run; " + s"got ${schema.fieldNames.toSeq}" ) checkAnswer( From 3a34e552648459d5841f3b2ba9eccc699bab05f4 Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Fri, 29 May 2026 16:34:47 +0000 Subject: [PATCH 2/9] add scaladocs --- .../autocdc/Scd2BatchProcessor.scala | 124 +++++++++++++++--- 1 file changed, 104 insertions(+), 20 deletions(-) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala index 31dd198b65d67..1c1d9652ecbab 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala @@ -139,33 +139,117 @@ case class Scd2BatchProcessor( } +/** + * Concept: run of upsert events. + * + * A run is a maximal sequence of consecutive upsert events (in sorted order by sequencing) + * for the same key whose tracked-history-column values are all identical. The transition + * from a previous run's tail to a new run's head represents a real state change; every + * subsequent event in the run is a no-op continuation that logically coalesces with the head. + * + * Runs matter because SCD2 only emits a new visible historical row when a + * tracked-history column actually changes. By convention we choose that only the tail of a + * run produces a visible row in the target table; the rest become hidden rows in the aux + * table. Selecting the tail means the latest no-op upsert is reflected in the target table. + * + * Example, with trackHistoryCols = [name], events for some key: + * (S=5, name=Alice) -> starts run head at S=5. Row lives in aux table. + * (S=10, name=Alice) -> no-op, adds to run at S=5. Row lives in aux table. + * (S=15, name=Alice) -> no-op and tail of run at S=5. Row lives in target table with + * START_AT=5. + * (S=20, name=Charlie) -> new run head/tail (run size=1) at S=20. Row lives in target + * table. + * + * Now if a new late-arriving event (S=12, name=Bob) arrives for the same key, we have: + * (S=5, name=Alice) -> starts run head at S=5. Row lives in aux table. + * (S=10, name=Alice) -> no-op but now tail of run at S=5. Row now lives in target + * table with START_AT=5. + * (S=12, name=Bob) -> new run head/tail (run size=1) at S=12. Row lives in target + * table. + * (S=15, name=Alice) -> previously-visible tail converts to a new run head at S=15. Row + * remains in target table, but now with START_AT=15. + * (S=20, name=Charlie) -> new run head at S=20. Row lives in target table. + * + * Note that if we did not track the no-op events in the aux table for the run at S=5 before the + * event (S=12, name=Bob) arrived, then we would not have correctly reconciled that the event + * (S=10, name=Alice) is now the visible tail of the Alice run before Bob. + * + * ------------- + * Concept: target table. + * + * The user-consumable output table of the CDC transformation. Every row in the target table + * represents the visible tail of a run (maybe size 1), carrying the run head's START_AT and the + * latest row values for that run. The target table in its entirety represents the SCD2 + * representation of the CDC flow's source table. + * + * ------------- + * Concept: aux table. + * + * The side state table used to track out of order events from the CDC source. Two classes + * of events are represented as rows in this table: + * 1. Early-arriving deletes, with no matching upsert; this is considered a tombstone, + * and may match with a late-arriving upsert in a future microbatch. + * 2. No-op upserts (i.e. tails of runs); hidden no-op rows that may reconcile as + * state-changing run heads in a future microbatch. + * + * The aux table is considered an internal table that users should neither tamper nor consider + * public contract. + * + * ------------- + * Concept: same-sequence tie-break between an upsert and a delete. + * + * When an upsert event and a delete event share the same `__RECORD_START_AT`, the delete wins: + * the visible upsert is dropped (as a zero-width interval) and only the tombstone is written + * to the aux table. The reverse pair (delete arriving first, then an upsert at the same + * sequence) is symmetric: the tombstone closes the upsert at the same instant, again leaving + * a zero-width visible interval that is dropped, and only the tombstone survives. + * + * This tie-break is an internal contract only - we do not publicly guarantee deterministic + * resolution when two events for the same key share a sequence value. Users who care about + * ordering should ensure their sequencing column is unique per (key, event). + */ object Scd2BatchProcessor { - // Metadata field that represents the exact time (sequence) of the CDC event that produced this - // row. Null only for synthetic decomposition tails. + /** + * Metadata field that represents the exact time (sequence) of the CDC event that produced + * this row. Null only for synthetic decomposition tails. + */ private[autocdc] val recordStartAtFieldName: String = "__RECORD_START_AT" - // In the target table: - // The user-visible column representing when this row is considered active from, i.e. - // this upsert run head's [[recordStartAtFieldName]]. - // In the aux table: - // If this row represents a tombstone, then the same value as [[recordStartAtFieldName]]. - // Else this row represents a coalesced no-op row that is part of an upsert run. - // Inherit the [[recordStartAtFieldName]] of the head of this upsert's run. - // - // The invariant in both tables is: startAtColName <= recordStartAtFieldName. If an event was - // generated at time X, it is active by time X, or earlier if it is not a run head. + /** + * What this column represents depends on which AutoCDC artifact table it is read from. + * + * In the target table: + * The user-visible column representing when this row is considered active from, i.e. + * this upsert run's head [[recordStartAtFieldName]]. + * In the aux table: + * If this row represents a tombstone, then the same value as [[recordStartAtFieldName]]. + * Else this row represents a coalesced no-op row that is part of an upsert run. + * Inherit the [[recordStartAtFieldName]] of the head of this upsert's run. + * + * The invariant in both tables is: startAtColName <= recordStartAtFieldName. If an event was + * generated at time X, it is active by time X, or earlier if it is not a run head. + */ private[autocdc] val startAtColName: String = "__START_AT" - // In the target table: - // The user-visible column representing when this row became inactive. Null IFF the row - // is active: neither superseded by a state-changing upsert nor affected by a delete. - // In the aux table: - // If this row is a tombstone, then by convention the sequence of the delete event that - // produced it. Delete events are considered instantaneous in time. - // Else this row is a coalesced no-op row that is part of an upsert run, and by - // convention the value will always be null. + /** + * What this column represents depends on which AutoCDC artifact table it is read from. + * + * In the target table: + * The user-visible column representing when this row became inactive. Null IFF the row + * is active: neither superseded by a state-changing upsert nor affected by a delete. + * In the aux table: + * If this row is a tombstone, then by convention the sequence of the delete event that + * produced it. Delete events are considered instantaneous in time. + * Else this row is a coalesced no-op row that is part of an upsert run, and by + * convention the value will always be null. + */ private[autocdc] val endAtColName: String = "__END_AT" + /** + * Column names reserved by AutoCDC, that will be projected onto the microbatch and target + * tables. If the user's source dataframe contains any of these columns, SCD2 reconciliation + * will fail. + */ private val reservedFrameworkColNames: Set[String] = Set( startAtColName, endAtColName, From b77e557bc0fbab325471fe68da7da603e1a8490b Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Fri, 29 May 2026 21:10:07 +0000 Subject: [PATCH 3/9] fixing indenting --- .../autocdc/Scd2BatchProcessor.scala | 112 +++++++++--------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala index 1c1d9652ecbab..4566cfad0894e 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala @@ -61,7 +61,7 @@ case class Scd2BatchProcessor( private def extendMicrobatchRowsWithStartAt(microbatchDf: DataFrame): DataFrame = { microbatchDf.withColumn( colName = Scd2BatchProcessor.startAtColName, - col = changeArgs.sequencing + col = changeArgs.sequencing.cast(resolvedSequencingType) ) } @@ -209,63 +209,63 @@ case class Scd2BatchProcessor( * ordering should ensure their sequencing column is unique per (key, event). */ object Scd2BatchProcessor { - /** - * Metadata field that represents the exact time (sequence) of the CDC event that produced - * this row. Null only for synthetic decomposition tails. - */ - private[autocdc] val recordStartAtFieldName: String = "__RECORD_START_AT" + /** + * Metadata field that represents the exact time (sequence) of the CDC event that produced + * this row. Null only for synthetic decomposition tails. + */ + private[autocdc] val recordStartAtFieldName: String = "__RECORD_START_AT" - /** - * What this column represents depends on which AutoCDC artifact table it is read from. - * - * In the target table: - * The user-visible column representing when this row is considered active from, i.e. - * this upsert run's head [[recordStartAtFieldName]]. - * In the aux table: - * If this row represents a tombstone, then the same value as [[recordStartAtFieldName]]. - * Else this row represents a coalesced no-op row that is part of an upsert run. - * Inherit the [[recordStartAtFieldName]] of the head of this upsert's run. - * - * The invariant in both tables is: startAtColName <= recordStartAtFieldName. If an event was - * generated at time X, it is active by time X, or earlier if it is not a run head. - */ - private[autocdc] val startAtColName: String = "__START_AT" + /** + * What this column represents depends on which AutoCDC artifact table it is read from. + * + * In the target table: + * The user-visible column representing when this row is considered active from, i.e. + * this upsert run's head [[recordStartAtFieldName]]. + * In the aux table: + * If this row represents a tombstone, then the same value as [[recordStartAtFieldName]]. + * Else this row represents a coalesced no-op row that is part of an upsert run. + * Inherit the [[recordStartAtFieldName]] of the head of this upsert's run. + * + * The invariant in both tables is: startAtColName <= recordStartAtFieldName. If an event was + * generated at time X, it is active by time X, or earlier if it is not a run head. + */ + private[autocdc] val startAtColName: String = "__START_AT" - /** - * What this column represents depends on which AutoCDC artifact table it is read from. - * - * In the target table: - * The user-visible column representing when this row became inactive. Null IFF the row - * is active: neither superseded by a state-changing upsert nor affected by a delete. - * In the aux table: - * If this row is a tombstone, then by convention the sequence of the delete event that - * produced it. Delete events are considered instantaneous in time. - * Else this row is a coalesced no-op row that is part of an upsert run, and by - * convention the value will always be null. - */ - private[autocdc] val endAtColName: String = "__END_AT" + /** + * What this column represents depends on which AutoCDC artifact table it is read from. + * + * In the target table: + * The user-visible column representing when this row became inactive. Null IFF the row + * is active: neither superseded by a state-changing upsert nor affected by a delete. + * In the aux table: + * If this row is a tombstone, then by convention the sequence of the delete event that + * produced it. Delete events are considered instantaneous in time. + * Else this row is a coalesced no-op row that is part of an upsert run, and by + * convention the value will always be null. + */ + private[autocdc] val endAtColName: String = "__END_AT" - /** - * Column names reserved by AutoCDC, that will be projected onto the microbatch and target - * tables. If the user's source dataframe contains any of these columns, SCD2 reconciliation - * will fail. - */ - private val reservedFrameworkColNames: Set[String] = Set( - startAtColName, - endAtColName, - AutoCdcReservedNames.cdcMetadataColName - ) + /** + * Column names reserved by AutoCDC, that will be projected onto the microbatch and target + * tables. If the user's source dataframe contains any of these columns, SCD2 reconciliation + * will fail. + */ + private val reservedFrameworkColNames: Set[String] = Set( + startAtColName, + endAtColName, + AutoCdcReservedNames.cdcMetadataColName + ) - /** - * Construct the CDC metadata struct column for SCD1, following the exact schema and field - * ordering defined by [[cdcMetadataColSchema]]. - */ - def constructCdcMetadataStruct( - recordStartAt: Column, - sequencingType: DataType - ): Column = { - F.struct( - recordStartAt.cast(sequencingType).as(recordStartAtFieldName) - ) - } + /** + * Construct the CDC metadata struct column for SCD1, following the exact schema and field + * ordering defined by [[cdcMetadataColSchema]]. + */ + def constructCdcMetadataStruct( + recordStartAt: Column, + sequencingType: DataType + ): Column = { + F.struct( + recordStartAt.cast(sequencingType).as(recordStartAtFieldName) + ) + } } From 437bfa41dc495e53921c45031c7794ef00257c9b Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Mon, 1 Jun 2026 17:55:22 +0000 Subject: [PATCH 4/9] document decomposition tail concept --- .../pipelines/autocdc/Scd2BatchProcessor.scala | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala index 4566cfad0894e..cdc5c0d0a3caf 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala @@ -196,6 +196,22 @@ case class Scd2BatchProcessor( * public contract. * * ------------- + * Concept: decomposition tail. + * + * A transient and synthetic row produced by the batch processor during reconciliation (not + * from the CDC source) when a previously-closed historical row [START_AT=X, END_AT=Y] is + * bisected by a late-arriving event. The bisected row is split into a head + * [START_AT=X, END_AT=null] - inheriting the original row's data and `__RECORD_START_AT` - + * and a tail [START_AT=null, END_AT=Y, `__RECORD_START_AT`=null] that carries the original + * row's right boundary. The tail typically becomes the closing END_AT of a bisecting upsert, + * giving it a valid right boundary in the target-table history. + * + * Decomposition tails are uniquely identified by `__RECORD_START_AT` = null - the only row + * category with that property - and are never persisted in their tail form: each is either + * absorbed by the next event in the affected window (dropped as redundant) or promoted to a + * tombstone in the aux table if it survives reconciliation unmatched. + * + * ------------- * Concept: same-sequence tie-break between an upsert and a delete. * * When an upsert event and a delete event share the same `__RECORD_START_AT`, the delete wins: From 75adaa5b0766821bc63cddc4aec7cd3764baf760 Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Mon, 1 Jun 2026 18:28:38 +0000 Subject: [PATCH 5/9] document event deduplication responisbilities and add test --- .../autocdc/Scd2BatchProcessor.scala | 21 ++++++++--- .../autocdc/Scd2BatchProcessorSuite.scala | 37 +++++++++++++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala index cdc5c0d0a3caf..319ad59004881 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala @@ -41,13 +41,24 @@ case class Scd2BatchProcessor( * * Step ordering is load-bearing: the row-extension steps reference user data columns that * target-column selection is allowed to drop, so selection runs last. Unlike SCD1, no per-key - * deduplication step is needed - SCD2 preserves every event as part of the row's history. + * deduplication step is performed here - SCD2 preserves every event as part of the row's + * history, including byte-identical full-event duplicates. * - * Requires the microbatch to have been validated upstream so that the sequencing column is - * non-null and orderable. + * Duplicate event elimination (e.g., collapsing two identical events at the same sequence), + * whether across microbatches or within the same microbatch, is the responsibility of + * downstream reconciliation - not preprocessing. + * + * Produces a dataframe that retains every input row 1:1 - no rows added, dropped, reordered, + * or merged - with the following schema, in column order: + * 1. The user columns of `microbatchDf` that survive [[ChangeArgs.columnSelection]], in the + * order they appeared in the input. + * 2. `__START_AT` column, populated with the sequence value of the row. + * 3. `__END_AT` column, populated with the sequence value of the row IFF it's a delete event, + * null otherwise. + * 4. `__spark_autocdc_metadata` column. */ - private[autocdc] def preprocessMicrobatch(validatedBatchDf: DataFrame): DataFrame = { - validatedBatchDf + private[autocdc] def preprocessMicrobatch(microbatchDf: DataFrame): DataFrame = { + microbatchDf .transform(extendMicrobatchRowsWithStartAt) .transform(extendMicrobatchRowsWithEndAt) .transform(extendMicrobatchRowsWithCdcMetadata) diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala index 80dd4975b7304..f02986320ab6c 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala @@ -129,6 +129,43 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { ) } + test("preprocessMicrobatch preserves byte-identical full-event duplicates") { + val schema = new StructType() + .add("id", IntegerType) + .add("seq", LongType) + .add("value", StringType) + .add("is_delete", BooleanType) + + // Two byte-identical events for the same key: same key, same sequencing, same data, same + // delete flag. SCD2 preprocessing intentionally preserves every event verbatim, including + // full-event duplicates. Cross-event redundancy elimination (collapsing duplicates before + // they could reconcile to a zero-width visible row) is the responsibility of downstream + // reconciliation, not preprocessing. + val batch = microbatchOf(schema)( + Row(1, 10L, "alice", false), + Row(1, 10L, "alice", false) + ) + + val processor = Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = Seq(UnqualifiedColumnName("id")), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2, + deleteCondition = Some(F.col("is_delete")) + ), + resolvedSequencingType = LongType + ) + + // Both rows must survive verbatim. + checkAnswer( + df = processor.preprocessMicrobatch(batch), + expectedAnswer = Seq( + Row(1, 10L, "alice", false, 10L, null, Row(10L)), + Row(1, 10L, "alice", false, 10L, null, Row(10L)) + ) + ) + } + test("preprocessMicrobatch leaves __END_AT null on every row when deleteCondition is None") { val schema = new StructType() .add("id", IntegerType) From 35b9d51dcac7902828adfd9341df7aeb456b6a7c Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 2 Jun 2026 00:09:56 +0000 Subject: [PATCH 6/9] core implementation --- .../autocdc/Scd2BatchProcessor.scala | 272 ++++++++++++++++-- 1 file changed, 254 insertions(+), 18 deletions(-) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala index 319ad59004881..b769d66affc98 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala @@ -17,11 +17,13 @@ package org.apache.spark.sql.pipelines.autocdc +import org.apache.spark.SparkException import org.apache.spark.sql.{functions => F} import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.util.QuotingUtils import org.apache.spark.sql.classic.DataFrame -import org.apache.spark.sql.types.{DataType, StructType} +import org.apache.spark.sql.types.{DataType, StructField, StructType} +import org.apache.spark.util.ArrayImplicits._ /** * Per-microbatch processor for SCD Type 2 AutoCDC flows, complying to the specified @@ -35,6 +37,9 @@ case class Scd2BatchProcessor( changeArgs: ChangeArgs, resolvedSequencingType: DataType) { + /** Backtick-quoted key column names. */ + private lazy val keysQuoted: Seq[String] = changeArgs.keys.map(_.quoted) + /** * Reconcile a CDC microbatch into the canonical form the auxiliary- and target-table merges * consume. @@ -48,14 +53,17 @@ case class Scd2BatchProcessor( * whether across microbatches or within the same microbatch, is the responsibility of * downstream reconciliation - not preprocessing. * - * Produces a dataframe that retains every input row 1:1 - no rows added, dropped, reordered, - * or merged - with the following schema, in column order: - * 1. The user columns of `microbatchDf` that survive [[ChangeArgs.columnSelection]], in the - * order they appeared in the input. - * 2. `__START_AT` column, populated with the sequence value of the row. - * 3. `__END_AT` column, populated with the sequence value of the row IFF it's a delete event, - * null otherwise. - * 4. `__spark_autocdc_metadata` column. + * @param microbatchDf + * the incoming CDC microbatch. + * @return + * a dataframe that retains every input row 1:1 - no rows added, dropped, reordered, or + * merged - with the following schema, in column order: + * 1. The user columns of `microbatchDf` that survive [[ChangeArgs.columnSelection]], in + * the order they appeared in the input. + * 2. `__START_AT` column, populated with the sequence value of the row. + * 3. `__END_AT` column, populated with the sequence value of the row IFF it's a delete + * event, null otherwise. + * 4. `__spark_autocdc_metadata` column, conforming to [[targetCdcMetadataColSchema]]. */ private[autocdc] def preprocessMicrobatch(microbatchDf: DataFrame): DataFrame = { microbatchDf @@ -105,7 +113,7 @@ case class Scd2BatchProcessor( private def extendMicrobatchRowsWithCdcMetadata(microbatchDf: DataFrame): DataFrame = { microbatchDf.withColumn( colName = AutoCdcReservedNames.cdcMetadataColName, - col = Scd2BatchProcessor.constructCdcMetadataStruct( + col = Scd2BatchProcessor.constructTargetCdcMetadataCol( recordStartAt = changeArgs.sequencing, sequencingType = resolvedSequencingType ) @@ -148,6 +156,179 @@ case class Scd2BatchProcessor( microbatch.select(finalColumnsToSelect: _*) } + /** + * For each key in the preprocessed microbatch, compute the earliest `__RECORD_START_AT` + * across the key's events. + * + * @param preprocessedBatchDf + * a validated and preprocessed microbatch as produced by [[preprocessMicrobatch]] - in + * particular, non-null key columns and a non-null `__RECORD_START_AT` on every row. + * @return + * a dataframe containing one row per distinct key. Schema, in column order: + * 1. The key columns ([[ChangeArgs.keys]]), in their declared order. + * 2. `__spark_autocdc_min_sequencing` column, carrying the min `__RECORD_START_AT` + * across all records within the microbatch for that key. + */ + private def computeMinimumSequencePerKey(preprocessedBatchDf: DataFrame): DataFrame = { + val recordStartAt = + Scd2BatchProcessor.recordStartAtOf(F.col(AutoCdcReservedNames.cdcMetadataColName)) + preprocessedBatchDf + .groupBy(keysQuoted.map(F.col): _*) + .agg(F.min(recordStartAt).alias(Scd2BatchProcessor.minSequencingColName)) + } + + /** + * Find the auxiliary-table rows whose state matters for reconciling the microbatch. + * + * @param rawAuxiliaryTableDf + * the auxiliary table in its native schema, whose CDC metadata column carries an extra + * `__DELETED_BY_BATCH_ID` field on top of the target/microbatch schema. + * @param minimumSequencePerKeyInMicrobatchDf + * one row per distinct key as produced by [[computeMinimumSequencePerKey]], representing + * the minimum sequence for that key in the microbatch. + * @param batchId + * the underlying Spark streaming query's batchId, which serves as the idempotency key. + * @return + * a dataframe containing the affected aux rows, with the CDC metadata column narrowed + * to the target/microbatch schema (aux-only subfields stripped) so the result is + * union-compatible with preprocessed microbatch rows and target-table rows downstream. + * Schema, in column order: + * 1. The user columns of the aux table. + * 2. `__START_AT` column. + * 3. `__END_AT` column. + * 4. `__spark_autocdc_metadata` column, conforming to [[targetCdcMetadataColSchema]]. + */ + private def findAffectedRowsFromAuxiliaryTable( + rawAuxiliaryTableDf: DataFrame, + minimumSequencePerKeyInMicrobatchDf: DataFrame, + batchId: Long + ): DataFrame = { + val rawAuxTableRecordStartAtField = Scd2BatchProcessor.recordStartAtOf( + F.col(AutoCdcReservedNames.cdcMetadataColName) + ) + val auxTableDeletedByBatchIdField = Scd2BatchProcessor.deletedByBatchIdOf( + F.col(AutoCdcReservedNames.cdcMetadataColName) + ) + + val reducedAuxiliaryTableDf = rawAuxiliaryTableDf + .filter( + // Ignore any auxiliary table rows logically deleted by any microbatch other than this one + // itself. Recall this execution could be a retry attempt on the same microbatch, and + // batchId is our idempotency key. + auxTableDeletedByBatchIdField.isNull || + auxTableDeletedByBatchIdField === F.lit(batchId) + ) + // The aux table's CDC metadata column is a superset of the target's: it carries the + // additional `__DELETED_BY_BATCH_ID` field. Since we eventually union aux rows with + // target and microbatch rows (which use the target's narrower CDC metadata schema), strip + // the aux-only subfields here so all three sources share an identical CDC metadata column + // schema, and replace the existing CDC metadata column with it. + .withColumn( + AutoCdcReservedNames.cdcMetadataColName, + Scd2BatchProcessor.constructTargetCdcMetadataCol( + recordStartAt = rawAuxTableRecordStartAtField, + sequencingType = resolvedSequencingType + ) + ) + + val auxTableRecordStartAtField = + Scd2BatchProcessor.recordStartAtOf(F.col(AutoCdcReservedNames.cdcMetadataColName)) + + val minSequencePerKeyInMicrobatchCol = F.col(Scd2BatchProcessor.minSequencingColName) + + // Per key, identify the sequence value associated with the anchor row in the aux table. + // + // The anchor row is the aux row with the largest `recordStartAt` strictly less than the + // min sequence in the incoming microbatch for that key. It is the immediate "left + // context" the reconciler needs to correctly attribute incoming events to a run: without + // the anchor, a no-op upsert of an existing run would look like a brand-new run head. + // + // Keys with no aux row strictly before the min sequence have no anchor; their affected set + // reduces to "all aux rows at or after the min sequence." + // + // The shape of this DataFrame is: [key1, key2, ... keyN, anchorRowSequenceForCompositeKey] + val anchorOrderingPerKey: DataFrame = reducedAuxiliaryTableDf + // The number of rows in [[minimumSequencePerKeyInMicrobatchDf]] is bounded by the + // number of unique keys in the microbatch, which should typically be small. The + // auxiliary table should generally also be small, containing only no-op upsert runs + // and tombstones per key. Therefore this join should be cheap, and broadcast joinable. + .join(minimumSequencePerKeyInMicrobatchDf, keysQuoted) + .filter(auxTableRecordStartAtField < minSequencePerKeyInMicrobatchCol) + .groupBy(keysQuoted.map(F.col): _*) + .agg( + F.max(auxTableRecordStartAtField).as(Scd2BatchProcessor.anchorOrderingColName) + ) + val anchorOrderingCol = F.col(Scd2BatchProcessor.anchorOrderingColName) + + // Now that we have the minimum sequence in the microbatch and the sequence of the anchor row, + // we have enough information to compute the full set of auxiliary rows that affect or are + // affected by the microbatch. + val auxRowIsAfterMinSequenceInMicrobatch = + auxTableRecordStartAtField >= minSequencePerKeyInMicrobatchCol + + val auxRowIsAnchorRow = auxTableRecordStartAtField === anchorOrderingCol + + val auxRowAffectsMicrobatch = auxRowIsAfterMinSequenceInMicrobatch || auxRowIsAnchorRow + + val affectedRowsFromAuxiliaryTable = reducedAuxiliaryTableDf + // Per row, join/project the minimum microbatch sequence and anchor sequencing for + // that row's key set. + .join(minimumSequencePerKeyInMicrobatchDf, keysQuoted) + .join( + anchorOrderingPerKey, + keysQuoted, + joinType = "left" + ) + // Using the joined information, determine if the row is affected by the microbatch. + .filter(auxRowAffectsMicrobatch) + .drop(minSequencePerKeyInMicrobatchCol, anchorOrderingCol) + + affectedRowsFromAuxiliaryTable + } + + /** + * Find the target-table rows whose state matters for reconciling the microbatch. + * + * @param targetTableDf + * the target table in its native schema. + * @param minimumSequencePerKeyInMicrobatchDf + * one row per distinct key as produced by [[computeMinimumSequencePerKey]], representing + * the minimum sequence for that key in the microbatch. + * @return + * a dataframe containing the affected target rows. Schema, in column order: + * 1. The user columns of the target table. + * 2. `__START_AT` column. + * 3. `__END_AT` column. + * 4. `__spark_autocdc_metadata` column, conforming to [[targetCdcMetadataColSchema]]. + */ + private def findAffectedRowsFromTargetTable( + targetTableDf: DataFrame, + minimumSequencePerKeyInMicrobatchDf: DataFrame + ): DataFrame = { + val targetEndAtCol = F.col(Scd2BatchProcessor.endAtColName) + val minSequencePerKeyInMicrobatchCol = F.col(Scd2BatchProcessor.minSequencingColName) + + // Per key, identify all the rows in the target table that may be affected by the + // incoming microbatch. + // + // Unlike the auxiliary table, the target table holds visible rows only: no hidden open + // no-op upsert rows, no tombstones. Visible rows for a given key form a non-overlapping + // interval partition over the sequencing axis, and at most one row has a null __END_AT + // (the currently active row per key). + // + // Hence we can simply grab all rows that were active at some point after the min sequencing + // per key, which can be determined entirely by the row's `__END_AT`. + val isCurrentlyActiveRow = targetEndAtCol.isNull + val rowEndsAfterMinimumSequence = targetEndAtCol >= minSequencePerKeyInMicrobatchCol + val rowMayBeAffected = isCurrentlyActiveRow || rowEndsAfterMinimumSequence + + val affectedRowsFromTargetTable = targetTableDf + .join(minimumSequencePerKeyInMicrobatchDf, keysQuoted) + .filter(rowMayBeAffected) + .drop(minSequencePerKeyInMicrobatchCol) + + affectedRowsFromTargetTable + } } /** @@ -237,10 +418,23 @@ case class Scd2BatchProcessor( */ object Scd2BatchProcessor { /** - * Metadata field that represents the exact time (sequence) of the CDC event that produced - * this row. Null only for synthetic decomposition tails. + * CDC metadata column field that represents the exact time (sequence) of the CDC event that + * produced this row. Null only for synthetic decomposition tails. */ private[autocdc] val recordStartAtFieldName: String = "__RECORD_START_AT" + + /** + * CDC metadata column field that represents the microbatch id a particular row was considered + * logically deleted by. Any future microbatches should consider that row as deleted. + * + * Logically deleted rows exist as a concept in the auxiliary to provide idempotency, should a + * microbatch fail between a MERGE executed against the auxiliary table and the MERGE executed + * against the target table. + * + * This field only exists in the CDC metadata column for the auxiliary table, not in CDC + * metadata column for the target table. + */ + private val deletedByBatchIdFieldName: String = "__DELETED_BY_BATCH_ID" /** * What this column represents depends on which AutoCDC artifact table it is read from. @@ -283,16 +477,58 @@ object Scd2BatchProcessor { AutoCdcReservedNames.cdcMetadataColName ) + /** + * Name of temporary column projected onto microbatch to compute the min sequencing value within + * the microbatch. + */ + private val minSequencingColName: String = s"${AutoCdcReservedNames.prefix}min_sequencing" + /** + * Name of temporary column projected + */ + private val anchorOrderingColName: String = s"${AutoCdcReservedNames.prefix}anchor_ordering" + + /** Project the `__RECORD_START_AT` field out of a CDC metadata column. */ + private def recordStartAtOf(cdcMetadataCol: Column): Column = + cdcMetadataCol.getField(recordStartAtFieldName) + + /** Project the `__DELETED_BY_BATCH_ID` field out of any `_cdc_metadata` column. */ + private def deletedByBatchIdOf(cdcMetadataCol: Column): Column = + cdcMetadataCol.getField(deletedByBatchIdFieldName) + /** - * Construct the CDC metadata struct column for SCD1, following the exact schema and field - * ordering defined by [[cdcMetadataColSchema]]. + * Schema of the CDC metadata struct column for SCD2 target table rows. + * + * Note that the aux table's CDC metadata column is a strict superset of this schema (it carries + * an additional `__DELETED_BY_BATCH_ID` field). */ - def constructCdcMetadataStruct( + private[pipelines] def targetCdcMetadataColSchema(sequencingType: DataType): StructType = + StructType( + Seq( + // The sequence value of the originating CDC event for this row. Nullable because + // decomposition tails, which are transient and synthetically constructed during + // reconciliation, have a null record start at. + StructField(recordStartAtFieldName, sequencingType, nullable = true) + ) + ) + + /** + * Construct the CDC metadata struct column for SCD2 target/microbatch rows, following the + * exact schema and field ordering defined by [[targetCdcMetadataColSchema]]. + */ + private def constructTargetCdcMetadataCol( recordStartAt: Column, sequencingType: DataType ): Column = { - F.struct( - recordStartAt.cast(sequencingType).as(recordStartAtFieldName) - ) + val cdcMetadataFieldsInOrder = targetCdcMetadataColSchema(sequencingType).fields.map { field => + val value = field.name match { + case `recordStartAtFieldName` => recordStartAt + case other => + throw SparkException.internalError( + s"Unable to construct SCD2 CDC metadata column due to unknown `${other}` field." + ) + } + value.cast(field.dataType).as(field.name) + } + F.struct(cdcMetadataFieldsInOrder.toImmutableArraySeq: _*) } } From 95d72807a3230670874fd6f919afdb3c3fa09b31 Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 2 Jun 2026 01:12:18 +0000 Subject: [PATCH 7/9] tests --- .../autocdc/Scd2BatchProcessor.scala | 114 ++- .../autocdc/Scd2BatchProcessorSuite.scala | 739 +++++++++++++++++- 2 files changed, 816 insertions(+), 37 deletions(-) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala index b769d66affc98..4d4e32eb24d46 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.{functions => F} import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.util.QuotingUtils import org.apache.spark.sql.classic.DataFrame -import org.apache.spark.sql.types.{DataType, StructField, StructType} +import org.apache.spark.sql.types.{DataType, LongType, StructField, StructType} import org.apache.spark.util.ArrayImplicits._ /** @@ -37,9 +37,18 @@ case class Scd2BatchProcessor( changeArgs: ChangeArgs, resolvedSequencingType: DataType) { - /** Backtick-quoted key column names. */ + /** + * Backtick-quoted key column names. Use when the name flows through an expression parser + * (e.g., [[F.col]]), which interprets dotted names as struct-field accesses. + */ private lazy val keysQuoted: Seq[String] = changeArgs.keys.map(_.quoted) + /** + * Raw key column names. Use when the name is matched literally against a schema field + * (e.g., DataFrame `.join(other, usingColumns)`), where backticks are NOT stripped. + */ + private lazy val keysRaw: Seq[String] = changeArgs.keys.map(_.name) + /** * Reconcile a CDC microbatch into the canonical form the auxiliary- and target-table merges * consume. @@ -169,7 +178,7 @@ case class Scd2BatchProcessor( * 2. `__spark_autocdc_min_sequencing` column, carrying the min `__RECORD_START_AT` * across all records within the microbatch for that key. */ - private def computeMinimumSequencePerKey(preprocessedBatchDf: DataFrame): DataFrame = { + private[autocdc] def computeMinimumSequencePerKey(preprocessedBatchDf: DataFrame): DataFrame = { val recordStartAt = Scd2BatchProcessor.recordStartAtOf(F.col(AutoCdcReservedNames.cdcMetadataColName)) preprocessedBatchDf @@ -189,21 +198,16 @@ case class Scd2BatchProcessor( * @param batchId * the underlying Spark streaming query's batchId, which serves as the idempotency key. * @return - * a dataframe containing the affected aux rows, with the CDC metadata column narrowed + * a dataframe containing all the affected aux rows, but with the CDC metadata column narrowed * to the target/microbatch schema (aux-only subfields stripped) so the result is * union-compatible with preprocessed microbatch rows and target-table rows downstream. - * Schema, in column order: - * 1. The user columns of the aux table. - * 2. `__START_AT` column. - * 3. `__END_AT` column. - * 4. `__spark_autocdc_metadata` column, conforming to [[targetCdcMetadataColSchema]]. */ - private def findAffectedRowsFromAuxiliaryTable( + private[autocdc] def findAffectedRowsFromAuxiliaryTable( rawAuxiliaryTableDf: DataFrame, minimumSequencePerKeyInMicrobatchDf: DataFrame, batchId: Long ): DataFrame = { - val rawAuxTableRecordStartAtField = Scd2BatchProcessor.recordStartAtOf( + val auxTableRecordStartAtField = Scd2BatchProcessor.recordStartAtOf( F.col(AutoCdcReservedNames.cdcMetadataColName) ) val auxTableDeletedByBatchIdField = Scd2BatchProcessor.deletedByBatchIdOf( @@ -226,13 +230,10 @@ case class Scd2BatchProcessor( .withColumn( AutoCdcReservedNames.cdcMetadataColName, Scd2BatchProcessor.constructTargetCdcMetadataCol( - recordStartAt = rawAuxTableRecordStartAtField, + recordStartAt = auxTableRecordStartAtField, sequencingType = resolvedSequencingType ) ) - - val auxTableRecordStartAtField = - Scd2BatchProcessor.recordStartAtOf(F.col(AutoCdcReservedNames.cdcMetadataColName)) val minSequencePerKeyInMicrobatchCol = F.col(Scd2BatchProcessor.minSequencingColName) @@ -252,7 +253,7 @@ case class Scd2BatchProcessor( // number of unique keys in the microbatch, which should typically be small. The // auxiliary table should generally also be small, containing only no-op upsert runs // and tombstones per key. Therefore this join should be cheap, and broadcast joinable. - .join(minimumSequencePerKeyInMicrobatchDf, keysQuoted) + .join(minimumSequencePerKeyInMicrobatchDf, keysRaw) .filter(auxTableRecordStartAtField < minSequencePerKeyInMicrobatchCol) .groupBy(keysQuoted.map(F.col): _*) .agg( @@ -273,10 +274,10 @@ case class Scd2BatchProcessor( val affectedRowsFromAuxiliaryTable = reducedAuxiliaryTableDf // Per row, join/project the minimum microbatch sequence and anchor sequencing for // that row's key set. - .join(minimumSequencePerKeyInMicrobatchDf, keysQuoted) + .join(minimumSequencePerKeyInMicrobatchDf, keysRaw) .join( anchorOrderingPerKey, - keysQuoted, + keysRaw, joinType = "left" ) // Using the joined information, determine if the row is affected by the microbatch. @@ -295,13 +296,9 @@ case class Scd2BatchProcessor( * one row per distinct key as produced by [[computeMinimumSequencePerKey]], representing * the minimum sequence for that key in the microbatch. * @return - * a dataframe containing the affected target rows. Schema, in column order: - * 1. The user columns of the target table. - * 2. `__START_AT` column. - * 3. `__END_AT` column. - * 4. `__spark_autocdc_metadata` column, conforming to [[targetCdcMetadataColSchema]]. + * a dataframe containing the affected target rows, exactly as-is from the target table. */ - private def findAffectedRowsFromTargetTable( + private[autocdc] def findAffectedRowsFromTargetTable( targetTableDf: DataFrame, minimumSequencePerKeyInMicrobatchDf: DataFrame ): DataFrame = { @@ -323,7 +320,7 @@ case class Scd2BatchProcessor( val rowMayBeAffected = isCurrentlyActiveRow || rowEndsAfterMinimumSequence val affectedRowsFromTargetTable = targetTableDf - .join(minimumSequencePerKeyInMicrobatchDf, keysQuoted) + .join(minimumSequencePerKeyInMicrobatchDf, keysRaw) .filter(rowMayBeAffected) .drop(minSequencePerKeyInMicrobatchCol) @@ -422,7 +419,7 @@ object Scd2BatchProcessor { * produced this row. Null only for synthetic decomposition tails. */ private[autocdc] val recordStartAtFieldName: String = "__RECORD_START_AT" - + /** * CDC metadata column field that represents the microbatch id a particular row was considered * logically deleted by. Any future microbatches should consider that row as deleted. @@ -477,27 +474,32 @@ object Scd2BatchProcessor { AutoCdcReservedNames.cdcMetadataColName ) - /** - * Name of temporary column projected onto microbatch to compute the min sequencing value within - * the microbatch. + /** + * Name of temporary column projected onto microbatch to compute the min sequencing value per + * key within the microbatch. */ - private val minSequencingColName: String = s"${AutoCdcReservedNames.prefix}min_sequencing" + private[autocdc] val minSequencingColName: String = + s"${AutoCdcReservedNames.prefix}min_sequencing" + /** - * Name of temporary column projected + * Name of temporary column projected used to identify the sequence associated with the anchor + * row found in the auxiliary table for the incoming microbatch. Since sequences must be unique + * amongst all rows for a key (or risk undefined behavior), this sequence value uniquely + * identifies an exact row in the aux. */ private val anchorOrderingColName: String = s"${AutoCdcReservedNames.prefix}anchor_ordering" - /** Project the `__RECORD_START_AT` field out of a CDC metadata column. */ + /** Project the `__RECORD_START_AT` field out of an SCD2 CDC metadata column. */ private def recordStartAtOf(cdcMetadataCol: Column): Column = cdcMetadataCol.getField(recordStartAtFieldName) - /** Project the `__DELETED_BY_BATCH_ID` field out of any `_cdc_metadata` column. */ + /** Project the `__DELETED_BY_BATCH_ID` field out of an SCD2 CDC metadata column. */ private def deletedByBatchIdOf(cdcMetadataCol: Column): Column = cdcMetadataCol.getField(deletedByBatchIdFieldName) /** - * Schema of the CDC metadata struct column for SCD2 target table rows. - * + * Schema of the CDC metadata struct column for SCD2 target table rows. + * * Note that the aux table's CDC metadata column is a strict superset of this schema (it carries * an additional `__DELETED_BY_BATCH_ID` field). */ @@ -524,7 +526,47 @@ object Scd2BatchProcessor { case `recordStartAtFieldName` => recordStartAt case other => throw SparkException.internalError( - s"Unable to construct SCD2 CDC metadata column due to unknown `${other}` field." + s"Unable to construct SCD2 target CDC metadata column due to unknown " + + s"`${other}` field." + ) + } + value.cast(field.dataType).as(field.name) + } + F.struct(cdcMetadataFieldsInOrder.toImmutableArraySeq: _*) + } + + /** + * Schema of the CDC metadata struct column for SCD2 aux-table rows. Strict superset of + * [[targetCdcMetadataColSchema]]: extends it with the aux-only `__DELETED_BY_BATCH_ID` + * field used for SCD2 idempotency. + */ + private[pipelines] def auxCdcMetadataColSchema(sequencingType: DataType): StructType = + StructType( + targetCdcMetadataColSchema(sequencingType).fields.toImmutableArraySeq ++ + Seq( + // The microbatch id by which this aux row was logically deleted, or null if the + // row is still live. + StructField(deletedByBatchIdFieldName, LongType, nullable = true) + ) + ) + + /** + * Construct the CDC metadata struct column for SCD2 aux-table rows, following the exact + * schema and field ordering defined by [[auxCdcMetadataColSchema]]. + */ + private[autocdc] def constructAuxCdcMetadataCol( + recordStartAt: Column, + deletedByBatchId: Column, + sequencingType: DataType + ): Column = { + val cdcMetadataFieldsInOrder = auxCdcMetadataColSchema(sequencingType).fields.map { field => + val value = field.name match { + case `recordStartAtFieldName` => recordStartAt + case `deletedByBatchIdFieldName` => deletedByBatchId + case other => + throw SparkException.internalError( + s"Unable to construct SCD2 aux CDC metadata column due to unknown " + + s"`${other}` field." ) } value.cast(field.dataType).as(field.name) diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala index f02986320ab6c..5df2c0df27127 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.pipelines.autocdc -import org.apache.spark.sql.{functions => F, QueryTest, Row} +import org.apache.spark.sql.{functions => F, Column, QueryTest, Row} import org.apache.spark.sql.classic.DataFrame import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -29,6 +29,111 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { private def microbatchOf(schema: StructType)(rows: Row*): DataFrame = spark.createDataFrame(spark.sparkContext.parallelize(rows), schema) + /** + * Build an aux-table [[DataFrame]] from explicit user rows + framework column values. + * + * Each input [[Row]] carries the user columns followed by: + * - the row's `__START_AT` value + * - the row's `__END_AT` value (null for non-tombstone rows) + * - the row's `__RECORD_START_AT` value (projected into the CDC metadata struct) + * - the row's `__DELETED_BY_BATCH_ID` value (null when the row is still live) + * + * Built via [[Scd2BatchProcessor.constructAuxCdcMetadataCol]] so the test data tracks the + * production schema 1:1. + */ + private def auxTableOf( + userSchema: StructType, + sequencingType: DataType = LongType + )(rows: Row*): DataFrame = { + val rawRecordStartAtColName = "__raw_record_start_at" + val rawDeletedByBatchIdColName = "__raw_deleted_by_batch_id" + val rawSchema = userSchema + .add(Scd2BatchProcessor.startAtColName, sequencingType, nullable = true) + .add(Scd2BatchProcessor.endAtColName, sequencingType, nullable = true) + .add(rawRecordStartAtColName, sequencingType, nullable = true) + .add(rawDeletedByBatchIdColName, LongType, nullable = true) + spark + .createDataFrame(spark.sparkContext.parallelize(rows), rawSchema) + .withColumn( + AutoCdcReservedNames.cdcMetadataColName, + Scd2BatchProcessor.constructAuxCdcMetadataCol( + recordStartAt = F.col(rawRecordStartAtColName), + deletedByBatchId = F.col(rawDeletedByBatchIdColName), + sequencingType = sequencingType + ) + ) + .drop(rawRecordStartAtColName, rawDeletedByBatchIdColName) + } + + /** + * Build a target-table [[DataFrame]] from explicit user rows + framework column values. + * + * Each input [[Row]] carries the user columns followed by: + * - the row's `__START_AT` value + * - the row's `__END_AT` value (null IFF the row is currently active) + * - the row's `_cdc_metadata` struct as a [[Row]] (e.g., `Row(recordStartAt)`) + */ + private def targetTableOf( + userSchema: StructType, + sequencingType: DataType = LongType + )(rows: Row*): DataFrame = { + val schema = userSchema + .add(Scd2BatchProcessor.startAtColName, sequencingType, nullable = true) + .add(Scd2BatchProcessor.endAtColName, sequencingType, nullable = true) + .add( + AutoCdcReservedNames.cdcMetadataColName, + Scd2BatchProcessor.targetCdcMetadataColSchema(sequencingType), + nullable = false + ) + spark.createDataFrame(spark.sparkContext.parallelize(rows), schema) + } + + /** + * Build a minimum-sequence-per-key [[DataFrame]] used by the `findAffected*` functions. + * + * Each input [[Row]] carries the key columns followed by the per-key minimum sequence. + */ + private def minSeqOf( + keySchema: StructType, + sequencingType: DataType = LongType + )(rows: Row*): DataFrame = { + val schema = keySchema.add( + Scd2BatchProcessor.minSequencingColName, + sequencingType, + nullable = false + ) + spark.createDataFrame(spark.sparkContext.parallelize(rows), schema) + } + + /** + * Build a [[Scd2BatchProcessor]] suitable for `findAffected*` and + * `computeMinimumSequencePerKey` tests. The `sequencing` is fixed to `F.col("seq")`, + * so the input microbatch must include a `seq` column. `deleteCondition` is optional + * and only needed by tests that exercise both event kinds. + */ + private def processorWithKeys( + keys: Seq[String], + deleteCondition: Option[Column] = None + ): Scd2BatchProcessor = + Scd2BatchProcessor( + changeArgs = ChangeArgs( + keys = keys.map(UnqualifiedColumnName(_)), + sequencing = F.col("seq"), + storedAsScdType = ScdType.Type2, + deleteCondition = deleteCondition + ), + resolvedSequencingType = LongType + ) + + /** User schema for single-key `findAffected*` tests: `id`, `value`. */ + private val singleKeyUserSchema: StructType = new StructType() + .add("id", IntegerType) + .add("value", StringType) + + /** Key-only schema for single-key `findAffected*` tests' minSeq dataframes. */ + private val singleKeyKeySchema: StructType = new StructType() + .add("id", IntegerType) + // =============== preprocessMicrobatch tests =============== test("preprocessMicrobatch appends framework columns __START_AT, __END_AT, " + @@ -512,4 +617,636 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { ) ) } + + // =============== computeMinimumSequencePerKey tests =============== + + test("computeMinimumSequencePerKey returns one row per distinct key with the per-key " + + "minimum __RECORD_START_AT, aggregating across both upsert and delete events") { + val schema = new StructType() + .add("id", IntegerType) + .add("seq", LongType) + .add("is_delete", BooleanType) + + val processor = processorWithKeys( + keys = Seq("id"), + deleteCondition = Some(F.col("is_delete")) + ) + + // Two keys, each with multiple events including at least one delete and at least one + // out-of-order sequence. Delete events must feed into the per-key min exactly like + // upserts: `preprocessMicrobatch` stamps `__RECORD_START_AT = sequencing` on every + // row regardless of kind, so the min computation cannot legitimately ignore deletes. + // (If it did, the early-delete-bisects-late-upsert reconciliation case would silently + // lose its anchor pull-in via the find* paths.) + val raw = microbatchOf(schema)( + Row(1, 30L, false), // out-of-order: appears before lower sequences in the input + Row(1, 10L, true), // delete - smallest sequence for key=1 + Row(1, 20L, false), + Row(2, 50L, false), + Row(2, 40L, true) // delete - smallest sequence for key=2 + ) + + val preprocessed = processor.preprocessMicrobatch(raw) + val result = processor.computeMinimumSequencePerKey(preprocessed) + + assert(result.schema.fieldNames.toSeq == Seq( + "id", Scd2BatchProcessor.minSequencingColName + )) + checkAnswer( + df = result, + expectedAnswer = Seq( + Row(1, 10L), + Row(2, 40L) + ) + ) + } + + test("computeMinimumSequencePerKey computes the minimum per composite-key tuple, " + + "not per single key column") { + val schema = new StructType() + .add("region", StringType) + .add("customer_id", IntegerType) + .add("seq", LongType) + + val processor = processorWithKeys(keys = Seq("region", "customer_id")) + + // Three composite-key tuples that share their first or second key column. If the + // function mistakenly grouped by `region` alone, (US, 1) and (US, 2) would collapse + // and we'd see only two output rows; if it grouped by `customer_id` alone, + // (US, 1) and (EU, 1) would collapse. + val raw = microbatchOf(schema)( + Row("US", 1, 100L), + Row("US", 1, 50L), // smaller sequence for (US, 1) + Row("US", 2, 200L), + Row("EU", 1, 30L) + ) + + val preprocessed = processor.preprocessMicrobatch(raw) + val result = processor.computeMinimumSequencePerKey(preprocessed) + + assert(result.schema.fieldNames.toSeq == Seq( + "region", "customer_id", Scd2BatchProcessor.minSequencingColName + )) + checkAnswer( + df = result, + expectedAnswer = Seq( + Row("US", 1, 50L), + Row("US", 2, 200L), + Row("EU", 1, 30L) + ) + ) + } + + test("computeMinimumSequencePerKey returns an empty result for an empty preprocessed " + + "microbatch") { + val schema = new StructType() + .add("id", IntegerType) + .add("seq", LongType) + + val processor = processorWithKeys(keys = Seq("id")) + + val raw = microbatchOf(schema)() + val preprocessed = processor.preprocessMicrobatch(raw) + val result = processor.computeMinimumSequencePerKey(preprocessed) + + assert(result.collect().isEmpty) + } + + test("computeMinimumSequencePerKey resolves key columns containing a literal dot") { + // Symmetric to the dotted-name test for findAffectedRowsFromAuxiliaryTable: the + // `groupBy(keysQuoted.map(F.col): _*)` site relies on `keysQuoted` correctly + // backtick-quoting "a.b" so that F.col parses it as a literal column name (rather + // than struct-field access). Pins the F.col axis of the keysQuoted vs keysRaw split. + val schema = new StructType() + .add("a.b", IntegerType) + .add("seq", LongType) + + val processor = processorWithKeys(keys = Seq("`a.b`")) + + val raw = microbatchOf(schema)( + Row(1, 30L), + Row(1, 10L) + ) + val preprocessed = processor.preprocessMicrobatch(raw) + val result = processor.computeMinimumSequencePerKey(preprocessed) + + assert(result.schema.fieldNames.toSeq == Seq( + "a.b", Scd2BatchProcessor.minSequencingColName + )) + checkAnswer( + df = result, + expectedAnswer = Seq(Row(1, 10L)) + ) + } + + // =============== findAffectedRowsFromAuxiliaryTable tests =============== + + test("findAffectedRowsFromAuxiliaryTable selects per-key anchor (max recordStartAt " + + "strictly less than minSeq), preserves rows at or after minSeq, and drops older " + + "anchor candidates") { + val processor = processorWithKeys(Seq("id")) + + // Two keys to demonstrate per-key anchor isolation. + // + // Input row shape per `auxTableOf`: + // (id, value, __START_AT, __END_AT, __RECORD_START_AT, __DELETED_BY_BATCH_ID) + // + // Key 1: aux rows at recordStartAt 3, 5, 10. minSeq = 10. + // - 3 -> older than the anchor; dropped. + // - 5 -> anchor (max < 10); included. + // - 10 -> at minSeq; included via the >= branch (NOT as anchor; selection is strict <). + // Key 2: only one aux row at 7, minSeq = 7. + // - 7 -> at minSeq; included via >= branch. No anchor (no rows < 7 for this key). + val aux = auxTableOf(singleKeyUserSchema)( + Row(1, "v1.3", 3L, null, 3L, null), + Row(1, "v1.5", 5L, null, 5L, null), + Row(1, "v1.10", 10L, null, 10L, null), + Row(2, "v2.7", 7L, null, 7L, null) + ) + val minSeq = minSeqOf(singleKeyKeySchema)( + Row(1, 10L), + Row(2, 7L) + ) + + val result = processor.findAffectedRowsFromAuxiliaryTable( + rawAuxiliaryTableDf = aux, + minimumSequencePerKeyInMicrobatchDf = minSeq, + batchId = 100L + ) + + checkAnswer( + df = result, + expectedAnswer = Seq( + Row(1, "v1.5", 5L, null, Row(5L)), // anchor for key=1 + Row(1, "v1.10", 10L, null, Row(10L)), // >= minSeq for key=1 + Row(2, "v2.7", 7L, null, Row(7L)) // >= minSeq for key=2 (no anchor) + ) + ) + } + + test("findAffectedRowsFromAuxiliaryTable selects the anchor regardless of row kind " + + "(tombstone vs no-op upsert continuation) and preserves both kinds in the affected set") { + val processor = processorWithKeys(Seq("id")) + + // Aux carries a mix of row kinds for one key. The find function does NOT distinguish + // between them - it filters purely on `recordStartAt` - so a tombstone, a no-op upsert + // run head, and a continuation are all eligible anchor candidates and all eligible for + // the >= minSeq inclusion branch. + val aux = auxTableOf(singleKeyUserSchema)( + // Tombstone at recordStartAt = 3 (deleted at sequence 3): startAt = endAt = 3. + // Older than the anchor; dropped. + Row(1, null, 3L, 3L, 3L, null), + // No-op upsert continuation at recordStartAt = 7: startAt inherits its run head's + // recordStartAt, endAt is null. Anchor for minSeq=10 (max < 10). + Row(1, "alice", 5L, null, 7L, null), + // Tombstone at recordStartAt = 12: at-or-after minSeq, included via >= branch. + Row(1, null, 12L, 12L, 12L, null), + // No-op upsert continuation at recordStartAt = 15: included via >= branch. + Row(1, "bob", 13L, null, 15L, null) + ) + val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) + + val result = processor.findAffectedRowsFromAuxiliaryTable( + rawAuxiliaryTableDf = aux, + minimumSequencePerKeyInMicrobatchDf = minSeq, + batchId = 100L + ) + + checkAnswer( + df = result, + expectedAnswer = Seq( + Row(1, "alice", 5L, null, Row(7L)), + Row(1, null, 12L, 12L, Row(12L)), + Row(1, "bob", 13L, null, Row(15L)) + ) + ) + } + + test("findAffectedRowsFromAuxiliaryTable pulls in both no-op upsert continuations " + + "when the microbatch's minSeq strictly bisects them (so reconciliation can promote " + + "them to visible target rows if the bisecting event makes them no-longer no-ops)") { + val processor = processorWithKeys(Seq("id")) + + // Three no-op upsert continuations of a single run for key=1 (run head was at + // sequence 2 - that head lives in the target table, not the aux). Each continuation + // has startAt=2 (the head's recordStartAt) and endAt=null, and they all carry the + // same user data ("alice"), which is what made them no-ops when observed. + // + // The incoming microbatch's minSeq for key=1 is 10, which strictly bisects the + // continuations at recordStartAt=8 and recordStartAt=12. Both must surface in the + // affected set so a later reconciliation stage can decide whether to promote them + // to visible target rows - e.g., if the bisecting event carries different user data, + // the previously-no-op continuations become honest history with non-trivial + // [startAt, endAt] boundaries. + val aux = auxTableOf(singleKeyUserSchema)( + Row(1, "alice", 2L, null, 3L, null), // older continuation; dropped (< anchor) + Row(1, "alice", 2L, null, 8L, null), // anchor: max recordStartAt strictly < 10 + Row(1, "alice", 2L, null, 12L, null) // included via >= branch (>= minSeq=10) + ) + val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) + + val result = processor.findAffectedRowsFromAuxiliaryTable( + rawAuxiliaryTableDf = aux, + minimumSequencePerKeyInMicrobatchDf = minSeq, + batchId = 100L + ) + + checkAnswer( + df = result, + expectedAnswer = Seq( + Row(1, "alice", 2L, null, Row(8L)), // anchor - left context for the run + Row(1, "alice", 2L, null, Row(12L)) // bisected by microbatch event at seq 10 + ) + ) + } + + test("findAffectedRowsFromAuxiliaryTable pulls in tombstones on both sides of minSeq, " + + "including a tombstone-as-anchor and multiple tombstones at-or-after minSeq") { + val processor = processorWithKeys(Seq("id")) + + // Four tombstones for key=1 at recordStartAt = 3, 7, 12, 18 (delete events). + // Tombstones obey: startAt = endAt = recordStartAt and carry no user data. + // + // The microbatch's minSeq for key=1 is 10. Expected: + // - Tombstone at 3: older than the anchor; dropped. + // - Tombstone at 7: anchor (max recordStartAt strictly < 10); included. + // - Tombstone at 12: at-or-after minSeq; included via the >= branch. + // - Tombstone at 18: at-or-after minSeq; included via the >= branch. + // + // Why both sides matter for downstream reconciliation: the left-side tombstone tells + // reconciliation the prior run was already closed by a delete (so an incoming upsert + // starts a fresh run, not a continuation), while the right-side tombstones bound the + // visible interval of any new upsert run against subsequent deletes. + val aux = auxTableOf(singleKeyUserSchema)( + Row(1, null, 3L, 3L, 3L, null), + Row(1, null, 7L, 7L, 7L, null), + Row(1, null, 12L, 12L, 12L, null), + Row(1, null, 18L, 18L, 18L, null) + ) + val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) + + val result = processor.findAffectedRowsFromAuxiliaryTable( + rawAuxiliaryTableDf = aux, + minimumSequencePerKeyInMicrobatchDf = minSeq, + batchId = 100L + ) + + checkAnswer( + df = result, + expectedAnswer = Seq( + Row(1, null, 7L, 7L, Row(7L)), // anchor - last delete before minSeq + Row(1, null, 12L, 12L, Row(12L)), // delete at seq 12 + Row(1, null, 18L, 18L, Row(18L)) // delete at seq 18 + ) + ) + } + + test("findAffectedRowsFromAuxiliaryTable filters logically-deleted aux rows by " + + "__DELETED_BY_BATCH_ID: keeps null and current-batch, drops different-batch") { + val processor = processorWithKeys(Seq("id")) + + val currentBatchId = 100L + val differentBatchId = 99L + + // All three rows would be eligible by recordStartAt alone (>= minSeq=10), but the + // idempotency filter drops the one logically deleted by a different batch. + val aux = auxTableOf(singleKeyUserSchema)( + Row(1, "live", 10L, null, 10L, null), // not deleted -> kept + Row(1, "retried", 11L, null, 11L, currentBatchId), // deleted by current -> kept + Row(1, "ignored", 12L, null, 12L, differentBatchId) // deleted by another -> dropped + ) + val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) + + val result = processor.findAffectedRowsFromAuxiliaryTable( + rawAuxiliaryTableDf = aux, + minimumSequencePerKeyInMicrobatchDf = minSeq, + batchId = currentBatchId + ) + + checkAnswer( + df = result, + expectedAnswer = Seq( + Row(1, "live", 10L, null, Row(10L)), + Row(1, "retried", 11L, null, Row(11L)) + ) + ) + } + + test("findAffectedRowsFromAuxiliaryTable still selects the anchor when that anchor row " + + "was logically deleted by the current batch (idempotent retry)") { + val processor = processorWithKeys(Seq("id")) + + // Models an idempotent retry: a previous attempt of this same batch already logically + // deleted the anchor row in the aux table, but mid-retry we still need to treat it as + // the left context. The idempotency filter retains rows deleted by `currentBatchId`, + // and anchor selection ignores `__DELETED_BY_BATCH_ID` entirely. + // + // Strict-sequence-uniqueness is not a public guarantee of AutoCDC, but this test pins + // the *current* behavior so the contract change is intentional rather than accidental. + val currentBatchId = 100L + val aux = auxTableOf(singleKeyUserSchema)( + Row(1, "anchor", 5L, null, 5L, currentBatchId) + ) + val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) + + val result = processor.findAffectedRowsFromAuxiliaryTable( + rawAuxiliaryTableDf = aux, + minimumSequencePerKeyInMicrobatchDf = minSeq, + batchId = currentBatchId + ) + + checkAnswer( + df = result, + expectedAnswer = Seq(Row(1, "anchor", 5L, null, Row(5L))) + ) + } + + test("findAffectedRowsFromAuxiliaryTable narrows the CDC metadata column from the aux " + + "schema to the target schema (drops __DELETED_BY_BATCH_ID)") { + val processor = processorWithKeys(Seq("id")) + + // Pre-condition: aux's `_cdc_metadata` carries __RECORD_START_AT and __DELETED_BY_BATCH_ID. + // The find function must strip the aux-only field so the result is union-compatible + // with target-table rows and preprocessed-microbatch rows downstream. + val aux = auxTableOf(singleKeyUserSchema)(Row(1, "v", 5L, null, 5L, null)) + val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) + + val result = processor.findAffectedRowsFromAuxiliaryTable( + rawAuxiliaryTableDf = aux, + minimumSequencePerKeyInMicrobatchDf = minSeq, + batchId = 100L + ) + + val cdcMetadataField = result.schema(AutoCdcReservedNames.cdcMetadataColName) + assert(cdcMetadataField.dataType == Scd2BatchProcessor.targetCdcMetadataColSchema(LongType)) + } + + test("findAffectedRowsFromAuxiliaryTable resolves key columns containing a literal dot") { + // Backticks tell the SQL parser that "a.b" is a single identifier literally named + // "a.b" (rather than struct-field access). The schema field, the join key (via + // `keysRaw`), and the F.col reference (via `keysQuoted`) must all consistently + // resolve to the literal "a.b" column for the find-* path to work end-to-end. + // + // This is the regression guard for the `keysRaw` vs `keysQuoted` split: passing + // backtick-quoted names to `.join(_, usingColumns)` matches schema fields literally + // and would fail to find a column named `a.b`, while passing raw names to F.col + // would parse the dot as struct-field access and also fail. + val processor = processorWithKeys(Seq("`a.b`")) + val userSchema = new StructType() + .add("a.b", IntegerType) + .add("value", StringType) + val keySchema = new StructType().add("a.b", IntegerType) + + val aux = auxTableOf(userSchema)(Row(1, "v", 5L, null, 5L, null)) + val minSeq = minSeqOf(keySchema)(Row(1, 10L)) + + val result = processor.findAffectedRowsFromAuxiliaryTable( + rawAuxiliaryTableDf = aux, + minimumSequencePerKeyInMicrobatchDf = minSeq, + batchId = 100L + ) + + // The lone aux row is the anchor (recordStartAt=5 < minSeq=10, no other candidates). + checkAnswer( + df = result, + expectedAnswer = Seq(Row(1, "v", 5L, null, Row(5L))) + ) + } + + test("findAffectedRowsFromAuxiliaryTable respects all key columns when computing " + + "per-key anchors with a composite key") { + val userSchema = new StructType() + .add("region", StringType) + .add("customer_id", IntegerType) + .add("name", StringType) + val keySchema = new StructType() + .add("region", StringType) + .add("customer_id", IntegerType) + + val processor = processorWithKeys(Seq("region", "customer_id")) + + // Three composite keys: (US, 1), (EU, 1), (US, 2). Each is independent. + // (US, 1): anchor at 3; row at 10 included via >=. + // (EU, 1): anchor at 4; no rows at or after 12 -> only the anchor. + // (US, 2): no aux rows -> contributes nothing. + val aux = auxTableOf(userSchema)( + Row("US", 1, "us1.3", 3L, null, 3L, null), + Row("US", 1, "us1.10", 10L, null, 10L, null), + Row("EU", 1, "eu1.4", 4L, null, 4L, null) + ) + val minSeq = minSeqOf(keySchema)( + Row("US", 1, 10L), + Row("EU", 1, 12L), + Row("US", 2, 100L) + ) + + val result = processor.findAffectedRowsFromAuxiliaryTable( + rawAuxiliaryTableDf = aux, + minimumSequencePerKeyInMicrobatchDf = minSeq, + batchId = 100L + ) + + checkAnswer( + df = result, + expectedAnswer = Seq( + Row("US", 1, "us1.3", 3L, null, Row(3L)), + Row("US", 1, "us1.10", 10L, null, Row(10L)), + Row("EU", 1, "eu1.4", 4L, null, Row(4L)) + ) + ) + } + + test("findAffectedRowsFromAuxiliaryTable returns an empty result when the aux table is empty") { + val processor = processorWithKeys(Seq("id")) + + val aux = auxTableOf(singleKeyUserSchema)() + val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) + + val result = processor.findAffectedRowsFromAuxiliaryTable( + rawAuxiliaryTableDf = aux, + minimumSequencePerKeyInMicrobatchDf = minSeq, + batchId = 100L + ) + + assert(result.collect().isEmpty) + } + + test("findAffectedRowsFromAuxiliaryTable returns no rows for a microbatch key that has " + + "no rows in the aux table") { + val processor = processorWithKeys(Seq("id")) + + // Aux only has rows for key=1. Microbatch only sees key=2. + val aux = auxTableOf(singleKeyUserSchema)(Row(1, "v", 5L, null, 5L, null)) + val minSeq = minSeqOf(singleKeyKeySchema)(Row(2, 10L)) + + val result = processor.findAffectedRowsFromAuxiliaryTable( + rawAuxiliaryTableDf = aux, + minimumSequencePerKeyInMicrobatchDf = minSeq, + batchId = 100L + ) + + assert(result.collect().isEmpty) + } + + test("findAffectedRowsFromAuxiliaryTable excludes aux rows for keys not in the microbatch") { + val processor = processorWithKeys(Seq("id")) + + // Aux has rows for keys 1 and 2. Microbatch only mentions key=1, so key=2's aux rows + // must be dropped (the inner join with minSeq strips them). + val aux = auxTableOf(singleKeyUserSchema)( + Row(1, "v1", 5L, null, 5L, null), + Row(2, "v2", 7L, null, 7L, null) + ) + val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) + + val result = processor.findAffectedRowsFromAuxiliaryTable( + rawAuxiliaryTableDf = aux, + minimumSequencePerKeyInMicrobatchDf = minSeq, + batchId = 100L + ) + + checkAnswer( + df = result, + expectedAnswer = Seq(Row(1, "v1", 5L, null, Row(5L))) + ) + } + + // =============== findAffectedRowsFromTargetTable tests =============== + + test("findAffectedRowsFromTargetTable includes the currently active row and any closed " + + "row whose __END_AT is at or after minSeq; older closed rows are excluded") { + val processor = processorWithKeys(Seq("id")) + + // Single key with four target rows. Schema for input: (id, value, __START_AT, __END_AT, + // _cdc_metadata{__RECORD_START_AT}). + // - closed at endAt=5 -> < minSeq=10 -> excluded + // - closed at endAt=10 -> = minSeq=10 -> included (>=) + // - closed at endAt=15 -> > minSeq=10 -> included + // - active (endAt=null) -> always included + val target = targetTableOf(singleKeyUserSchema)( + Row(1, "old", 1L, 5L, Row(1L)), + Row(1, "edge", 5L, 10L, Row(5L)), + Row(1, "recent", 10L, 15L, Row(10L)), + Row(1, "active", 15L, null, Row(15L)) + ) + val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) + + val result = processor.findAffectedRowsFromTargetTable( + targetTableDf = target, + minimumSequencePerKeyInMicrobatchDf = minSeq + ) + + checkAnswer( + df = result, + expectedAnswer = Seq( + Row(1, "edge", 5L, 10L, Row(5L)), + Row(1, "recent", 10L, 15L, Row(10L)), + Row(1, "active", 15L, null, Row(15L)) + ) + ) + } + + test("findAffectedRowsFromTargetTable computes inclusion independently per key") { + val processor = processorWithKeys(Seq("id")) + + // Two keys with overlapping endAt ranges but different per-key minSeqs. Each key is + // reconciled independently against its own minSeq. + val target = targetTableOf(singleKeyUserSchema)( + // Key 1: minSeq=10. "active" (null) and "recent" (15) are at/after 10. + Row(1, "k1.old", 1L, 5L, Row(1L)), + Row(1, "k1.recent", 5L, 15L, Row(5L)), + Row(1, "k1.active", 15L, null, Row(15L)), + // Key 2: minSeq=20. Only "active" (null) is at/after 20. + Row(2, "k2.old", 1L, 10L, Row(1L)), + Row(2, "k2.recent", 10L, 18L, Row(10L)), + Row(2, "k2.active", 18L, null, Row(18L)) + ) + val minSeq = minSeqOf(singleKeyKeySchema)( + Row(1, 10L), + Row(2, 20L) + ) + + val result = processor.findAffectedRowsFromTargetTable( + targetTableDf = target, + minimumSequencePerKeyInMicrobatchDf = minSeq + ) + + checkAnswer( + df = result, + expectedAnswer = Seq( + Row(1, "k1.recent", 5L, 15L, Row(5L)), + Row(1, "k1.active", 15L, null, Row(15L)), + Row(2, "k2.active", 18L, null, Row(18L)) + ) + ) + } + + test("findAffectedRowsFromTargetTable respects all key columns with a composite key") { + val userSchema = new StructType() + .add("region", StringType) + .add("customer_id", IntegerType) + .add("name", StringType) + val keySchema = new StructType() + .add("region", StringType) + .add("customer_id", IntegerType) + + val processor = processorWithKeys(Seq("region", "customer_id")) + + // (US, 1) and (EU, 1) are distinct composite keys. (US, 1)'s active row is included + // for minSeq=10; (EU, 1)'s active row is included for minSeq=12; (EU, 1)'s old closed + // row at endAt=5 is excluded (5 < 12). (US, 2) has no target rows. + val target = targetTableOf(userSchema)( + Row("US", 1, "us1", 1L, null, Row(1L)), + Row("EU", 1, "eu1.old", 1L, 5L, Row(1L)), + Row("EU", 1, "eu1", 5L, null, Row(5L)) + ) + val minSeq = minSeqOf(keySchema)( + Row("US", 1, 10L), + Row("EU", 1, 12L), + Row("US", 2, 100L) + ) + + val result = processor.findAffectedRowsFromTargetTable( + targetTableDf = target, + minimumSequencePerKeyInMicrobatchDf = minSeq + ) + + checkAnswer( + df = result, + expectedAnswer = Seq( + Row("US", 1, "us1", 1L, null, Row(1L)), + Row("EU", 1, "eu1", 5L, null, Row(5L)) + ) + ) + } + + test("findAffectedRowsFromTargetTable returns an empty result when the target table is empty") { + val processor = processorWithKeys(Seq("id")) + + val target = targetTableOf(singleKeyUserSchema)() + val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) + + val result = processor.findAffectedRowsFromTargetTable( + targetTableDf = target, + minimumSequencePerKeyInMicrobatchDf = minSeq + ) + + assert(result.collect().isEmpty) + } + + test("findAffectedRowsFromTargetTable returns no rows for a microbatch key that has " + + "no rows in the target table") { + val processor = processorWithKeys(Seq("id")) + + // Target only has rows for key=1. Microbatch only sees key=2. + val target = targetTableOf(singleKeyUserSchema)(Row(1, "v", 1L, null, Row(1L))) + val minSeq = minSeqOf(singleKeyKeySchema)(Row(2, 10L)) + + val result = processor.findAffectedRowsFromTargetTable( + targetTableDf = target, + minimumSequencePerKeyInMicrobatchDf = minSeq + ) + + assert(result.collect().isEmpty) + } } From 841bcd9afb4f4bc097cd5daae0c4c83786c66ade Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 2 Jun 2026 16:28:41 +0000 Subject: [PATCH 8/9] some test cleanup --- .../autocdc/Scd2BatchProcessorSuite.scala | 105 +++++++----------- 1 file changed, 42 insertions(+), 63 deletions(-) diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala index 5df2c0df27127..0fd2df3e923bf 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala @@ -35,34 +35,22 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { * Each input [[Row]] carries the user columns followed by: * - the row's `__START_AT` value * - the row's `__END_AT` value (null for non-tombstone rows) - * - the row's `__RECORD_START_AT` value (projected into the CDC metadata struct) - * - the row's `__DELETED_BY_BATCH_ID` value (null when the row is still live) - * - * Built via [[Scd2BatchProcessor.constructAuxCdcMetadataCol]] so the test data tracks the - * production schema 1:1. + * - the row's `_cdc_metadata` struct as a [[Row]] + * (e.g., `Row(recordStartAt, deletedByBatchId)`) */ private def auxTableOf( userSchema: StructType, sequencingType: DataType = LongType )(rows: Row*): DataFrame = { - val rawRecordStartAtColName = "__raw_record_start_at" - val rawDeletedByBatchIdColName = "__raw_deleted_by_batch_id" - val rawSchema = userSchema + val schema = userSchema .add(Scd2BatchProcessor.startAtColName, sequencingType, nullable = true) .add(Scd2BatchProcessor.endAtColName, sequencingType, nullable = true) - .add(rawRecordStartAtColName, sequencingType, nullable = true) - .add(rawDeletedByBatchIdColName, LongType, nullable = true) - spark - .createDataFrame(spark.sparkContext.parallelize(rows), rawSchema) - .withColumn( + .add( AutoCdcReservedNames.cdcMetadataColName, - Scd2BatchProcessor.constructAuxCdcMetadataCol( - recordStartAt = F.col(rawRecordStartAtColName), - deletedByBatchId = F.col(rawDeletedByBatchIdColName), - sequencingType = sequencingType - ) + Scd2BatchProcessor.auxCdcMetadataColSchema(sequencingType), + nullable = false ) - .drop(rawRecordStartAtColName, rawDeletedByBatchIdColName) + spark.createDataFrame(spark.sparkContext.parallelize(rows), schema) } /** @@ -125,15 +113,14 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { resolvedSequencingType = LongType ) - /** User schema for single-key `findAffected*` tests: `id`, `value`. */ - private val singleKeyUserSchema: StructType = new StructType() - .add("id", IntegerType) - .add("value", StringType) - /** Key-only schema for single-key `findAffected*` tests' minSeq dataframes. */ private val singleKeyKeySchema: StructType = new StructType() .add("id", IntegerType) + /** User schema for single-key `findAffected*` tests: the key column plus a `value` column. */ + private val singleKeyUserSchema: StructType = singleKeyKeySchema + .add("value", StringType) + // =============== preprocessMicrobatch tests =============== test("preprocessMicrobatch appends framework columns __START_AT, __END_AT, " + @@ -749,7 +736,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { // Two keys to demonstrate per-key anchor isolation. // // Input row shape per `auxTableOf`: - // (id, value, __START_AT, __END_AT, __RECORD_START_AT, __DELETED_BY_BATCH_ID) + // (id, value, __START_AT, __END_AT, Row(recordStartAt, deletedByBatchId)) // // Key 1: aux rows at recordStartAt 3, 5, 10. minSeq = 10. // - 3 -> older than the anchor; dropped. @@ -758,10 +745,10 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { // Key 2: only one aux row at 7, minSeq = 7. // - 7 -> at minSeq; included via >= branch. No anchor (no rows < 7 for this key). val aux = auxTableOf(singleKeyUserSchema)( - Row(1, "v1.3", 3L, null, 3L, null), - Row(1, "v1.5", 5L, null, 5L, null), - Row(1, "v1.10", 10L, null, 10L, null), - Row(2, "v2.7", 7L, null, 7L, null) + Row(1, "v1.3", 3L, null, Row(3L, null)), + Row(1, "v1.5", 5L, null, Row(5L, null)), + Row(1, "v1.10", 10L, null, Row(10L, null)), + Row(2, "v2.7", 7L, null, Row(7L, null)) ) val minSeq = minSeqOf(singleKeyKeySchema)( Row(1, 10L), @@ -795,14 +782,14 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val aux = auxTableOf(singleKeyUserSchema)( // Tombstone at recordStartAt = 3 (deleted at sequence 3): startAt = endAt = 3. // Older than the anchor; dropped. - Row(1, null, 3L, 3L, 3L, null), + Row(1, null, 3L, 3L, Row(3L, null)), // No-op upsert continuation at recordStartAt = 7: startAt inherits its run head's // recordStartAt, endAt is null. Anchor for minSeq=10 (max < 10). - Row(1, "alice", 5L, null, 7L, null), + Row(1, "alice", 5L, null, Row(7L, null)), // Tombstone at recordStartAt = 12: at-or-after minSeq, included via >= branch. - Row(1, null, 12L, 12L, 12L, null), + Row(1, null, 12L, 12L, Row(12L, null)), // No-op upsert continuation at recordStartAt = 15: included via >= branch. - Row(1, "bob", 13L, null, 15L, null) + Row(1, "bob", 13L, null, Row(15L, null)) ) val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) @@ -839,9 +826,9 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { // the previously-no-op continuations become honest history with non-trivial // [startAt, endAt] boundaries. val aux = auxTableOf(singleKeyUserSchema)( - Row(1, "alice", 2L, null, 3L, null), // older continuation; dropped (< anchor) - Row(1, "alice", 2L, null, 8L, null), // anchor: max recordStartAt strictly < 10 - Row(1, "alice", 2L, null, 12L, null) // included via >= branch (>= minSeq=10) + Row(1, "alice", 2L, null, Row(3L, null)), // older continuation; dropped (< anchor) + Row(1, "alice", 2L, null, Row(8L, null)), // anchor: max recordStartAt strictly < 10 + Row(1, "alice", 2L, null, Row(12L, null)) // included via >= branch (>= minSeq=10) ) val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) @@ -878,10 +865,10 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { // starts a fresh run, not a continuation), while the right-side tombstones bound the // visible interval of any new upsert run against subsequent deletes. val aux = auxTableOf(singleKeyUserSchema)( - Row(1, null, 3L, 3L, 3L, null), - Row(1, null, 7L, 7L, 7L, null), - Row(1, null, 12L, 12L, 12L, null), - Row(1, null, 18L, 18L, 18L, null) + Row(1, null, 3L, 3L, Row(3L, null)), + Row(1, null, 7L, 7L, Row(7L, null)), + Row(1, null, 12L, 12L, Row(12L, null)), + Row(1, null, 18L, 18L, Row(18L, null)) ) val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) @@ -911,9 +898,9 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { // All three rows would be eligible by recordStartAt alone (>= minSeq=10), but the // idempotency filter drops the one logically deleted by a different batch. val aux = auxTableOf(singleKeyUserSchema)( - Row(1, "live", 10L, null, 10L, null), // not deleted -> kept - Row(1, "retried", 11L, null, 11L, currentBatchId), // deleted by current -> kept - Row(1, "ignored", 12L, null, 12L, differentBatchId) // deleted by another -> dropped + Row(1, "live", 10L, null, Row(10L, null)), // not deleted -> kept + Row(1, "retried", 11L, null, Row(11L, currentBatchId)), // deleted by current -> kept + Row(1, "ignored", 12L, null, Row(12L, differentBatchId)) // deleted by another -> dropped ) val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) @@ -945,7 +932,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { // the *current* behavior so the contract change is intentional rather than accidental. val currentBatchId = 100L val aux = auxTableOf(singleKeyUserSchema)( - Row(1, "anchor", 5L, null, 5L, currentBatchId) + Row(1, "anchor", 5L, null, Row(5L, currentBatchId)) ) val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) @@ -968,7 +955,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { // Pre-condition: aux's `_cdc_metadata` carries __RECORD_START_AT and __DELETED_BY_BATCH_ID. // The find function must strip the aux-only field so the result is union-compatible // with target-table rows and preprocessed-microbatch rows downstream. - val aux = auxTableOf(singleKeyUserSchema)(Row(1, "v", 5L, null, 5L, null)) + val aux = auxTableOf(singleKeyUserSchema)(Row(1, "v", 5L, null, Row(5L, null))) val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) val result = processor.findAffectedRowsFromAuxiliaryTable( @@ -992,12 +979,10 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { // and would fail to find a column named `a.b`, while passing raw names to F.col // would parse the dot as struct-field access and also fail. val processor = processorWithKeys(Seq("`a.b`")) - val userSchema = new StructType() - .add("a.b", IntegerType) - .add("value", StringType) val keySchema = new StructType().add("a.b", IntegerType) + val userSchema = keySchema.add("value", StringType) - val aux = auxTableOf(userSchema)(Row(1, "v", 5L, null, 5L, null)) + val aux = auxTableOf(userSchema)(Row(1, "v", 5L, null, Row(5L, null))) val minSeq = minSeqOf(keySchema)(Row(1, 10L)) val result = processor.findAffectedRowsFromAuxiliaryTable( @@ -1015,13 +1000,10 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { test("findAffectedRowsFromAuxiliaryTable respects all key columns when computing " + "per-key anchors with a composite key") { - val userSchema = new StructType() - .add("region", StringType) - .add("customer_id", IntegerType) - .add("name", StringType) val keySchema = new StructType() .add("region", StringType) .add("customer_id", IntegerType) + val userSchema = keySchema.add("name", StringType) val processor = processorWithKeys(Seq("region", "customer_id")) @@ -1030,9 +1012,9 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { // (EU, 1): anchor at 4; no rows at or after 12 -> only the anchor. // (US, 2): no aux rows -> contributes nothing. val aux = auxTableOf(userSchema)( - Row("US", 1, "us1.3", 3L, null, 3L, null), - Row("US", 1, "us1.10", 10L, null, 10L, null), - Row("EU", 1, "eu1.4", 4L, null, 4L, null) + Row("US", 1, "us1.3", 3L, null, Row(3L, null)), + Row("US", 1, "us1.10", 10L, null, Row(10L, null)), + Row("EU", 1, "eu1.4", 4L, null, Row(4L, null)) ) val minSeq = minSeqOf(keySchema)( Row("US", 1, 10L), @@ -1076,7 +1058,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val processor = processorWithKeys(Seq("id")) // Aux only has rows for key=1. Microbatch only sees key=2. - val aux = auxTableOf(singleKeyUserSchema)(Row(1, "v", 5L, null, 5L, null)) + val aux = auxTableOf(singleKeyUserSchema)(Row(1, "v", 5L, null, Row(5L, null))) val minSeq = minSeqOf(singleKeyKeySchema)(Row(2, 10L)) val result = processor.findAffectedRowsFromAuxiliaryTable( @@ -1094,8 +1076,8 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { // Aux has rows for keys 1 and 2. Microbatch only mentions key=1, so key=2's aux rows // must be dropped (the inner join with minSeq strips them). val aux = auxTableOf(singleKeyUserSchema)( - Row(1, "v1", 5L, null, 5L, null), - Row(2, "v2", 7L, null, 7L, null) + Row(1, "v1", 5L, null, Row(5L, null)), + Row(2, "v2", 7L, null, Row(7L, null)) ) val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) @@ -1182,13 +1164,10 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { } test("findAffectedRowsFromTargetTable respects all key columns with a composite key") { - val userSchema = new StructType() - .add("region", StringType) - .add("customer_id", IntegerType) - .add("name", StringType) val keySchema = new StructType() .add("region", StringType) .add("customer_id", IntegerType) + val userSchema = keySchema.add("name", StringType) val processor = processorWithKeys(Seq("region", "customer_id")) From 53a3c5daceaeff4358327b6c47e92ec2c9ae9440 Mon Sep 17 00:00:00 2001 From: Anish Mahto Date: Tue, 2 Jun 2026 17:49:51 +0000 Subject: [PATCH 9/9] cleanup tests --- .../autocdc/Scd2BatchProcessor.scala | 100 ++++----- .../autocdc/Scd2BatchProcessorSuite.scala | 196 ++++++------------ 2 files changed, 119 insertions(+), 177 deletions(-) diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala index 4d4e32eb24d46..7f4c3e0174945 100644 --- a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala +++ b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessor.scala @@ -69,10 +69,10 @@ case class Scd2BatchProcessor( * merged - with the following schema, in column order: * 1. The user columns of `microbatchDf` that survive [[ChangeArgs.columnSelection]], in * the order they appeared in the input. - * 2. `__START_AT` column, populated with the sequence value of the row. - * 3. `__END_AT` column, populated with the sequence value of the row IFF it's a delete + * 2. [[startAtColName]], populated with the sequence value of the row. + * 3. [[endAtColName]], populated with the sequence value of the row IFF it's a delete * event, null otherwise. - * 4. `__spark_autocdc_metadata` column, conforming to [[targetCdcMetadataColSchema]]. + * 4. [[cdcMetadataColName]], conforming to [[targetCdcMetadataColSchema]]. */ private[autocdc] def preprocessMicrobatch(microbatchDf: DataFrame): DataFrame = { microbatchDf @@ -166,16 +166,16 @@ case class Scd2BatchProcessor( } /** - * For each key in the preprocessed microbatch, compute the earliest `__RECORD_START_AT` + * For each key in the preprocessed microbatch, compute the earliest [[recordStartAtFieldName]] * across the key's events. * * @param preprocessedBatchDf * a validated and preprocessed microbatch as produced by [[preprocessMicrobatch]] - in - * particular, non-null key columns and a non-null `__RECORD_START_AT` on every row. + * particular, non-null key columns and a non-null [[recordStartAtFieldName]] on every row. * @return * a dataframe containing one row per distinct key. Schema, in column order: * 1. The key columns ([[ChangeArgs.keys]]), in their declared order. - * 2. `__spark_autocdc_min_sequencing` column, carrying the min `__RECORD_START_AT` + * 2. [[minSequenceColName]], carrying the min [[recordStartAtFieldName]] * across all records within the microbatch for that key. */ private[autocdc] def computeMinimumSequencePerKey(preprocessedBatchDf: DataFrame): DataFrame = { @@ -183,7 +183,7 @@ case class Scd2BatchProcessor( Scd2BatchProcessor.recordStartAtOf(F.col(AutoCdcReservedNames.cdcMetadataColName)) preprocessedBatchDf .groupBy(keysQuoted.map(F.col): _*) - .agg(F.min(recordStartAt).alias(Scd2BatchProcessor.minSequencingColName)) + .agg(F.min(recordStartAt).alias(Scd2BatchProcessor.minSequenceColName)) } /** @@ -191,8 +191,8 @@ case class Scd2BatchProcessor( * * @param rawAuxiliaryTableDf * the auxiliary table in its native schema, whose CDC metadata column carries an extra - * `__DELETED_BY_BATCH_ID` field on top of the target/microbatch schema. - * @param minimumSequencePerKeyInMicrobatchDf + * [[deletedByBatchIdFieldName]] on top of the target/microbatch schema. + * @param perKeyMinimumSequenceInMicrobatch * one row per distinct key as produced by [[computeMinimumSequencePerKey]], representing * the minimum sequence for that key in the microbatch. * @param batchId @@ -204,7 +204,7 @@ case class Scd2BatchProcessor( */ private[autocdc] def findAffectedRowsFromAuxiliaryTable( rawAuxiliaryTableDf: DataFrame, - minimumSequencePerKeyInMicrobatchDf: DataFrame, + perKeyMinimumSequenceInMicrobatch: DataFrame, batchId: Long ): DataFrame = { val auxTableRecordStartAtField = Scd2BatchProcessor.recordStartAtOf( @@ -223,7 +223,7 @@ case class Scd2BatchProcessor( auxTableDeletedByBatchIdField === F.lit(batchId) ) // The aux table's CDC metadata column is a superset of the target's: it carries the - // additional `__DELETED_BY_BATCH_ID` field. Since we eventually union aux rows with + // additional [[deletedByBatchIdFieldName]]. Since we eventually union aux rows with // target and microbatch rows (which use the target's narrower CDC metadata schema), strip // the aux-only subfields here so all three sources share an identical CDC metadata column // schema, and replace the existing CDC metadata column with it. @@ -235,54 +235,62 @@ case class Scd2BatchProcessor( ) ) - val minSequencePerKeyInMicrobatchCol = F.col(Scd2BatchProcessor.minSequencingColName) + val perKeyMinimumSequenceInMicrobatchCol = F.col(Scd2BatchProcessor.minSequenceColName) // Per key, identify the sequence value associated with the anchor row in the aux table. // - // The anchor row is the aux row with the largest `recordStartAt` strictly less than the - // min sequence in the incoming microbatch for that key. It is the immediate "left - // context" the reconciler needs to correctly attribute incoming events to a run: without - // the anchor, a no-op upsert of an existing run would look like a brand-new run head. + // The anchor row is the aux row with the largest [[recordStartAtFieldName]] strictly less + // than the min sequence in the incoming microbatch for that key. The reconciler needs this + // "left context" in two cases: + // (1) Incoming no-op upsert: without the anchor, it would look like a new run head, when in + // reality it's a part of an existing no-op run/head. + // (2) Incoming state-changing upsert that bisects two aux no-ops: the anchor surfaces + // the before-half so both halves can be promoted to target. (The after-half is + // picked up by the >= minSeq branch.) + // + // Because no-op upserts are stored only in the aux table, the anchor concept only exists when + // pulling in rows from the aux table, and is not relevant for the target table. // // Keys with no aux row strictly before the min sequence have no anchor; their affected set // reduces to "all aux rows at or after the min sequence." // - // The shape of this DataFrame is: [key1, key2, ... keyN, anchorRowSequenceForCompositeKey] - val anchorOrderingPerKey: DataFrame = reducedAuxiliaryTableDf - // The number of rows in [[minimumSequencePerKeyInMicrobatchDf]] is bounded by the + // The shape of this DataFrame is: [key1, key2, ... keyN, anchorSequence] + val perKeyAnchorSequence: DataFrame = reducedAuxiliaryTableDf + // The number of rows in [[perKeyMinimumSequenceInMicrobatch]] is bounded by the // number of unique keys in the microbatch, which should typically be small. The // auxiliary table should generally also be small, containing only no-op upsert runs // and tombstones per key. Therefore this join should be cheap, and broadcast joinable. - .join(minimumSequencePerKeyInMicrobatchDf, keysRaw) - .filter(auxTableRecordStartAtField < minSequencePerKeyInMicrobatchCol) + .join(perKeyMinimumSequenceInMicrobatch, keysRaw) + .filter(auxTableRecordStartAtField < perKeyMinimumSequenceInMicrobatchCol) .groupBy(keysQuoted.map(F.col): _*) .agg( - F.max(auxTableRecordStartAtField).as(Scd2BatchProcessor.anchorOrderingColName) + F.max(auxTableRecordStartAtField).as(Scd2BatchProcessor.anchorSequenceColName) ) - val anchorOrderingCol = F.col(Scd2BatchProcessor.anchorOrderingColName) + val anchorSequenceCol = F.col(Scd2BatchProcessor.anchorSequenceColName) // Now that we have the minimum sequence in the microbatch and the sequence of the anchor row, // we have enough information to compute the full set of auxiliary rows that affect or are // affected by the microbatch. val auxRowIsAfterMinSequenceInMicrobatch = - auxTableRecordStartAtField >= minSequencePerKeyInMicrobatchCol + auxTableRecordStartAtField >= perKeyMinimumSequenceInMicrobatchCol - val auxRowIsAnchorRow = auxTableRecordStartAtField === anchorOrderingCol + val auxRowIsAnchorRow = auxTableRecordStartAtField === anchorSequenceCol val auxRowAffectsMicrobatch = auxRowIsAfterMinSequenceInMicrobatch || auxRowIsAnchorRow val affectedRowsFromAuxiliaryTable = reducedAuxiliaryTableDf - // Per row, join/project the minimum microbatch sequence and anchor sequencing for - // that row's key set. - .join(minimumSequencePerKeyInMicrobatchDf, keysRaw) + // Per row, join/project the minimum microbatch sequence and anchor sequence for that row's + // key set. This join is relatively cheap, because the size of the dataframes being joined is + // bound by the number of unique keys in the microbatch. + .join(perKeyMinimumSequenceInMicrobatch, keysRaw) .join( - anchorOrderingPerKey, + perKeyAnchorSequence, keysRaw, joinType = "left" ) // Using the joined information, determine if the row is affected by the microbatch. .filter(auxRowAffectsMicrobatch) - .drop(minSequencePerKeyInMicrobatchCol, anchorOrderingCol) + .drop(perKeyMinimumSequenceInMicrobatchCol, anchorSequenceCol) affectedRowsFromAuxiliaryTable } @@ -292,7 +300,7 @@ case class Scd2BatchProcessor( * * @param targetTableDf * the target table in its native schema. - * @param minimumSequencePerKeyInMicrobatchDf + * @param perKeyMinimumSequenceInMicrobatch * one row per distinct key as produced by [[computeMinimumSequencePerKey]], representing * the minimum sequence for that key in the microbatch. * @return @@ -300,29 +308,29 @@ case class Scd2BatchProcessor( */ private[autocdc] def findAffectedRowsFromTargetTable( targetTableDf: DataFrame, - minimumSequencePerKeyInMicrobatchDf: DataFrame + perKeyMinimumSequenceInMicrobatch: DataFrame ): DataFrame = { val targetEndAtCol = F.col(Scd2BatchProcessor.endAtColName) - val minSequencePerKeyInMicrobatchCol = F.col(Scd2BatchProcessor.minSequencingColName) + val perKeyMinimumSequenceInMicrobatchCol = F.col(Scd2BatchProcessor.minSequenceColName) // Per key, identify all the rows in the target table that may be affected by the // incoming microbatch. // // Unlike the auxiliary table, the target table holds visible rows only: no hidden open // no-op upsert rows, no tombstones. Visible rows for a given key form a non-overlapping - // interval partition over the sequencing axis, and at most one row has a null __END_AT + // interval partition over the sequencing axis, and at most one row has a null [[endAtColName]] // (the currently active row per key). // // Hence we can simply grab all rows that were active at some point after the min sequencing - // per key, which can be determined entirely by the row's `__END_AT`. + // per key, which can be determined entirely by the row's [[endAtColName]]. val isCurrentlyActiveRow = targetEndAtCol.isNull - val rowEndsAfterMinimumSequence = targetEndAtCol >= minSequencePerKeyInMicrobatchCol + val rowEndsAfterMinimumSequence = targetEndAtCol >= perKeyMinimumSequenceInMicrobatchCol val rowMayBeAffected = isCurrentlyActiveRow || rowEndsAfterMinimumSequence val affectedRowsFromTargetTable = targetTableDf - .join(minimumSequencePerKeyInMicrobatchDf, keysRaw) + .join(perKeyMinimumSequenceInMicrobatch, keysRaw) .filter(rowMayBeAffected) - .drop(minSequencePerKeyInMicrobatchCol) + .drop(perKeyMinimumSequenceInMicrobatchCol) affectedRowsFromTargetTable } @@ -478,8 +486,7 @@ object Scd2BatchProcessor { * Name of temporary column projected onto microbatch to compute the min sequencing value per * key within the microbatch. */ - private[autocdc] val minSequencingColName: String = - s"${AutoCdcReservedNames.prefix}min_sequencing" + private[autocdc] val minSequenceColName: String = s"${AutoCdcReservedNames.prefix}min_sequence" /** * Name of temporary column projected used to identify the sequence associated with the anchor @@ -487,21 +494,18 @@ object Scd2BatchProcessor { * amongst all rows for a key (or risk undefined behavior), this sequence value uniquely * identifies an exact row in the aux. */ - private val anchorOrderingColName: String = s"${AutoCdcReservedNames.prefix}anchor_ordering" + private val anchorSequenceColName: String = s"${AutoCdcReservedNames.prefix}anchor_sequence" - /** Project the `__RECORD_START_AT` field out of an SCD2 CDC metadata column. */ + /** Project the [[recordStartAtFieldName]] out of an SCD2 CDC metadata column. */ private def recordStartAtOf(cdcMetadataCol: Column): Column = cdcMetadataCol.getField(recordStartAtFieldName) - /** Project the `__DELETED_BY_BATCH_ID` field out of an SCD2 CDC metadata column. */ + /** Project the [[deletedByBatchIdFieldName]] out of an SCD2 CDC metadata column. */ private def deletedByBatchIdOf(cdcMetadataCol: Column): Column = cdcMetadataCol.getField(deletedByBatchIdFieldName) /** * Schema of the CDC metadata struct column for SCD2 target table rows. - * - * Note that the aux table's CDC metadata column is a strict superset of this schema (it carries - * an additional `__DELETED_BY_BATCH_ID` field). */ private[pipelines] def targetCdcMetadataColSchema(sequencingType: DataType): StructType = StructType( @@ -537,8 +541,8 @@ object Scd2BatchProcessor { /** * Schema of the CDC metadata struct column for SCD2 aux-table rows. Strict superset of - * [[targetCdcMetadataColSchema]]: extends it with the aux-only `__DELETED_BY_BATCH_ID` - * field used for SCD2 idempotency. + * [[targetCdcMetadataColSchema]]: extends it with the aux-only [[deletedByBatchIdFieldName]] + * used for SCD2 idempotency. */ private[pipelines] def auxCdcMetadataColSchema(sequencingType: DataType): StructType = StructType( diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala index 0fd2df3e923bf..14e77fbd190ab 100644 --- a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala +++ b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/autocdc/Scd2BatchProcessorSuite.scala @@ -86,7 +86,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { sequencingType: DataType = LongType )(rows: Row*): DataFrame = { val schema = keySchema.add( - Scd2BatchProcessor.minSequencingColName, + Scd2BatchProcessor.minSequenceColName, sequencingType, nullable = false ) @@ -607,8 +607,8 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { // =============== computeMinimumSequencePerKey tests =============== - test("computeMinimumSequencePerKey returns one row per distinct key with the per-key " + - "minimum __RECORD_START_AT, aggregating across both upsert and delete events") { + test("computeMinimumSequencePerKey returns one row per distinct key and aggregates across " + + "both upsert and delete events") { val schema = new StructType() .add("id", IntegerType) .add("seq", LongType) @@ -637,7 +637,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.computeMinimumSequencePerKey(preprocessed) assert(result.schema.fieldNames.toSeq == Seq( - "id", Scd2BatchProcessor.minSequencingColName + "id", Scd2BatchProcessor.minSequenceColName )) checkAnswer( df = result, @@ -648,8 +648,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { ) } - test("computeMinimumSequencePerKey computes the minimum per composite-key tuple, " + - "not per single key column") { + test("computeMinimumSequencePerKey is compatible with composite keys") { val schema = new StructType() .add("region", StringType) .add("customer_id", IntegerType) @@ -672,7 +671,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.computeMinimumSequencePerKey(preprocessed) assert(result.schema.fieldNames.toSeq == Seq( - "region", "customer_id", Scd2BatchProcessor.minSequencingColName + "region", "customer_id", Scd2BatchProcessor.minSequenceColName )) checkAnswer( df = result, @@ -684,8 +683,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { ) } - test("computeMinimumSequencePerKey returns an empty result for an empty preprocessed " + - "microbatch") { + test("computeMinimumSequencePerKey returns an empty result for an empty microbatch") { val schema = new StructType() .add("id", IntegerType) .add("seq", LongType) @@ -718,7 +716,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.computeMinimumSequencePerKey(preprocessed) assert(result.schema.fieldNames.toSeq == Seq( - "a.b", Scd2BatchProcessor.minSequencingColName + "a.b", Scd2BatchProcessor.minSequenceColName )) checkAnswer( df = result, @@ -728,9 +726,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { // =============== findAffectedRowsFromAuxiliaryTable tests =============== - test("findAffectedRowsFromAuxiliaryTable selects per-key anchor (max recordStartAt " + - "strictly less than minSeq), preserves rows at or after minSeq, and drops older " + - "anchor candidates") { + test("findAffectedRowsFromAuxiliaryTable returns the anchor row per key") { val processor = processorWithKeys(Seq("id")) // Two keys to demonstrate per-key anchor isolation. @@ -757,7 +753,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.findAffectedRowsFromAuxiliaryTable( rawAuxiliaryTableDf = aux, - minimumSequencePerKeyInMicrobatchDf = minSeq, + perKeyMinimumSequenceInMicrobatch = minSeq, batchId = 100L ) @@ -771,8 +767,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { ) } - test("findAffectedRowsFromAuxiliaryTable selects the anchor regardless of row kind " + - "(tombstone vs no-op upsert continuation) and preserves both kinds in the affected set") { + test("findAffectedRowsFromAuxiliaryTable pulls in both tombstone and no-op upsert rows") { val processor = processorWithKeys(Seq("id")) // Aux carries a mix of row kinds for one key. The find function does NOT distinguish @@ -795,7 +790,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.findAffectedRowsFromAuxiliaryTable( rawAuxiliaryTableDf = aux, - minimumSequencePerKeyInMicrobatchDf = minSeq, + perKeyMinimumSequenceInMicrobatch = minSeq, batchId = 100L ) @@ -809,147 +804,102 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { ) } - test("findAffectedRowsFromAuxiliaryTable pulls in both no-op upsert continuations " + - "when the microbatch's minSeq strictly bisects them (so reconciliation can promote " + - "them to visible target rows if the bisecting event makes them no-longer no-ops)") { + test("findAffectedRowsFromAuxiliaryTable pulls in both consecutive no-op upsert events " + + "being interleaved by incoming microbatch row") { val processor = processorWithKeys(Seq("id")) - // Three no-op upsert continuations of a single run for key=1 (run head was at - // sequence 2 - that head lives in the target table, not the aux). Each continuation - // has startAt=2 (the head's recordStartAt) and endAt=null, and they all carry the - // same user data ("alice"), which is what made them no-ops when observed. - // - // The incoming microbatch's minSeq for key=1 is 10, which strictly bisects the - // continuations at recordStartAt=8 and recordStartAt=12. Both must surface in the - // affected set so a later reconciliation stage can decide whether to promote them - // to visible target rows - e.g., if the bisecting event carries different user data, - // the previously-no-op continuations become honest history with non-trivial - // [startAt, endAt] boundaries. val aux = auxTableOf(singleKeyUserSchema)( - Row(1, "alice", 2L, null, Row(3L, null)), // older continuation; dropped (< anchor) - Row(1, "alice", 2L, null, Row(8L, null)), // anchor: max recordStartAt strictly < 10 - Row(1, "alice", 2L, null, Row(12L, null)) // included via >= branch (>= minSeq=10) + Row(1, "alice", 2L, null, Row(8L, null)), + Row(1, "alice", 2L, null, Row(12L, null)) ) val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) val result = processor.findAffectedRowsFromAuxiliaryTable( rawAuxiliaryTableDf = aux, - minimumSequencePerKeyInMicrobatchDf = minSeq, + perKeyMinimumSequenceInMicrobatch = minSeq, batchId = 100L ) checkAnswer( df = result, expectedAnswer = Seq( - Row(1, "alice", 2L, null, Row(8L)), // anchor - left context for the run - Row(1, "alice", 2L, null, Row(12L)) // bisected by microbatch event at seq 10 + // Row with record start at of 8 gets pulled in as an anchor, + Row(1, "alice", 2L, null, Row(8L)), + // Row with record start at of 12 gets pulled in as a regular affected row. + Row(1, "alice", 2L, null, Row(12L)) ) ) } - test("findAffectedRowsFromAuxiliaryTable pulls in tombstones on both sides of minSeq, " + - "including a tombstone-as-anchor and multiple tombstones at-or-after minSeq") { + test("findAffectedRowsFromAuxiliaryTable selects tombstones as anchor if applicable") { val processor = processorWithKeys(Seq("id")) - // Four tombstones for key=1 at recordStartAt = 3, 7, 12, 18 (delete events). - // Tombstones obey: startAt = endAt = recordStartAt and carry no user data. - // - // The microbatch's minSeq for key=1 is 10. Expected: - // - Tombstone at 3: older than the anchor; dropped. - // - Tombstone at 7: anchor (max recordStartAt strictly < 10); included. - // - Tombstone at 12: at-or-after minSeq; included via the >= branch. - // - Tombstone at 18: at-or-after minSeq; included via the >= branch. - // - // Why both sides matter for downstream reconciliation: the left-side tombstone tells - // reconciliation the prior run was already closed by a delete (so an incoming upsert - // starts a fresh run, not a continuation), while the right-side tombstones bound the - // visible interval of any new upsert run against subsequent deletes. + // Tombstone-as-anchor is incidental: the find function selects the anchor purely on + // `max recordStartAt < minSeq`, so a tombstone qualifies just like any other row kind. + // Downstream reconciliation does not actually rely on the anchor when it is a + // tombstone (a delete already closed the prior run, so any subsequent incoming event + // is necessarily a fresh run head regardless of whether the anchor is surfaced). We + // still pull it in as a harmless side effect of the range filter, and this behavior is + // documented via test. val aux = auxTableOf(singleKeyUserSchema)( - Row(1, null, 3L, 3L, Row(3L, null)), Row(1, null, 7L, 7L, Row(7L, null)), - Row(1, null, 12L, 12L, Row(12L, null)), - Row(1, null, 18L, 18L, Row(18L, null)) + Row(1, null, 12L, 12L, Row(12L, null)) ) val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) val result = processor.findAffectedRowsFromAuxiliaryTable( rawAuxiliaryTableDf = aux, - minimumSequencePerKeyInMicrobatchDf = minSeq, + perKeyMinimumSequenceInMicrobatch = minSeq, batchId = 100L ) checkAnswer( df = result, expectedAnswer = Seq( - Row(1, null, 7L, 7L, Row(7L)), // anchor - last delete before minSeq - Row(1, null, 12L, 12L, Row(12L)), // delete at seq 12 - Row(1, null, 18L, 18L, Row(18L)) // delete at seq 18 + // Pulled in as anchor. + Row(1, null, 7L, 7L, Row(7L)), + // Pulled in as regular affected row. + Row(1, null, 12L, 12L, Row(12L)) ) ) } - test("findAffectedRowsFromAuxiliaryTable filters logically-deleted aux rows by " + - "__DELETED_BY_BATCH_ID: keeps null and current-batch, drops different-batch") { + test("findAffectedRowsFromAuxiliaryTable filters logically-deleted aux rows") { val processor = processorWithKeys(Seq("id")) val currentBatchId = 100L val differentBatchId = 99L - // All three rows would be eligible by recordStartAt alone (>= minSeq=10), but the - // idempotency filter drops the one logically deleted by a different batch. + // The idempotency filter retains rows deleted by `currentBatchId` (so a mid-flight + // retry sees its own prior writes) and drops rows deleted by any other batch. This + // applies uniformly to both the anchor and non-anchor affected rows. val aux = auxTableOf(singleKeyUserSchema)( - Row(1, "live", 10L, null, Row(10L, null)), // not deleted -> kept - Row(1, "retried", 11L, null, Row(11L, currentBatchId)), // deleted by current -> kept - Row(1, "ignored", 12L, null, Row(12L, differentBatchId)) // deleted by another -> dropped + // Anchor candidate (recordStartAt < minSeq): + Row(1, "anchor", 5L, null, Row(5L, currentBatchId)), // deleted by current -> kept + // At-or-after minSeq: + Row(1, "live", 10L, null, Row(10L, null)), // not deleted -> kept + Row(1, "retried", 11L, null, Row(11L, currentBatchId)), // deleted by current -> kept + Row(1, "ignored", 12L, null, Row(12L, differentBatchId)) // deleted by another -> dropped ) val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) val result = processor.findAffectedRowsFromAuxiliaryTable( rawAuxiliaryTableDf = aux, - minimumSequencePerKeyInMicrobatchDf = minSeq, + perKeyMinimumSequenceInMicrobatch = minSeq, batchId = currentBatchId ) checkAnswer( df = result, expectedAnswer = Seq( + Row(1, "anchor", 5L, null, Row(5L)), Row(1, "live", 10L, null, Row(10L)), Row(1, "retried", 11L, null, Row(11L)) ) ) } - test("findAffectedRowsFromAuxiliaryTable still selects the anchor when that anchor row " + - "was logically deleted by the current batch (idempotent retry)") { - val processor = processorWithKeys(Seq("id")) - - // Models an idempotent retry: a previous attempt of this same batch already logically - // deleted the anchor row in the aux table, but mid-retry we still need to treat it as - // the left context. The idempotency filter retains rows deleted by `currentBatchId`, - // and anchor selection ignores `__DELETED_BY_BATCH_ID` entirely. - // - // Strict-sequence-uniqueness is not a public guarantee of AutoCDC, but this test pins - // the *current* behavior so the contract change is intentional rather than accidental. - val currentBatchId = 100L - val aux = auxTableOf(singleKeyUserSchema)( - Row(1, "anchor", 5L, null, Row(5L, currentBatchId)) - ) - val minSeq = minSeqOf(singleKeyKeySchema)(Row(1, 10L)) - - val result = processor.findAffectedRowsFromAuxiliaryTable( - rawAuxiliaryTableDf = aux, - minimumSequencePerKeyInMicrobatchDf = minSeq, - batchId = currentBatchId - ) - - checkAnswer( - df = result, - expectedAnswer = Seq(Row(1, "anchor", 5L, null, Row(5L))) - ) - } - - test("findAffectedRowsFromAuxiliaryTable narrows the CDC metadata column from the aux " + - "schema to the target schema (drops __DELETED_BY_BATCH_ID)") { + test("findAffectedRowsFromAuxiliaryTable narrows CDC metadata column to match target's") { val processor = processorWithKeys(Seq("id")) // Pre-condition: aux's `_cdc_metadata` carries __RECORD_START_AT and __DELETED_BY_BATCH_ID. @@ -960,7 +910,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.findAffectedRowsFromAuxiliaryTable( rawAuxiliaryTableDf = aux, - minimumSequencePerKeyInMicrobatchDf = minSeq, + perKeyMinimumSequenceInMicrobatch = minSeq, batchId = 100L ) @@ -969,15 +919,6 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { } test("findAffectedRowsFromAuxiliaryTable resolves key columns containing a literal dot") { - // Backticks tell the SQL parser that "a.b" is a single identifier literally named - // "a.b" (rather than struct-field access). The schema field, the join key (via - // `keysRaw`), and the F.col reference (via `keysQuoted`) must all consistently - // resolve to the literal "a.b" column for the find-* path to work end-to-end. - // - // This is the regression guard for the `keysRaw` vs `keysQuoted` split: passing - // backtick-quoted names to `.join(_, usingColumns)` matches schema fields literally - // and would fail to find a column named `a.b`, while passing raw names to F.col - // would parse the dot as struct-field access and also fail. val processor = processorWithKeys(Seq("`a.b`")) val keySchema = new StructType().add("a.b", IntegerType) val userSchema = keySchema.add("value", StringType) @@ -987,7 +928,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.findAffectedRowsFromAuxiliaryTable( rawAuxiliaryTableDf = aux, - minimumSequencePerKeyInMicrobatchDf = minSeq, + perKeyMinimumSequenceInMicrobatch = minSeq, batchId = 100L ) @@ -998,8 +939,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { ) } - test("findAffectedRowsFromAuxiliaryTable respects all key columns when computing " + - "per-key anchors with a composite key") { + test("findAffectedRowsFromAuxiliaryTable respects composite keys") { val keySchema = new StructType() .add("region", StringType) .add("customer_id", IntegerType) @@ -1024,7 +964,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.findAffectedRowsFromAuxiliaryTable( rawAuxiliaryTableDf = aux, - minimumSequencePerKeyInMicrobatchDf = minSeq, + perKeyMinimumSequenceInMicrobatch = minSeq, batchId = 100L ) @@ -1046,7 +986,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.findAffectedRowsFromAuxiliaryTable( rawAuxiliaryTableDf = aux, - minimumSequencePerKeyInMicrobatchDf = minSeq, + perKeyMinimumSequenceInMicrobatch = minSeq, batchId = 100L ) @@ -1063,7 +1003,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.findAffectedRowsFromAuxiliaryTable( rawAuxiliaryTableDf = aux, - minimumSequencePerKeyInMicrobatchDf = minSeq, + perKeyMinimumSequenceInMicrobatch = minSeq, batchId = 100L ) @@ -1083,7 +1023,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.findAffectedRowsFromAuxiliaryTable( rawAuxiliaryTableDf = aux, - minimumSequencePerKeyInMicrobatchDf = minSeq, + perKeyMinimumSequenceInMicrobatch = minSeq, batchId = 100L ) @@ -1095,16 +1035,14 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { // =============== findAffectedRowsFromTargetTable tests =============== - test("findAffectedRowsFromTargetTable includes the currently active row and any closed " + - "row whose __END_AT is at or after minSeq; older closed rows are excluded") { + test("findAffectedRowsFromTargetTable includes both closed and active affected rows") { val processor = processorWithKeys(Seq("id")) - // Single key with four target rows. Schema for input: (id, value, __START_AT, __END_AT, - // _cdc_metadata{__RECORD_START_AT}). - // - closed at endAt=5 -> < minSeq=10 -> excluded - // - closed at endAt=10 -> = minSeq=10 -> included (>=) - // - closed at endAt=15 -> > minSeq=10 -> included - // - active (endAt=null) -> always included + // Single key with four target rows: + // - row closed at endAt=5 -> < minSeq=10 -> excluded + // - row closed at endAt=10 -> = minSeq=10 -> included (>=) + // - row closed at endAt=15 -> > minSeq=10 -> included + // - row active (endAt=null) -> always included val target = targetTableOf(singleKeyUserSchema)( Row(1, "old", 1L, 5L, Row(1L)), Row(1, "edge", 5L, 10L, Row(5L)), @@ -1115,7 +1053,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.findAffectedRowsFromTargetTable( targetTableDf = target, - minimumSequencePerKeyInMicrobatchDf = minSeq + perKeyMinimumSequenceInMicrobatch = minSeq ) checkAnswer( @@ -1150,7 +1088,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.findAffectedRowsFromTargetTable( targetTableDf = target, - minimumSequencePerKeyInMicrobatchDf = minSeq + perKeyMinimumSequenceInMicrobatch = minSeq ) checkAnswer( @@ -1163,7 +1101,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { ) } - test("findAffectedRowsFromTargetTable respects all key columns with a composite key") { + test("findAffectedRowsFromTargetTable respects composite keys") { val keySchema = new StructType() .add("region", StringType) .add("customer_id", IntegerType) @@ -1187,7 +1125,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.findAffectedRowsFromTargetTable( targetTableDf = target, - minimumSequencePerKeyInMicrobatchDf = minSeq + perKeyMinimumSequenceInMicrobatch = minSeq ) checkAnswer( @@ -1207,7 +1145,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.findAffectedRowsFromTargetTable( targetTableDf = target, - minimumSequencePerKeyInMicrobatchDf = minSeq + perKeyMinimumSequenceInMicrobatch = minSeq ) assert(result.collect().isEmpty) @@ -1223,7 +1161,7 @@ class Scd2BatchProcessorSuite extends QueryTest with SharedSparkSession { val result = processor.findAffectedRowsFromTargetTable( targetTableDf = target, - minimumSequencePerKeyInMicrobatchDf = minSeq + perKeyMinimumSequenceInMicrobatch = minSeq ) assert(result.collect().isEmpty)