From de9d01ed937dce44723d152b0405a8ffb0f7edec Mon Sep 17 00:00:00 2001
From: fishfishfishfishaa <792850842@qq.com>
Date: Sun, 28 Jun 2026 09:00:48 +0800
Subject: [PATCH] [feature] Support sink.committer-coordinator-operator.enabled
---
.../flink_connector_configuration.html | 6 +
paimon-e2e-tests/pom.xml | 1 +
.../org/apache/paimon/tests/E2eTestBase.java | 38 +-
.../tests/PaimonWriterCoordinatorE2eTest.java | 716 ++++++++++++++++
.../resources-filtered/docker-compose.yaml | 4 +-
.../src/test/resources/flink-pwc.env | 19 +
.../paimon/flink/FlinkConnectorOptions.java | 7 +
.../paimon/flink/sink/AppendTableSink.java | 17 +
.../paimon/flink/sink/CommitHandler.java | 53 ++
.../sink/CommitterCoordinatedFactory.java | 90 ++
.../flink/sink/CoordinatedCommitHandler.java | 123 +++
.../apache/paimon/flink/sink/FlinkSink.java | 15 +-
.../flink/sink/PrepareCommitOperator.java | 10 +-
.../paimon/flink/sink/RowAppendTableSink.java | 25 +
.../StatelessRowDataStoreWriteOperator.java | 6 +-
.../paimon/flink/sink/TableWriteOperator.java | 46 +-
.../sink/coordinator/CommitCompleteEvent.java | 37 +
.../flink/sink/coordinator/CommitResult.java | 53 ++
.../coordinator/CommitterCoordinator.java | 176 ++++
.../CoordinatedCommittableState.java | 203 +++++
.../CoordinatedFileInfoSender.java | 181 ++++
.../coordinator/FileInfoReceivedResponse.java | 43 +
.../sink/coordinator/FileInfoRequest.java | 155 ++++
.../coordinator/PaimonWriterCoordinator.java | 425 ++++++++++
.../sink/coordinator/PendingCheckpoint.java | 106 +++
.../sink/coordinator/PendingSubtask.java | 269 ++++++
.../sink/coordinator/SubtaskFileInfo.java | 43 +
.../CoordinatedCommittableStateTest.java | 145 ++++
.../CoordinatedFileInfoSenderTest.java | 189 +++++
.../PaimonWriterCoordinatorITCase.java | 270 ++++++
.../PaimonWriterCoordinatorTest.java | 780 ++++++++++++++++++
.../flink/utils/RuntimeContextUtils.java | 4 +
.../flink/utils/RuntimeContextUtils.java | 4 +
33 files changed, 4244 insertions(+), 15 deletions(-)
create mode 100644 paimon-e2e-tests/src/test/java/org/apache/paimon/tests/PaimonWriterCoordinatorE2eTest.java
create mode 100644 paimon-e2e-tests/src/test/resources/flink-pwc.env
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitHandler.java
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitterCoordinatedFactory.java
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CoordinatedCommitHandler.java
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitCompleteEvent.java
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitResult.java
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitterCoordinator.java
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableState.java
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSender.java
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoReceivedResponse.java
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoRequest.java
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinator.java
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingCheckpoint.java
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingSubtask.java
create mode 100644 paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/SubtaskFileInfo.java
create mode 100644 paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableStateTest.java
create mode 100644 paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSenderTest.java
create mode 100644 paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorITCase.java
create mode 100644 paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorTest.java
diff --git a/docs/generated/flink_connector_configuration.html b/docs/generated/flink_connector_configuration.html
index 0f2786b038dc..0c40f290d163 100644
--- a/docs/generated/flink_connector_configuration.html
+++ b/docs/generated/flink_connector_configuration.html
@@ -248,6 +248,12 @@
Boolean |
Indicates whether to further sort data belonged to each sink task after range partitioning. |
+
+ sink.committer-coordinator-operator.enabled |
+ false |
+ Boolean |
+ Allow coordinator replace committer operator, only support for append table now. |
+
sink.committer-cpu |
1.0 |
diff --git a/paimon-e2e-tests/pom.xml b/paimon-e2e-tests/pom.xml
index 88f50539d6d9..291e36315a2f 100644
--- a/paimon-e2e-tests/pom.xml
+++ b/paimon-e2e-tests/pom.xml
@@ -269,6 +269,7 @@ under the License.
+
diff --git a/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/E2eTestBase.java b/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/E2eTestBase.java
index 048356933d0b..b4ed247631cf 100644
--- a/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/E2eTestBase.java
+++ b/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/E2eTestBase.java
@@ -58,6 +58,7 @@ public abstract class E2eTestBase {
private final boolean withKafka;
private final boolean withHive;
private final boolean withSpark;
+ private final int taskManagerReplicas;
protected E2eTestBase() {
this(false, false);
@@ -68,9 +69,15 @@ protected E2eTestBase(boolean withKafka, boolean withHive) {
}
protected E2eTestBase(boolean withKafka, boolean withHive, boolean withSpark) {
+ this(withKafka, withHive, withSpark, 1);
+ }
+
+ protected E2eTestBase(
+ boolean withKafka, boolean withHive, boolean withSpark, int taskManagerReplicas) {
this.withKafka = withKafka;
this.withHive = withHive;
this.withSpark = withSpark;
+ this.taskManagerReplicas = taskManagerReplicas;
}
protected static final String TEST_DATA_DIR = "/test-data";
@@ -104,10 +111,13 @@ public void before() throws Exception {
.getResource("docker-compose.yaml")
.toURI()))
.withEnv("NETWORK_ID", ((Network.NetworkImpl) network).getName())
+ .withEnv("FLINK_ENV_FILE", flinkEnvFile())
.withLogConsumer("jobmanager-1", new LogConsumer(LOG))
- .withLogConsumer("taskmanager-1", new LogConsumer(LOG))
.withStartupTimeout(Duration.ofMinutes(3))
.withLocalCompose(true);
+ for (int i = 1; i <= taskManagerReplicas; i++) {
+ environment.withLogConsumer("taskmanager-" + i, new LogConsumer(LOG));
+ }
if (withKafka) {
services.add("kafka");
environment.withLogConsumer("kafka-1", new Slf4jLogConsumer(LOG));
@@ -140,11 +150,17 @@ public void before() throws Exception {
".*Master: I have been elected leader! New state: ALIVE.*", 1));
}
environment.withServices(services.toArray(new String[0])).withLocalCompose(true);
+ if (taskManagerReplicas > 1) {
+ environment.withScaledService("taskmanager", taskManagerReplicas);
+ environment.withExposedService("jobmanager-1", 8081);
+ }
environment.waitingFor("jobmanager-1", buildWaitStrategy(".*Registering TaskManager.*", 1));
- environment.waitingFor(
- "taskmanager-1",
- buildWaitStrategy(".*Successful registration at resource manager.*", 1));
+ for (int i = 1; i <= taskManagerReplicas; i++) {
+ environment.waitingFor(
+ "taskmanager-" + i,
+ buildWaitStrategy(".*Successful registration at resource manager.*", 1));
+ }
environment.start();
jobManager = environment.getContainerByServiceName("jobmanager-1").get();
@@ -156,6 +172,20 @@ public void before() throws Exception {
flinkVersion = flinkVersionMatcher.find() ? flinkVersionMatcher.group(1) : null;
}
+ protected String flinkEnvFile() {
+ return "flink.env";
+ }
+
+ protected String flinkRestUrl() {
+ if (taskManagerReplicas <= 1) {
+ throw new IllegalStateException("Flink REST is not exposed for this test.");
+ }
+ return String.format(
+ "http://%s:%d",
+ environment.getServiceHost("jobmanager-1", 8081),
+ environment.getServicePort("jobmanager-1", 8081));
+ }
+
private WaitStrategy buildWaitStrategy(String regex, int times) {
// Increase timeout from 60s (default value) to 180s
return Wait.forLogMessage(regex, times).withStartupTimeout(Duration.ofSeconds(180));
diff --git a/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/PaimonWriterCoordinatorE2eTest.java b/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/PaimonWriterCoordinatorE2eTest.java
new file mode 100644
index 000000000000..0333155da4a1
--- /dev/null
+++ b/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/PaimonWriterCoordinatorE2eTest.java
@@ -0,0 +1,716 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.tests;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.Timeout;
+import org.testcontainers.containers.Container;
+import org.testcontainers.containers.ContainerState;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.UUID;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** End-to-end tests for committing an append table through Paimon writer coordinator. */
+@Timeout(300)
+public class PaimonWriterCoordinatorE2eTest extends E2eTestBase {
+
+ private static final long WAIT_TIMEOUT_MS = 120_000L;
+ private static final Pattern VERTEX_PATTERN =
+ Pattern.compile(
+ "\\\"id\\\"\\s*:\\s*\\\"([^\\\"]+)\\\"[^{}]*"
+ + "\\\"name\\\"\\s*:\\s*\\\"[^\\\"]*"
+ + "Writer\\(write-only\\)\\s*:\\s*pip30_sink[^\\\"]*\\\"");
+ private static final Pattern INTEGER_PATTERN = Pattern.compile("(\\d+)");
+
+ public PaimonWriterCoordinatorE2eTest() {
+ super(false, false, false, 2);
+ }
+
+ @Override
+ protected String flinkEnvFile() {
+ return "flink-pwc.env";
+ }
+
+ @Test
+ public void testCheckpointCommitWithWriterCoordinator() throws Exception {
+ TestContext context = createContext();
+ writeRecords(context.inputDirectory, 0, 20);
+
+ String jobId = submit(context);
+ waitForJobStatus(jobId, "RUNNING");
+ waitForWriterSubtasks(jobId);
+ waitForRecords();
+ triggerAndWaitForCompletedCheckpoint(jobId);
+
+ assertThat(rest("GET", "/jobs/" + jobId + "/plan", null))
+ .doesNotContain("Committer", "Compact Coordinator", "Compact Worker");
+ waitUntil(
+ () -> jobManager.getLogs().contains("Paimon writer coordinator starting"),
+ "PWC did not start.");
+
+ cancel(jobId);
+ assertTable(context, 0, 20);
+ }
+
+ @Test
+ public void testPartialCheckpointAbortIsRecoveredByNextCheckpoint() throws Exception {
+ TestContext context = createContext();
+ writeRecords(context.inputDirectory, 0, 20);
+
+ // 先启动流作业并完成一次正常 checkpoint,确保 coordinator 已经完成一次提交。
+ String jobId = submit(context);
+ waitForJobStatus(jobId, "RUNNING");
+ waitForWriterSubtasks(jobId);
+ waitForRecords();
+ triggerAndWaitForCompletedCheckpoint(jobId);
+
+ // 记录 writer 两个 subtask 的 attempt 和所在 TM,后面用来确认失败 checkpoint 不会重启任务。
+ String writerVertexId = findWriterVertexId(jobId);
+ Map before = waitForWriterSubtasks(jobId);
+ assertThat(before).hasSize(2);
+ assertThat(before.get(0).host).isNotEqualTo(before.get(1).host);
+
+ // 写入第二批数据,然后暂停其中一个 writer 所在 TM,让 checkpoint 进入部分完成状态。
+ writeRecords(context.inputDirectory, 20, 20);
+ waitForRecords();
+ ContainerState pausedTaskManager = findTaskManager(before.get(1));
+ int failedBefore = checkpointCount(jobId, "failed");
+ boolean paused = false;
+ try {
+ pausedTaskManager
+ .getDockerClient()
+ .pauseContainerCmd(pausedTaskManager.getContainerId())
+ .exec();
+ paused = true;
+
+ triggerCheckpoint(jobId);
+ waitForPartialCheckpoint(jobId);
+ waitForCheckpointCount(jobId, "failed", failedBefore + 1);
+ } finally {
+ // 恢复被暂停的 TM,使后续 checkpoint 可以重新收齐所有 writer 的 committable。
+ if (paused) {
+ pausedTaskManager
+ .getDockerClient()
+ .unpauseContainerCmd(pausedTaskManager.getContainerId())
+ .exec();
+ }
+ }
+
+ // 再触发一次成功 checkpoint,验证前一次 abort 的部分提交信息能被下一次 checkpoint 恢复。
+ waitForJobStatus(jobId, "RUNNING");
+ triggerAndWaitForDataCommitted(jobId, context);
+ Map after = getSubtaskAttempts(jobId, writerVertexId);
+ assertThat(after.get(0).attempt).isEqualTo(before.get(0).attempt);
+ assertThat(after.get(1).attempt).isEqualTo(before.get(1).attempt);
+
+ // 取消流作业释放 slot 后,用 batch query 校验两批数据最终一致。
+ cancel(jobId);
+ assertTable(context, 0, 40);
+ }
+
+ @Test
+ public void testTaskManagerFailureRestoresOnlyAffectedRegion() throws Exception {
+ TestContext context = createContext();
+ writeRecords(context.inputDirectory, 0, 40);
+
+ String jobId = submit(context);
+ waitForJobStatus(jobId, "RUNNING");
+ waitForWriterSubtasks(jobId);
+ waitForRecords();
+ triggerAndWaitForDataCommitted(jobId, context);
+
+ String writerVertexId = findWriterVertexId(jobId);
+ assertSourceAndWriterAreChained(jobId);
+ Map before = waitForWriterSubtasks(jobId);
+ assertThat(before).hasSize(2);
+ assertThat(before.get(0).host).isNotEqualTo(before.get(1).host);
+
+ ContainerState failedTaskManager = findTaskManager(before.get(0));
+ failedTaskManager
+ .getDockerClient()
+ .restartContainerCmd(failedTaskManager.getContainerId())
+ .withTimeout(10)
+ .exec();
+
+ waitUntil(
+ () -> {
+ Map attempts =
+ getSubtaskAttempts(jobId, writerVertexId);
+ return attempts.size() == 2
+ && attempts.values().stream()
+ .allMatch(attempt -> "RUNNING".equals(attempt.status))
+ && attempts.get(0).attempt > before.get(0).attempt
+ && "RUNNING".equals(jobStatus(jobId));
+ },
+ "The affected writer region did not recover.");
+
+ Map after = getSubtaskAttempts(jobId, writerVertexId);
+ assertThat(after.get(0).attempt).isGreaterThan(before.get(0).attempt);
+ assertThat(after.get(1).attempt).isEqualTo(before.get(1).attempt);
+
+ triggerAndWaitForDataCommitted(jobId, context);
+ cancel(jobId);
+ assertTable(context, 0, 40);
+ }
+
+ @Test
+ public void testSavepointRestoreReplaysPendingFileInfo() throws Exception {
+ TestContext context = createContext();
+ writeRecords(context.inputDirectory, 0, 20);
+
+ String firstJobId = submit(context);
+ waitForJobStatus(firstJobId, "RUNNING");
+ waitForWriterSubtasks(firstJobId);
+ waitForRecords();
+ String savepoint = cancelWithSavepoint(firstJobId);
+
+ String restoredJobId = submit(context, savepoint);
+ waitForJobStatus(restoredJobId, "RUNNING");
+ waitForWriterSubtasks(restoredJobId);
+ writeRecords(context.inputDirectory, 20, 20);
+ waitForRecords();
+ triggerAndWaitForDataCommitted(restoredJobId, context);
+
+ waitUntil(
+ () ->
+ countOccurrences(jobManager.getLogs(), "Paimon writer coordinator starting")
+ >= 2,
+ "PWC was not recreated after savepoint restore.");
+
+ cancel(restoredJobId);
+ assertTable(context, 0, 40);
+ }
+
+ private TestContext createContext() {
+ String id = UUID.randomUUID().toString().replace("-", "");
+ String inputDirectory = "pip30-input-" + id;
+ String inputPath = TEST_DATA_DIR + "/" + inputDirectory;
+ String warehouse = TEST_DATA_DIR + "/pip30-" + id;
+
+ String catalogDdl =
+ String.format(
+ "CREATE CATALOG pip30_catalog WITH (\n"
+ + " 'type' = 'paimon',\n"
+ + " 'warehouse' = '%s'\n"
+ + ");",
+ warehouse);
+ String sourceDdl =
+ String.format(
+ "CREATE TEMPORARY TABLE pip30_source (\n"
+ + " sequence_id BIGINT,\n"
+ + " payload STRING\n"
+ + ") WITH (\n"
+ + " 'connector' = 'filesystem',\n"
+ + " 'path' = '%s',\n"
+ + " 'format' = 'csv',\n"
+ + " 'source.monitor-interval' = '1 s'\n"
+ + ");",
+ inputPath);
+ String tableDdl =
+ "CREATE TABLE IF NOT EXISTS pip30_sink (\n"
+ + " sequence_id BIGINT,\n"
+ + " payload STRING\n"
+ + ") WITH (\n"
+ + " 'bucket' = '-1',\n"
+ + " 'write-only' = 'true',\n"
+ + " 'sink.committer-coordinator-operator.enabled' = 'true'\n"
+ + ");";
+ return new TestContext(
+ inputDirectory,
+ catalogDdl,
+ sourceDdl,
+ tableDdl,
+ warehouse + "/default.db/pip30_sink");
+ }
+
+ private String submit(TestContext context) throws Exception {
+ return submit(context, null);
+ }
+
+ private String submit(TestContext context, String savepoint) throws Exception {
+ String restore =
+ savepoint == null
+ ? ""
+ : String.format("SET 'execution.savepoint.path' = '%s';\n", savepoint);
+ return runStreamingSql(
+ "INSERT INTO pip30_sink SELECT * FROM pip30_source;",
+ "SET 'parallelism.default' = '2';\n"
+ + "SET 'execution.checkpointing.interval' = '1 d';\n"
+ + "SET 'execution.checkpointing.timeout' = '10 s';\n"
+ + "SET 'execution.checkpointing.tolerable-failed-checkpoints' = '1';\n"
+ + "SET 'restart-strategy' = 'fixed-delay';\n"
+ + "SET 'restart-strategy.fixed-delay.attempts' = '10';\n"
+ + "SET 'restart-strategy.fixed-delay.delay' = '1 s';\n"
+ + restore,
+ context.catalogDdl,
+ "USE CATALOG pip30_catalog;",
+ context.tableDdl,
+ context.sourceDdl);
+ }
+
+ private void writeRecords(String inputDirectory, int start, int count) throws Exception {
+ StringBuilder records = new StringBuilder();
+ for (int i = start; i < start + count; i++) {
+ records.append(i).append(",value-").append(i).append('\n');
+ }
+ writeSharedFile(inputDirectory + "/" + UUID.randomUUID() + ".csv", records.toString());
+ }
+
+ private void waitForRecords() throws InterruptedException {
+ Thread.sleep(2_000L);
+ }
+
+ private void assertTable(TestContext context, int start, int end) throws Exception {
+ String resultDirectory = "pip30-result-" + UUID.randomUUID();
+ String resultPath = TEST_DATA_DIR + "/" + resultDirectory;
+ runBatchSql(
+ "INSERT INTO pip30_result SELECT sequence_id, payload FROM pip30_sink;",
+ context.catalogDdl,
+ "USE CATALOG pip30_catalog;",
+ context.tableDdl,
+ String.format(
+ "CREATE TEMPORARY TABLE pip30_result (\n"
+ + " sequence_id BIGINT,\n"
+ + " payload STRING\n"
+ + ") WITH (\n"
+ + " 'connector' = 'filesystem',\n"
+ + " 'path' = '%s',\n"
+ + " 'format' = 'csv'\n"
+ + ");",
+ resultPath));
+
+ Map expected = new HashMap<>();
+ for (int i = start; i < end; i++) {
+ expected.compute(i + ",value-" + i, (k, v) -> (v == null ? 0 : v) + 1);
+ }
+ assertThat(readRows(resultPath)).isEqualTo(expected);
+ }
+
+ private Map readRows(String path) throws Exception {
+ Container.ExecResult result =
+ jobManager.execInContainer(
+ "bash",
+ "-c",
+ "if [ -d "
+ + path
+ + " ]; then find "
+ + path
+ + " -type f ! -name '.*' -exec cat {} +; fi");
+ assertCommandSucceeded("read result files", result);
+
+ Map rows = new HashMap<>();
+ for (String row : result.getStdout().split("\\R")) {
+ if (!row.trim().isEmpty()) {
+ rows.compute(row.trim(), (k, v) -> (v == null ? 0 : v) + 1);
+ }
+ }
+ return rows;
+ }
+
+ private void triggerAndWaitForDataCommitted(String jobId, TestContext context)
+ throws Exception {
+ waitUntil(
+ () -> {
+ triggerAndWaitForCompletedCheckpoint(jobId);
+ return latestSnapshotRecordCount(context) >= 40;
+ },
+ "Paimon data was not committed.");
+ }
+
+ private long latestSnapshotRecordCount(TestContext context) throws Exception {
+ Container.ExecResult result =
+ jobManager.execInContainer(
+ "bash",
+ "-c",
+ "latest=$(ls "
+ + context.tableDirectory
+ + "/snapshot/snapshot-* 2>/dev/null | sort -V | tail -1); "
+ + "[ -n \"$latest\" ] && sed -n 's/.*\"totalRecordCount\"[ ]*:[ ]*\\([0-9][0-9]*\\).*/\\1/p' \"$latest\"");
+ if (result.getExitCode() != 0) {
+ return 0L;
+ }
+ String recordCount = result.getStdout().trim();
+ if (recordCount.isEmpty()) {
+ return 0L;
+ }
+ return Long.parseLong(recordCount);
+ }
+
+ private String findWriterVertexId(String jobId) throws Exception {
+ String details = rest("GET", "/jobs/" + jobId, null);
+ Matcher matcher = VERTEX_PATTERN.matcher(details);
+ if (!matcher.find()) {
+ throw new AssertionError("Cannot find writer vertex in job details: " + details);
+ }
+ return matcher.group(1);
+ }
+
+ private void assertSourceAndWriterAreChained(String jobId) throws Exception {
+ String plan = rest("GET", "/jobs/" + jobId + "/plan", null);
+ assertThat(plan)
+ .withFailMessage(
+ "Source and writer must be chained in one parallel vertex to verify"
+ + " subtask-level region failover.%nPlan:%n%s",
+ plan)
+ .contains(
+ "\"parallelism\":2",
+ "TableSourceScan(table=[[pip30_catalog, default, pip30_source]]",
+ "Writer(write-only) : pip30_sink");
+ }
+
+ private Map waitForWriterSubtasks(String jobId) throws Exception {
+ String writerVertexId = findWriterVertexId(jobId);
+ waitUntil(
+ () -> {
+ Map attempts =
+ getSubtaskAttempts(jobId, writerVertexId);
+ return attempts.size() == 2
+ && attempts.values().stream()
+ .allMatch(attempt -> "RUNNING".equals(attempt.status));
+ },
+ "Writer subtasks did not become available.");
+ return getSubtaskAttempts(jobId, writerVertexId);
+ }
+
+ private Map getSubtaskAttempts(String jobId, String vertexId)
+ throws Exception {
+ String details = rest("GET", "/jobs/" + jobId + "/vertices/" + vertexId, null);
+ Map attempts = new HashMap<>();
+ String[] subtasks = details.split("\\\"subtask\\\"\\s*:");
+ for (int i = 1; i < subtasks.length; i++) {
+ String subtask = subtasks[i];
+ Integer index = firstInteger(subtask);
+ Integer attempt = integerField(subtask, "attempt");
+ String status = stringField(subtask, "status");
+ String host = stringField(subtask, "host");
+ if (host == null) {
+ // support for Flink 2.2 REST API
+ host = stringField(subtask, "endpoint");
+ }
+ if (index != null && attempt != null && status != null && host != null) {
+ attempts.put(index, new SubtaskAttempt(attempt, status, host));
+ }
+ }
+ return attempts;
+ }
+
+ private ContainerState findTaskManager(SubtaskAttempt attempt) {
+ String normalizedHost = attempt.host.replace('_', '-');
+ for (int i = 1; i <= 2; i++) {
+ ContainerState taskManager =
+ environment.getContainerByServiceName("taskmanager-" + i).get();
+ boolean hostnameMatches =
+ normalizedHost.endsWith("-taskmanager-" + i)
+ || normalizedHost.contains("-taskmanager-" + i + ".")
+ || normalizedHost.contains("-taskmanager-" + i + ":");
+ boolean ipMatches =
+ taskManager.getContainerInfo().getNetworkSettings().getNetworks().values()
+ .stream()
+ .anyMatch(
+ network ->
+ attempt.host.startsWith(network.getIpAddress() + ":")
+ || attempt.host.equals(network.getIpAddress()));
+ if (hostnameMatches || ipMatches) {
+ return taskManager;
+ }
+ }
+ throw new AssertionError("Cannot map writer host to TaskManager: " + attempt.host);
+ }
+
+ private void triggerAndWaitForCompletedCheckpoint(String jobId) throws Exception {
+ int completedBefore = checkpointCount(jobId, "completed");
+ triggerCheckpoint(jobId);
+ waitForCheckpointCount(jobId, "completed", completedBefore + 1);
+ }
+
+ private void triggerCheckpoint(String jobId) throws Exception {
+ rest("POST", "/jobs/" + jobId + "/checkpoints", "{}");
+ }
+
+ private int checkpointCount(String jobId, String field) throws Exception {
+ String checkpoints = rest("GET", "/jobs/" + jobId + "/checkpoints", null);
+ Matcher counts = Pattern.compile("\\\"counts\\\"\\s*:\\s*\\{([^}]*)}").matcher(checkpoints);
+ if (!counts.find()) {
+ throw new AssertionError("Checkpoint counts are missing: " + checkpoints);
+ }
+ Integer value = integerField(counts.group(1), field);
+ if (value == null) {
+ throw new AssertionError("Checkpoint count is missing for " + field);
+ }
+ return value;
+ }
+
+ private void waitForCheckpointCount(String jobId, String field, int expected) throws Exception {
+ waitUntil(
+ () -> checkpointCount(jobId, field) >= expected,
+ "Checkpoint " + field + " count did not reach " + expected + '.');
+ }
+
+ private void waitForPartialCheckpoint(String jobId) throws Exception {
+ waitUntil(
+ () -> {
+ String checkpoints = rest("GET", "/jobs/" + jobId + "/checkpoints", null);
+ Matcher inProgress =
+ Pattern.compile(
+ "\\\"status\\\"\\s*:\\s*\\\"IN_PROGRESS\\\"([\\s\\S]{0,600})")
+ .matcher(checkpoints);
+ while (inProgress.find()) {
+ Integer acknowledged =
+ integerField(inProgress.group(1), "num_acknowledged_subtasks");
+ if (acknowledged != null && acknowledged > 0) {
+ return true;
+ }
+ }
+ return false;
+ },
+ "Checkpoint did not receive a partial writer snapshot.");
+ }
+
+ private void waitForJobStatus(String jobId, String expected) throws Exception {
+ final String[] lastStatus = new String[1];
+ waitUntil(
+ () -> {
+ lastStatus[0] = jobStatus(jobId);
+ return expected.equals(lastStatus[0]);
+ },
+ "Job "
+ + jobId
+ + " did not reach status "
+ + expected
+ + ", last status was "
+ + lastStatus[0]
+ + ". Exceptions: "
+ + rest("GET", "/jobs/" + jobId + "/exceptions", null));
+ }
+
+ private String jobStatus(String jobId) throws Exception {
+ return stringField(rest("GET", "/jobs/" + jobId, null), "state");
+ }
+
+ private void cancel(String jobId) throws Exception {
+ Container.ExecResult result =
+ jobManager.execInContainerWithUser("flink", "bin/flink", "cancel", jobId);
+ assertCommandSucceeded("cancel job", result);
+ waitForJobStatus(jobId, "CANCELED");
+ }
+
+ private String cancelWithSavepoint(String jobId) throws Exception {
+ String directory = TEST_DATA_DIR + "/savepoints-" + UUID.randomUUID();
+ Container.ExecResult mkdir =
+ jobManager.execInContainerWithUser("flink", "mkdir", "-p", directory);
+ assertCommandSucceeded("create savepoint directory", mkdir);
+
+ Container.ExecResult result =
+ jobManager.execInContainerWithUser(
+ "flink", "bin/flink", "cancel", "-s", directory, jobId);
+ assertCommandSucceeded("cancel job with savepoint", result);
+
+ String output = result.getStdout() + '\n' + result.getStderr();
+ Matcher path =
+ Pattern.compile("Path:\\s*(\\S+)|Savepoint stored in\\s+(\\S+)\\.").matcher(output);
+
+ if (!path.find()) {
+ throw new AssertionError(
+ "Cannot find savepoint path.\nstdout:\n"
+ + result.getStdout()
+ + "\nstderr:\n"
+ + result.getStderr());
+ }
+
+ String savepointPath = path.group(1) != null ? path.group(1) : path.group(2);
+ waitForJobStatus(jobId, "CANCELED");
+ return savepointPath;
+ }
+
+ private void assertCommandSucceeded(String command, Container.ExecResult result) {
+ assertThat(result.getExitCode())
+ .withFailMessage(
+ "%s failed with exit code %s.\nstdout:\n%s\nstderr:\n%s",
+ command, result.getExitCode(), result.getStdout(), result.getStderr())
+ .isZero();
+ }
+
+ private String rest(String method, String path, String body) throws Exception {
+ HttpURLConnection connection =
+ (HttpURLConnection) new URL(flinkRestUrl() + path).openConnection();
+ connection.setRequestMethod(method);
+ connection.setConnectTimeout(10_000);
+ connection.setReadTimeout(30_000);
+ if (body != null) {
+ connection.setDoOutput(true);
+ connection.setRequestProperty("Content-Type", "application/json");
+ try (OutputStream output = connection.getOutputStream()) {
+ output.write(body.getBytes(StandardCharsets.UTF_8));
+ }
+ }
+
+ int responseCode = connection.getResponseCode();
+ InputStream input =
+ responseCode >= 200 && responseCode < 300
+ ? connection.getInputStream()
+ : connection.getErrorStream();
+ String response = read(input);
+ connection.disconnect();
+ if (responseCode < 200 || responseCode >= 300) {
+ throw new IOException(
+ "Flink REST "
+ + method
+ + ' '
+ + path
+ + " failed: "
+ + responseCode
+ + ' '
+ + response);
+ }
+ return response;
+ }
+
+ private static String read(InputStream input) throws IOException {
+ if (input == null) {
+ return "";
+ }
+ StringBuilder result = new StringBuilder();
+ try (BufferedReader reader =
+ new BufferedReader(new InputStreamReader(input, StandardCharsets.UTF_8))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ result.append(line);
+ }
+ }
+ return result.toString();
+ }
+
+ private static Integer firstInteger(String value) {
+ Matcher matcher = INTEGER_PATTERN.matcher(value);
+ return matcher.find() ? Integer.parseInt(matcher.group(1)) : null;
+ }
+
+ private static Integer integerField(String value, String field) {
+ Matcher matcher =
+ Pattern.compile("\\\"" + Pattern.quote(field) + "\\\"\\s*:\\s*(\\d+)")
+ .matcher(value);
+ return matcher.find() ? Integer.parseInt(matcher.group(1)) : null;
+ }
+
+ private static String stringField(String value, String field) {
+ Matcher matcher =
+ Pattern.compile("\\\"" + Pattern.quote(field) + "\\\"\\s*:\\s*\\\"([^\\\"]+)\\\"")
+ .matcher(value);
+ return matcher.find() ? matcher.group(1) : null;
+ }
+
+ private static int countOccurrences(String value, String expected) {
+ int count = 0;
+ int index = 0;
+ while ((index = value.indexOf(expected, index)) >= 0) {
+ count++;
+ index += expected.length();
+ }
+ return count;
+ }
+
+ private static void waitUntil(CheckedBooleanSupplier condition, String failureMessage)
+ throws Exception {
+ long deadline = System.currentTimeMillis() + WAIT_TIMEOUT_MS;
+ Throwable lastFailure = null;
+ while (System.currentTimeMillis() < deadline) {
+ try {
+ if (condition.getAsBoolean()) {
+ return;
+ }
+ } catch (Throwable t) {
+ lastFailure = t;
+ }
+ Thread.sleep(200L);
+ }
+ AssertionError error = new AssertionError(failureMessage);
+ if (lastFailure != null) {
+ error.initCause(lastFailure);
+ }
+ throw error;
+ }
+
+ @FunctionalInterface
+ private interface CheckedBooleanSupplier {
+
+ boolean getAsBoolean() throws Exception;
+ }
+
+ private static class TestContext {
+
+ private final String inputDirectory;
+ private final String catalogDdl;
+ private final String sourceDdl;
+ private final String tableDdl;
+ private final String tableDirectory;
+
+ private TestContext(
+ String inputDirectory,
+ String catalogDdl,
+ String sourceDdl,
+ String tableDdl,
+ String tableDirectory) {
+ this.inputDirectory = inputDirectory;
+ this.catalogDdl = catalogDdl;
+ this.sourceDdl = sourceDdl;
+ this.tableDdl = tableDdl;
+ this.tableDirectory = tableDirectory;
+ }
+ }
+
+ private static class SubtaskAttempt {
+
+ private final int attempt;
+ private final String status;
+ private final String host;
+
+ private SubtaskAttempt(int attempt, String status, String host) {
+ this.attempt = attempt;
+ this.status = status;
+ this.host = host;
+ }
+
+ @Override
+ public String toString() {
+ return "SubtaskAttempt{"
+ + "attempt="
+ + attempt
+ + ", status='"
+ + status
+ + '\''
+ + ", host='"
+ + host
+ + '\''
+ + '}';
+ }
+ }
+}
diff --git a/paimon-e2e-tests/src/test/resources-filtered/docker-compose.yaml b/paimon-e2e-tests/src/test/resources-filtered/docker-compose.yaml
index c9d579fb5657..04f7857b0a38 100644
--- a/paimon-e2e-tests/src/test/resources-filtered/docker-compose.yaml
+++ b/paimon-e2e-tests/src/test/resources-filtered/docker-compose.yaml
@@ -41,7 +41,7 @@ services:
/docker-entrypoint.sh jobmanager
"
env_file:
- - ./flink.env
+ - ./$FLINK_ENV_FILE
networks:
testnetwork:
aliases:
@@ -66,7 +66,7 @@ services:
/docker-entrypoint.sh taskmanager
"
env_file:
- - ./flink.env
+ - ./$FLINK_ENV_FILE
networks:
testnetwork:
aliases:
diff --git a/paimon-e2e-tests/src/test/resources/flink-pwc.env b/paimon-e2e-tests/src/test/resources/flink-pwc.env
new file mode 100644
index 000000000000..a0d1683d9659
--- /dev/null
+++ b/paimon-e2e-tests/src/test/resources/flink-pwc.env
@@ -0,0 +1,19 @@
+################################################################################
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+FLINK_PROPERTIES="jobmanager.rpc.address: jobmanager\ntaskmanager.numberOfTaskSlots: 1\nparallelism.default: 3\ncluster.evenly-spread-out-slots: true\nsql-client.execution.result-mode: TABLEAU\nenv.java.opts.taskmanager: -verbose:gc -Xloggc:/opt/flink/log/gc.log\nexecution.checkpointing.checkpoints-after-tasks-finish.enabled: true"
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkConnectorOptions.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkConnectorOptions.java
index 678471ea337c..8577e09e46f5 100644
--- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkConnectorOptions.java
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkConnectorOptions.java
@@ -365,6 +365,13 @@ public class FlinkConnectorOptions {
.withDescription(
"Allow sink committer and writer operator to be chained together");
+ public static final ConfigOption SINK_COMMITTER_COORDINATOR_OPERATOR_ENABLED =
+ key("sink.committer-coordinator-operator.enabled")
+ .booleanType()
+ .defaultValue(false)
+ .withDescription(
+ "Allow coordinator replace committer operator, only support for append table now.");
+
public static final ConfigOption PARTITION_MARK_DONE_MODE =
key("partition.mark-done-action.mode")
.enumType(PartitionMarkDoneActionMode.class)
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/AppendTableSink.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/AppendTableSink.java
index 6f33a5e45f02..a700757a1974 100644
--- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/AppendTableSink.java
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/AppendTableSink.java
@@ -32,12 +32,16 @@
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.configuration.ExecutionOptions;
import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.datastream.DataStreamSink;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
+import org.apache.flink.streaming.api.functions.sink.v2.DiscardingSink;
import javax.annotation.Nullable;
import java.util.Map;
+import static org.apache.paimon.flink.FlinkConnectorOptions.PRECOMMIT_COMPACT;
+import static org.apache.paimon.flink.FlinkConnectorOptions.SINK_COMMITTER_COORDINATOR_OPERATOR_ENABLED;
import static org.apache.paimon.flink.FlinkConnectorOptions.SINK_MANAGED_WRITER_BUFFER_MEMORY;
import static org.apache.paimon.flink.FlinkConnectorOptions.SINK_USE_MANAGED_MEMORY;
import static org.apache.paimon.flink.utils.ManagedMemoryUtils.declareManagedMemory;
@@ -132,4 +136,17 @@ public DataStream doWrite(
return written;
}
+
+ @Override
+ public DataStreamSink> doCommit(DataStream written, String commitUser) {
+ Options options = Options.fromMap(table.options());
+ if (options.get(SINK_COMMITTER_COORDINATOR_OPERATOR_ENABLED)
+ && !options.get(PRECOMMIT_COMPACT)) {
+ return written.sinkTo(new DiscardingSink<>())
+ .name("end")
+ .setParallelism(written.getParallelism());
+ } else {
+ return super.doCommit(written, commitUser);
+ }
+ }
}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitHandler.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitHandler.java
new file mode 100644
index 000000000000..200e3a9ce150
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitHandler.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink;
+
+import org.apache.paimon.flink.sink.coordinator.CommitCompleteEvent;
+
+import org.apache.flink.runtime.operators.coordination.OperatorEvent;
+import org.apache.flink.runtime.state.StateInitializationContext;
+import org.apache.flink.runtime.state.StateSnapshotContext;
+
+/** Handles writer-side interactions with commit coordinator. */
+public class CommitHandler {
+
+ public static final CommitHandler EMPTY = new CommitHandler();
+
+ public void initialize(
+ StateInitializationContext context, int subtaskId, int attemptNumber, String commitUser)
+ throws Exception {}
+
+ public void processWatermark(long watermark) {}
+
+ public void snapshot(StateSnapshotContext context) throws Exception {}
+
+ public void handleCommittables(long checkpointId) {}
+
+ public boolean requiresStableCommitUser() {
+ return false;
+ }
+
+ public boolean collect(Committable committable) {
+ return false;
+ }
+
+ public boolean handleOperatorEvent(OperatorEvent event) {
+ return event instanceof CommitCompleteEvent;
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitterCoordinatedFactory.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitterCoordinatedFactory.java
new file mode 100644
index 000000000000..923037648025
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitterCoordinatedFactory.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink;
+
+import org.apache.paimon.flink.sink.coordinator.PaimonWriterCoordinator;
+
+import org.apache.flink.runtime.jobgraph.OperatorID;
+import org.apache.flink.runtime.jobgraph.tasks.TaskOperatorEventGateway;
+import org.apache.flink.runtime.operators.coordination.OperatorCoordinator;
+import org.apache.flink.streaming.api.operators.CoordinatedOperatorFactory;
+import org.apache.flink.streaming.api.operators.StreamOperator;
+import org.apache.flink.streaming.api.operators.StreamOperatorParameters;
+
+/** Factory that installs a JM-side committer coordinator for writer operators. */
+public class CommitterCoordinatedFactory
+ extends PrepareCommitOperator.Factory
+ implements CoordinatedOperatorFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ private final boolean streamingCheckpointEnabled;
+ private final TableWriteOperator.Factory writeFactory;
+ private final Committer.Factory committerFactory;
+ private final String initialCommitUser;
+ private final Long endInputWatermark;
+
+ public CommitterCoordinatedFactory(
+ boolean streamingCheckpointEnabled,
+ TableWriteOperator.Factory writeFactory,
+ Committer.Factory committerFactory,
+ String initialCommitUser,
+ Long endInputWatermark) {
+ super(writeFactory.options);
+ this.streamingCheckpointEnabled = streamingCheckpointEnabled;
+ this.writeFactory = writeFactory;
+ this.committerFactory = committerFactory;
+ this.initialCommitUser = initialCommitUser;
+ this.endInputWatermark = endInputWatermark;
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public > T createStreamOperator(
+ StreamOperatorParameters parameters) {
+ OperatorID operatorId = parameters.getStreamConfig().getOperatorID();
+ TaskOperatorEventGateway gateway =
+ parameters
+ .getContainingTask()
+ .getEnvironment()
+ .getOperatorCoordinatorEventGateway();
+ TableWriteOperator operator = writeFactory.createStreamOperator(parameters);
+ operator.setCommitHandler(new CoordinatedCommitHandler(gateway, operatorId));
+ parameters.getOperatorEventDispatcher().registerEventHandler(operatorId, operator);
+ return (T) operator;
+ }
+
+ @Override
+ @SuppressWarnings("rawtypes")
+ public Class extends StreamOperator> getStreamOperatorClass(ClassLoader classLoader) {
+ return writeFactory.getStreamOperatorClass(classLoader);
+ }
+
+ @Override
+ public OperatorCoordinator.Provider getCoordinatorProvider(
+ String operatorName, OperatorID operatorID) {
+ return new PaimonWriterCoordinator.WriterCoordinatorProvider(
+ streamingCheckpointEnabled,
+ operatorName,
+ operatorID,
+ initialCommitUser,
+ committerFactory,
+ endInputWatermark);
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CoordinatedCommitHandler.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CoordinatedCommitHandler.java
new file mode 100644
index 000000000000..258625cebf86
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CoordinatedCommitHandler.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink;
+
+import org.apache.paimon.flink.sink.coordinator.CommitCompleteEvent;
+import org.apache.paimon.flink.sink.coordinator.CoordinatedCommittableState;
+import org.apache.paimon.flink.sink.coordinator.CoordinatedFileInfoSender;
+
+import org.apache.flink.runtime.jobgraph.OperatorID;
+import org.apache.flink.runtime.jobgraph.tasks.TaskOperatorEventGateway;
+import org.apache.flink.runtime.operators.coordination.OperatorEvent;
+import org.apache.flink.runtime.state.StateInitializationContext;
+import org.apache.flink.runtime.state.StateSnapshotContext;
+
+import java.util.List;
+
+import static org.apache.paimon.flink.sink.coordinator.CommitterCoordinator.END_INPUT_CHECKPOINT_ID;
+
+/** PIP-30 writer-side handler for the Paimon writer coordinator. */
+public class CoordinatedCommitHandler extends CommitHandler {
+
+ private final CoordinatedFileInfoSender sender;
+
+ private transient CoordinatedCommittableState state;
+
+ public CoordinatedCommitHandler(TaskOperatorEventGateway gateway, OperatorID operatorId) {
+ this.sender = new CoordinatedFileInfoSender(gateway, operatorId);
+ }
+
+ @Override
+ public void initialize(
+ StateInitializationContext context, int subtaskId, int attemptNumber, String commitUser)
+ throws Exception {
+ sender.setSubtaskId(subtaskId);
+ sender.setAttemptNumber(attemptNumber);
+ state = new CoordinatedCommittableState();
+ state.initialize(context);
+ if (context.isRestored()) {
+ long restoredCheckpointId =
+ context.getRestoredCheckpointId()
+ .orElseThrow(
+ () ->
+ new IllegalStateException(
+ "Restored checkpoint id is missing."));
+ List committables = state.committables();
+ sender.sendRecoveredFileInfoToCoordinator(
+ restoredCheckpointId, commitUser, committables);
+ state.markAcknowledged(committables);
+ }
+ }
+
+ @Override
+ public void processWatermark(long watermark) {
+ sender.processWatermark(watermark);
+ }
+
+ @Override
+ public void snapshot(StateSnapshotContext context) throws Exception {
+ if (state != null) {
+ state.snapshot(context.getCheckpointId());
+ }
+ if (!sender.isEndInput()) {
+ sendUnacknowledgedCommittables(context.getCheckpointId());
+ }
+ }
+
+ @Override
+ public void handleCommittables(long checkpointId) {
+ if (checkpointId == END_INPUT_CHECKPOINT_ID) {
+ sendUnacknowledgedCommittables(checkpointId);
+ }
+ }
+
+ @Override
+ public boolean requiresStableCommitUser() {
+ return true;
+ }
+
+ @Override
+ public boolean collect(Committable committable) {
+ if (state != null) {
+ state.add(committable);
+ }
+ return true;
+ }
+
+ @Override
+ public boolean handleOperatorEvent(OperatorEvent event) {
+ if (event instanceof CommitCompleteEvent) {
+ if (state != null) {
+ state.markCommittedUpTo(((CommitCompleteEvent) event).checkpointId());
+ }
+ return true;
+ }
+ return false;
+ }
+
+ private void sendUnacknowledgedCommittables(long checkpointId) {
+ if (state == null) {
+ return;
+ }
+
+ List committables = state.unacknowledgedCommittables();
+ sender.sendToCoordinator(checkpointId, committables);
+ state.markAcknowledged(committables);
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSink.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSink.java
index 959132ad58e0..edd5fff5e9c3 100644
--- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSink.java
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSink.java
@@ -126,28 +126,30 @@ public DataStream doWrite(
DataStream input, String commitUser, @Nullable Integer parallelism) {
StreamExecutionEnvironment env = input.getExecutionEnvironment();
boolean isStreaming = isStreaming(input);
+ Options options = Options.fromMap(table.options());
+ boolean streamingCheckpointEnabled =
+ isStreaming && env.getCheckpointConfig().isCheckpointingEnabled();
boolean writeOnly = table.coreOptions().writeOnly();
SingleOutputStreamOperator written =
input.transform(
(writeOnly ? WRITER_WRITE_ONLY_NAME : WRITER_NAME) + " : " + table.name(),
new CommittableTypeInfo(),
- createWriteOperatorFactory(
+ createWriteCoordinatorFactory(
StoreSinkWrite.createWriteProvider(
table,
env.getCheckpointConfig(),
isStreaming,
ignorePreviousFiles,
hasSinkMaterializer(input)),
- commitUser));
+ commitUser,
+ streamingCheckpointEnabled));
if (parallelism == null) {
forwardParallelism(written, input);
} else {
written.setParallelism(parallelism);
}
- Options options = Options.fromMap(table.options());
-
String uidSuffix = options.get(SINK_OPERATOR_UID_SUFFIX);
if (options.get(SINK_OPERATOR_UID_SUFFIX) != null) {
written = written.uid(generateCustomUid(WRITER_NAME, table.name(), uidSuffix));
@@ -307,6 +309,11 @@ public static void assertBatchAdaptiveParallelism(
protected abstract OneInputStreamOperatorFactory createWriteOperatorFactory(
StoreSinkWrite.Provider writeProvider, String commitUser);
+ protected OneInputStreamOperatorFactory createWriteCoordinatorFactory(
+ StoreSinkWrite.Provider writeProvider, String commitUser, boolean isStreaming) {
+ return createWriteOperatorFactory(writeProvider, commitUser);
+ }
+
protected abstract Committer.Factory createCommitterFactory();
protected abstract CommittableStateManager createCommittableStateManager();
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/PrepareCommitOperator.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/PrepareCommitOperator.java
index 35f5ff15b9ae..874b0850a9d2 100644
--- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/PrepareCommitOperator.java
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/PrepareCommitOperator.java
@@ -112,8 +112,14 @@ public void close() throws Exception {
}
private void emitCommittables(boolean waitCompaction, long checkpointId) throws IOException {
- prepareCommit(waitCompaction, checkpointId)
- .forEach(committable -> output.collect(new StreamRecord<>(committable)));
+ prepareCommit(waitCompaction, checkpointId).forEach(this::collect);
+ handleCommittables(checkpointId);
+ }
+
+ protected void handleCommittables(long checkpointId) {}
+
+ protected void collect(OUT committable) {
+ output.collect(new StreamRecord<>(committable));
}
protected abstract List prepareCommit(boolean waitCompaction, long checkpointId)
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/RowAppendTableSink.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/RowAppendTableSink.java
index 6e3272f9e0e9..793cdeaabfad 100644
--- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/RowAppendTableSink.java
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/RowAppendTableSink.java
@@ -20,12 +20,17 @@
import org.apache.paimon.data.InternalRow;
import org.apache.paimon.manifest.ManifestCommittable;
+import org.apache.paimon.options.Options;
import org.apache.paimon.table.FileStoreTable;
import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory;
import java.util.Map;
+import static org.apache.paimon.flink.FlinkConnectorOptions.END_INPUT_WATERMARK;
+import static org.apache.paimon.flink.FlinkConnectorOptions.PRECOMMIT_COMPACT;
+import static org.apache.paimon.flink.FlinkConnectorOptions.SINK_COMMITTER_COORDINATOR_OPERATOR_ENABLED;
+
/** An {@link AppendTableSink} which handles {@link InternalRow}. */
public class RowAppendTableSink extends AppendTableSink {
@@ -46,4 +51,24 @@ protected OneInputStreamOperatorFactory createWriteOpe
protected CommittableStateManager createCommittableStateManager() {
return createRestoreOnlyCommittableStateManager(table);
}
+
+ @Override
+ @SuppressWarnings("unchecked")
+ protected OneInputStreamOperatorFactory createWriteCoordinatorFactory(
+ StoreSinkWrite.Provider writeProvider, String commitUser, boolean isStreaming) {
+ Options options = table.coreOptions().toConfiguration();
+ boolean coordinatorEnabled =
+ options.get(SINK_COMMITTER_COORDINATOR_OPERATOR_ENABLED)
+ && !options.get(PRECOMMIT_COMPACT);
+ return coordinatorEnabled
+ ? new CommitterCoordinatedFactory(
+ isStreaming,
+ (TableWriteOperator.Factory)
+ createNoStateRowWriteOperatorFactory(
+ table, writeProvider, commitUser),
+ createCommitterFactory(),
+ commitUser,
+ options.get(END_INPUT_WATERMARK))
+ : createWriteOperatorFactory(writeProvider, commitUser);
+ }
}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/StatelessRowDataStoreWriteOperator.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/StatelessRowDataStoreWriteOperator.java
index eac883b56d9b..69318a07f75b 100644
--- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/StatelessRowDataStoreWriteOperator.java
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/StatelessRowDataStoreWriteOperator.java
@@ -47,7 +47,11 @@ protected StoreSinkWriteState createState(
}
@Override
- protected String getCommitUser(StateInitializationContext context) {
+ protected String getCommitUser(StateInitializationContext context) throws Exception {
+ if (commitHandler.requiresStableCommitUser()) {
+ // PWC requires the commit user to remain stable across recovery.
+ return super.getCommitUser(context);
+ }
// No conflicts will occur in append only unaware bucket writer, so
// commitUser does not matter.
return commitUser == null ? initialCommitUser : commitUser;
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/TableWriteOperator.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/TableWriteOperator.java
index f93bdfb560dd..c672604eb5e9 100644
--- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/TableWriteOperator.java
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/TableWriteOperator.java
@@ -31,12 +31,15 @@
import org.apache.flink.runtime.jobgraph.OperatorID;
import org.apache.flink.runtime.jobgraph.tasks.TaskOperatorEventGateway;
import org.apache.flink.runtime.operators.coordination.OperatorCoordinator;
+import org.apache.flink.runtime.operators.coordination.OperatorEvent;
+import org.apache.flink.runtime.operators.coordination.OperatorEventHandler;
import org.apache.flink.runtime.state.StateInitializationContext;
import org.apache.flink.runtime.state.StateSnapshotContext;
import org.apache.flink.streaming.api.operators.CoordinatedOperatorFactory;
import org.apache.flink.streaming.api.operators.StreamOperator;
import org.apache.flink.streaming.api.operators.StreamOperatorFactory;
import org.apache.flink.streaming.api.operators.StreamOperatorParameters;
+import org.apache.flink.streaming.api.watermark.Watermark;
import javax.annotation.Nullable;
@@ -44,11 +47,13 @@
import java.util.List;
/** An abstract class for table write operator. */
-public abstract class TableWriteOperator extends PrepareCommitOperator {
+public abstract class TableWriteOperator extends PrepareCommitOperator
+ implements OperatorEventHandler {
private static final long serialVersionUID = 1L;
protected FileStoreTable table;
+ protected CommitHandler commitHandler = CommitHandler.EMPTY;
protected final StoreSinkWrite.Provider storeSinkWriteProvider;
protected final String initialCommitUser;
@@ -77,6 +82,12 @@ public void initializeState(StateInitializationContext context) throws Exception
int numTasks = RuntimeContextUtils.getNumberOfParallelSubtasks(getRuntimeContext());
int subtaskId = RuntimeContextUtils.getIndexOfThisSubtask(getRuntimeContext());
+ String currentCommitUser = getCommitUser(context);
+ commitHandler.initialize(
+ context,
+ subtaskId,
+ RuntimeContextUtils.getAttemptNumber(getRuntimeContext()),
+ currentCommitUser);
StateValueFilter stateFilter =
(tableName, partition, bucket) ->
subtaskId == ChannelComputer.select(partition, bucket, numTasks);
@@ -85,7 +96,7 @@ public void initializeState(StateInitializationContext context) throws Exception
write =
storeSinkWriteProvider.provide(
table,
- getCommitUser(context),
+ currentCommitUser,
state,
getContainingTask().getEnvironment().getIOManager(),
memoryPoolFactory,
@@ -100,6 +111,16 @@ public void setWriteRestore(@Nullable WriteRestore writeRestore) {
this.writeRestore = writeRestore;
}
+ public void setCommitHandler(CommitHandler commitHandler) {
+ this.commitHandler = commitHandler;
+ }
+
+ @Override
+ public void processWatermark(Watermark mark) throws Exception {
+ super.processWatermark(mark);
+ commitHandler.processWatermark(mark.getTimestamp());
+ }
+
protected StoreSinkWriteState createState(
int subtaskId,
StateInitializationContext context,
@@ -127,6 +148,7 @@ public void snapshotState(StateSnapshotContext context) throws Exception {
write.snapshotState();
state.snapshotState();
+ commitHandler.snapshot(context);
}
@Override
@@ -143,6 +165,26 @@ protected List prepareCommit(boolean waitCompaction, long checkpoin
return write.prepareCommit(waitCompaction, checkpointId);
}
+ @Override
+ protected void handleCommittables(long checkpointId) {
+ commitHandler.handleCommittables(checkpointId);
+ }
+
+ @Override
+ protected void collect(Committable committable) {
+ if (!commitHandler.collect(committable)) {
+ super.collect(committable);
+ }
+ }
+
+ @Override
+ public void handleOperatorEvent(OperatorEvent event) {
+ if (commitHandler.handleOperatorEvent(event)) {
+ return;
+ }
+ throw new IllegalArgumentException("Unsupported operator event: " + event.getClass());
+ }
+
@VisibleForTesting
public StoreSinkWrite getWrite() {
return write;
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitCompleteEvent.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitCompleteEvent.java
new file mode 100644
index 000000000000..e26d04e012f8
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitCompleteEvent.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.flink.runtime.operators.coordination.OperatorEvent;
+
+/** Coordinator event telling a writer that committables up to a checkpoint can be cleaned. */
+public class CommitCompleteEvent implements OperatorEvent {
+
+ private static final long serialVersionUID = 1L;
+
+ private final long checkpointId;
+
+ public CommitCompleteEvent(long checkpointId) {
+ this.checkpointId = checkpointId;
+ }
+
+ public long checkpointId() {
+ return checkpointId;
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitResult.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitResult.java
new file mode 100644
index 000000000000..145e81ffcdc5
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitResult.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+/** Result of a PWC commit attempt. */
+class CommitResult {
+
+ static final CommitResult NONE = new CommitResult(false, 0, -1, false);
+
+ private final boolean committed;
+ private final int committedCount;
+ private final long checkpointId;
+ private final boolean restoredCommit;
+
+ CommitResult(boolean committed, int committedCount, long checkpointId, boolean restoredCommit) {
+ this.committed = committed;
+ this.committedCount = committedCount;
+ this.checkpointId = checkpointId;
+ this.restoredCommit = restoredCommit;
+ }
+
+ boolean committed() {
+ return committed;
+ }
+
+ int committedCount() {
+ return committedCount;
+ }
+
+ long checkpointId() {
+ return checkpointId;
+ }
+
+ boolean restoredCommit() {
+ return restoredCommit;
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitterCoordinator.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitterCoordinator.java
new file mode 100644
index 000000000000..f73244456898
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitterCoordinator.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.paimon.flink.sink.Committer;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.NavigableMap;
+import java.util.TreeMap;
+
+import static org.apache.paimon.utils.Preconditions.checkNotNull;
+
+/** JM-side global committer used by PaimonWriterCoordinator. */
+public class CommitterCoordinator {
+
+ public static final long END_INPUT_CHECKPOINT_ID = Long.MAX_VALUE;
+
+ private final boolean streamingCheckpointEnabled;
+ private final Committer.Factory committerFactory;
+ private final Long endInputWatermark;
+ private final NavigableMap committablesPerCheckpoint;
+
+ private Committer committer;
+ private long globalWatermark;
+ private boolean endInput;
+
+ public CommitterCoordinator(
+ boolean streamingCheckpointEnabled,
+ Committer.Factory committerFactory,
+ Long endInputWatermark) {
+ this.streamingCheckpointEnabled = streamingCheckpointEnabled;
+ this.committerFactory = checkNotNull(committerFactory);
+ this.endInputWatermark = endInputWatermark;
+ this.committablesPerCheckpoint = new TreeMap<>();
+ this.globalWatermark = Long.MIN_VALUE;
+ }
+
+ public void init(int parallelism, String commitUser) throws Exception {
+ this.globalWatermark = Long.MIN_VALUE;
+ this.endInput = false;
+ if (committer == null) {
+ committer =
+ committerFactory.create(
+ Committer.createContext(
+ commitUser,
+ null,
+ streamingCheckpointEnabled,
+ false,
+ null,
+ parallelism,
+ 0));
+ }
+ }
+
+ public void save(List committables, long checkpointId, long watermark)
+ throws Exception {
+ processWatermark(watermark);
+ pollInputs(committables);
+ if (checkpointId == END_INPUT_CHECKPOINT_ID) {
+ endInput();
+ }
+ }
+
+ private void pollInputs(Collection inputs) throws Exception {
+ Map> grouped = committer.groupByCheckpoint(inputs);
+ for (Map.Entry> entry : grouped.entrySet()) {
+ Long checkpoint = entry.getKey();
+ List committables = entry.getValue();
+ if (checkpoint != null
+ && checkpoint == END_INPUT_CHECKPOINT_ID
+ && committablesPerCheckpoint.containsKey(checkpoint)) {
+ GlobalCommitT merged =
+ committer.combine(
+ checkpoint,
+ globalWatermark,
+ committablesPerCheckpoint.get(checkpoint),
+ committables);
+ committablesPerCheckpoint.put(checkpoint, merged);
+ } else if (committablesPerCheckpoint.containsKey(checkpoint)) {
+ continue;
+ } else {
+ committablesPerCheckpoint.put(
+ checkpoint, committer.combine(checkpoint, globalWatermark, committables));
+ }
+ }
+ }
+
+ private void processWatermark(long watermark) {
+ if (watermark != Long.MAX_VALUE) {
+ globalWatermark = Math.max(globalWatermark, watermark);
+ }
+ }
+
+ private void endInput() throws Exception {
+ endInput = true;
+ if (endInputWatermark != null) {
+ globalWatermark = endInputWatermark;
+ }
+ if (!streamingCheckpointEnabled) {
+ commitUpToCheckpoint(END_INPUT_CHECKPOINT_ID);
+ }
+ }
+
+ public boolean isEndInput() {
+ return endInput && streamingCheckpointEnabled;
+ }
+
+ public void notifyCheckpointComplete(long checkpointId) throws Exception {
+ commitUpToCheckpoint(endInput ? END_INPUT_CHECKPOINT_ID : checkpointId);
+ }
+
+ public int filterAndCommitUpToCheckpoint(long checkpointId) throws Exception {
+ NavigableMap headMap =
+ committablesPerCheckpoint.headMap(checkpointId, true);
+ List committables = new ArrayList<>(headMap.values());
+ if (committables.isEmpty() && committer.forceCreatingSnapshot()) {
+ committables =
+ Collections.singletonList(
+ committer.combine(
+ checkpointId, globalWatermark, Collections.emptyList()));
+ }
+ int committed = committer.filterAndCommit(committables, true, false);
+ headMap.clear();
+ return committed;
+ }
+
+ private void commitUpToCheckpoint(long checkpointId) throws Exception {
+ NavigableMap headMap =
+ committablesPerCheckpoint.headMap(checkpointId, true);
+ List committables = new ArrayList<>(headMap.values());
+ if (committables.isEmpty() && committer.forceCreatingSnapshot()) {
+ committables =
+ Collections.singletonList(
+ committer.combine(
+ checkpointId, globalWatermark, Collections.emptyList()));
+ }
+ if (checkpointId == END_INPUT_CHECKPOINT_ID) {
+ committer.filterAndCommit(committables, false, true);
+ } else {
+ committer.commit(committables);
+ }
+ headMap.clear();
+ }
+
+ public void notifyCheckpointAborted(long checkpointId) {
+ // Checkpoint abort is not committable abort. Keep pending committables for a later
+ // completed checkpoint.
+ }
+
+ public void close() throws Exception {
+ committablesPerCheckpoint.clear();
+ if (committer != null) {
+ committer.close();
+ }
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableState.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableState.java
new file mode 100644
index 000000000000..421a9315a709
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableState.java
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.paimon.flink.sink.Committable;
+import org.apache.paimon.flink.sink.CommittableSerializer;
+import org.apache.paimon.table.sink.CommitMessageSerializer;
+
+import org.apache.flink.api.common.state.ListState;
+import org.apache.flink.api.common.state.ListStateDescriptor;
+import org.apache.flink.api.common.typeutils.base.array.BytePrimitiveArraySerializer;
+import org.apache.flink.core.memory.DataInputDeserializer;
+import org.apache.flink.core.memory.DataOutputViewStreamWrapper;
+import org.apache.flink.runtime.state.StateInitializationContext;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.NavigableMap;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+/** Writer-side state for committables which have not been committed by PWC. */
+public class CoordinatedCommittableState {
+
+ private static final String STATE_NAME = "pwc_pending_committables";
+
+ private ListState state;
+ private CheckpointCommittablesSerializer serializer;
+ private final NavigableMap> pendingCommittables = new TreeMap<>();
+ private final Set acknowledgedCheckpoints = new TreeSet<>();
+
+ public void initialize(StateInitializationContext context) throws Exception {
+ CommittableSerializer committableSerializer =
+ new CommittableSerializer(new CommitMessageSerializer());
+ serializer = new CheckpointCommittablesSerializer(committableSerializer);
+ state =
+ context.getOperatorStateStore()
+ .getListState(
+ new ListStateDescriptor<>(
+ STATE_NAME, BytePrimitiveArraySerializer.INSTANCE));
+ for (byte[] bytes : state.get()) {
+ CheckpointCommittables checkpoint = serializer.deserialize(bytes);
+ pendingCommittables
+ .computeIfAbsent(checkpoint.checkpointId(), ignored -> new ArrayList<>())
+ .addAll(checkpoint.committables());
+ }
+ }
+
+ public void add(Committable committable) {
+ pendingCommittables
+ .computeIfAbsent(committable.checkpointId(), ignored -> new ArrayList<>())
+ .add(committable);
+ }
+
+ public void snapshot(long checkpointId) throws Exception {
+ pendingCommittables.computeIfAbsent(checkpointId, ignored -> new ArrayList<>());
+ List checkpoints = new ArrayList<>();
+ for (Map.Entry> entry : pendingCommittables.entrySet()) {
+ checkpoints.add(
+ serializer.serialize(
+ new CheckpointCommittables(
+ entry.getKey(), new ArrayList<>(entry.getValue()))));
+ }
+ state.update(checkpoints);
+ }
+
+ public Map> pendingCommittables() {
+ Map> result = new TreeMap<>();
+ for (Map.Entry> entry : pendingCommittables.entrySet()) {
+ result.put(entry.getKey(), new ArrayList<>(entry.getValue()));
+ }
+ return result;
+ }
+
+ public List committables() {
+ List result = new ArrayList<>();
+ for (List committables : pendingCommittables.values()) {
+ result.addAll(committables);
+ }
+ return result;
+ }
+
+ public List unacknowledgedCommittables() {
+ List result = new ArrayList<>();
+ for (Map.Entry> entry : pendingCommittables.entrySet()) {
+ if (!acknowledgedCheckpoints.contains(entry.getKey())) {
+ result.addAll(entry.getValue());
+ }
+ }
+ return result;
+ }
+
+ public void markAcknowledged(List committables) {
+ for (Committable committable : committables) {
+ acknowledgedCheckpoints.add(committable.checkpointId());
+ }
+ }
+
+ public void markCommittedUpTo(long checkpointId) {
+ pendingCommittables.headMap(checkpointId, true).clear();
+ acknowledgedCheckpoints.removeIf(id -> id <= checkpointId);
+ }
+
+ public void clear() throws Exception {
+ pendingCommittables.clear();
+ acknowledgedCheckpoints.clear();
+ if (state != null) {
+ state.clear();
+ }
+ }
+
+ private static class CheckpointCommittables {
+
+ private final long checkpointId;
+ private final List committables;
+
+ private CheckpointCommittables(long checkpointId, List committables) {
+ this.checkpointId = checkpointId;
+ this.committables = committables;
+ }
+
+ private long checkpointId() {
+ return checkpointId;
+ }
+
+ private List committables() {
+ return committables;
+ }
+ }
+
+ /** Serializer for checkpoint committables. */
+ private static class CheckpointCommittablesSerializer {
+
+ private final CommittableSerializer committableSerializer;
+
+ private CheckpointCommittablesSerializer(CommittableSerializer committableSerializer) {
+ this.committableSerializer = committableSerializer;
+ }
+
+ private byte[] serialize(CheckpointCommittables checkpoint) throws IOException {
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ DataOutputViewStreamWrapper view = new DataOutputViewStreamWrapper(out);
+ view.writeLong(checkpoint.checkpointId());
+ view.writeInt(checkpoint.committables().size());
+ for (Committable committable : checkpoint.committables()) {
+ byte[] bytes = committableSerializer.serialize(committable);
+ view.writeInt(bytes.length);
+ view.write(bytes);
+ }
+ return out.toByteArray();
+ }
+
+ private CheckpointCommittables deserialize(byte[] serialized) throws IOException {
+ DataInputDeserializer view = new DataInputDeserializer(serialized);
+ long checkpointId = view.readLong();
+ int count = view.readInt();
+ if (count < 0) {
+ throw new IOException("Negative committable count: " + count);
+ }
+
+ List committables = new ArrayList<>(count);
+ for (int i = 0; i < count; i++) {
+ int length = view.readInt();
+ if (length < 0) {
+ throw new IOException("Negative committable length: " + length);
+ }
+ byte[] bytes = new byte[length];
+ view.readFully(bytes);
+ Committable committable =
+ committableSerializer.deserialize(
+ committableSerializer.getVersion(), bytes);
+ if (committable.checkpointId() != checkpointId) {
+ throw new IOException(
+ String.format(
+ "Committable checkpoint %s does not match state checkpoint %s.",
+ committable.checkpointId(), checkpointId));
+ }
+ committables.add(committable);
+ }
+ return new CheckpointCommittables(checkpointId, committables);
+ }
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSender.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSender.java
new file mode 100644
index 000000000000..e5c3c0e70ef9
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSender.java
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.paimon.flink.sink.Committable;
+import org.apache.paimon.flink.sink.CommittableSerializer;
+import org.apache.paimon.table.sink.CommitMessageSerializer;
+
+import org.apache.flink.runtime.jobgraph.OperatorID;
+import org.apache.flink.runtime.jobgraph.tasks.TaskOperatorEventGateway;
+import org.apache.flink.runtime.operators.coordination.CoordinationRequest;
+import org.apache.flink.util.Preconditions;
+import org.apache.flink.util.SerializedValue;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+
+/** Sends writer committables to the Paimon writer coordinator. */
+public class CoordinatedFileInfoSender {
+
+ private static final int LENGTH_FIELD_SIZE = 4;
+ private static final int COUNT_FIELD_SIZE = 4;
+
+ private final TaskOperatorEventGateway gateway;
+ private final OperatorID operatorId;
+ private final CommittableSerializer serializer;
+
+ private int subtaskId;
+ private int attemptNumber;
+ private long watermark;
+ private boolean endInput;
+
+ public CoordinatedFileInfoSender(TaskOperatorEventGateway gateway, OperatorID operatorId) {
+ this.gateway = gateway;
+ this.operatorId = operatorId;
+ this.serializer = new CommittableSerializer(new CommitMessageSerializer());
+ this.subtaskId = -1;
+ this.attemptNumber = -1;
+ this.watermark = Long.MIN_VALUE;
+ }
+
+ public void setSubtaskId(int subtaskId) {
+ this.subtaskId = subtaskId;
+ }
+
+ public void setAttemptNumber(int attemptNumber) {
+ this.attemptNumber = attemptNumber;
+ }
+
+ public void processWatermark(long watermark) {
+ if (watermark != Long.MAX_VALUE) {
+ this.watermark = Math.max(this.watermark, watermark);
+ }
+ }
+
+ public boolean isEndInput() {
+ return endInput;
+ }
+
+ public void sendToCoordinator(long checkpointId, List committables) {
+ if (checkpointId == CommitterCoordinator.END_INPUT_CHECKPOINT_ID) {
+ endInput = true;
+ }
+ byte[] data = serializeCommittables(committables);
+ FileInfoRequest request =
+ FileInfoRequest.fileInfo(
+ checkpointId,
+ subtaskId,
+ attemptNumber,
+ watermark,
+ data,
+ committables.size());
+ sendRequest(request);
+ }
+
+ public void sendRecoveredFileInfoToCoordinator(
+ long checkpointId, String commitUser, List committables) {
+ byte[] data = serializeCommittables(committables);
+ sendRequest(
+ FileInfoRequest.recoveredFileInfo(
+ checkpointId,
+ subtaskId,
+ attemptNumber,
+ watermark,
+ data,
+ committables.size(),
+ commitUser));
+ }
+
+ private void sendRequest(FileInfoRequest request) {
+ try {
+ SerializedValue serializedRequest =
+ new SerializedValue(request);
+ FileInfoReceivedResponse response =
+ CoordinationResponseUtils.unwrap(
+ gateway.sendRequestToCoordinator(operatorId, serializedRequest).get());
+ Preconditions.checkState(
+ response.checkpointId() == request.checkpointId()
+ && response.subtaskId() == request.subtaskId(),
+ "Unexpected file info ACK response for checkpoint %s subtask %s: checkpoint %s subtask %s.",
+ request.checkpointId(),
+ request.subtaskId(),
+ response.checkpointId(),
+ response.subtaskId());
+ } catch (IOException | ExecutionException e) {
+ throw new RuntimeException("Failed to send file info to coordinator.", e);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ throw new RuntimeException("Interrupted while sending file info to coordinator.", e);
+ }
+ }
+
+ private byte[] serializeCommittables(List committables) {
+ try {
+ int totalBytes = COUNT_FIELD_SIZE;
+ List serializedCommittables = new ArrayList<>(committables.size());
+ for (Committable committable : committables) {
+ Preconditions.checkNotNull(committable, "Committable cannot be null");
+ byte[] serialized = serializer.serialize(committable);
+ serializedCommittables.add(serialized);
+ totalBytes += LENGTH_FIELD_SIZE + serialized.length;
+ }
+
+ byte[] result = new byte[totalBytes];
+ ByteBuffer resultBuffer = ByteBuffer.wrap(result);
+ resultBuffer.putInt(committables.size());
+ for (byte[] serialized : serializedCommittables) {
+ resultBuffer.putInt(serialized.length);
+ resultBuffer.put(serialized);
+ }
+ return result;
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to serialize committable.", e);
+ }
+ }
+
+ public static List deserializeCommittables(byte[] data) throws IOException {
+ ByteBuffer buffer = ByteBuffer.wrap(data == null ? new byte[0] : data);
+ if (buffer.remaining() < COUNT_FIELD_SIZE) {
+ throw new IOException("Invalid committable data: missing count field.");
+ }
+
+ int count = buffer.getInt();
+ List result = new ArrayList<>(count);
+ CommittableSerializer serializer = new CommittableSerializer(new CommitMessageSerializer());
+ int version = serializer.getVersion();
+ for (int i = 0; i < count; i++) {
+ if (buffer.remaining() < LENGTH_FIELD_SIZE) {
+ throw new IOException("Invalid committable data: missing length field.");
+ }
+ int length = buffer.getInt();
+ if (length < 0 || length > buffer.remaining()) {
+ throw new IOException("Invalid committable data: corrupted length field.");
+ }
+ byte[] bytes = new byte[length];
+ buffer.get(bytes);
+ result.add(serializer.deserialize(version, bytes));
+ }
+
+ return result;
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoReceivedResponse.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoReceivedResponse.java
new file mode 100644
index 000000000000..3cbaa45374c0
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoReceivedResponse.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.flink.runtime.operators.coordination.CoordinationResponse;
+
+/** ACK response after PWC handles a writer file info request. */
+public class FileInfoReceivedResponse implements CoordinationResponse {
+
+ private static final long serialVersionUID = 1L;
+
+ private final long checkpointId;
+ private final int subtaskId;
+
+ public FileInfoReceivedResponse(long checkpointId, int subtaskId) {
+ this.checkpointId = checkpointId;
+ this.subtaskId = subtaskId;
+ }
+
+ public long checkpointId() {
+ return checkpointId;
+ }
+
+ public int subtaskId() {
+ return subtaskId;
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoRequest.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoRequest.java
new file mode 100644
index 000000000000..f5f50e9c2ff4
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoRequest.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.flink.runtime.operators.coordination.CoordinationRequest;
+
+import javax.annotation.Nullable;
+
+import java.util.Arrays;
+
+/** Request sent from writer subtasks to the Paimon writer coordinator. */
+public class FileInfoRequest implements CoordinationRequest {
+
+ private static final long serialVersionUID = 1L;
+
+ private final long checkpointId;
+ private final int subtaskId;
+ private final int attemptNumber;
+ private final long watermark;
+ private final byte[] serializedData;
+ private final int committableCount;
+ private final boolean recovered;
+ private final @Nullable String commitUser;
+ private final int payloadHash;
+
+ public static FileInfoRequest fileInfo(
+ long checkpointId,
+ int subtaskId,
+ int attemptNumber,
+ long watermark,
+ byte[] serializedData,
+ int committableCount) {
+ return new FileInfoRequest(
+ checkpointId,
+ subtaskId,
+ attemptNumber,
+ watermark,
+ serializedData,
+ committableCount,
+ false,
+ null);
+ }
+
+ public static FileInfoRequest recoveredFileInfo(
+ long checkpointId,
+ int subtaskId,
+ int attemptNumber,
+ long watermark,
+ byte[] serializedData,
+ int committableCount,
+ String commitUser) {
+ return new FileInfoRequest(
+ checkpointId,
+ subtaskId,
+ attemptNumber,
+ watermark,
+ serializedData,
+ committableCount,
+ true,
+ commitUser);
+ }
+
+ private FileInfoRequest(
+ long checkpointId,
+ int subtaskId,
+ int attemptNumber,
+ long watermark,
+ byte[] serializedData,
+ int committableCount,
+ boolean recovered,
+ @Nullable String commitUser) {
+ this.checkpointId = checkpointId;
+ this.subtaskId = subtaskId;
+ this.attemptNumber = attemptNumber;
+ this.watermark = watermark;
+ this.serializedData = serializedData == null ? new byte[0] : serializedData;
+ this.committableCount = committableCount;
+ this.recovered = recovered;
+ this.commitUser = commitUser;
+ this.payloadHash = Arrays.hashCode(this.serializedData);
+ }
+
+ public long checkpointId() {
+ return checkpointId;
+ }
+
+ public int subtaskId() {
+ return subtaskId;
+ }
+
+ public int attemptNumber() {
+ return attemptNumber;
+ }
+
+ public long watermark() {
+ return watermark;
+ }
+
+ public byte[] serializedData() {
+ return serializedData;
+ }
+
+ public int committableCount() {
+ return committableCount;
+ }
+
+ public boolean recovered() {
+ return recovered;
+ }
+
+ public @Nullable String commitUser() {
+ return commitUser;
+ }
+
+ public boolean samePayload(FileInfoRequest other) {
+ return other != null
+ && payloadHash == other.payloadHash
+ && committableCount == other.committableCount
+ && Arrays.equals(serializedData, other.serializedData);
+ }
+
+ @Override
+ public String toString() {
+ if (recovered) {
+ return String.format(
+ "FileInfoRequest{checkpoint=%d, recovered=true, subtask=%d, attempt=%d, "
+ + "count=%d, dataSize=%d bytes, commitUser=%s}",
+ checkpointId,
+ subtaskId,
+ attemptNumber,
+ committableCount,
+ serializedData.length,
+ commitUser);
+ }
+ return String.format(
+ "FileInfoRequest{checkpoint=%d, subtask=%d, attempt=%d, count=%d, dataSize=%d bytes}",
+ checkpointId, subtaskId, attemptNumber, committableCount, serializedData.length);
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinator.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinator.java
new file mode 100644
index 000000000000..c9deac75e088
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinator.java
@@ -0,0 +1,425 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.paimon.flink.sink.Committable;
+import org.apache.paimon.flink.sink.Committer;
+import org.apache.paimon.flink.sink.TableWriteOperator;
+import org.apache.paimon.utils.ExceptionUtils;
+
+import org.apache.flink.runtime.jobgraph.OperatorID;
+import org.apache.flink.runtime.operators.coordination.CoordinationRequest;
+import org.apache.flink.runtime.operators.coordination.CoordinationRequestHandler;
+import org.apache.flink.runtime.operators.coordination.CoordinationResponse;
+import org.apache.flink.runtime.operators.coordination.OperatorCoordinator;
+import org.apache.flink.runtime.operators.coordination.OperatorEvent;
+import org.apache.flink.util.ThrowableCatchingRunnable;
+import org.apache.flink.util.function.ThrowingRunnable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.annotation.Nullable;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ThreadFactory;
+
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * {@link OperatorCoordinator} for {@link TableWriteOperator}. It receives writer file information
+ * and performs global commits in JobManager.
+ */
+public class PaimonWriterCoordinator implements OperatorCoordinator, CoordinationRequestHandler {
+
+ private static final Logger LOG = LoggerFactory.getLogger(PaimonWriterCoordinator.class);
+
+ private final PendingSubtask pendingSubtask;
+ private final CoordinatorExecutorThreadFactory coordinatorThreadFactory;
+ private final CompletableFuture finalCheckpointCompleted = new CompletableFuture<>();
+
+ private final OperatorCoordinator.Context context;
+ private final CommitterCoordinator coordinator;
+ private final String initialCommitUser;
+
+ private @Nullable String commitUser;
+ private ScheduledExecutorService coordinatorExecutor;
+ private boolean started;
+ private boolean freshInstance = true;
+
+ public PaimonWriterCoordinator(
+ boolean streamingCheckpointEnabled,
+ String initialCommitUser,
+ Committer.Factory committerFactory,
+ OperatorCoordinator.Context context,
+ CoordinatorExecutorThreadFactory coordinatorThreadFactory,
+ Long endInputWatermark) {
+ this.context = context;
+ this.coordinatorThreadFactory = coordinatorThreadFactory;
+ this.initialCommitUser = initialCommitUser;
+ this.coordinator =
+ new CommitterCoordinator<>(
+ streamingCheckpointEnabled, committerFactory, endInputWatermark);
+ this.pendingSubtask = new PendingSubtask(this.coordinator);
+ }
+
+ @Override
+ public void start() throws Exception {
+ OperatorID operatorId = context.getOperatorId();
+ LOG.info("Paimon writer coordinator starting, operatorId={}", operatorId);
+ if (commitUser == null) {
+ commitUser = initialCommitUser;
+ }
+ started = true;
+ coordinatorExecutor = Executors.newScheduledThreadPool(1, coordinatorThreadFactory);
+ int parallelism = context.currentParallelism();
+ coordinator.init(parallelism, commitUser);
+ pendingSubtask.init(parallelism);
+ }
+
+ @Override
+ public void executionAttemptReady(int subtask, int attemptNumber, SubtaskGateway gateway) {
+ runInEventLoop(
+ () -> pendingSubtask.registerSubtask(subtask, attemptNumber, gateway),
+ "registering subtask %d attempt %d",
+ subtask,
+ attemptNumber);
+ }
+
+ @Override
+ public void executionAttemptFailed(int subtask, int attemptNumber, Throwable throwable) {
+ runInEventLoop(
+ () -> pendingSubtask.unregisterSubtask(subtask, attemptNumber, throwable),
+ "unregistering subtask %d attempt %d",
+ subtask,
+ attemptNumber);
+ }
+
+ @Override
+ public void handleEventFromOperator(int subtask, int attemptNumber, OperatorEvent event) {
+ freshInstance = false;
+ throw new UnsupportedOperationException(
+ "PWC only accepts file info through coordination requests.");
+ }
+
+ @Override
+ public CompletableFuture handleCoordinationRequest(
+ CoordinationRequest request) {
+ freshInstance = false;
+ if (request instanceof FileInfoRequest) {
+ return handleFileInfoRequest((FileInfoRequest) request);
+ }
+ CompletableFuture result = new CompletableFuture<>();
+ result.completeExceptionally(
+ new IllegalArgumentException("Unsupported request type: " + request.getClass()));
+ return result;
+ }
+
+ @Override
+ public void checkpointCoordinator(long checkpointId, CompletableFuture result) {
+ freshInstance = false;
+ LOG.info("PWC snapshot commitUser={}, checkpointId={}", commitUser, checkpointId);
+ checkState(commitUser != null, "PWC has not been started.");
+ result.complete(serializeCoordinatorState(commitUser));
+ }
+
+ @Override
+ public void notifyCheckpointComplete(long checkpointId) {
+ freshInstance = false;
+ runInEventLoop(
+ () -> {
+ handleCommitResult(pendingSubtask.notifyCheckpointComplete(checkpointId));
+ if (coordinator.isEndInput()) {
+ finalCheckpointCompleted.complete(null);
+ }
+ },
+ "notifying checkpoint %d complete",
+ checkpointId);
+ if (coordinator.isEndInput()) {
+ try {
+ finalCheckpointCompleted.get();
+ } catch (InterruptedException | ExecutionException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ @Override
+ public void notifyCheckpointAborted(long checkpointId) {
+ runInEventLoop(
+ () -> pendingSubtask.notifyCheckpointAborted(checkpointId),
+ "notifying checkpoint %d aborted",
+ checkpointId);
+ }
+
+ @Override
+ public void resetToCheckpoint(long checkpointId, byte[] bytes) throws Exception {
+ LOG.info("PWC resetToCheckpoint: checkpointId={}, fresh={}", checkpointId, freshInstance);
+ if (freshInstance && checkpointId >= 0) {
+ checkState(!started, "PWC can only be restored before it is started.");
+ commitUser = deserializeCoordinatorState(bytes);
+ pendingSubtask.restoreCheckpoint(checkpointId);
+ }
+ freshInstance = false;
+ }
+
+ @Override
+ public void subtaskReset(int subtask, long checkpointId) {}
+
+ @Override
+ public void close() throws Exception {
+ pendingSubtask.close();
+ coordinator.close();
+ if (coordinatorExecutor != null) {
+ coordinatorExecutor.shutdownNow();
+ }
+ }
+
+ private void handleCommitResult(CommitResult result) {
+ if (!result.committed()) {
+ return;
+ }
+ if (result.restoredCommit() && result.committedCount() > 0) {
+ context.failJob(
+ new RecommitRequiredException(result.checkpointId(), result.committedCount()));
+ return;
+ }
+ sendCommitCompleteEvent(result.checkpointId());
+ }
+
+ private void sendCommitCompleteEvent(long checkpointId) {
+ CommitCompleteEvent event = new CommitCompleteEvent(checkpointId);
+ for (SubtaskGateway gateway : pendingSubtask.activeGateways()) {
+ gateway.sendEvent(event)
+ .whenComplete(
+ (ignored, error) -> {
+ if (error != null) {
+ context.failJob(error);
+ }
+ });
+ }
+ }
+
+ private CompletableFuture handleFileInfoRequest(FileInfoRequest request) {
+ ensureStarted();
+ CompletableFuture result = new CompletableFuture<>();
+ runInEventLoop(
+ () -> {
+ try {
+ if (!pendingSubtask.isValid(request.subtaskId(), request.attemptNumber())) {
+ result.completeExceptionally(
+ new IllegalStateException(
+ String.format(
+ "Received file info request from invalid subtask %d attempt %d.",
+ request.subtaskId(), request.attemptNumber())));
+ return;
+ }
+ if (request.recovered()) {
+ validateCommitUser(request.commitUser());
+ }
+ handleCommitResult(pendingSubtask.receive(request.subtaskId(), request));
+ result.complete(
+ CoordinationResponseUtils.wrap(
+ new FileInfoReceivedResponse(
+ request.checkpointId(), request.subtaskId())));
+ } catch (Throwable t) {
+ result.completeExceptionally(t);
+ throw t;
+ }
+ },
+ "handling file info request %s",
+ request);
+ return result;
+ }
+
+ private void validateCommitUser(@Nullable String recoveredCommitUser) {
+ checkState(commitUser != null, "PWC has not been started.");
+ checkState(recoveredCommitUser != null, "Recovered writer commit user is null.");
+ checkState(
+ commitUser.equals(recoveredCommitUser),
+ "Writer commit user %s does not match PWC commit user %s.",
+ recoveredCommitUser,
+ commitUser);
+ }
+
+ private static byte[] serializeCoordinatorState(String commitUser) {
+ byte[] commitUserBytes = commitUser.getBytes(StandardCharsets.UTF_8);
+ return ByteBuffer.allocate(Integer.BYTES + commitUserBytes.length)
+ .putInt(commitUserBytes.length)
+ .put(commitUserBytes)
+ .array();
+ }
+
+ private static String deserializeCoordinatorState(byte[] bytes) {
+ if (bytes.length < Integer.BYTES) {
+ throw new IllegalArgumentException("Corrupted PWC coordinator state.");
+ }
+
+ ByteBuffer buffer = ByteBuffer.wrap(bytes);
+ int commitUserLength = buffer.getInt();
+ if (commitUserLength < 0 || commitUserLength != buffer.remaining()) {
+ throw new IllegalArgumentException("Corrupted commit user in PWC coordinator state.");
+ }
+ byte[] commitUserBytes = new byte[commitUserLength];
+ buffer.get(commitUserBytes);
+ return new String(commitUserBytes, StandardCharsets.UTF_8);
+ }
+
+ private void runInEventLoop(
+ final ThrowingRunnable action,
+ final String actionName,
+ final Object... parameters) {
+ ensureStarted();
+ coordinatorExecutor.execute(
+ new ThrowableCatchingRunnable(
+ throwable ->
+ coordinatorThreadFactory.uncaughtException(
+ Thread.currentThread(), throwable),
+ () -> {
+ try {
+ action.run();
+ } catch (Throwable t) {
+ ExceptionUtils.rethrowIfFatalErrorOrOOM(t);
+ LOG.error(
+ "Uncaught exception in PWC while {}.",
+ String.format(actionName, parameters),
+ t);
+ context.failJob(t);
+ }
+ }));
+ }
+
+ public void runInCoordinatorThread(Runnable runnable) {
+ ensureStarted();
+ coordinatorExecutor.execute(runnable);
+ }
+
+ private void ensureStarted() {
+ if (!started) {
+ throw new IllegalStateException("The coordinator has not started yet.");
+ }
+ }
+
+ /** Provider for {@link PaimonWriterCoordinator}. */
+ public static class WriterCoordinatorProvider implements OperatorCoordinator.Provider {
+
+ private static final long serialVersionUID = 1L;
+
+ private final boolean streamingCheckpointEnabled;
+ private final String operatorName;
+ private final OperatorID operatorId;
+ private final String initialCommitUser;
+ private final Committer.Factory committerFactory;
+ private final Long endInputWatermark;
+
+ public WriterCoordinatorProvider(
+ boolean streamingCheckpointEnabled,
+ String operatorName,
+ OperatorID operatorId,
+ String initialCommitUser,
+ Committer.Factory committerFactory,
+ Long endInputWatermark) {
+ this.streamingCheckpointEnabled = streamingCheckpointEnabled;
+ this.operatorName = operatorName;
+ this.operatorId = operatorId;
+ this.initialCommitUser = initialCommitUser;
+ this.committerFactory = committerFactory;
+ this.endInputWatermark = endInputWatermark;
+ }
+
+ @Override
+ public OperatorID getOperatorId() {
+ return operatorId;
+ }
+
+ @Override
+ public OperatorCoordinator create(OperatorCoordinator.Context context) {
+ CoordinatorExecutorThreadFactory threadFactory =
+ new CoordinatorExecutorThreadFactory(
+ "PaimonWriterCoordinator-" + operatorName, context);
+ return new PaimonWriterCoordinator(
+ streamingCheckpointEnabled,
+ initialCommitUser,
+ committerFactory,
+ context,
+ threadFactory,
+ endInputWatermark);
+ }
+ }
+
+ /** Thread factory for the single coordinator event loop. */
+ public static class CoordinatorExecutorThreadFactory
+ implements ThreadFactory, Thread.UncaughtExceptionHandler {
+
+ private final String coordinatorThreadName;
+ private final ClassLoader classLoader;
+ private final Thread.UncaughtExceptionHandler errorHandler;
+
+ @Nullable private Thread thread;
+
+ public CoordinatorExecutorThreadFactory(
+ String coordinatorThreadName, OperatorCoordinator.Context context) {
+ this(
+ coordinatorThreadName,
+ context.getUserCodeClassloader(),
+ (thread, error) -> context.failJob(error));
+ }
+
+ CoordinatorExecutorThreadFactory(
+ String coordinatorThreadName,
+ ClassLoader classLoader,
+ Thread.UncaughtExceptionHandler errorHandler) {
+ this.coordinatorThreadName = coordinatorThreadName;
+ this.classLoader = classLoader;
+ this.errorHandler = errorHandler;
+ }
+
+ @Override
+ public synchronized Thread newThread(Runnable runnable) {
+ checkState(thread == null, "CoordinatorExecutorThreadFactory can create one thread.");
+ thread = new Thread(runnable, coordinatorThreadName);
+ thread.setContextClassLoader(classLoader);
+ thread.setUncaughtExceptionHandler(this);
+ return thread;
+ }
+
+ @Override
+ public synchronized void uncaughtException(Thread thread, Throwable error) {
+ errorHandler.uncaughtException(thread, error);
+ }
+ }
+
+ private static class RecommitRequiredException extends RuntimeException {
+
+ private static final long serialVersionUID = 1L;
+
+ private RecommitRequiredException(long checkpointId, int committedCount) {
+ super(
+ String.format(
+ "PWC committed %d restored committable(s) up to checkpoint %d. "
+ + "Triggering global recovery so writers continue from the "
+ + "latest Paimon snapshot.",
+ committedCount, checkpointId));
+ }
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingCheckpoint.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingCheckpoint.java
new file mode 100644
index 000000000000..40a2a6def447
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingCheckpoint.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.paimon.flink.sink.Committable;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeSet;
+
+/** Pending data and state for one checkpoint in PWC. */
+class PendingCheckpoint {
+
+ private final long checkpointId;
+ private final Map fileInfos;
+ private boolean staged;
+
+ PendingCheckpoint(long checkpointId) {
+ this.checkpointId = checkpointId;
+ this.fileInfos = new HashMap<>();
+ }
+
+ long checkpointId() {
+ return checkpointId;
+ }
+
+ boolean receive(int subtask, FileInfoRequest request, List committables) {
+ SubtaskFileInfo previous = fileInfos.get(subtask);
+ if (previous != null) {
+ if (previous.request().samePayload(request)) {
+ return false;
+ }
+ throw new IllegalStateException(
+ String.format(
+ "Different FileInfoRequest received for checkpoint %d subtask %d.",
+ checkpointId, subtask));
+ }
+
+ fileInfos.put(subtask, new SubtaskFileInfo(request, committables));
+ return true;
+ }
+
+ void removeSubtask(int subtask) {
+ fileInfos.remove(subtask);
+ }
+
+ boolean isEmpty() {
+ return fileInfos.isEmpty();
+ }
+
+ boolean staged() {
+ return staged;
+ }
+
+ void markStaged() {
+ staged = true;
+ }
+
+ List fileInfos() {
+ return new ArrayList<>(fileInfos.values());
+ }
+
+ List allCommittables() {
+ List result = new ArrayList<>();
+ for (Integer subtask : new TreeSet<>(fileInfos.keySet())) {
+ result.addAll(fileInfos.get(subtask).committables());
+ }
+ return result;
+ }
+
+ List committablesAfter(long checkpointId) {
+ List result = new ArrayList<>();
+ for (Committable committable : allCommittables()) {
+ if (committable.checkpointId() > checkpointId) {
+ result.add(committable);
+ }
+ }
+ return result;
+ }
+
+ long maxWatermark() {
+ long watermark = Long.MIN_VALUE;
+ for (SubtaskFileInfo fileInfo : fileInfos.values()) {
+ watermark = Math.max(watermark, fileInfo.request().watermark());
+ }
+ return watermark;
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingSubtask.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingSubtask.java
new file mode 100644
index 000000000000..8e87728363e5
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingSubtask.java
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.paimon.flink.sink.Committable;
+
+import org.apache.flink.runtime.operators.coordination.OperatorCoordinator;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.NavigableMap;
+import java.util.Set;
+import java.util.TreeMap;
+
+/** Tracks writer subtasks and pending checkpoint file information for PWC. */
+public class PendingSubtask {
+
+ private final Map> registeredSubtasks;
+ private final NavigableMap checkpoints;
+ private final Map> pendingEnvelopes;
+ private final Set abortedCheckpoints;
+ private final CommitterCoordinator coordinator;
+
+ private int parallelism;
+ private long maxCommittedCheckpointId;
+ private long restoredCheckpointId;
+
+ public PendingSubtask(CommitterCoordinator coordinator) {
+ this.coordinator = coordinator;
+ this.registeredSubtasks = new HashMap<>();
+ this.checkpoints = new TreeMap<>();
+ this.pendingEnvelopes = new HashMap<>();
+ this.abortedCheckpoints = new HashSet<>();
+ this.maxCommittedCheckpointId = Long.MIN_VALUE;
+ this.restoredCheckpointId = Long.MIN_VALUE;
+ }
+
+ public void init(int parallelism) {
+ this.parallelism = parallelism;
+ }
+
+ public void registerSubtask(
+ int subtask, int attemptNumber, OperatorCoordinator.SubtaskGateway gateway) {
+ Map attempts =
+ registeredSubtasks.computeIfAbsent(subtask, ignored -> new HashMap<>());
+ if (!attempts.isEmpty() && !attempts.containsKey(attemptNumber)) {
+ attempts.clear();
+ removePendingSubtask(subtask);
+ }
+ attempts.put(attemptNumber, gateway);
+ }
+
+ public void unregisterSubtask(int subtask, int attemptNumber, Throwable throwable) {
+ Map attempts = registeredSubtasks.get(subtask);
+ if (attempts != null) {
+ attempts.remove(attemptNumber);
+ }
+ removePendingSubtask(subtask);
+ }
+
+ public boolean isValid(int subtask, int attemptNumber) {
+ Map attempts = registeredSubtasks.get(subtask);
+ return attempts != null && attempts.containsKey(attemptNumber);
+ }
+
+ public Collection activeGateways() {
+ Collection gateways = new ArrayList<>();
+ for (Map attempts :
+ registeredSubtasks.values()) {
+ gateways.addAll(attempts.values());
+ }
+ return gateways;
+ }
+
+ public CommitResult receive(int subtask, FileInfoRequest request) throws Exception {
+ long envelopeCheckpointId = request.checkpointId();
+ if (envelopeCheckpointId <= maxCommittedCheckpointId) {
+ return new CommitResult(true, 0, maxCommittedCheckpointId, false);
+ }
+
+ recordEnvelope(envelopeCheckpointId, subtask, request);
+ recordFileInfos(subtask, request);
+ if (!envelopeAllReceived(envelopeCheckpointId)) {
+ return CommitResult.NONE;
+ }
+
+ stageCheckpointsUpTo(envelopeCheckpointId);
+
+ if (envelopeCheckpointId != restoredCheckpointId) {
+ return CommitResult.NONE;
+ }
+
+ if (!envelopeAllRecovered(envelopeCheckpointId)) {
+ throw new IllegalStateException(
+ String.format(
+ "Restored checkpoint %d contains non-recovered file info.",
+ restoredCheckpointId));
+ }
+
+ int committedCount = coordinator.filterAndCommitUpToCheckpoint(restoredCheckpointId);
+ maxCommittedCheckpointId = Math.max(maxCommittedCheckpointId, restoredCheckpointId);
+ cleanupCommittedCheckpoints(restoredCheckpointId);
+ return new CommitResult(true, committedCount, restoredCheckpointId, true);
+ }
+
+ public CommitResult notifyCheckpointComplete(long checkpointId) throws Exception {
+ if (checkpointId <= maxCommittedCheckpointId) {
+ return new CommitResult(true, 0, maxCommittedCheckpointId, false);
+ }
+ if (!stagedEnvelope(checkpointId)) {
+ throw new IllegalStateException(
+ String.format(
+ "Checkpoint %d completed before PWC staged file info from all subtasks.",
+ checkpointId));
+ }
+
+ coordinator.notifyCheckpointComplete(checkpointId);
+ maxCommittedCheckpointId = Math.max(maxCommittedCheckpointId, checkpointId);
+ cleanupCommittedCheckpoints(checkpointId);
+ return new CommitResult(true, 0, checkpointId, false);
+ }
+
+ public void notifyCheckpointAborted(long checkpointId) {
+ abortedCheckpoints.add(checkpointId);
+ coordinator.notifyCheckpointAborted(checkpointId);
+ }
+
+ public void restoreCheckpoint(long checkpointId) {
+ restoredCheckpointId = checkpointId;
+ }
+
+ private void recordEnvelope(long checkpointId, int subtask, FileInfoRequest request) {
+ Map envelope =
+ pendingEnvelopes.computeIfAbsent(checkpointId, ignored -> new HashMap<>());
+ if (envelope.containsKey(subtask)) {
+ throw new IllegalStateException(
+ String.format(
+ "Repeated file info envelope received for checkpoint %d subtask %d.",
+ checkpointId, subtask));
+ }
+ envelope.put(subtask, request);
+ }
+
+ private void recordFileInfos(int subtask, FileInfoRequest request) throws Exception {
+ Map> committablesByCheckpoint = new TreeMap<>();
+ for (Committable committable :
+ CoordinatedFileInfoSender.deserializeCommittables(request.serializedData())) {
+ committablesByCheckpoint
+ .computeIfAbsent(committable.checkpointId(), ignored -> new ArrayList<>())
+ .add(committable);
+ }
+ for (Map.Entry> entry : committablesByCheckpoint.entrySet()) {
+ if (entry.getKey() > maxCommittedCheckpointId) {
+ checkpoint(entry.getKey()).receive(subtask, request, entry.getValue());
+ }
+ }
+ }
+
+ private PendingCheckpoint checkpoint(long checkpointId) {
+ return checkpoints.computeIfAbsent(
+ checkpointId, ignored -> new PendingCheckpoint(checkpointId));
+ }
+
+ private boolean envelopeAllReceived(long checkpointId) {
+ Map receivedSubtasks = pendingEnvelopes.get(checkpointId);
+ return receivedSubtasks != null
+ && receivedSubtasks.keySet().containsAll(expectedSubtasks());
+ }
+
+ private boolean envelopeAllRecovered(long checkpointId) {
+ if (!envelopeAllReceived(checkpointId)) {
+ return false;
+ }
+ for (FileInfoRequest request : pendingEnvelopes.get(checkpointId).values()) {
+ if (!request.recovered()) {
+ return false;
+ }
+ }
+ for (PendingCheckpoint checkpoint : checkpoints.headMap(checkpointId, true).values()) {
+ if (!checkpoint.staged()) {
+ continue;
+ }
+ for (SubtaskFileInfo fileInfo : checkpoint.fileInfos()) {
+ if (!fileInfo.request().recovered()) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ private Set expectedSubtasks() {
+ Set subtasks = new HashSet<>();
+ for (int i = 0; i < parallelism; i++) {
+ subtasks.add(i);
+ }
+ return subtasks;
+ }
+
+ private void stageCheckpointsUpTo(long checkpointId) throws Exception {
+ for (PendingCheckpoint checkpoint : checkpoints.headMap(checkpointId, true).values()) {
+ if (!checkpoint.staged()) {
+ saveCheckpoint(checkpoint);
+ checkpoint.markStaged();
+ }
+ }
+ }
+
+ private boolean stagedEnvelope(long checkpointId) {
+ return envelopeAllReceived(checkpointId)
+ && checkpoints.headMap(checkpointId, true).values().stream()
+ .allMatch(PendingCheckpoint::staged);
+ }
+
+ private void saveCheckpoint(PendingCheckpoint checkpoint) throws Exception {
+ coordinator.save(
+ checkpoint.committablesAfter(maxCommittedCheckpointId),
+ checkpoint.checkpointId(),
+ checkpoint.maxWatermark());
+ }
+
+ private void cleanupCommittedCheckpoints(long checkpointId) {
+ checkpoints.keySet().removeIf(id -> id <= checkpointId);
+ pendingEnvelopes.keySet().removeIf(id -> id <= checkpointId);
+ abortedCheckpoints.removeIf(id -> id <= checkpointId);
+ }
+
+ private void removePendingSubtask(int subtask) {
+ for (PendingCheckpoint checkpoint : checkpoints.values()) {
+ if (!checkpoint.staged()) {
+ checkpoint.removeSubtask(subtask);
+ }
+ }
+ checkpoints
+ .entrySet()
+ .removeIf(entry -> !entry.getValue().staged() && entry.getValue().isEmpty());
+ for (Map subtasks : pendingEnvelopes.values()) {
+ subtasks.remove(subtask);
+ }
+ pendingEnvelopes.entrySet().removeIf(entry -> entry.getValue().isEmpty());
+ }
+
+ public void close() {
+ registeredSubtasks.clear();
+ checkpoints.clear();
+ pendingEnvelopes.clear();
+ abortedCheckpoints.clear();
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/SubtaskFileInfo.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/SubtaskFileInfo.java
new file mode 100644
index 000000000000..9b95144c5360
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/SubtaskFileInfo.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.paimon.flink.sink.Committable;
+
+import java.util.List;
+
+/** File information reported by one writer subtask for one checkpoint. */
+class SubtaskFileInfo {
+
+ private final FileInfoRequest request;
+ private final List committables;
+
+ SubtaskFileInfo(FileInfoRequest request, List committables) {
+ this.request = request;
+ this.committables = committables;
+ }
+
+ FileInfoRequest request() {
+ return request;
+ }
+
+ List committables() {
+ return committables;
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableStateTest.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableStateTest.java
new file mode 100644
index 000000000000..087587b7ff5a
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableStateTest.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.paimon.data.BinaryRow;
+import org.apache.paimon.flink.sink.Committable;
+import org.apache.paimon.io.CompactIncrement;
+import org.apache.paimon.io.DataIncrement;
+import org.apache.paimon.table.sink.CommitMessageImpl;
+
+import org.apache.flink.api.common.state.ListState;
+import org.apache.flink.api.common.state.ListStateDescriptor;
+import org.apache.flink.api.common.state.OperatorStateStore;
+import org.apache.flink.runtime.state.StateInitializationContext;
+import org.junit.jupiter.api.Test;
+import org.mockito.ArgumentCaptor;
+import org.mockito.Mockito;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Tests for {@link CoordinatedCommittableState}. */
+class CoordinatedCommittableStateTest {
+
+ @Test
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ void testSnapshotKeepsEmptyCheckpointForRestore() throws Exception {
+ StateInitializationContext context = context(Collections.emptyList(), false);
+ ListState flinkState = committableState(context);
+
+ CoordinatedCommittableState state = new CoordinatedCommittableState();
+ state.initialize(context);
+ state.snapshot(1L);
+
+ assertThat(state.pendingCommittables()).containsOnlyKeys(1L);
+ assertThat(state.pendingCommittables().get(1L)).isEmpty();
+
+ ArgumentCaptor> serializedState = ArgumentCaptor.forClass(List.class);
+ Mockito.verify(flinkState).update(serializedState.capture());
+
+ CoordinatedCommittableState restored = new CoordinatedCommittableState();
+ restored.initialize(context(serializedState.getValue(), true));
+ assertThat(restored.pendingCommittables()).containsOnlyKeys(1L);
+ assertThat(restored.pendingCommittables().get(1L)).isEmpty();
+ }
+
+ @Test
+ void testPendingCheckpointsAreReturnedInAscendingOrder() throws Exception {
+ StateInitializationContext context = context(Collections.emptyList(), false);
+
+ CoordinatedCommittableState state = new CoordinatedCommittableState();
+ state.initialize(context);
+ state.snapshot(3L);
+ state.snapshot(1L);
+ state.snapshot(2L);
+
+ assertThat(new ArrayList<>(state.pendingCommittables().keySet()))
+ .containsExactly(1L, 2L, 3L);
+ assertThat(state.pendingCommittables().get(1L)).isEmpty();
+ assertThat(state.pendingCommittables().get(2L)).isEmpty();
+ assertThat(state.pendingCommittables().get(3L)).isEmpty();
+ }
+
+ @Test
+ void testOnlyUnacknowledgedCommittablesAreReported() throws Exception {
+ StateInitializationContext context = context(Collections.emptyList(), false);
+
+ CoordinatedCommittableState state = new CoordinatedCommittableState();
+ state.initialize(context);
+ Committable ck1 = committable(1L);
+ Committable ck2 = committable(2L);
+ state.add(ck1);
+ state.add(ck2);
+
+ assertThat(state.unacknowledgedCommittables()).containsExactly(ck1, ck2);
+
+ state.markAcknowledged(Collections.singletonList(ck1));
+ assertThat(state.unacknowledgedCommittables()).containsExactly(ck2);
+
+ state.markCommittedUpTo(1L);
+ assertThat(state.pendingCommittables()).containsOnlyKeys(2L);
+ assertThat(state.unacknowledgedCommittables()).containsExactly(ck2);
+ }
+
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ private StateInitializationContext context(Iterable committables, boolean restored)
+ throws Exception {
+ StateInitializationContext context = Mockito.mock(StateInitializationContext.class);
+ OperatorStateStore operatorStateStore = Mockito.mock(OperatorStateStore.class);
+ ListState committableState = Mockito.mock(ListState.class);
+ Mockito.when(context.getOperatorStateStore()).thenReturn(operatorStateStore);
+ Mockito.when(context.isRestored()).thenReturn(restored);
+ Mockito.when(committableState.get()).thenReturn(committables);
+ Mockito.when(operatorStateStore.getListState(Mockito.any(ListStateDescriptor.class)))
+ .thenAnswer(
+ invocation -> {
+ ListStateDescriptor descriptor = invocation.getArgument(0);
+ if ("pwc_pending_committables".equals(descriptor.getName())) {
+ return committableState;
+ }
+ throw new IllegalArgumentException(
+ "Unexpected state " + descriptor.getName());
+ });
+ return context;
+ }
+
+ @SuppressWarnings({"unchecked", "rawtypes"})
+ private ListState committableState(StateInitializationContext context)
+ throws Exception {
+ OperatorStateStore operatorStateStore = context.getOperatorStateStore();
+ ListStateDescriptor descriptor =
+ new ListStateDescriptor<>("pwc_pending_committables", byte[].class);
+ return operatorStateStore.getListState(descriptor);
+ }
+
+ private Committable committable(long checkpointId) {
+ return new Committable(
+ checkpointId,
+ new CommitMessageImpl(
+ BinaryRow.EMPTY_ROW,
+ 0,
+ null,
+ DataIncrement.emptyIncrement(),
+ CompactIncrement.emptyIncrement()));
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSenderTest.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSenderTest.java
new file mode 100644
index 000000000000..41c20a82532a
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSenderTest.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.paimon.data.BinaryRow;
+import org.apache.paimon.flink.sink.Committable;
+import org.apache.paimon.io.CompactIncrement;
+import org.apache.paimon.io.DataIncrement;
+import org.apache.paimon.table.sink.CommitMessageImpl;
+
+import org.apache.flink.runtime.jobgraph.OperatorID;
+import org.apache.flink.runtime.jobgraph.tasks.TaskOperatorEventGateway;
+import org.apache.flink.runtime.operators.coordination.CoordinationRequest;
+import org.apache.flink.runtime.operators.coordination.CoordinationResponse;
+import org.apache.flink.util.SerializedValue;
+import org.junit.jupiter.api.Test;
+import org.mockito.ArgumentCaptor;
+import org.mockito.Mockito;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+/** Tests for {@link CoordinatedFileInfoSender}. */
+class CoordinatedFileInfoSenderTest {
+
+ @Test
+ void testSendWaitsForAckBeforeReturning() throws Exception {
+ TaskOperatorEventGateway gateway = Mockito.mock(TaskOperatorEventGateway.class);
+ OperatorID operatorId = new OperatorID();
+ CompletableFuture ack = new CompletableFuture<>();
+ Mockito.when(gateway.sendRequestToCoordinator(Mockito.eq(operatorId), Mockito.any()))
+ .thenReturn(ack);
+
+ CoordinatedFileInfoSender sender = new CoordinatedFileInfoSender(gateway, operatorId);
+ sender.setSubtaskId(3);
+ sender.setAttemptNumber(4);
+
+ ExecutorService executor = Executors.newSingleThreadExecutor();
+ try {
+ Future> send =
+ executor.submit(
+ () ->
+ sender.sendToCoordinator(
+ 1L, Collections.singletonList(committable(1L))));
+
+ Thread.sleep(100L);
+ assertThat(send.isDone()).isFalse();
+
+ ack.complete(ackResponse(1L, 3));
+ send.get(5, TimeUnit.SECONDS);
+ } finally {
+ executor.shutdownNow();
+ }
+ }
+
+ @Test
+ void testSendEmptyFileInfo() throws Exception {
+ TaskOperatorEventGateway gateway = Mockito.mock(TaskOperatorEventGateway.class);
+ OperatorID operatorId = new OperatorID();
+ Mockito.when(gateway.sendRequestToCoordinator(Mockito.eq(operatorId), Mockito.any()))
+ .thenReturn(CompletableFuture.completedFuture(ackResponse(1L, 3)));
+
+ CoordinatedFileInfoSender sender = new CoordinatedFileInfoSender(gateway, operatorId);
+ sender.setSubtaskId(3);
+ sender.setAttemptNumber(4);
+ sender.sendToCoordinator(1L, Collections.emptyList());
+
+ ArgumentCaptor captor = ArgumentCaptor.forClass(SerializedValue.class);
+ Mockito.verify(gateway).sendRequestToCoordinator(Mockito.eq(operatorId), captor.capture());
+ FileInfoRequest request = deserializeRequest(captor.getValue());
+
+ assertThat(request.checkpointId()).isEqualTo(1L);
+ assertThat(request.subtaskId()).isEqualTo(3);
+ assertThat(request.attemptNumber()).isEqualTo(4);
+ assertThat(request.committableCount()).isEqualTo(0);
+ assertThat(CoordinatedFileInfoSender.deserializeCommittables(request.serializedData()))
+ .isEmpty();
+ }
+
+ @Test
+ void testFailedSendCanBeRetriedWithSameCommittables() throws Exception {
+ TaskOperatorEventGateway gateway = Mockito.mock(TaskOperatorEventGateway.class);
+ OperatorID operatorId = new OperatorID();
+ CompletableFuture failed = new CompletableFuture<>();
+ failed.completeExceptionally(new RuntimeException("send failed"));
+ Mockito.when(gateway.sendRequestToCoordinator(Mockito.eq(operatorId), Mockito.any()))
+ .thenReturn(failed)
+ .thenReturn(CompletableFuture.completedFuture(ackResponse(1L, 3)));
+
+ CoordinatedFileInfoSender sender = new CoordinatedFileInfoSender(gateway, operatorId);
+ sender.setSubtaskId(3);
+ sender.setAttemptNumber(4);
+ List committables = Collections.singletonList(committable(1L));
+
+ assertThatThrownBy(() -> sender.sendToCoordinator(1L, committables))
+ .isInstanceOf(RuntimeException.class)
+ .hasRootCauseMessage("send failed");
+
+ sender.sendToCoordinator(1L, committables);
+
+ ArgumentCaptor captor = ArgumentCaptor.forClass(SerializedValue.class);
+ Mockito.verify(gateway, Mockito.times(2))
+ .sendRequestToCoordinator(Mockito.eq(operatorId), captor.capture());
+ FileInfoRequest retryRequest = deserializeRequest(captor.getAllValues().get(1));
+
+ assertThat(retryRequest.checkpointId()).isEqualTo(1L);
+ assertThat(retryRequest.subtaskId()).isEqualTo(3);
+ assertThat(retryRequest.attemptNumber()).isEqualTo(4);
+ assertThat(retryRequest.committableCount()).isEqualTo(1);
+
+ List retryCommittables =
+ CoordinatedFileInfoSender.deserializeCommittables(retryRequest.serializedData());
+ assertThat(retryCommittables).hasSize(1);
+ assertThat(retryCommittables.get(0).checkpointId()).isEqualTo(1L);
+ }
+
+ @Test
+ void testRecoveredFileInfoUsesSingleRequest() throws Exception {
+ TaskOperatorEventGateway gateway = Mockito.mock(TaskOperatorEventGateway.class);
+ OperatorID operatorId = new OperatorID();
+ Mockito.when(gateway.sendRequestToCoordinator(Mockito.eq(operatorId), Mockito.any()))
+ .thenReturn(CompletableFuture.completedFuture(ackResponse(1L, 3)));
+
+ CoordinatedFileInfoSender sender = new CoordinatedFileInfoSender(gateway, operatorId);
+ sender.setSubtaskId(3);
+ sender.setAttemptNumber(4);
+ sender.sendRecoveredFileInfoToCoordinator(
+ 1L, "commit-user", Collections.singletonList(committable(1L)));
+
+ ArgumentCaptor captor = ArgumentCaptor.forClass(SerializedValue.class);
+ Mockito.verify(gateway).sendRequestToCoordinator(Mockito.eq(operatorId), captor.capture());
+ FileInfoRequest request = deserializeRequest(captor.getValue());
+ assertThat(request.recovered()).isTrue();
+ assertThat(request.checkpointId()).isEqualTo(1L);
+ assertThat(request.subtaskId()).isEqualTo(3);
+ assertThat(request.attemptNumber()).isEqualTo(4);
+ assertThat(request.commitUser()).isEqualTo("commit-user");
+ assertThat(request.committableCount()).isEqualTo(1);
+ assertThat(CoordinatedFileInfoSender.deserializeCommittables(request.serializedData()))
+ .hasSize(1);
+ }
+
+ private Committable committable(long checkpointId) {
+ return new Committable(
+ checkpointId,
+ new CommitMessageImpl(
+ BinaryRow.EMPTY_ROW,
+ 0,
+ null,
+ DataIncrement.emptyIncrement(),
+ CompactIncrement.emptyIncrement()));
+ }
+
+ private CoordinationResponse ackResponse(long checkpointId, int subtaskId) {
+ return CoordinationResponseUtils.wrap(
+ new FileInfoReceivedResponse(checkpointId, subtaskId));
+ }
+
+ @SuppressWarnings("unchecked")
+ private FileInfoRequest deserializeRequest(SerializedValue serializedValue) throws Exception {
+ return (FileInfoRequest)
+ ((SerializedValue) serializedValue)
+ .deserializeValue(Thread.currentThread().getContextClassLoader());
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorITCase.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorITCase.java
new file mode 100644
index 000000000000..9a41c9df01f1
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorITCase.java
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.paimon.Snapshot;
+import org.apache.paimon.flink.CatalogITCaseBase;
+import org.apache.paimon.flink.sink.FlinkSinkBuilder;
+import org.apache.paimon.flink.source.AbstractNonCoordinatedSource;
+import org.apache.paimon.flink.source.AbstractNonCoordinatedSourceReader;
+import org.apache.paimon.flink.source.SimpleSourceSplit;
+
+import org.apache.flink.api.common.JobID;
+import org.apache.flink.api.common.JobStatus;
+import org.apache.flink.api.common.eventtime.WatermarkStrategy;
+import org.apache.flink.api.connector.source.Boundedness;
+import org.apache.flink.api.connector.source.ReaderOutput;
+import org.apache.flink.api.connector.source.SourceReader;
+import org.apache.flink.api.connector.source.SourceReaderContext;
+import org.apache.flink.api.dag.Transformation;
+import org.apache.flink.core.execution.JobClient;
+import org.apache.flink.core.io.InputStatus;
+import org.apache.flink.runtime.execution.ExecutionState;
+import org.apache.flink.runtime.minicluster.MiniCluster;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.table.data.GenericRowData;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.StringData;
+import org.apache.flink.table.runtime.typeutils.InternalTypeInfo;
+import org.apache.flink.table.types.logical.IntType;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.table.types.logical.VarCharType;
+import org.apache.flink.types.Row;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.Timeout;
+
+import java.lang.reflect.Field;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Integration tests for {@link PaimonWriterCoordinator}. */
+@SuppressWarnings("BusyWait")
+public class PaimonWriterCoordinatorITCase extends CatalogITCaseBase {
+
+ private static final String MINI_CLUSTER_FIELD = "miniCluster";
+ private static final RowType ROW_TYPE =
+ new RowType(
+ Arrays.asList(
+ new RowType.RowField("k", new IntType()),
+ new RowType.RowField("v", new VarCharType())));
+
+ @Override
+ protected List ddl() {
+ return Arrays.asList(
+ "CREATE TABLE unaware_table (k INT, v STRING) WITH ("
+ + "'bucket'='-1',"
+ + "'sink.committer-coordinator-operator.enabled'='true')",
+ "CREATE TABLE fixed_table (k INT, v STRING) WITH ("
+ + "'bucket'='1',"
+ + "'bucket-key'='k',"
+ + "'sink.committer-coordinator-operator.enabled'='true')",
+ "CREATE TABLE dynamic_table (k INT PRIMARY KEY NOT ENFORCED, v STRING) WITH ("
+ + "'bucket'='-1',"
+ + "'sink.committer-coordinator-operator.enabled'='true')");
+ }
+
+ @Test
+ @Timeout(120)
+ public void testStreamingCheckpointWriteUnawareTableWithWriterCoordinator() throws Exception {
+ testStreamingCheckpointWriteWithWriterCoordinator("unaware_table");
+ }
+
+ @Test
+ public void testFixedTableIgnoresWriterCoordinatorOption() throws Exception {
+ assertUsesGlobalCommitter(buildPaimonSink("fixed_table"), "fixed_table");
+ }
+
+ @Test
+ public void testDynamicTableIgnoresWriterCoordinatorOption() throws Exception {
+ assertUsesGlobalCommitter(buildPaimonSink("dynamic_table"), "dynamic_table");
+ }
+
+ private void testStreamingCheckpointWriteWithWriterCoordinator(String tableName)
+ throws Exception {
+ StreamExecutionEnvironment env = buildPaimonSink(tableName);
+ assertThat(transformationNames(env)).doesNotContain("Global Committer : " + tableName);
+
+ JobClient jobClient = env.executeAsync();
+ triggerCheckpointAndWaitForWrites(jobClient, tableName, 4);
+ jobClient.cancel().get();
+
+ sqlAssertWithRetry(
+ "SELECT * FROM " + tableName,
+ rows ->
+ rows.containsExactlyInAnyOrder(
+ Row.of(1, "one"),
+ Row.of(2, "two"),
+ Row.of(3, "three"),
+ Row.of(4, "four")));
+ }
+
+ private StreamExecutionEnvironment buildPaimonSink(String tableName) throws Exception {
+ StreamExecutionEnvironment env =
+ streamExecutionEnvironmentBuilder()
+ .streamingMode()
+ .parallelism(2)
+ .checkpointIntervalMs(100)
+ .build();
+
+ new FlinkSinkBuilder(paimonTable(tableName))
+ .forRowData(
+ env.fromSource(
+ new EmitOnceAndWaitSource(),
+ WatermarkStrategy.noWatermarks(),
+ "EmitOnceAndWaitSource",
+ InternalTypeInfo.of(ROW_TYPE))
+ .setParallelism(1))
+ .build();
+ return env;
+ }
+
+ private void assertUsesGlobalCommitter(StreamExecutionEnvironment env, String tableName) {
+ assertThat(transformationNames(env)).contains("Global Committer : " + tableName);
+ }
+
+ private List transformationNames(StreamExecutionEnvironment env) {
+ List names = new ArrayList<>();
+ List> pending = new ArrayList<>(env.getTransformations());
+ Set visited = new HashSet<>();
+ while (!pending.isEmpty()) {
+ Transformation> transformation = pending.remove(pending.size() - 1);
+ if (visited.add(transformation.getId())) {
+ names.add(transformation.getName());
+ pending.addAll(transformation.getInputs());
+ }
+ }
+ return names;
+ }
+
+ @SuppressWarnings("unchecked")
+ private T reflectGetMiniCluster(Object instance)
+ throws NoSuchFieldException, IllegalAccessException {
+ Field field = instance.getClass().getDeclaredField(MINI_CLUSTER_FIELD);
+ field.setAccessible(true);
+ return (T) field.get(instance);
+ }
+
+ private void triggerCheckpointAndWaitForWrites(
+ JobClient jobClient, String tableName, long totalRecords) throws Exception {
+ MiniCluster miniCluster = reflectGetMiniCluster(jobClient);
+ JobID jobID = jobClient.getJobID();
+ waitForJobRunning(jobClient, miniCluster, jobID);
+
+ long lastSnapshotId = -1L;
+ long deadline = System.currentTimeMillis() + 60_000L;
+ while (System.currentTimeMillis() < deadline) {
+ miniCluster.triggerCheckpoint(jobID).get();
+ Snapshot snapshot = waitForNewSnapshot(tableName, lastSnapshotId, deadline);
+ lastSnapshotId = snapshot.id();
+ if (snapshot.totalRecordCount() >= totalRecords) {
+ return;
+ }
+ }
+ throw new AssertionError("Timed out waiting for records committed by PWC.");
+ }
+
+ private void waitForJobRunning(JobClient jobClient, MiniCluster miniCluster, JobID jobID)
+ throws Exception {
+ JobStatus jobStatus = jobClient.getJobStatus().get();
+ while (jobStatus == JobStatus.INITIALIZING || jobStatus == JobStatus.CREATED) {
+ Thread.sleep(500L);
+ jobStatus = jobClient.getJobStatus().get();
+ }
+
+ if (jobStatus != JobStatus.RUNNING) {
+ throw new IllegalStateException("Job status is not RUNNING");
+ }
+
+ AtomicBoolean allTaskRunning = new AtomicBoolean(false);
+ while (!allTaskRunning.get()) {
+ allTaskRunning.set(true);
+ Thread.sleep(500L);
+ miniCluster
+ .getExecutionGraph(jobID)
+ .thenAccept(
+ graph ->
+ graph.getAllExecutionVertices()
+ .forEach(
+ vertex -> {
+ if (vertex.getExecutionState()
+ != ExecutionState.RUNNING) {
+ allTaskRunning.set(false);
+ }
+ }))
+ .get();
+ }
+ }
+
+ private Snapshot waitForNewSnapshot(String tableName, long initialSnapshotId, long deadline)
+ throws InterruptedException {
+ Snapshot snapshot = findLatestSnapshot(tableName);
+ while (System.currentTimeMillis() < deadline
+ && (snapshot == null || snapshot.id() == initialSnapshotId)) {
+ Thread.sleep(500L);
+ snapshot = findLatestSnapshot(tableName);
+ }
+ if (snapshot == null || snapshot.id() == initialSnapshotId) {
+ throw new AssertionError("Timed out waiting for a new Paimon snapshot.");
+ }
+ return snapshot;
+ }
+
+ private static class EmitOnceAndWaitSource extends AbstractNonCoordinatedSource {
+
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public Boundedness getBoundedness() {
+ return Boundedness.CONTINUOUS_UNBOUNDED;
+ }
+
+ @Override
+ public SourceReader createReader(
+ SourceReaderContext sourceReaderContext) {
+ return new Reader();
+ }
+
+ private static class Reader extends AbstractNonCoordinatedSourceReader {
+
+ private boolean emitted;
+
+ @Override
+ public InputStatus pollNext(ReaderOutput output) {
+ if (!emitted) {
+ output.collect(row(1, "one"));
+ output.collect(row(2, "two"));
+ output.collect(row(3, "three"));
+ output.collect(row(4, "four"));
+ emitted = true;
+ }
+ return InputStatus.NOTHING_AVAILABLE;
+ }
+ }
+
+ private static RowData row(int k, String v) {
+ return GenericRowData.of(k, StringData.fromString(v));
+ }
+ }
+}
diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorTest.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorTest.java
new file mode 100644
index 000000000000..126e2650b2dc
--- /dev/null
+++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorTest.java
@@ -0,0 +1,780 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.flink.sink.coordinator;
+
+import org.apache.paimon.CoreOptions;
+import org.apache.paimon.data.GenericRow;
+import org.apache.paimon.data.InternalRow;
+import org.apache.paimon.flink.sink.Committable;
+import org.apache.paimon.flink.sink.CommittableSerializer;
+import org.apache.paimon.flink.sink.Committer;
+import org.apache.paimon.flink.sink.StoreCommitter;
+import org.apache.paimon.fs.Path;
+import org.apache.paimon.fs.local.LocalFileIO;
+import org.apache.paimon.manifest.ManifestCommittable;
+import org.apache.paimon.options.Options;
+import org.apache.paimon.reader.RecordReader;
+import org.apache.paimon.reader.RecordReaderIterator;
+import org.apache.paimon.schema.Schema;
+import org.apache.paimon.schema.SchemaManager;
+import org.apache.paimon.table.FileStoreTable;
+import org.apache.paimon.table.FileStoreTableFactory;
+import org.apache.paimon.table.sink.CommitMessage;
+import org.apache.paimon.table.sink.CommitMessageSerializer;
+import org.apache.paimon.table.sink.StreamTableWrite;
+import org.apache.paimon.table.source.TableRead;
+import org.apache.paimon.types.DataType;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.RowType;
+import org.apache.paimon.utils.CloseableIterator;
+
+import org.apache.flink.runtime.jobgraph.OperatorID;
+import org.apache.flink.runtime.operators.coordination.CoordinationResponse;
+import org.apache.flink.runtime.operators.coordination.OperatorCoordinator;
+import org.apache.flink.runtime.operators.coordination.OperatorEvent;
+import org.apache.flink.util.ExceptionUtils;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.mockito.Mockito;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.mockito.Mockito.when;
+
+/** Tests for {@link PaimonWriterCoordinator}. */
+public class PaimonWriterCoordinatorTest {
+
+ private static final long CK0 = 7L;
+ private static final long CK1 = 8L;
+ private static final long CK2 = 9L;
+ private static final RowType ROW_TYPE =
+ RowType.of(
+ new DataType[] {DataTypes.INT(), DataTypes.BIGINT()}, new String[] {"a", "b"});
+
+ @TempDir public java.nio.file.Path tempDir;
+
+ private Path tablePath;
+ private OperatorID operatorId;
+ private String commitUser;
+
+ @BeforeEach
+ public void before() {
+ tablePath = new Path(tempDir.toString());
+ operatorId = new OperatorID();
+ commitUser = UUID.randomUUID().toString();
+ }
+
+ // ------------------------------------------------------------------------
+ // basic function tests
+ // ------------------------------------------------------------------------
+
+ @Test
+ public void testCommitUserRestoredFromCoordinatorState() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ String restoredCommitUser = commitUser;
+
+ PaimonWriterCoordinator previous = createCoordinator(table, 1);
+ previous.start();
+ byte[] coordinatorState = checkpointState(previous, CK1);
+ previous.close();
+
+ commitUser = UUID.randomUUID().toString();
+ PaimonWriterCoordinator restored = createCoordinator(table, 1);
+ restored.resetToCheckpoint(CK1, coordinatorState);
+ restored.start();
+ register(restored, 0);
+
+ sendRequest(
+ restored,
+ recoveredFileInfoRequest(CK1, 0, restoredCommitUser, Collections.emptyList()));
+ waitForCoordinator(restored);
+
+ sendRequest(restored, fileInfoRequest(table, CK2, 0, GenericRow.of(1, 10L)));
+ restored.notifyCheckpointComplete(CK2);
+ waitForCoordinator(restored);
+ assertThat(table.snapshotManager().latestSnapshot().commitUser())
+ .isEqualTo(restoredCommitUser);
+ restored.close();
+ }
+
+ @Test
+ public void testFileInfoRequestAcksAfterReceive() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ PaimonWriterCoordinator coordinator = createCoordinator(table, 1);
+ coordinator.start();
+ register(coordinator, 0);
+
+ FileInfoRequest request = fileInfoRequest(table, CK1, 0, 0, GenericRow.of(1, 10L));
+ CoordinationResponse rawResponse = coordinator.handleCoordinationRequest(request).get();
+ FileInfoReceivedResponse response = CoordinationResponseUtils.unwrap(rawResponse);
+ assertThat(response.checkpointId()).isEqualTo(CK1);
+ assertThat(response.subtaskId()).isEqualTo(0);
+
+ coordinator.notifyCheckpointComplete(CK1);
+ waitForCoordinator(coordinator);
+ assertResults(table, "1, 10");
+
+ coordinator.close();
+ }
+
+ @Test
+ public void testFileInfoRequestFromStaleAttemptIsIgnored() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ PaimonWriterCoordinator coordinator = createCoordinator(table, 1);
+ coordinator.start();
+ register(coordinator, 0, 1);
+
+ FileInfoRequest request = fileInfoRequest(table, CK1, 0, 0, GenericRow.of(1, 10L));
+ CompletableFuture response =
+ coordinator.handleCoordinationRequest(request);
+ assertThatThrownBy(response::get)
+ .isInstanceOf(ExecutionException.class)
+ .hasCauseInstanceOf(IllegalStateException.class)
+ .hasMessageContaining("invalid subtask 0 attempt 0");
+
+ coordinator.notifyCheckpointComplete(CK1);
+ waitForCoordinator(coordinator);
+ assertThat(table.snapshotManager().latestSnapshotId()).isNull();
+
+ coordinator.close();
+ }
+
+ @Test
+ public void testCheckpointCompleteRequiresStagedFileInfo() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ OperatorCoordinator.Context context = createContext(2);
+ PaimonWriterCoordinator coordinator = createCoordinator(table, context);
+ coordinator.start();
+ register(coordinator, 0);
+ register(coordinator, 1);
+
+ sendRequest(coordinator, fileInfoRequest(table, CK1, 0, GenericRow.of(1, 10L)));
+ coordinator.notifyCheckpointComplete(CK1);
+ waitForCoordinator(coordinator);
+
+ Mockito.verify(context).failJob(Mockito.any(IllegalStateException.class));
+ assertThat(table.snapshotManager().latestSnapshotId()).isNull();
+ coordinator.close();
+ }
+
+ // ------------------------------------------------------------------------
+ // restore and recovered file-info tests
+ // ------------------------------------------------------------------------
+
+ @Test
+ public void testRecoveredFileInfoWithoutCoordinatorRestoreWaitsForCheckpointComplete()
+ throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ PaimonWriterCoordinator coordinator = createCoordinator(table, 1);
+ coordinator.start();
+ register(coordinator, 0);
+
+ sendRequest(
+ coordinator,
+ recoveredFileInfoRequest(
+ CK1, 0, commitUser, committables(table, CK1, GenericRow.of(1, 10L))));
+ waitForCoordinator(coordinator);
+ assertThat(table.snapshotManager().latestSnapshotId()).isNull();
+
+ coordinator.notifyCheckpointComplete(CK1);
+ waitForCoordinator(coordinator);
+ assertResults(table, "1, 10");
+
+ coordinator.close();
+ }
+
+ @Test
+ public void testRecoveredFileInfoCommitsPendingCommittables() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ byte[] coordinatorState = createCoordinatorState(table, CK2);
+ OperatorCoordinator.Context context = createContext(2);
+ PaimonWriterCoordinator coordinator = createCoordinator(table, context);
+ coordinator.resetToCheckpoint(CK2, coordinatorState);
+ coordinator.start();
+ register(coordinator, 0);
+ register(coordinator, 1);
+
+ sendRequest(
+ coordinator,
+ recoveredFileInfoRequest(
+ CK2, 0, commitUser, committables(table, CK1, GenericRow.of(1, 10L))));
+ waitForCoordinator(coordinator);
+ assertThat(table.snapshotManager().latestSnapshotId()).isNull();
+
+ sendRequest(
+ coordinator,
+ recoveredFileInfoRequest(
+ CK2, 1, commitUser, committables(table, CK1, GenericRow.of(2, 20L))));
+ waitForCoordinator(coordinator);
+
+ Mockito.verify(context).failJob(Mockito.any(Throwable.class));
+ assertResults(table, "1, 10", "2, 20");
+
+ coordinator.close();
+ }
+
+ /**
+ * Restored file info is recommitted and triggers the expected recovery failure, after which a
+ * newer checkpoint is reported and completed normally.
+ */
+ @Test
+ public void testCheckpointAfterRestoredCommitDoesNotFailJob() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ byte[] coordinatorState = createCoordinatorState(table, CK1);
+ OperatorCoordinator.Context context = createContext(1);
+ PaimonWriterCoordinator coordinator = createCoordinator(table, context);
+ coordinator.resetToCheckpoint(CK1, coordinatorState);
+ coordinator.start();
+ register(coordinator, 0);
+
+ sendRequest(
+ coordinator,
+ recoveredFileInfoRequest(
+ CK1, 0, commitUser, committables(table, CK1, GenericRow.of(1, 10L))));
+ waitForCoordinator(coordinator);
+ Mockito.verify(context).failJob(Mockito.any(Throwable.class));
+ assertResults(table, "1, 10");
+
+ sendRequest(coordinator, fileInfoRequest(table, CK2, 0, GenericRow.of(2, 20L)));
+ coordinator.notifyCheckpointComplete(CK2);
+ waitForCoordinator(coordinator);
+
+ Mockito.verify(context, Mockito.times(1)).failJob(Mockito.any(Throwable.class));
+ assertResults(table, "1, 10", "2, 20");
+
+ coordinator.close();
+ }
+
+ /**
+ * At a restored checkpoint, one subtask reports data and another reports an empty recovered
+ * file-info payload.
+ */
+ @Test
+ public void testEmptyRestoredFileInfoCompletesRestoredCheckpoint() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ byte[] coordinatorState = createCoordinatorState(table, CK1);
+ OperatorCoordinator.Context context = createContext(2);
+ PaimonWriterCoordinator coordinator = createCoordinator(table, context);
+ coordinator.resetToCheckpoint(CK1, coordinatorState);
+ coordinator.start();
+ register(coordinator, 0);
+ register(coordinator, 1);
+
+ sendRequest(
+ coordinator,
+ recoveredFileInfoRequest(
+ CK1, 0, commitUser, committables(table, CK1, GenericRow.of(1, 10L))));
+ waitForCoordinator(coordinator);
+ assertThat(table.snapshotManager().latestSnapshotId()).isNull();
+
+ sendRequest(
+ coordinator, recoveredFileInfoRequest(CK1, 1, commitUser, Collections.emptyList()));
+ waitForCoordinator(coordinator);
+
+ Mockito.verify(context).failJob(Mockito.any(Throwable.class));
+ assertResults(table, "1, 10");
+
+ coordinator.close();
+ }
+
+ /**
+ * During recovery to CK2, one recovered request cumulatively carries CK1 committables and the
+ * other recovered request is empty.
+ */
+ @Test
+ public void testEmptyFileInfoCompletesEarlierCheckpointBeforeRestoredCommit() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ byte[] coordinatorState = createCoordinatorState(table, CK2);
+ OperatorCoordinator.Context context = createContext(2);
+ PaimonWriterCoordinator coordinator = createCoordinator(table, context);
+ coordinator.resetToCheckpoint(CK2, coordinatorState);
+ coordinator.start();
+ register(coordinator, 0);
+ register(coordinator, 1);
+
+ List ck1Subtask0 = committables(table, CK1, GenericRow.of(1, 10L));
+ sendRequest(coordinator, recoveredFileInfoRequest(CK2, 0, commitUser, ck1Subtask0));
+ waitForCoordinator(coordinator);
+ assertThat(table.snapshotManager().latestSnapshotId()).isNull();
+
+ sendRequest(
+ coordinator, recoveredFileInfoRequest(CK2, 1, commitUser, Collections.emptyList()));
+ waitForCoordinator(coordinator);
+
+ Mockito.verify(context).failJob(Mockito.any(Throwable.class));
+ assertResults(table, "1, 10");
+
+ coordinator.close();
+ }
+
+ // ------------------------------------------------------------------------
+ // abort and late-arrival tests
+ // ------------------------------------------------------------------------
+
+ /**
+ * CK0 commit, CK1 abort but Task not fail, CK2 commit The aborted checkpoint creates no table
+ * data, while completing the next checkpoint commits both the re-reported and newly produced
+ * rows exactly once.
+ */
+ @Test
+ public void testCheckpointAbortDoesNotDropCommittables() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ PaimonWriterCoordinator coordinator = createCoordinator(table, 1);
+ coordinator.start();
+
+ sendCheckpoint(coordinator, table, CK0, 0, GenericRow.of(0, 0L));
+ coordinator.notifyCheckpointComplete(CK0);
+ waitForCoordinator(coordinator);
+ assertResults(table, "0, 0");
+
+ List ck1Committables = committables(table, CK1, GenericRow.of(1, 10L));
+ sendRequest(coordinator, fileInfoRequest(CK1, 0, ck1Committables));
+ coordinator.notifyCheckpointAborted(CK1);
+ waitForCoordinator(coordinator);
+ assertResults(table, "0, 0");
+
+ sendRequest(
+ coordinator,
+ fileInfoRequest(CK2, 0, committables(table, CK2, GenericRow.of(2, 20L))));
+ coordinator.notifyCheckpointComplete(CK2);
+ waitForCoordinator(coordinator);
+ assertResults(table, "0, 0", "1, 10", "2, 20");
+
+ coordinator.close();
+ }
+
+ /** Only one of two subtasks reports a checkpoint before it is aborted. */
+ @Test
+ public void testPartialCheckpointAbortDoesNotFailJob() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ OperatorCoordinator.Context context = createContext(2);
+ PaimonWriterCoordinator coordinator = createCoordinator(table, context);
+ coordinator.start();
+
+ register(coordinator, 0);
+ register(coordinator, 1);
+ List subtask0Ck1 = committables(table, CK1, GenericRow.of(1, 10L));
+ sendRequest(coordinator, fileInfoRequest(CK1, 0, subtask0Ck1));
+ waitForCoordinator(coordinator);
+
+ coordinator.notifyCheckpointAborted(CK1);
+ waitForCoordinator(coordinator);
+ Mockito.verify(context, Mockito.never()).failJob(Mockito.any(Throwable.class));
+ assertThat(table.snapshotManager().latestSnapshotId()).isNull();
+
+ sendRequest(
+ coordinator,
+ fileInfoRequest(CK2, 0, committables(table, CK2, GenericRow.of(2, 20L))));
+ sendRequest(
+ coordinator,
+ fileInfoRequest(CK2, 1, committables(table, CK2, GenericRow.of(3, 30L))));
+ coordinator.notifyCheckpointComplete(CK2);
+ waitForCoordinator(coordinator);
+
+ Mockito.verify(context, Mockito.never()).failJob(Mockito.any(Throwable.class));
+ assertResults(table, "1, 10", "2, 20", "3, 30");
+
+ coordinator.close();
+ }
+
+ /**
+ * A file-info request arrives after the corresponding checkpoint has already been aborted. The
+ * late request is retained as reliable pending file info and committed by the next complete
+ * checkpoint envelope.
+ */
+ @Test
+ public void testFileInfoAfterCheckpointAbortIsCommittedByLaterCheckpoint() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ OperatorCoordinator.Context context = createContext(2);
+ PaimonWriterCoordinator coordinator = createCoordinator(table, context);
+ coordinator.start();
+
+ register(coordinator, 0);
+ register(coordinator, 1);
+ coordinator.notifyCheckpointAborted(CK1);
+ waitForCoordinator(coordinator);
+
+ sendRequest(coordinator, fileInfoRequest(table, CK1, 0, GenericRow.of(1, 10L)));
+ waitForCoordinator(coordinator);
+
+ Mockito.verify(context, Mockito.never()).failJob(Mockito.any(Throwable.class));
+ assertThat(table.snapshotManager().latestSnapshotId()).isNull();
+
+ sendRequest(coordinator, fileInfoRequest(table, CK2, 0, GenericRow.of(2, 20L)));
+ sendRequest(coordinator, fileInfoRequest(table, CK2, 1, GenericRow.of(3, 30L)));
+ coordinator.notifyCheckpointComplete(CK2);
+ waitForCoordinator(coordinator);
+
+ Mockito.verify(context, Mockito.never()).failJob(Mockito.any(Throwable.class));
+ assertResults(table, "1, 10", "2, 20", "3, 30");
+
+ coordinator.close();
+ }
+
+ /**
+ * Scenario: PWC collects a checkpoint's file info, the checkpoint is aborted, and the same
+ * attempt reports the same envelope again. Under ack-based reporting this is a protocol error:
+ * if the first ACK was lost, the writer snapshot cannot complete and the attempt must failover.
+ */
+ @Test
+ public void testDuplicateFileInfoAfterCollectedAbortIsRejected() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ OperatorCoordinator.Context context = createContext(1);
+ PaimonWriterCoordinator coordinator = createCoordinator(table, context);
+ coordinator.start();
+ register(coordinator, 0);
+
+ FileInfoRequest request = fileInfoRequest(table, CK1, 0, GenericRow.of(1, 10L));
+ sendRequest(coordinator, request);
+ waitForCoordinator(coordinator);
+ coordinator.notifyCheckpointAborted(CK1);
+ waitForCoordinator(coordinator);
+
+ assertThatThrownBy(() -> sendRequest(coordinator, request))
+ .isInstanceOf(ExecutionException.class)
+ .hasCauseInstanceOf(IllegalStateException.class)
+ .hasMessageContaining("Repeated file info envelope");
+
+ coordinator.close();
+ }
+
+ /**
+ * An earlier checkpoint has file info from only one subtask, while a later checkpoint receives
+ * complete cumulative file info from all subtasks.
+ */
+ @Test
+ public void testLaterCheckpointCompleteCanCommitEarlierPartialFileInfo() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ OperatorCoordinator.Context context = createContext(2);
+ PaimonWriterCoordinator coordinator = createCoordinator(table, context);
+ coordinator.start();
+
+ register(coordinator, 0);
+ register(coordinator, 1);
+ List ck1Subtask0 = committables(table, CK1, GenericRow.of(1, 10L));
+ sendRequest(coordinator, fileInfoRequest(CK1, 0, ck1Subtask0));
+ waitForCoordinator(coordinator);
+
+ sendRequest(
+ coordinator,
+ fileInfoRequest(CK2, 0, committables(table, CK2, GenericRow.of(2, 20L))));
+ sendRequest(
+ coordinator,
+ fileInfoRequest(CK2, 1, committables(table, CK2, GenericRow.of(3, 30L))));
+ coordinator.notifyCheckpointComplete(CK2);
+ waitForCoordinator(coordinator);
+
+ Mockito.verify(context, Mockito.never()).failJob(Mockito.any(Throwable.class));
+ assertResults(table, "1, 10", "2, 20", "3, 30");
+
+ coordinator.close();
+ }
+
+ // ------------------------------------------------------------------------
+ // attempt and stale-message tests
+ // ------------------------------------------------------------------------
+
+ @Test
+ public void testSubtaskFailoverReplacesUnstagedPendingFileInfo() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ PaimonWriterCoordinator coordinator = createCoordinator(table, 2);
+ coordinator.start();
+
+ register(coordinator, 0, 0);
+ register(coordinator, 1, 0);
+ sendRequest(coordinator, fileInfoRequest(table, CK1, 0, 0, GenericRow.of(1, 10L)));
+ waitForCoordinator(coordinator);
+
+ coordinator.executionAttemptFailed(0, 0, new RuntimeException("failover"));
+ waitForCoordinator(coordinator);
+ register(coordinator, 0, 1);
+
+ List recoveredSubtask0 = new ArrayList<>();
+ recoveredSubtask0.addAll(committables(table, CK1, GenericRow.of(9, 90L)));
+ recoveredSubtask0.addAll(committables(table, CK2, GenericRow.of(2, 20L)));
+ sendRequest(coordinator, fileInfoRequest(CK2, 0, 1, recoveredSubtask0));
+ sendRequest(coordinator, fileInfoRequest(table, CK2, 1, 0, GenericRow.of(3, 30L)));
+ coordinator.notifyCheckpointComplete(CK2);
+ waitForCoordinator(coordinator);
+
+ assertResults(table, "2, 20", "3, 30", "9, 90");
+ coordinator.close();
+ }
+
+ /**
+ * A writer resends file info for a checkpoint that PWC has already committed. The table remains
+ * committed once, and the registered gateway receives a second commit-complete event in
+ * response to the stale resend, in addition to the original commit notification.
+ */
+ @Test
+ public void testStaleFileInfoResendSendsCommitCompleteEvent() throws Exception {
+ FileStoreTable table = createFileStoreTable();
+ PaimonWriterCoordinator coordinator = createCoordinator(table, 1);
+ coordinator.start();
+ OperatorCoordinator.SubtaskGateway gateway = registerAndReturnGateway(coordinator, 0);
+
+ FileInfoRequest request = fileInfoRequest(table, CK1, 0, GenericRow.of(1, 10L));
+ sendRequest(coordinator, request);
+ coordinator.notifyCheckpointComplete(CK1);
+ waitForCoordinator(coordinator);
+ assertResults(table, "1, 10");
+
+ sendRequest(coordinator, request);
+ waitForCoordinator(coordinator);
+
+ Mockito.verify(gateway, Mockito.times(2)).sendEvent(Mockito.any(CommitCompleteEvent.class));
+
+ coordinator.close();
+ }
+
+ private void sendCheckpoint(
+ PaimonWriterCoordinator coordinator,
+ FileStoreTable table,
+ long checkpointId,
+ int subtask,
+ GenericRow... rows)
+ throws Exception {
+ register(coordinator, subtask);
+ sendRequest(coordinator, fileInfoRequest(table, checkpointId, subtask, rows));
+ waitForCoordinator(coordinator);
+ }
+
+ private FileInfoReceivedResponse sendRequest(
+ PaimonWriterCoordinator coordinator, FileInfoRequest request) throws Exception {
+ CoordinationResponse rawResponse = coordinator.handleCoordinationRequest(request).get();
+ return CoordinationResponseUtils.unwrap(rawResponse);
+ }
+
+ private byte[] checkpointState(PaimonWriterCoordinator coordinator, long checkpointId)
+ throws Exception {
+ CompletableFuture result = new CompletableFuture<>();
+ coordinator.checkpointCoordinator(checkpointId, result);
+ return result.get();
+ }
+
+ private byte[] createCoordinatorState(FileStoreTable table, long checkpointId)
+ throws Exception {
+ PaimonWriterCoordinator coordinator = createCoordinator(table, 1);
+ coordinator.start();
+ byte[] state = checkpointState(coordinator, checkpointId);
+ coordinator.close();
+ return state;
+ }
+
+ private void register(PaimonWriterCoordinator coordinator, int subtask) {
+ register(coordinator, subtask, 0);
+ }
+
+ private void register(PaimonWriterCoordinator coordinator, int subtask, int attemptNumber) {
+ OperatorCoordinator.SubtaskGateway gateway =
+ Mockito.mock(OperatorCoordinator.SubtaskGateway.class);
+ when(gateway.sendEvent(Mockito.any(OperatorEvent.class)))
+ .thenReturn(CompletableFuture.completedFuture(null));
+ coordinator.executionAttemptReady(subtask, attemptNumber, gateway);
+ }
+
+ private OperatorCoordinator.SubtaskGateway registerAndReturnGateway(
+ PaimonWriterCoordinator coordinator, int subtask) {
+ OperatorCoordinator.SubtaskGateway gateway =
+ Mockito.mock(OperatorCoordinator.SubtaskGateway.class);
+ when(gateway.sendEvent(Mockito.any(OperatorEvent.class)))
+ .thenReturn(CompletableFuture.completedFuture(null));
+ coordinator.executionAttemptReady(subtask, 0, gateway);
+ return gateway;
+ }
+
+ private FileInfoRequest fileInfoRequest(
+ FileStoreTable table, long checkpointId, int subtask, GenericRow... rows)
+ throws Exception {
+ return fileInfoRequest(table, checkpointId, subtask, 0, rows);
+ }
+
+ private FileInfoRequest fileInfoRequest(
+ FileStoreTable table,
+ long checkpointId,
+ int subtask,
+ int attemptNumber,
+ GenericRow... rows)
+ throws Exception {
+ return fileInfoRequest(
+ checkpointId, subtask, attemptNumber, committables(table, checkpointId, rows));
+ }
+
+ private FileInfoRequest fileInfoRequest(
+ long checkpointId, int subtask, List committables) throws Exception {
+ return fileInfoRequest(checkpointId, subtask, 0, committables);
+ }
+
+ private FileInfoRequest fileInfoRequest(
+ long checkpointId, int subtask, int attemptNumber, List committables)
+ throws Exception {
+ return FileInfoRequest.fileInfo(
+ checkpointId,
+ subtask,
+ attemptNumber,
+ Long.MIN_VALUE,
+ serialize(committables),
+ committables.size());
+ }
+
+ private FileInfoRequest recoveredFileInfoRequest(
+ long checkpointId,
+ int subtask,
+ String recoveredCommitUser,
+ List committables)
+ throws Exception {
+ return FileInfoRequest.recoveredFileInfo(
+ checkpointId,
+ subtask,
+ 0,
+ Long.MIN_VALUE,
+ serialize(committables),
+ committables.size(),
+ recoveredCommitUser);
+ }
+
+ private List committables(
+ FileStoreTable table, long checkpointId, GenericRow... rows) throws Exception {
+ StreamTableWrite write =
+ table.newStreamWriteBuilder().withCommitUser(commitUser).newWrite();
+ for (GenericRow row : rows) {
+ write.write(row);
+ }
+ List committables = new ArrayList<>();
+ for (CommitMessage message : write.prepareCommit(false, checkpointId)) {
+ committables.add(new Committable(checkpointId, message));
+ }
+ write.close();
+ return committables;
+ }
+
+ private byte[] serialize(List committables) throws Exception {
+ CommittableSerializer serializer = new CommittableSerializer(new CommitMessageSerializer());
+ int total = 4;
+ List bytes = new ArrayList<>();
+ for (Committable committable : committables) {
+ byte[] serialized = serializer.serialize(committable);
+ bytes.add(serialized);
+ total += 4 + serialized.length;
+ }
+ ByteBuffer buffer = ByteBuffer.allocate(total);
+ buffer.putInt(committables.size());
+ for (byte[] serialized : bytes) {
+ buffer.putInt(serialized.length);
+ buffer.put(serialized);
+ }
+ return buffer.array();
+ }
+
+ private PaimonWriterCoordinator createCoordinator(FileStoreTable table, int parallelism) {
+ return createCoordinator(table, createContext(parallelism));
+ }
+
+ private OperatorCoordinator.Context createContext(int parallelism) {
+ OperatorCoordinator.Context context = Mockito.mock(OperatorCoordinator.Context.class);
+ when(context.getOperatorId()).thenReturn(operatorId);
+ when(context.currentParallelism()).thenReturn(parallelism);
+ when(context.getUserCodeClassloader())
+ .thenReturn(Thread.currentThread().getContextClassLoader());
+ return context;
+ }
+
+ private PaimonWriterCoordinator createCoordinator(
+ FileStoreTable table, OperatorCoordinator.Context context) {
+ Committer.Factory factory =
+ commitContext ->
+ new StoreCommitter(
+ table,
+ table.newCommit(commitContext.commitUser())
+ .ignoreEmptyCommit(
+ !commitContext.streamingCheckpointEnabled()),
+ commitContext);
+ return new PaimonWriterCoordinator(
+ true,
+ commitUser,
+ factory,
+ context,
+ new PaimonWriterCoordinator.CoordinatorExecutorThreadFactory("PWC", context),
+ null);
+ }
+
+ private void waitForCoordinator(PaimonWriterCoordinator coordinator) {
+ CompletableFuture future = new CompletableFuture<>();
+ coordinator.runInCoordinatorThread(() -> future.complete(null));
+ try {
+ future.get();
+ } catch (InterruptedException e) {
+ throw new AssertionError("Interrupted while waiting for coordinator.", e);
+ } catch (ExecutionException e) {
+ ExceptionUtils.rethrow(ExceptionUtils.stripExecutionException(e));
+ }
+ }
+
+ private FileStoreTable createFileStoreTable() throws Exception {
+ Options conf = new Options();
+ conf.set(CoreOptions.PATH, tablePath.toString());
+ conf.setString("bucket", "1");
+ conf.setString("bucket-key", "a");
+ new SchemaManager(LocalFileIO.create(), tablePath)
+ .createTable(
+ new Schema(
+ ROW_TYPE.getFields(),
+ Collections.emptyList(),
+ Collections.emptyList(),
+ conf.toMap(),
+ ""));
+ return FileStoreTableFactory.create(LocalFileIO.create(), conf);
+ }
+
+ private void assertResults(FileStoreTable table, String... expected) {
+ TableRead read = table.newReadBuilder().newRead();
+ List actual = new ArrayList<>();
+ table.newReadBuilder()
+ .newScan()
+ .plan()
+ .splits()
+ .forEach(
+ split -> {
+ try {
+ RecordReader reader = read.createReader(split);
+ CloseableIterator iterator =
+ new RecordReaderIterator<>(reader);
+ while (iterator.hasNext()) {
+ InternalRow row = iterator.next();
+ actual.add(row.getInt(0) + ", " + row.getLong(1));
+ }
+ iterator.close();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ });
+ Collections.sort(actual);
+ assertThat(actual).isEqualTo(Arrays.asList(expected));
+ }
+}
diff --git a/paimon-flink/paimon-flink1-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java b/paimon-flink/paimon-flink1-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java
index 1087dcb65cc2..9968459b47b9 100644
--- a/paimon-flink/paimon-flink1-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java
+++ b/paimon-flink/paimon-flink1-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java
@@ -34,6 +34,10 @@ public static int getIndexOfThisSubtask(RuntimeContext context) {
return context.getIndexOfThisSubtask();
}
+ public static int getAttemptNumber(RuntimeContext context) {
+ return context.getAttemptNumber();
+ }
+
public static @Nullable Integer getNumberOfParallelSubtasks(FunctionContext context) {
return null;
}
diff --git a/paimon-flink/paimon-flink2-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java b/paimon-flink/paimon-flink2-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java
index ff5fa868128c..c2a5ae6cc35b 100644
--- a/paimon-flink/paimon-flink2-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java
+++ b/paimon-flink/paimon-flink2-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java
@@ -34,6 +34,10 @@ public static int getIndexOfThisSubtask(RuntimeContext context) {
return context.getTaskInfo().getIndexOfThisSubtask();
}
+ public static int getAttemptNumber(RuntimeContext context) {
+ return context.getTaskInfo().getAttemptNumber();
+ }
+
public static @Nullable Integer getNumberOfParallelSubtasks(FunctionContext context) {
return context.getTaskInfo().getNumberOfParallelSubtasks();
}