diff --git a/docs/generated/flink_connector_configuration.html b/docs/generated/flink_connector_configuration.html index 0f2786b038dc..0c40f290d163 100644 --- a/docs/generated/flink_connector_configuration.html +++ b/docs/generated/flink_connector_configuration.html @@ -248,6 +248,12 @@ Boolean Indicates whether to further sort data belonged to each sink task after range partitioning. + +
sink.committer-coordinator-operator.enabled
+ false + Boolean + Allow coordinator replace committer operator, only support for append table now. +
sink.committer-cpu
1.0 diff --git a/paimon-e2e-tests/pom.xml b/paimon-e2e-tests/pom.xml index 88f50539d6d9..291e36315a2f 100644 --- a/paimon-e2e-tests/pom.xml +++ b/paimon-e2e-tests/pom.xml @@ -269,6 +269,7 @@ under the License. + diff --git a/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/E2eTestBase.java b/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/E2eTestBase.java index 048356933d0b..b4ed247631cf 100644 --- a/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/E2eTestBase.java +++ b/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/E2eTestBase.java @@ -58,6 +58,7 @@ public abstract class E2eTestBase { private final boolean withKafka; private final boolean withHive; private final boolean withSpark; + private final int taskManagerReplicas; protected E2eTestBase() { this(false, false); @@ -68,9 +69,15 @@ protected E2eTestBase(boolean withKafka, boolean withHive) { } protected E2eTestBase(boolean withKafka, boolean withHive, boolean withSpark) { + this(withKafka, withHive, withSpark, 1); + } + + protected E2eTestBase( + boolean withKafka, boolean withHive, boolean withSpark, int taskManagerReplicas) { this.withKafka = withKafka; this.withHive = withHive; this.withSpark = withSpark; + this.taskManagerReplicas = taskManagerReplicas; } protected static final String TEST_DATA_DIR = "/test-data"; @@ -104,10 +111,13 @@ public void before() throws Exception { .getResource("docker-compose.yaml") .toURI())) .withEnv("NETWORK_ID", ((Network.NetworkImpl) network).getName()) + .withEnv("FLINK_ENV_FILE", flinkEnvFile()) .withLogConsumer("jobmanager-1", new LogConsumer(LOG)) - .withLogConsumer("taskmanager-1", new LogConsumer(LOG)) .withStartupTimeout(Duration.ofMinutes(3)) .withLocalCompose(true); + for (int i = 1; i <= taskManagerReplicas; i++) { + environment.withLogConsumer("taskmanager-" + i, new LogConsumer(LOG)); + } if (withKafka) { services.add("kafka"); environment.withLogConsumer("kafka-1", new Slf4jLogConsumer(LOG)); @@ -140,11 +150,17 @@ public void before() throws Exception { ".*Master: I have been elected leader! New state: ALIVE.*", 1)); } environment.withServices(services.toArray(new String[0])).withLocalCompose(true); + if (taskManagerReplicas > 1) { + environment.withScaledService("taskmanager", taskManagerReplicas); + environment.withExposedService("jobmanager-1", 8081); + } environment.waitingFor("jobmanager-1", buildWaitStrategy(".*Registering TaskManager.*", 1)); - environment.waitingFor( - "taskmanager-1", - buildWaitStrategy(".*Successful registration at resource manager.*", 1)); + for (int i = 1; i <= taskManagerReplicas; i++) { + environment.waitingFor( + "taskmanager-" + i, + buildWaitStrategy(".*Successful registration at resource manager.*", 1)); + } environment.start(); jobManager = environment.getContainerByServiceName("jobmanager-1").get(); @@ -156,6 +172,20 @@ public void before() throws Exception { flinkVersion = flinkVersionMatcher.find() ? flinkVersionMatcher.group(1) : null; } + protected String flinkEnvFile() { + return "flink.env"; + } + + protected String flinkRestUrl() { + if (taskManagerReplicas <= 1) { + throw new IllegalStateException("Flink REST is not exposed for this test."); + } + return String.format( + "http://%s:%d", + environment.getServiceHost("jobmanager-1", 8081), + environment.getServicePort("jobmanager-1", 8081)); + } + private WaitStrategy buildWaitStrategy(String regex, int times) { // Increase timeout from 60s (default value) to 180s return Wait.forLogMessage(regex, times).withStartupTimeout(Duration.ofSeconds(180)); diff --git a/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/PaimonWriterCoordinatorE2eTest.java b/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/PaimonWriterCoordinatorE2eTest.java new file mode 100644 index 000000000000..0333155da4a1 --- /dev/null +++ b/paimon-e2e-tests/src/test/java/org/apache/paimon/tests/PaimonWriterCoordinatorE2eTest.java @@ -0,0 +1,716 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.tests; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.testcontainers.containers.Container; +import org.testcontainers.containers.ContainerState; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.assertj.core.api.Assertions.assertThat; + +/** End-to-end tests for committing an append table through Paimon writer coordinator. */ +@Timeout(300) +public class PaimonWriterCoordinatorE2eTest extends E2eTestBase { + + private static final long WAIT_TIMEOUT_MS = 120_000L; + private static final Pattern VERTEX_PATTERN = + Pattern.compile( + "\\\"id\\\"\\s*:\\s*\\\"([^\\\"]+)\\\"[^{}]*" + + "\\\"name\\\"\\s*:\\s*\\\"[^\\\"]*" + + "Writer\\(write-only\\)\\s*:\\s*pip30_sink[^\\\"]*\\\""); + private static final Pattern INTEGER_PATTERN = Pattern.compile("(\\d+)"); + + public PaimonWriterCoordinatorE2eTest() { + super(false, false, false, 2); + } + + @Override + protected String flinkEnvFile() { + return "flink-pwc.env"; + } + + @Test + public void testCheckpointCommitWithWriterCoordinator() throws Exception { + TestContext context = createContext(); + writeRecords(context.inputDirectory, 0, 20); + + String jobId = submit(context); + waitForJobStatus(jobId, "RUNNING"); + waitForWriterSubtasks(jobId); + waitForRecords(); + triggerAndWaitForCompletedCheckpoint(jobId); + + assertThat(rest("GET", "/jobs/" + jobId + "/plan", null)) + .doesNotContain("Committer", "Compact Coordinator", "Compact Worker"); + waitUntil( + () -> jobManager.getLogs().contains("Paimon writer coordinator starting"), + "PWC did not start."); + + cancel(jobId); + assertTable(context, 0, 20); + } + + @Test + public void testPartialCheckpointAbortIsRecoveredByNextCheckpoint() throws Exception { + TestContext context = createContext(); + writeRecords(context.inputDirectory, 0, 20); + + // 先启动流作业并完成一次正常 checkpoint,确保 coordinator 已经完成一次提交。 + String jobId = submit(context); + waitForJobStatus(jobId, "RUNNING"); + waitForWriterSubtasks(jobId); + waitForRecords(); + triggerAndWaitForCompletedCheckpoint(jobId); + + // 记录 writer 两个 subtask 的 attempt 和所在 TM,后面用来确认失败 checkpoint 不会重启任务。 + String writerVertexId = findWriterVertexId(jobId); + Map before = waitForWriterSubtasks(jobId); + assertThat(before).hasSize(2); + assertThat(before.get(0).host).isNotEqualTo(before.get(1).host); + + // 写入第二批数据,然后暂停其中一个 writer 所在 TM,让 checkpoint 进入部分完成状态。 + writeRecords(context.inputDirectory, 20, 20); + waitForRecords(); + ContainerState pausedTaskManager = findTaskManager(before.get(1)); + int failedBefore = checkpointCount(jobId, "failed"); + boolean paused = false; + try { + pausedTaskManager + .getDockerClient() + .pauseContainerCmd(pausedTaskManager.getContainerId()) + .exec(); + paused = true; + + triggerCheckpoint(jobId); + waitForPartialCheckpoint(jobId); + waitForCheckpointCount(jobId, "failed", failedBefore + 1); + } finally { + // 恢复被暂停的 TM,使后续 checkpoint 可以重新收齐所有 writer 的 committable。 + if (paused) { + pausedTaskManager + .getDockerClient() + .unpauseContainerCmd(pausedTaskManager.getContainerId()) + .exec(); + } + } + + // 再触发一次成功 checkpoint,验证前一次 abort 的部分提交信息能被下一次 checkpoint 恢复。 + waitForJobStatus(jobId, "RUNNING"); + triggerAndWaitForDataCommitted(jobId, context); + Map after = getSubtaskAttempts(jobId, writerVertexId); + assertThat(after.get(0).attempt).isEqualTo(before.get(0).attempt); + assertThat(after.get(1).attempt).isEqualTo(before.get(1).attempt); + + // 取消流作业释放 slot 后,用 batch query 校验两批数据最终一致。 + cancel(jobId); + assertTable(context, 0, 40); + } + + @Test + public void testTaskManagerFailureRestoresOnlyAffectedRegion() throws Exception { + TestContext context = createContext(); + writeRecords(context.inputDirectory, 0, 40); + + String jobId = submit(context); + waitForJobStatus(jobId, "RUNNING"); + waitForWriterSubtasks(jobId); + waitForRecords(); + triggerAndWaitForDataCommitted(jobId, context); + + String writerVertexId = findWriterVertexId(jobId); + assertSourceAndWriterAreChained(jobId); + Map before = waitForWriterSubtasks(jobId); + assertThat(before).hasSize(2); + assertThat(before.get(0).host).isNotEqualTo(before.get(1).host); + + ContainerState failedTaskManager = findTaskManager(before.get(0)); + failedTaskManager + .getDockerClient() + .restartContainerCmd(failedTaskManager.getContainerId()) + .withTimeout(10) + .exec(); + + waitUntil( + () -> { + Map attempts = + getSubtaskAttempts(jobId, writerVertexId); + return attempts.size() == 2 + && attempts.values().stream() + .allMatch(attempt -> "RUNNING".equals(attempt.status)) + && attempts.get(0).attempt > before.get(0).attempt + && "RUNNING".equals(jobStatus(jobId)); + }, + "The affected writer region did not recover."); + + Map after = getSubtaskAttempts(jobId, writerVertexId); + assertThat(after.get(0).attempt).isGreaterThan(before.get(0).attempt); + assertThat(after.get(1).attempt).isEqualTo(before.get(1).attempt); + + triggerAndWaitForDataCommitted(jobId, context); + cancel(jobId); + assertTable(context, 0, 40); + } + + @Test + public void testSavepointRestoreReplaysPendingFileInfo() throws Exception { + TestContext context = createContext(); + writeRecords(context.inputDirectory, 0, 20); + + String firstJobId = submit(context); + waitForJobStatus(firstJobId, "RUNNING"); + waitForWriterSubtasks(firstJobId); + waitForRecords(); + String savepoint = cancelWithSavepoint(firstJobId); + + String restoredJobId = submit(context, savepoint); + waitForJobStatus(restoredJobId, "RUNNING"); + waitForWriterSubtasks(restoredJobId); + writeRecords(context.inputDirectory, 20, 20); + waitForRecords(); + triggerAndWaitForDataCommitted(restoredJobId, context); + + waitUntil( + () -> + countOccurrences(jobManager.getLogs(), "Paimon writer coordinator starting") + >= 2, + "PWC was not recreated after savepoint restore."); + + cancel(restoredJobId); + assertTable(context, 0, 40); + } + + private TestContext createContext() { + String id = UUID.randomUUID().toString().replace("-", ""); + String inputDirectory = "pip30-input-" + id; + String inputPath = TEST_DATA_DIR + "/" + inputDirectory; + String warehouse = TEST_DATA_DIR + "/pip30-" + id; + + String catalogDdl = + String.format( + "CREATE CATALOG pip30_catalog WITH (\n" + + " 'type' = 'paimon',\n" + + " 'warehouse' = '%s'\n" + + ");", + warehouse); + String sourceDdl = + String.format( + "CREATE TEMPORARY TABLE pip30_source (\n" + + " sequence_id BIGINT,\n" + + " payload STRING\n" + + ") WITH (\n" + + " 'connector' = 'filesystem',\n" + + " 'path' = '%s',\n" + + " 'format' = 'csv',\n" + + " 'source.monitor-interval' = '1 s'\n" + + ");", + inputPath); + String tableDdl = + "CREATE TABLE IF NOT EXISTS pip30_sink (\n" + + " sequence_id BIGINT,\n" + + " payload STRING\n" + + ") WITH (\n" + + " 'bucket' = '-1',\n" + + " 'write-only' = 'true',\n" + + " 'sink.committer-coordinator-operator.enabled' = 'true'\n" + + ");"; + return new TestContext( + inputDirectory, + catalogDdl, + sourceDdl, + tableDdl, + warehouse + "/default.db/pip30_sink"); + } + + private String submit(TestContext context) throws Exception { + return submit(context, null); + } + + private String submit(TestContext context, String savepoint) throws Exception { + String restore = + savepoint == null + ? "" + : String.format("SET 'execution.savepoint.path' = '%s';\n", savepoint); + return runStreamingSql( + "INSERT INTO pip30_sink SELECT * FROM pip30_source;", + "SET 'parallelism.default' = '2';\n" + + "SET 'execution.checkpointing.interval' = '1 d';\n" + + "SET 'execution.checkpointing.timeout' = '10 s';\n" + + "SET 'execution.checkpointing.tolerable-failed-checkpoints' = '1';\n" + + "SET 'restart-strategy' = 'fixed-delay';\n" + + "SET 'restart-strategy.fixed-delay.attempts' = '10';\n" + + "SET 'restart-strategy.fixed-delay.delay' = '1 s';\n" + + restore, + context.catalogDdl, + "USE CATALOG pip30_catalog;", + context.tableDdl, + context.sourceDdl); + } + + private void writeRecords(String inputDirectory, int start, int count) throws Exception { + StringBuilder records = new StringBuilder(); + for (int i = start; i < start + count; i++) { + records.append(i).append(",value-").append(i).append('\n'); + } + writeSharedFile(inputDirectory + "/" + UUID.randomUUID() + ".csv", records.toString()); + } + + private void waitForRecords() throws InterruptedException { + Thread.sleep(2_000L); + } + + private void assertTable(TestContext context, int start, int end) throws Exception { + String resultDirectory = "pip30-result-" + UUID.randomUUID(); + String resultPath = TEST_DATA_DIR + "/" + resultDirectory; + runBatchSql( + "INSERT INTO pip30_result SELECT sequence_id, payload FROM pip30_sink;", + context.catalogDdl, + "USE CATALOG pip30_catalog;", + context.tableDdl, + String.format( + "CREATE TEMPORARY TABLE pip30_result (\n" + + " sequence_id BIGINT,\n" + + " payload STRING\n" + + ") WITH (\n" + + " 'connector' = 'filesystem',\n" + + " 'path' = '%s',\n" + + " 'format' = 'csv'\n" + + ");", + resultPath)); + + Map expected = new HashMap<>(); + for (int i = start; i < end; i++) { + expected.compute(i + ",value-" + i, (k, v) -> (v == null ? 0 : v) + 1); + } + assertThat(readRows(resultPath)).isEqualTo(expected); + } + + private Map readRows(String path) throws Exception { + Container.ExecResult result = + jobManager.execInContainer( + "bash", + "-c", + "if [ -d " + + path + + " ]; then find " + + path + + " -type f ! -name '.*' -exec cat {} +; fi"); + assertCommandSucceeded("read result files", result); + + Map rows = new HashMap<>(); + for (String row : result.getStdout().split("\\R")) { + if (!row.trim().isEmpty()) { + rows.compute(row.trim(), (k, v) -> (v == null ? 0 : v) + 1); + } + } + return rows; + } + + private void triggerAndWaitForDataCommitted(String jobId, TestContext context) + throws Exception { + waitUntil( + () -> { + triggerAndWaitForCompletedCheckpoint(jobId); + return latestSnapshotRecordCount(context) >= 40; + }, + "Paimon data was not committed."); + } + + private long latestSnapshotRecordCount(TestContext context) throws Exception { + Container.ExecResult result = + jobManager.execInContainer( + "bash", + "-c", + "latest=$(ls " + + context.tableDirectory + + "/snapshot/snapshot-* 2>/dev/null | sort -V | tail -1); " + + "[ -n \"$latest\" ] && sed -n 's/.*\"totalRecordCount\"[ ]*:[ ]*\\([0-9][0-9]*\\).*/\\1/p' \"$latest\""); + if (result.getExitCode() != 0) { + return 0L; + } + String recordCount = result.getStdout().trim(); + if (recordCount.isEmpty()) { + return 0L; + } + return Long.parseLong(recordCount); + } + + private String findWriterVertexId(String jobId) throws Exception { + String details = rest("GET", "/jobs/" + jobId, null); + Matcher matcher = VERTEX_PATTERN.matcher(details); + if (!matcher.find()) { + throw new AssertionError("Cannot find writer vertex in job details: " + details); + } + return matcher.group(1); + } + + private void assertSourceAndWriterAreChained(String jobId) throws Exception { + String plan = rest("GET", "/jobs/" + jobId + "/plan", null); + assertThat(plan) + .withFailMessage( + "Source and writer must be chained in one parallel vertex to verify" + + " subtask-level region failover.%nPlan:%n%s", + plan) + .contains( + "\"parallelism\":2", + "TableSourceScan(table=[[pip30_catalog, default, pip30_source]]", + "Writer(write-only) : pip30_sink"); + } + + private Map waitForWriterSubtasks(String jobId) throws Exception { + String writerVertexId = findWriterVertexId(jobId); + waitUntil( + () -> { + Map attempts = + getSubtaskAttempts(jobId, writerVertexId); + return attempts.size() == 2 + && attempts.values().stream() + .allMatch(attempt -> "RUNNING".equals(attempt.status)); + }, + "Writer subtasks did not become available."); + return getSubtaskAttempts(jobId, writerVertexId); + } + + private Map getSubtaskAttempts(String jobId, String vertexId) + throws Exception { + String details = rest("GET", "/jobs/" + jobId + "/vertices/" + vertexId, null); + Map attempts = new HashMap<>(); + String[] subtasks = details.split("\\\"subtask\\\"\\s*:"); + for (int i = 1; i < subtasks.length; i++) { + String subtask = subtasks[i]; + Integer index = firstInteger(subtask); + Integer attempt = integerField(subtask, "attempt"); + String status = stringField(subtask, "status"); + String host = stringField(subtask, "host"); + if (host == null) { + // support for Flink 2.2 REST API + host = stringField(subtask, "endpoint"); + } + if (index != null && attempt != null && status != null && host != null) { + attempts.put(index, new SubtaskAttempt(attempt, status, host)); + } + } + return attempts; + } + + private ContainerState findTaskManager(SubtaskAttempt attempt) { + String normalizedHost = attempt.host.replace('_', '-'); + for (int i = 1; i <= 2; i++) { + ContainerState taskManager = + environment.getContainerByServiceName("taskmanager-" + i).get(); + boolean hostnameMatches = + normalizedHost.endsWith("-taskmanager-" + i) + || normalizedHost.contains("-taskmanager-" + i + ".") + || normalizedHost.contains("-taskmanager-" + i + ":"); + boolean ipMatches = + taskManager.getContainerInfo().getNetworkSettings().getNetworks().values() + .stream() + .anyMatch( + network -> + attempt.host.startsWith(network.getIpAddress() + ":") + || attempt.host.equals(network.getIpAddress())); + if (hostnameMatches || ipMatches) { + return taskManager; + } + } + throw new AssertionError("Cannot map writer host to TaskManager: " + attempt.host); + } + + private void triggerAndWaitForCompletedCheckpoint(String jobId) throws Exception { + int completedBefore = checkpointCount(jobId, "completed"); + triggerCheckpoint(jobId); + waitForCheckpointCount(jobId, "completed", completedBefore + 1); + } + + private void triggerCheckpoint(String jobId) throws Exception { + rest("POST", "/jobs/" + jobId + "/checkpoints", "{}"); + } + + private int checkpointCount(String jobId, String field) throws Exception { + String checkpoints = rest("GET", "/jobs/" + jobId + "/checkpoints", null); + Matcher counts = Pattern.compile("\\\"counts\\\"\\s*:\\s*\\{([^}]*)}").matcher(checkpoints); + if (!counts.find()) { + throw new AssertionError("Checkpoint counts are missing: " + checkpoints); + } + Integer value = integerField(counts.group(1), field); + if (value == null) { + throw new AssertionError("Checkpoint count is missing for " + field); + } + return value; + } + + private void waitForCheckpointCount(String jobId, String field, int expected) throws Exception { + waitUntil( + () -> checkpointCount(jobId, field) >= expected, + "Checkpoint " + field + " count did not reach " + expected + '.'); + } + + private void waitForPartialCheckpoint(String jobId) throws Exception { + waitUntil( + () -> { + String checkpoints = rest("GET", "/jobs/" + jobId + "/checkpoints", null); + Matcher inProgress = + Pattern.compile( + "\\\"status\\\"\\s*:\\s*\\\"IN_PROGRESS\\\"([\\s\\S]{0,600})") + .matcher(checkpoints); + while (inProgress.find()) { + Integer acknowledged = + integerField(inProgress.group(1), "num_acknowledged_subtasks"); + if (acknowledged != null && acknowledged > 0) { + return true; + } + } + return false; + }, + "Checkpoint did not receive a partial writer snapshot."); + } + + private void waitForJobStatus(String jobId, String expected) throws Exception { + final String[] lastStatus = new String[1]; + waitUntil( + () -> { + lastStatus[0] = jobStatus(jobId); + return expected.equals(lastStatus[0]); + }, + "Job " + + jobId + + " did not reach status " + + expected + + ", last status was " + + lastStatus[0] + + ". Exceptions: " + + rest("GET", "/jobs/" + jobId + "/exceptions", null)); + } + + private String jobStatus(String jobId) throws Exception { + return stringField(rest("GET", "/jobs/" + jobId, null), "state"); + } + + private void cancel(String jobId) throws Exception { + Container.ExecResult result = + jobManager.execInContainerWithUser("flink", "bin/flink", "cancel", jobId); + assertCommandSucceeded("cancel job", result); + waitForJobStatus(jobId, "CANCELED"); + } + + private String cancelWithSavepoint(String jobId) throws Exception { + String directory = TEST_DATA_DIR + "/savepoints-" + UUID.randomUUID(); + Container.ExecResult mkdir = + jobManager.execInContainerWithUser("flink", "mkdir", "-p", directory); + assertCommandSucceeded("create savepoint directory", mkdir); + + Container.ExecResult result = + jobManager.execInContainerWithUser( + "flink", "bin/flink", "cancel", "-s", directory, jobId); + assertCommandSucceeded("cancel job with savepoint", result); + + String output = result.getStdout() + '\n' + result.getStderr(); + Matcher path = + Pattern.compile("Path:\\s*(\\S+)|Savepoint stored in\\s+(\\S+)\\.").matcher(output); + + if (!path.find()) { + throw new AssertionError( + "Cannot find savepoint path.\nstdout:\n" + + result.getStdout() + + "\nstderr:\n" + + result.getStderr()); + } + + String savepointPath = path.group(1) != null ? path.group(1) : path.group(2); + waitForJobStatus(jobId, "CANCELED"); + return savepointPath; + } + + private void assertCommandSucceeded(String command, Container.ExecResult result) { + assertThat(result.getExitCode()) + .withFailMessage( + "%s failed with exit code %s.\nstdout:\n%s\nstderr:\n%s", + command, result.getExitCode(), result.getStdout(), result.getStderr()) + .isZero(); + } + + private String rest(String method, String path, String body) throws Exception { + HttpURLConnection connection = + (HttpURLConnection) new URL(flinkRestUrl() + path).openConnection(); + connection.setRequestMethod(method); + connection.setConnectTimeout(10_000); + connection.setReadTimeout(30_000); + if (body != null) { + connection.setDoOutput(true); + connection.setRequestProperty("Content-Type", "application/json"); + try (OutputStream output = connection.getOutputStream()) { + output.write(body.getBytes(StandardCharsets.UTF_8)); + } + } + + int responseCode = connection.getResponseCode(); + InputStream input = + responseCode >= 200 && responseCode < 300 + ? connection.getInputStream() + : connection.getErrorStream(); + String response = read(input); + connection.disconnect(); + if (responseCode < 200 || responseCode >= 300) { + throw new IOException( + "Flink REST " + + method + + ' ' + + path + + " failed: " + + responseCode + + ' ' + + response); + } + return response; + } + + private static String read(InputStream input) throws IOException { + if (input == null) { + return ""; + } + StringBuilder result = new StringBuilder(); + try (BufferedReader reader = + new BufferedReader(new InputStreamReader(input, StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + result.append(line); + } + } + return result.toString(); + } + + private static Integer firstInteger(String value) { + Matcher matcher = INTEGER_PATTERN.matcher(value); + return matcher.find() ? Integer.parseInt(matcher.group(1)) : null; + } + + private static Integer integerField(String value, String field) { + Matcher matcher = + Pattern.compile("\\\"" + Pattern.quote(field) + "\\\"\\s*:\\s*(\\d+)") + .matcher(value); + return matcher.find() ? Integer.parseInt(matcher.group(1)) : null; + } + + private static String stringField(String value, String field) { + Matcher matcher = + Pattern.compile("\\\"" + Pattern.quote(field) + "\\\"\\s*:\\s*\\\"([^\\\"]+)\\\"") + .matcher(value); + return matcher.find() ? matcher.group(1) : null; + } + + private static int countOccurrences(String value, String expected) { + int count = 0; + int index = 0; + while ((index = value.indexOf(expected, index)) >= 0) { + count++; + index += expected.length(); + } + return count; + } + + private static void waitUntil(CheckedBooleanSupplier condition, String failureMessage) + throws Exception { + long deadline = System.currentTimeMillis() + WAIT_TIMEOUT_MS; + Throwable lastFailure = null; + while (System.currentTimeMillis() < deadline) { + try { + if (condition.getAsBoolean()) { + return; + } + } catch (Throwable t) { + lastFailure = t; + } + Thread.sleep(200L); + } + AssertionError error = new AssertionError(failureMessage); + if (lastFailure != null) { + error.initCause(lastFailure); + } + throw error; + } + + @FunctionalInterface + private interface CheckedBooleanSupplier { + + boolean getAsBoolean() throws Exception; + } + + private static class TestContext { + + private final String inputDirectory; + private final String catalogDdl; + private final String sourceDdl; + private final String tableDdl; + private final String tableDirectory; + + private TestContext( + String inputDirectory, + String catalogDdl, + String sourceDdl, + String tableDdl, + String tableDirectory) { + this.inputDirectory = inputDirectory; + this.catalogDdl = catalogDdl; + this.sourceDdl = sourceDdl; + this.tableDdl = tableDdl; + this.tableDirectory = tableDirectory; + } + } + + private static class SubtaskAttempt { + + private final int attempt; + private final String status; + private final String host; + + private SubtaskAttempt(int attempt, String status, String host) { + this.attempt = attempt; + this.status = status; + this.host = host; + } + + @Override + public String toString() { + return "SubtaskAttempt{" + + "attempt=" + + attempt + + ", status='" + + status + + '\'' + + ", host='" + + host + + '\'' + + '}'; + } + } +} diff --git a/paimon-e2e-tests/src/test/resources-filtered/docker-compose.yaml b/paimon-e2e-tests/src/test/resources-filtered/docker-compose.yaml index c9d579fb5657..04f7857b0a38 100644 --- a/paimon-e2e-tests/src/test/resources-filtered/docker-compose.yaml +++ b/paimon-e2e-tests/src/test/resources-filtered/docker-compose.yaml @@ -41,7 +41,7 @@ services: /docker-entrypoint.sh jobmanager " env_file: - - ./flink.env + - ./$FLINK_ENV_FILE networks: testnetwork: aliases: @@ -66,7 +66,7 @@ services: /docker-entrypoint.sh taskmanager " env_file: - - ./flink.env + - ./$FLINK_ENV_FILE networks: testnetwork: aliases: diff --git a/paimon-e2e-tests/src/test/resources/flink-pwc.env b/paimon-e2e-tests/src/test/resources/flink-pwc.env new file mode 100644 index 000000000000..a0d1683d9659 --- /dev/null +++ b/paimon-e2e-tests/src/test/resources/flink-pwc.env @@ -0,0 +1,19 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +FLINK_PROPERTIES="jobmanager.rpc.address: jobmanager\ntaskmanager.numberOfTaskSlots: 1\nparallelism.default: 3\ncluster.evenly-spread-out-slots: true\nsql-client.execution.result-mode: TABLEAU\nenv.java.opts.taskmanager: -verbose:gc -Xloggc:/opt/flink/log/gc.log\nexecution.checkpointing.checkpoints-after-tasks-finish.enabled: true" diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkConnectorOptions.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkConnectorOptions.java index 678471ea337c..8577e09e46f5 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkConnectorOptions.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkConnectorOptions.java @@ -365,6 +365,13 @@ public class FlinkConnectorOptions { .withDescription( "Allow sink committer and writer operator to be chained together"); + public static final ConfigOption SINK_COMMITTER_COORDINATOR_OPERATOR_ENABLED = + key("sink.committer-coordinator-operator.enabled") + .booleanType() + .defaultValue(false) + .withDescription( + "Allow coordinator replace committer operator, only support for append table now."); + public static final ConfigOption PARTITION_MARK_DONE_MODE = key("partition.mark-done-action.mode") .enumType(PartitionMarkDoneActionMode.class) diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/AppendTableSink.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/AppendTableSink.java index 6f33a5e45f02..a700757a1974 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/AppendTableSink.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/AppendTableSink.java @@ -32,12 +32,16 @@ import org.apache.flink.api.java.typeutils.TupleTypeInfo; import org.apache.flink.configuration.ExecutionOptions; import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.api.functions.sink.v2.DiscardingSink; import javax.annotation.Nullable; import java.util.Map; +import static org.apache.paimon.flink.FlinkConnectorOptions.PRECOMMIT_COMPACT; +import static org.apache.paimon.flink.FlinkConnectorOptions.SINK_COMMITTER_COORDINATOR_OPERATOR_ENABLED; import static org.apache.paimon.flink.FlinkConnectorOptions.SINK_MANAGED_WRITER_BUFFER_MEMORY; import static org.apache.paimon.flink.FlinkConnectorOptions.SINK_USE_MANAGED_MEMORY; import static org.apache.paimon.flink.utils.ManagedMemoryUtils.declareManagedMemory; @@ -132,4 +136,17 @@ public DataStream doWrite( return written; } + + @Override + public DataStreamSink doCommit(DataStream written, String commitUser) { + Options options = Options.fromMap(table.options()); + if (options.get(SINK_COMMITTER_COORDINATOR_OPERATOR_ENABLED) + && !options.get(PRECOMMIT_COMPACT)) { + return written.sinkTo(new DiscardingSink<>()) + .name("end") + .setParallelism(written.getParallelism()); + } else { + return super.doCommit(written, commitUser); + } + } } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitHandler.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitHandler.java new file mode 100644 index 000000000000..200e3a9ce150 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitHandler.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink; + +import org.apache.paimon.flink.sink.coordinator.CommitCompleteEvent; + +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; + +/** Handles writer-side interactions with commit coordinator. */ +public class CommitHandler { + + public static final CommitHandler EMPTY = new CommitHandler(); + + public void initialize( + StateInitializationContext context, int subtaskId, int attemptNumber, String commitUser) + throws Exception {} + + public void processWatermark(long watermark) {} + + public void snapshot(StateSnapshotContext context) throws Exception {} + + public void handleCommittables(long checkpointId) {} + + public boolean requiresStableCommitUser() { + return false; + } + + public boolean collect(Committable committable) { + return false; + } + + public boolean handleOperatorEvent(OperatorEvent event) { + return event instanceof CommitCompleteEvent; + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitterCoordinatedFactory.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitterCoordinatedFactory.java new file mode 100644 index 000000000000..923037648025 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CommitterCoordinatedFactory.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink; + +import org.apache.paimon.flink.sink.coordinator.PaimonWriterCoordinator; + +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.jobgraph.tasks.TaskOperatorEventGateway; +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.streaming.api.operators.CoordinatedOperatorFactory; +import org.apache.flink.streaming.api.operators.StreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; + +/** Factory that installs a JM-side committer coordinator for writer operators. */ +public class CommitterCoordinatedFactory + extends PrepareCommitOperator.Factory + implements CoordinatedOperatorFactory { + + private static final long serialVersionUID = 1L; + + private final boolean streamingCheckpointEnabled; + private final TableWriteOperator.Factory writeFactory; + private final Committer.Factory committerFactory; + private final String initialCommitUser; + private final Long endInputWatermark; + + public CommitterCoordinatedFactory( + boolean streamingCheckpointEnabled, + TableWriteOperator.Factory writeFactory, + Committer.Factory committerFactory, + String initialCommitUser, + Long endInputWatermark) { + super(writeFactory.options); + this.streamingCheckpointEnabled = streamingCheckpointEnabled; + this.writeFactory = writeFactory; + this.committerFactory = committerFactory; + this.initialCommitUser = initialCommitUser; + this.endInputWatermark = endInputWatermark; + } + + @Override + @SuppressWarnings("unchecked") + public > T createStreamOperator( + StreamOperatorParameters parameters) { + OperatorID operatorId = parameters.getStreamConfig().getOperatorID(); + TaskOperatorEventGateway gateway = + parameters + .getContainingTask() + .getEnvironment() + .getOperatorCoordinatorEventGateway(); + TableWriteOperator operator = writeFactory.createStreamOperator(parameters); + operator.setCommitHandler(new CoordinatedCommitHandler(gateway, operatorId)); + parameters.getOperatorEventDispatcher().registerEventHandler(operatorId, operator); + return (T) operator; + } + + @Override + @SuppressWarnings("rawtypes") + public Class getStreamOperatorClass(ClassLoader classLoader) { + return writeFactory.getStreamOperatorClass(classLoader); + } + + @Override + public OperatorCoordinator.Provider getCoordinatorProvider( + String operatorName, OperatorID operatorID) { + return new PaimonWriterCoordinator.WriterCoordinatorProvider( + streamingCheckpointEnabled, + operatorName, + operatorID, + initialCommitUser, + committerFactory, + endInputWatermark); + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CoordinatedCommitHandler.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CoordinatedCommitHandler.java new file mode 100644 index 000000000000..258625cebf86 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/CoordinatedCommitHandler.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink; + +import org.apache.paimon.flink.sink.coordinator.CommitCompleteEvent; +import org.apache.paimon.flink.sink.coordinator.CoordinatedCommittableState; +import org.apache.paimon.flink.sink.coordinator.CoordinatedFileInfoSender; + +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.jobgraph.tasks.TaskOperatorEventGateway; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; + +import java.util.List; + +import static org.apache.paimon.flink.sink.coordinator.CommitterCoordinator.END_INPUT_CHECKPOINT_ID; + +/** PIP-30 writer-side handler for the Paimon writer coordinator. */ +public class CoordinatedCommitHandler extends CommitHandler { + + private final CoordinatedFileInfoSender sender; + + private transient CoordinatedCommittableState state; + + public CoordinatedCommitHandler(TaskOperatorEventGateway gateway, OperatorID operatorId) { + this.sender = new CoordinatedFileInfoSender(gateway, operatorId); + } + + @Override + public void initialize( + StateInitializationContext context, int subtaskId, int attemptNumber, String commitUser) + throws Exception { + sender.setSubtaskId(subtaskId); + sender.setAttemptNumber(attemptNumber); + state = new CoordinatedCommittableState(); + state.initialize(context); + if (context.isRestored()) { + long restoredCheckpointId = + context.getRestoredCheckpointId() + .orElseThrow( + () -> + new IllegalStateException( + "Restored checkpoint id is missing.")); + List committables = state.committables(); + sender.sendRecoveredFileInfoToCoordinator( + restoredCheckpointId, commitUser, committables); + state.markAcknowledged(committables); + } + } + + @Override + public void processWatermark(long watermark) { + sender.processWatermark(watermark); + } + + @Override + public void snapshot(StateSnapshotContext context) throws Exception { + if (state != null) { + state.snapshot(context.getCheckpointId()); + } + if (!sender.isEndInput()) { + sendUnacknowledgedCommittables(context.getCheckpointId()); + } + } + + @Override + public void handleCommittables(long checkpointId) { + if (checkpointId == END_INPUT_CHECKPOINT_ID) { + sendUnacknowledgedCommittables(checkpointId); + } + } + + @Override + public boolean requiresStableCommitUser() { + return true; + } + + @Override + public boolean collect(Committable committable) { + if (state != null) { + state.add(committable); + } + return true; + } + + @Override + public boolean handleOperatorEvent(OperatorEvent event) { + if (event instanceof CommitCompleteEvent) { + if (state != null) { + state.markCommittedUpTo(((CommitCompleteEvent) event).checkpointId()); + } + return true; + } + return false; + } + + private void sendUnacknowledgedCommittables(long checkpointId) { + if (state == null) { + return; + } + + List committables = state.unacknowledgedCommittables(); + sender.sendToCoordinator(checkpointId, committables); + state.markAcknowledged(committables); + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSink.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSink.java index 959132ad58e0..edd5fff5e9c3 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSink.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSink.java @@ -126,28 +126,30 @@ public DataStream doWrite( DataStream input, String commitUser, @Nullable Integer parallelism) { StreamExecutionEnvironment env = input.getExecutionEnvironment(); boolean isStreaming = isStreaming(input); + Options options = Options.fromMap(table.options()); + boolean streamingCheckpointEnabled = + isStreaming && env.getCheckpointConfig().isCheckpointingEnabled(); boolean writeOnly = table.coreOptions().writeOnly(); SingleOutputStreamOperator written = input.transform( (writeOnly ? WRITER_WRITE_ONLY_NAME : WRITER_NAME) + " : " + table.name(), new CommittableTypeInfo(), - createWriteOperatorFactory( + createWriteCoordinatorFactory( StoreSinkWrite.createWriteProvider( table, env.getCheckpointConfig(), isStreaming, ignorePreviousFiles, hasSinkMaterializer(input)), - commitUser)); + commitUser, + streamingCheckpointEnabled)); if (parallelism == null) { forwardParallelism(written, input); } else { written.setParallelism(parallelism); } - Options options = Options.fromMap(table.options()); - String uidSuffix = options.get(SINK_OPERATOR_UID_SUFFIX); if (options.get(SINK_OPERATOR_UID_SUFFIX) != null) { written = written.uid(generateCustomUid(WRITER_NAME, table.name(), uidSuffix)); @@ -307,6 +309,11 @@ public static void assertBatchAdaptiveParallelism( protected abstract OneInputStreamOperatorFactory createWriteOperatorFactory( StoreSinkWrite.Provider writeProvider, String commitUser); + protected OneInputStreamOperatorFactory createWriteCoordinatorFactory( + StoreSinkWrite.Provider writeProvider, String commitUser, boolean isStreaming) { + return createWriteOperatorFactory(writeProvider, commitUser); + } + protected abstract Committer.Factory createCommitterFactory(); protected abstract CommittableStateManager createCommittableStateManager(); diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/PrepareCommitOperator.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/PrepareCommitOperator.java index 35f5ff15b9ae..874b0850a9d2 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/PrepareCommitOperator.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/PrepareCommitOperator.java @@ -112,8 +112,14 @@ public void close() throws Exception { } private void emitCommittables(boolean waitCompaction, long checkpointId) throws IOException { - prepareCommit(waitCompaction, checkpointId) - .forEach(committable -> output.collect(new StreamRecord<>(committable))); + prepareCommit(waitCompaction, checkpointId).forEach(this::collect); + handleCommittables(checkpointId); + } + + protected void handleCommittables(long checkpointId) {} + + protected void collect(OUT committable) { + output.collect(new StreamRecord<>(committable)); } protected abstract List prepareCommit(boolean waitCompaction, long checkpointId) diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/RowAppendTableSink.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/RowAppendTableSink.java index 6e3272f9e0e9..793cdeaabfad 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/RowAppendTableSink.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/RowAppendTableSink.java @@ -20,12 +20,17 @@ import org.apache.paimon.data.InternalRow; import org.apache.paimon.manifest.ManifestCommittable; +import org.apache.paimon.options.Options; import org.apache.paimon.table.FileStoreTable; import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; import java.util.Map; +import static org.apache.paimon.flink.FlinkConnectorOptions.END_INPUT_WATERMARK; +import static org.apache.paimon.flink.FlinkConnectorOptions.PRECOMMIT_COMPACT; +import static org.apache.paimon.flink.FlinkConnectorOptions.SINK_COMMITTER_COORDINATOR_OPERATOR_ENABLED; + /** An {@link AppendTableSink} which handles {@link InternalRow}. */ public class RowAppendTableSink extends AppendTableSink { @@ -46,4 +51,24 @@ protected OneInputStreamOperatorFactory createWriteOpe protected CommittableStateManager createCommittableStateManager() { return createRestoreOnlyCommittableStateManager(table); } + + @Override + @SuppressWarnings("unchecked") + protected OneInputStreamOperatorFactory createWriteCoordinatorFactory( + StoreSinkWrite.Provider writeProvider, String commitUser, boolean isStreaming) { + Options options = table.coreOptions().toConfiguration(); + boolean coordinatorEnabled = + options.get(SINK_COMMITTER_COORDINATOR_OPERATOR_ENABLED) + && !options.get(PRECOMMIT_COMPACT); + return coordinatorEnabled + ? new CommitterCoordinatedFactory( + isStreaming, + (TableWriteOperator.Factory) + createNoStateRowWriteOperatorFactory( + table, writeProvider, commitUser), + createCommitterFactory(), + commitUser, + options.get(END_INPUT_WATERMARK)) + : createWriteOperatorFactory(writeProvider, commitUser); + } } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/StatelessRowDataStoreWriteOperator.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/StatelessRowDataStoreWriteOperator.java index eac883b56d9b..69318a07f75b 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/StatelessRowDataStoreWriteOperator.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/StatelessRowDataStoreWriteOperator.java @@ -47,7 +47,11 @@ protected StoreSinkWriteState createState( } @Override - protected String getCommitUser(StateInitializationContext context) { + protected String getCommitUser(StateInitializationContext context) throws Exception { + if (commitHandler.requiresStableCommitUser()) { + // PWC requires the commit user to remain stable across recovery. + return super.getCommitUser(context); + } // No conflicts will occur in append only unaware bucket writer, so // commitUser does not matter. return commitUser == null ? initialCommitUser : commitUser; diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/TableWriteOperator.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/TableWriteOperator.java index f93bdfb560dd..c672604eb5e9 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/TableWriteOperator.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/TableWriteOperator.java @@ -31,12 +31,15 @@ import org.apache.flink.runtime.jobgraph.OperatorID; import org.apache.flink.runtime.jobgraph.tasks.TaskOperatorEventGateway; import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.operators.coordination.OperatorEventHandler; import org.apache.flink.runtime.state.StateInitializationContext; import org.apache.flink.runtime.state.StateSnapshotContext; import org.apache.flink.streaming.api.operators.CoordinatedOperatorFactory; import org.apache.flink.streaming.api.operators.StreamOperator; import org.apache.flink.streaming.api.operators.StreamOperatorFactory; import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.flink.streaming.api.watermark.Watermark; import javax.annotation.Nullable; @@ -44,11 +47,13 @@ import java.util.List; /** An abstract class for table write operator. */ -public abstract class TableWriteOperator extends PrepareCommitOperator { +public abstract class TableWriteOperator extends PrepareCommitOperator + implements OperatorEventHandler { private static final long serialVersionUID = 1L; protected FileStoreTable table; + protected CommitHandler commitHandler = CommitHandler.EMPTY; protected final StoreSinkWrite.Provider storeSinkWriteProvider; protected final String initialCommitUser; @@ -77,6 +82,12 @@ public void initializeState(StateInitializationContext context) throws Exception int numTasks = RuntimeContextUtils.getNumberOfParallelSubtasks(getRuntimeContext()); int subtaskId = RuntimeContextUtils.getIndexOfThisSubtask(getRuntimeContext()); + String currentCommitUser = getCommitUser(context); + commitHandler.initialize( + context, + subtaskId, + RuntimeContextUtils.getAttemptNumber(getRuntimeContext()), + currentCommitUser); StateValueFilter stateFilter = (tableName, partition, bucket) -> subtaskId == ChannelComputer.select(partition, bucket, numTasks); @@ -85,7 +96,7 @@ public void initializeState(StateInitializationContext context) throws Exception write = storeSinkWriteProvider.provide( table, - getCommitUser(context), + currentCommitUser, state, getContainingTask().getEnvironment().getIOManager(), memoryPoolFactory, @@ -100,6 +111,16 @@ public void setWriteRestore(@Nullable WriteRestore writeRestore) { this.writeRestore = writeRestore; } + public void setCommitHandler(CommitHandler commitHandler) { + this.commitHandler = commitHandler; + } + + @Override + public void processWatermark(Watermark mark) throws Exception { + super.processWatermark(mark); + commitHandler.processWatermark(mark.getTimestamp()); + } + protected StoreSinkWriteState createState( int subtaskId, StateInitializationContext context, @@ -127,6 +148,7 @@ public void snapshotState(StateSnapshotContext context) throws Exception { write.snapshotState(); state.snapshotState(); + commitHandler.snapshot(context); } @Override @@ -143,6 +165,26 @@ protected List prepareCommit(boolean waitCompaction, long checkpoin return write.prepareCommit(waitCompaction, checkpointId); } + @Override + protected void handleCommittables(long checkpointId) { + commitHandler.handleCommittables(checkpointId); + } + + @Override + protected void collect(Committable committable) { + if (!commitHandler.collect(committable)) { + super.collect(committable); + } + } + + @Override + public void handleOperatorEvent(OperatorEvent event) { + if (commitHandler.handleOperatorEvent(event)) { + return; + } + throw new IllegalArgumentException("Unsupported operator event: " + event.getClass()); + } + @VisibleForTesting public StoreSinkWrite getWrite() { return write; diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitCompleteEvent.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitCompleteEvent.java new file mode 100644 index 000000000000..e26d04e012f8 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitCompleteEvent.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.flink.runtime.operators.coordination.OperatorEvent; + +/** Coordinator event telling a writer that committables up to a checkpoint can be cleaned. */ +public class CommitCompleteEvent implements OperatorEvent { + + private static final long serialVersionUID = 1L; + + private final long checkpointId; + + public CommitCompleteEvent(long checkpointId) { + this.checkpointId = checkpointId; + } + + public long checkpointId() { + return checkpointId; + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitResult.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitResult.java new file mode 100644 index 000000000000..145e81ffcdc5 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitResult.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +/** Result of a PWC commit attempt. */ +class CommitResult { + + static final CommitResult NONE = new CommitResult(false, 0, -1, false); + + private final boolean committed; + private final int committedCount; + private final long checkpointId; + private final boolean restoredCommit; + + CommitResult(boolean committed, int committedCount, long checkpointId, boolean restoredCommit) { + this.committed = committed; + this.committedCount = committedCount; + this.checkpointId = checkpointId; + this.restoredCommit = restoredCommit; + } + + boolean committed() { + return committed; + } + + int committedCount() { + return committedCount; + } + + long checkpointId() { + return checkpointId; + } + + boolean restoredCommit() { + return restoredCommit; + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitterCoordinator.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitterCoordinator.java new file mode 100644 index 000000000000..f73244456898 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CommitterCoordinator.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.paimon.flink.sink.Committer; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.TreeMap; + +import static org.apache.paimon.utils.Preconditions.checkNotNull; + +/** JM-side global committer used by PaimonWriterCoordinator. */ +public class CommitterCoordinator { + + public static final long END_INPUT_CHECKPOINT_ID = Long.MAX_VALUE; + + private final boolean streamingCheckpointEnabled; + private final Committer.Factory committerFactory; + private final Long endInputWatermark; + private final NavigableMap committablesPerCheckpoint; + + private Committer committer; + private long globalWatermark; + private boolean endInput; + + public CommitterCoordinator( + boolean streamingCheckpointEnabled, + Committer.Factory committerFactory, + Long endInputWatermark) { + this.streamingCheckpointEnabled = streamingCheckpointEnabled; + this.committerFactory = checkNotNull(committerFactory); + this.endInputWatermark = endInputWatermark; + this.committablesPerCheckpoint = new TreeMap<>(); + this.globalWatermark = Long.MIN_VALUE; + } + + public void init(int parallelism, String commitUser) throws Exception { + this.globalWatermark = Long.MIN_VALUE; + this.endInput = false; + if (committer == null) { + committer = + committerFactory.create( + Committer.createContext( + commitUser, + null, + streamingCheckpointEnabled, + false, + null, + parallelism, + 0)); + } + } + + public void save(List committables, long checkpointId, long watermark) + throws Exception { + processWatermark(watermark); + pollInputs(committables); + if (checkpointId == END_INPUT_CHECKPOINT_ID) { + endInput(); + } + } + + private void pollInputs(Collection inputs) throws Exception { + Map> grouped = committer.groupByCheckpoint(inputs); + for (Map.Entry> entry : grouped.entrySet()) { + Long checkpoint = entry.getKey(); + List committables = entry.getValue(); + if (checkpoint != null + && checkpoint == END_INPUT_CHECKPOINT_ID + && committablesPerCheckpoint.containsKey(checkpoint)) { + GlobalCommitT merged = + committer.combine( + checkpoint, + globalWatermark, + committablesPerCheckpoint.get(checkpoint), + committables); + committablesPerCheckpoint.put(checkpoint, merged); + } else if (committablesPerCheckpoint.containsKey(checkpoint)) { + continue; + } else { + committablesPerCheckpoint.put( + checkpoint, committer.combine(checkpoint, globalWatermark, committables)); + } + } + } + + private void processWatermark(long watermark) { + if (watermark != Long.MAX_VALUE) { + globalWatermark = Math.max(globalWatermark, watermark); + } + } + + private void endInput() throws Exception { + endInput = true; + if (endInputWatermark != null) { + globalWatermark = endInputWatermark; + } + if (!streamingCheckpointEnabled) { + commitUpToCheckpoint(END_INPUT_CHECKPOINT_ID); + } + } + + public boolean isEndInput() { + return endInput && streamingCheckpointEnabled; + } + + public void notifyCheckpointComplete(long checkpointId) throws Exception { + commitUpToCheckpoint(endInput ? END_INPUT_CHECKPOINT_ID : checkpointId); + } + + public int filterAndCommitUpToCheckpoint(long checkpointId) throws Exception { + NavigableMap headMap = + committablesPerCheckpoint.headMap(checkpointId, true); + List committables = new ArrayList<>(headMap.values()); + if (committables.isEmpty() && committer.forceCreatingSnapshot()) { + committables = + Collections.singletonList( + committer.combine( + checkpointId, globalWatermark, Collections.emptyList())); + } + int committed = committer.filterAndCommit(committables, true, false); + headMap.clear(); + return committed; + } + + private void commitUpToCheckpoint(long checkpointId) throws Exception { + NavigableMap headMap = + committablesPerCheckpoint.headMap(checkpointId, true); + List committables = new ArrayList<>(headMap.values()); + if (committables.isEmpty() && committer.forceCreatingSnapshot()) { + committables = + Collections.singletonList( + committer.combine( + checkpointId, globalWatermark, Collections.emptyList())); + } + if (checkpointId == END_INPUT_CHECKPOINT_ID) { + committer.filterAndCommit(committables, false, true); + } else { + committer.commit(committables); + } + headMap.clear(); + } + + public void notifyCheckpointAborted(long checkpointId) { + // Checkpoint abort is not committable abort. Keep pending committables for a later + // completed checkpoint. + } + + public void close() throws Exception { + committablesPerCheckpoint.clear(); + if (committer != null) { + committer.close(); + } + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableState.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableState.java new file mode 100644 index 000000000000..421a9315a709 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableState.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.paimon.flink.sink.Committable; +import org.apache.paimon.flink.sink.CommittableSerializer; +import org.apache.paimon.table.sink.CommitMessageSerializer; + +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeutils.base.array.BytePrimitiveArraySerializer; +import org.apache.flink.core.memory.DataInputDeserializer; +import org.apache.flink.core.memory.DataOutputViewStreamWrapper; +import org.apache.flink.runtime.state.StateInitializationContext; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; + +/** Writer-side state for committables which have not been committed by PWC. */ +public class CoordinatedCommittableState { + + private static final String STATE_NAME = "pwc_pending_committables"; + + private ListState state; + private CheckpointCommittablesSerializer serializer; + private final NavigableMap> pendingCommittables = new TreeMap<>(); + private final Set acknowledgedCheckpoints = new TreeSet<>(); + + public void initialize(StateInitializationContext context) throws Exception { + CommittableSerializer committableSerializer = + new CommittableSerializer(new CommitMessageSerializer()); + serializer = new CheckpointCommittablesSerializer(committableSerializer); + state = + context.getOperatorStateStore() + .getListState( + new ListStateDescriptor<>( + STATE_NAME, BytePrimitiveArraySerializer.INSTANCE)); + for (byte[] bytes : state.get()) { + CheckpointCommittables checkpoint = serializer.deserialize(bytes); + pendingCommittables + .computeIfAbsent(checkpoint.checkpointId(), ignored -> new ArrayList<>()) + .addAll(checkpoint.committables()); + } + } + + public void add(Committable committable) { + pendingCommittables + .computeIfAbsent(committable.checkpointId(), ignored -> new ArrayList<>()) + .add(committable); + } + + public void snapshot(long checkpointId) throws Exception { + pendingCommittables.computeIfAbsent(checkpointId, ignored -> new ArrayList<>()); + List checkpoints = new ArrayList<>(); + for (Map.Entry> entry : pendingCommittables.entrySet()) { + checkpoints.add( + serializer.serialize( + new CheckpointCommittables( + entry.getKey(), new ArrayList<>(entry.getValue())))); + } + state.update(checkpoints); + } + + public Map> pendingCommittables() { + Map> result = new TreeMap<>(); + for (Map.Entry> entry : pendingCommittables.entrySet()) { + result.put(entry.getKey(), new ArrayList<>(entry.getValue())); + } + return result; + } + + public List committables() { + List result = new ArrayList<>(); + for (List committables : pendingCommittables.values()) { + result.addAll(committables); + } + return result; + } + + public List unacknowledgedCommittables() { + List result = new ArrayList<>(); + for (Map.Entry> entry : pendingCommittables.entrySet()) { + if (!acknowledgedCheckpoints.contains(entry.getKey())) { + result.addAll(entry.getValue()); + } + } + return result; + } + + public void markAcknowledged(List committables) { + for (Committable committable : committables) { + acknowledgedCheckpoints.add(committable.checkpointId()); + } + } + + public void markCommittedUpTo(long checkpointId) { + pendingCommittables.headMap(checkpointId, true).clear(); + acknowledgedCheckpoints.removeIf(id -> id <= checkpointId); + } + + public void clear() throws Exception { + pendingCommittables.clear(); + acknowledgedCheckpoints.clear(); + if (state != null) { + state.clear(); + } + } + + private static class CheckpointCommittables { + + private final long checkpointId; + private final List committables; + + private CheckpointCommittables(long checkpointId, List committables) { + this.checkpointId = checkpointId; + this.committables = committables; + } + + private long checkpointId() { + return checkpointId; + } + + private List committables() { + return committables; + } + } + + /** Serializer for checkpoint committables. */ + private static class CheckpointCommittablesSerializer { + + private final CommittableSerializer committableSerializer; + + private CheckpointCommittablesSerializer(CommittableSerializer committableSerializer) { + this.committableSerializer = committableSerializer; + } + + private byte[] serialize(CheckpointCommittables checkpoint) throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + DataOutputViewStreamWrapper view = new DataOutputViewStreamWrapper(out); + view.writeLong(checkpoint.checkpointId()); + view.writeInt(checkpoint.committables().size()); + for (Committable committable : checkpoint.committables()) { + byte[] bytes = committableSerializer.serialize(committable); + view.writeInt(bytes.length); + view.write(bytes); + } + return out.toByteArray(); + } + + private CheckpointCommittables deserialize(byte[] serialized) throws IOException { + DataInputDeserializer view = new DataInputDeserializer(serialized); + long checkpointId = view.readLong(); + int count = view.readInt(); + if (count < 0) { + throw new IOException("Negative committable count: " + count); + } + + List committables = new ArrayList<>(count); + for (int i = 0; i < count; i++) { + int length = view.readInt(); + if (length < 0) { + throw new IOException("Negative committable length: " + length); + } + byte[] bytes = new byte[length]; + view.readFully(bytes); + Committable committable = + committableSerializer.deserialize( + committableSerializer.getVersion(), bytes); + if (committable.checkpointId() != checkpointId) { + throw new IOException( + String.format( + "Committable checkpoint %s does not match state checkpoint %s.", + committable.checkpointId(), checkpointId)); + } + committables.add(committable); + } + return new CheckpointCommittables(checkpointId, committables); + } + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSender.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSender.java new file mode 100644 index 000000000000..e5c3c0e70ef9 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSender.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.paimon.flink.sink.Committable; +import org.apache.paimon.flink.sink.CommittableSerializer; +import org.apache.paimon.table.sink.CommitMessageSerializer; + +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.jobgraph.tasks.TaskOperatorEventGateway; +import org.apache.flink.runtime.operators.coordination.CoordinationRequest; +import org.apache.flink.util.Preconditions; +import org.apache.flink.util.SerializedValue; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutionException; + +/** Sends writer committables to the Paimon writer coordinator. */ +public class CoordinatedFileInfoSender { + + private static final int LENGTH_FIELD_SIZE = 4; + private static final int COUNT_FIELD_SIZE = 4; + + private final TaskOperatorEventGateway gateway; + private final OperatorID operatorId; + private final CommittableSerializer serializer; + + private int subtaskId; + private int attemptNumber; + private long watermark; + private boolean endInput; + + public CoordinatedFileInfoSender(TaskOperatorEventGateway gateway, OperatorID operatorId) { + this.gateway = gateway; + this.operatorId = operatorId; + this.serializer = new CommittableSerializer(new CommitMessageSerializer()); + this.subtaskId = -1; + this.attemptNumber = -1; + this.watermark = Long.MIN_VALUE; + } + + public void setSubtaskId(int subtaskId) { + this.subtaskId = subtaskId; + } + + public void setAttemptNumber(int attemptNumber) { + this.attemptNumber = attemptNumber; + } + + public void processWatermark(long watermark) { + if (watermark != Long.MAX_VALUE) { + this.watermark = Math.max(this.watermark, watermark); + } + } + + public boolean isEndInput() { + return endInput; + } + + public void sendToCoordinator(long checkpointId, List committables) { + if (checkpointId == CommitterCoordinator.END_INPUT_CHECKPOINT_ID) { + endInput = true; + } + byte[] data = serializeCommittables(committables); + FileInfoRequest request = + FileInfoRequest.fileInfo( + checkpointId, + subtaskId, + attemptNumber, + watermark, + data, + committables.size()); + sendRequest(request); + } + + public void sendRecoveredFileInfoToCoordinator( + long checkpointId, String commitUser, List committables) { + byte[] data = serializeCommittables(committables); + sendRequest( + FileInfoRequest.recoveredFileInfo( + checkpointId, + subtaskId, + attemptNumber, + watermark, + data, + committables.size(), + commitUser)); + } + + private void sendRequest(FileInfoRequest request) { + try { + SerializedValue serializedRequest = + new SerializedValue(request); + FileInfoReceivedResponse response = + CoordinationResponseUtils.unwrap( + gateway.sendRequestToCoordinator(operatorId, serializedRequest).get()); + Preconditions.checkState( + response.checkpointId() == request.checkpointId() + && response.subtaskId() == request.subtaskId(), + "Unexpected file info ACK response for checkpoint %s subtask %s: checkpoint %s subtask %s.", + request.checkpointId(), + request.subtaskId(), + response.checkpointId(), + response.subtaskId()); + } catch (IOException | ExecutionException e) { + throw new RuntimeException("Failed to send file info to coordinator.", e); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while sending file info to coordinator.", e); + } + } + + private byte[] serializeCommittables(List committables) { + try { + int totalBytes = COUNT_FIELD_SIZE; + List serializedCommittables = new ArrayList<>(committables.size()); + for (Committable committable : committables) { + Preconditions.checkNotNull(committable, "Committable cannot be null"); + byte[] serialized = serializer.serialize(committable); + serializedCommittables.add(serialized); + totalBytes += LENGTH_FIELD_SIZE + serialized.length; + } + + byte[] result = new byte[totalBytes]; + ByteBuffer resultBuffer = ByteBuffer.wrap(result); + resultBuffer.putInt(committables.size()); + for (byte[] serialized : serializedCommittables) { + resultBuffer.putInt(serialized.length); + resultBuffer.put(serialized); + } + return result; + } catch (IOException e) { + throw new RuntimeException("Failed to serialize committable.", e); + } + } + + public static List deserializeCommittables(byte[] data) throws IOException { + ByteBuffer buffer = ByteBuffer.wrap(data == null ? new byte[0] : data); + if (buffer.remaining() < COUNT_FIELD_SIZE) { + throw new IOException("Invalid committable data: missing count field."); + } + + int count = buffer.getInt(); + List result = new ArrayList<>(count); + CommittableSerializer serializer = new CommittableSerializer(new CommitMessageSerializer()); + int version = serializer.getVersion(); + for (int i = 0; i < count; i++) { + if (buffer.remaining() < LENGTH_FIELD_SIZE) { + throw new IOException("Invalid committable data: missing length field."); + } + int length = buffer.getInt(); + if (length < 0 || length > buffer.remaining()) { + throw new IOException("Invalid committable data: corrupted length field."); + } + byte[] bytes = new byte[length]; + buffer.get(bytes); + result.add(serializer.deserialize(version, bytes)); + } + + return result; + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoReceivedResponse.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoReceivedResponse.java new file mode 100644 index 000000000000..3cbaa45374c0 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoReceivedResponse.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.flink.runtime.operators.coordination.CoordinationResponse; + +/** ACK response after PWC handles a writer file info request. */ +public class FileInfoReceivedResponse implements CoordinationResponse { + + private static final long serialVersionUID = 1L; + + private final long checkpointId; + private final int subtaskId; + + public FileInfoReceivedResponse(long checkpointId, int subtaskId) { + this.checkpointId = checkpointId; + this.subtaskId = subtaskId; + } + + public long checkpointId() { + return checkpointId; + } + + public int subtaskId() { + return subtaskId; + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoRequest.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoRequest.java new file mode 100644 index 000000000000..f5f50e9c2ff4 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/FileInfoRequest.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.flink.runtime.operators.coordination.CoordinationRequest; + +import javax.annotation.Nullable; + +import java.util.Arrays; + +/** Request sent from writer subtasks to the Paimon writer coordinator. */ +public class FileInfoRequest implements CoordinationRequest { + + private static final long serialVersionUID = 1L; + + private final long checkpointId; + private final int subtaskId; + private final int attemptNumber; + private final long watermark; + private final byte[] serializedData; + private final int committableCount; + private final boolean recovered; + private final @Nullable String commitUser; + private final int payloadHash; + + public static FileInfoRequest fileInfo( + long checkpointId, + int subtaskId, + int attemptNumber, + long watermark, + byte[] serializedData, + int committableCount) { + return new FileInfoRequest( + checkpointId, + subtaskId, + attemptNumber, + watermark, + serializedData, + committableCount, + false, + null); + } + + public static FileInfoRequest recoveredFileInfo( + long checkpointId, + int subtaskId, + int attemptNumber, + long watermark, + byte[] serializedData, + int committableCount, + String commitUser) { + return new FileInfoRequest( + checkpointId, + subtaskId, + attemptNumber, + watermark, + serializedData, + committableCount, + true, + commitUser); + } + + private FileInfoRequest( + long checkpointId, + int subtaskId, + int attemptNumber, + long watermark, + byte[] serializedData, + int committableCount, + boolean recovered, + @Nullable String commitUser) { + this.checkpointId = checkpointId; + this.subtaskId = subtaskId; + this.attemptNumber = attemptNumber; + this.watermark = watermark; + this.serializedData = serializedData == null ? new byte[0] : serializedData; + this.committableCount = committableCount; + this.recovered = recovered; + this.commitUser = commitUser; + this.payloadHash = Arrays.hashCode(this.serializedData); + } + + public long checkpointId() { + return checkpointId; + } + + public int subtaskId() { + return subtaskId; + } + + public int attemptNumber() { + return attemptNumber; + } + + public long watermark() { + return watermark; + } + + public byte[] serializedData() { + return serializedData; + } + + public int committableCount() { + return committableCount; + } + + public boolean recovered() { + return recovered; + } + + public @Nullable String commitUser() { + return commitUser; + } + + public boolean samePayload(FileInfoRequest other) { + return other != null + && payloadHash == other.payloadHash + && committableCount == other.committableCount + && Arrays.equals(serializedData, other.serializedData); + } + + @Override + public String toString() { + if (recovered) { + return String.format( + "FileInfoRequest{checkpoint=%d, recovered=true, subtask=%d, attempt=%d, " + + "count=%d, dataSize=%d bytes, commitUser=%s}", + checkpointId, + subtaskId, + attemptNumber, + committableCount, + serializedData.length, + commitUser); + } + return String.format( + "FileInfoRequest{checkpoint=%d, subtask=%d, attempt=%d, count=%d, dataSize=%d bytes}", + checkpointId, subtaskId, attemptNumber, committableCount, serializedData.length); + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinator.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinator.java new file mode 100644 index 000000000000..c9deac75e088 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinator.java @@ -0,0 +1,425 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.paimon.flink.sink.Committable; +import org.apache.paimon.flink.sink.Committer; +import org.apache.paimon.flink.sink.TableWriteOperator; +import org.apache.paimon.utils.ExceptionUtils; + +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.coordination.CoordinationRequest; +import org.apache.flink.runtime.operators.coordination.CoordinationRequestHandler; +import org.apache.flink.runtime.operators.coordination.CoordinationResponse; +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.util.ThrowableCatchingRunnable; +import org.apache.flink.util.function.ThrowingRunnable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ThreadFactory; + +import static org.apache.flink.util.Preconditions.checkState; + +/** + * {@link OperatorCoordinator} for {@link TableWriteOperator}. It receives writer file information + * and performs global commits in JobManager. + */ +public class PaimonWriterCoordinator implements OperatorCoordinator, CoordinationRequestHandler { + + private static final Logger LOG = LoggerFactory.getLogger(PaimonWriterCoordinator.class); + + private final PendingSubtask pendingSubtask; + private final CoordinatorExecutorThreadFactory coordinatorThreadFactory; + private final CompletableFuture finalCheckpointCompleted = new CompletableFuture<>(); + + private final OperatorCoordinator.Context context; + private final CommitterCoordinator coordinator; + private final String initialCommitUser; + + private @Nullable String commitUser; + private ScheduledExecutorService coordinatorExecutor; + private boolean started; + private boolean freshInstance = true; + + public PaimonWriterCoordinator( + boolean streamingCheckpointEnabled, + String initialCommitUser, + Committer.Factory committerFactory, + OperatorCoordinator.Context context, + CoordinatorExecutorThreadFactory coordinatorThreadFactory, + Long endInputWatermark) { + this.context = context; + this.coordinatorThreadFactory = coordinatorThreadFactory; + this.initialCommitUser = initialCommitUser; + this.coordinator = + new CommitterCoordinator<>( + streamingCheckpointEnabled, committerFactory, endInputWatermark); + this.pendingSubtask = new PendingSubtask(this.coordinator); + } + + @Override + public void start() throws Exception { + OperatorID operatorId = context.getOperatorId(); + LOG.info("Paimon writer coordinator starting, operatorId={}", operatorId); + if (commitUser == null) { + commitUser = initialCommitUser; + } + started = true; + coordinatorExecutor = Executors.newScheduledThreadPool(1, coordinatorThreadFactory); + int parallelism = context.currentParallelism(); + coordinator.init(parallelism, commitUser); + pendingSubtask.init(parallelism); + } + + @Override + public void executionAttemptReady(int subtask, int attemptNumber, SubtaskGateway gateway) { + runInEventLoop( + () -> pendingSubtask.registerSubtask(subtask, attemptNumber, gateway), + "registering subtask %d attempt %d", + subtask, + attemptNumber); + } + + @Override + public void executionAttemptFailed(int subtask, int attemptNumber, Throwable throwable) { + runInEventLoop( + () -> pendingSubtask.unregisterSubtask(subtask, attemptNumber, throwable), + "unregistering subtask %d attempt %d", + subtask, + attemptNumber); + } + + @Override + public void handleEventFromOperator(int subtask, int attemptNumber, OperatorEvent event) { + freshInstance = false; + throw new UnsupportedOperationException( + "PWC only accepts file info through coordination requests."); + } + + @Override + public CompletableFuture handleCoordinationRequest( + CoordinationRequest request) { + freshInstance = false; + if (request instanceof FileInfoRequest) { + return handleFileInfoRequest((FileInfoRequest) request); + } + CompletableFuture result = new CompletableFuture<>(); + result.completeExceptionally( + new IllegalArgumentException("Unsupported request type: " + request.getClass())); + return result; + } + + @Override + public void checkpointCoordinator(long checkpointId, CompletableFuture result) { + freshInstance = false; + LOG.info("PWC snapshot commitUser={}, checkpointId={}", commitUser, checkpointId); + checkState(commitUser != null, "PWC has not been started."); + result.complete(serializeCoordinatorState(commitUser)); + } + + @Override + public void notifyCheckpointComplete(long checkpointId) { + freshInstance = false; + runInEventLoop( + () -> { + handleCommitResult(pendingSubtask.notifyCheckpointComplete(checkpointId)); + if (coordinator.isEndInput()) { + finalCheckpointCompleted.complete(null); + } + }, + "notifying checkpoint %d complete", + checkpointId); + if (coordinator.isEndInput()) { + try { + finalCheckpointCompleted.get(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + } + } + + @Override + public void notifyCheckpointAborted(long checkpointId) { + runInEventLoop( + () -> pendingSubtask.notifyCheckpointAborted(checkpointId), + "notifying checkpoint %d aborted", + checkpointId); + } + + @Override + public void resetToCheckpoint(long checkpointId, byte[] bytes) throws Exception { + LOG.info("PWC resetToCheckpoint: checkpointId={}, fresh={}", checkpointId, freshInstance); + if (freshInstance && checkpointId >= 0) { + checkState(!started, "PWC can only be restored before it is started."); + commitUser = deserializeCoordinatorState(bytes); + pendingSubtask.restoreCheckpoint(checkpointId); + } + freshInstance = false; + } + + @Override + public void subtaskReset(int subtask, long checkpointId) {} + + @Override + public void close() throws Exception { + pendingSubtask.close(); + coordinator.close(); + if (coordinatorExecutor != null) { + coordinatorExecutor.shutdownNow(); + } + } + + private void handleCommitResult(CommitResult result) { + if (!result.committed()) { + return; + } + if (result.restoredCommit() && result.committedCount() > 0) { + context.failJob( + new RecommitRequiredException(result.checkpointId(), result.committedCount())); + return; + } + sendCommitCompleteEvent(result.checkpointId()); + } + + private void sendCommitCompleteEvent(long checkpointId) { + CommitCompleteEvent event = new CommitCompleteEvent(checkpointId); + for (SubtaskGateway gateway : pendingSubtask.activeGateways()) { + gateway.sendEvent(event) + .whenComplete( + (ignored, error) -> { + if (error != null) { + context.failJob(error); + } + }); + } + } + + private CompletableFuture handleFileInfoRequest(FileInfoRequest request) { + ensureStarted(); + CompletableFuture result = new CompletableFuture<>(); + runInEventLoop( + () -> { + try { + if (!pendingSubtask.isValid(request.subtaskId(), request.attemptNumber())) { + result.completeExceptionally( + new IllegalStateException( + String.format( + "Received file info request from invalid subtask %d attempt %d.", + request.subtaskId(), request.attemptNumber()))); + return; + } + if (request.recovered()) { + validateCommitUser(request.commitUser()); + } + handleCommitResult(pendingSubtask.receive(request.subtaskId(), request)); + result.complete( + CoordinationResponseUtils.wrap( + new FileInfoReceivedResponse( + request.checkpointId(), request.subtaskId()))); + } catch (Throwable t) { + result.completeExceptionally(t); + throw t; + } + }, + "handling file info request %s", + request); + return result; + } + + private void validateCommitUser(@Nullable String recoveredCommitUser) { + checkState(commitUser != null, "PWC has not been started."); + checkState(recoveredCommitUser != null, "Recovered writer commit user is null."); + checkState( + commitUser.equals(recoveredCommitUser), + "Writer commit user %s does not match PWC commit user %s.", + recoveredCommitUser, + commitUser); + } + + private static byte[] serializeCoordinatorState(String commitUser) { + byte[] commitUserBytes = commitUser.getBytes(StandardCharsets.UTF_8); + return ByteBuffer.allocate(Integer.BYTES + commitUserBytes.length) + .putInt(commitUserBytes.length) + .put(commitUserBytes) + .array(); + } + + private static String deserializeCoordinatorState(byte[] bytes) { + if (bytes.length < Integer.BYTES) { + throw new IllegalArgumentException("Corrupted PWC coordinator state."); + } + + ByteBuffer buffer = ByteBuffer.wrap(bytes); + int commitUserLength = buffer.getInt(); + if (commitUserLength < 0 || commitUserLength != buffer.remaining()) { + throw new IllegalArgumentException("Corrupted commit user in PWC coordinator state."); + } + byte[] commitUserBytes = new byte[commitUserLength]; + buffer.get(commitUserBytes); + return new String(commitUserBytes, StandardCharsets.UTF_8); + } + + private void runInEventLoop( + final ThrowingRunnable action, + final String actionName, + final Object... parameters) { + ensureStarted(); + coordinatorExecutor.execute( + new ThrowableCatchingRunnable( + throwable -> + coordinatorThreadFactory.uncaughtException( + Thread.currentThread(), throwable), + () -> { + try { + action.run(); + } catch (Throwable t) { + ExceptionUtils.rethrowIfFatalErrorOrOOM(t); + LOG.error( + "Uncaught exception in PWC while {}.", + String.format(actionName, parameters), + t); + context.failJob(t); + } + })); + } + + public void runInCoordinatorThread(Runnable runnable) { + ensureStarted(); + coordinatorExecutor.execute(runnable); + } + + private void ensureStarted() { + if (!started) { + throw new IllegalStateException("The coordinator has not started yet."); + } + } + + /** Provider for {@link PaimonWriterCoordinator}. */ + public static class WriterCoordinatorProvider implements OperatorCoordinator.Provider { + + private static final long serialVersionUID = 1L; + + private final boolean streamingCheckpointEnabled; + private final String operatorName; + private final OperatorID operatorId; + private final String initialCommitUser; + private final Committer.Factory committerFactory; + private final Long endInputWatermark; + + public WriterCoordinatorProvider( + boolean streamingCheckpointEnabled, + String operatorName, + OperatorID operatorId, + String initialCommitUser, + Committer.Factory committerFactory, + Long endInputWatermark) { + this.streamingCheckpointEnabled = streamingCheckpointEnabled; + this.operatorName = operatorName; + this.operatorId = operatorId; + this.initialCommitUser = initialCommitUser; + this.committerFactory = committerFactory; + this.endInputWatermark = endInputWatermark; + } + + @Override + public OperatorID getOperatorId() { + return operatorId; + } + + @Override + public OperatorCoordinator create(OperatorCoordinator.Context context) { + CoordinatorExecutorThreadFactory threadFactory = + new CoordinatorExecutorThreadFactory( + "PaimonWriterCoordinator-" + operatorName, context); + return new PaimonWriterCoordinator( + streamingCheckpointEnabled, + initialCommitUser, + committerFactory, + context, + threadFactory, + endInputWatermark); + } + } + + /** Thread factory for the single coordinator event loop. */ + public static class CoordinatorExecutorThreadFactory + implements ThreadFactory, Thread.UncaughtExceptionHandler { + + private final String coordinatorThreadName; + private final ClassLoader classLoader; + private final Thread.UncaughtExceptionHandler errorHandler; + + @Nullable private Thread thread; + + public CoordinatorExecutorThreadFactory( + String coordinatorThreadName, OperatorCoordinator.Context context) { + this( + coordinatorThreadName, + context.getUserCodeClassloader(), + (thread, error) -> context.failJob(error)); + } + + CoordinatorExecutorThreadFactory( + String coordinatorThreadName, + ClassLoader classLoader, + Thread.UncaughtExceptionHandler errorHandler) { + this.coordinatorThreadName = coordinatorThreadName; + this.classLoader = classLoader; + this.errorHandler = errorHandler; + } + + @Override + public synchronized Thread newThread(Runnable runnable) { + checkState(thread == null, "CoordinatorExecutorThreadFactory can create one thread."); + thread = new Thread(runnable, coordinatorThreadName); + thread.setContextClassLoader(classLoader); + thread.setUncaughtExceptionHandler(this); + return thread; + } + + @Override + public synchronized void uncaughtException(Thread thread, Throwable error) { + errorHandler.uncaughtException(thread, error); + } + } + + private static class RecommitRequiredException extends RuntimeException { + + private static final long serialVersionUID = 1L; + + private RecommitRequiredException(long checkpointId, int committedCount) { + super( + String.format( + "PWC committed %d restored committable(s) up to checkpoint %d. " + + "Triggering global recovery so writers continue from the " + + "latest Paimon snapshot.", + committedCount, checkpointId)); + } + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingCheckpoint.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingCheckpoint.java new file mode 100644 index 000000000000..40a2a6def447 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingCheckpoint.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.paimon.flink.sink.Committable; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; + +/** Pending data and state for one checkpoint in PWC. */ +class PendingCheckpoint { + + private final long checkpointId; + private final Map fileInfos; + private boolean staged; + + PendingCheckpoint(long checkpointId) { + this.checkpointId = checkpointId; + this.fileInfos = new HashMap<>(); + } + + long checkpointId() { + return checkpointId; + } + + boolean receive(int subtask, FileInfoRequest request, List committables) { + SubtaskFileInfo previous = fileInfos.get(subtask); + if (previous != null) { + if (previous.request().samePayload(request)) { + return false; + } + throw new IllegalStateException( + String.format( + "Different FileInfoRequest received for checkpoint %d subtask %d.", + checkpointId, subtask)); + } + + fileInfos.put(subtask, new SubtaskFileInfo(request, committables)); + return true; + } + + void removeSubtask(int subtask) { + fileInfos.remove(subtask); + } + + boolean isEmpty() { + return fileInfos.isEmpty(); + } + + boolean staged() { + return staged; + } + + void markStaged() { + staged = true; + } + + List fileInfos() { + return new ArrayList<>(fileInfos.values()); + } + + List allCommittables() { + List result = new ArrayList<>(); + for (Integer subtask : new TreeSet<>(fileInfos.keySet())) { + result.addAll(fileInfos.get(subtask).committables()); + } + return result; + } + + List committablesAfter(long checkpointId) { + List result = new ArrayList<>(); + for (Committable committable : allCommittables()) { + if (committable.checkpointId() > checkpointId) { + result.add(committable); + } + } + return result; + } + + long maxWatermark() { + long watermark = Long.MIN_VALUE; + for (SubtaskFileInfo fileInfo : fileInfos.values()) { + watermark = Math.max(watermark, fileInfo.request().watermark()); + } + return watermark; + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingSubtask.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingSubtask.java new file mode 100644 index 000000000000..8e87728363e5 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/PendingSubtask.java @@ -0,0 +1,269 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.paimon.flink.sink.Committable; + +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Set; +import java.util.TreeMap; + +/** Tracks writer subtasks and pending checkpoint file information for PWC. */ +public class PendingSubtask { + + private final Map> registeredSubtasks; + private final NavigableMap checkpoints; + private final Map> pendingEnvelopes; + private final Set abortedCheckpoints; + private final CommitterCoordinator coordinator; + + private int parallelism; + private long maxCommittedCheckpointId; + private long restoredCheckpointId; + + public PendingSubtask(CommitterCoordinator coordinator) { + this.coordinator = coordinator; + this.registeredSubtasks = new HashMap<>(); + this.checkpoints = new TreeMap<>(); + this.pendingEnvelopes = new HashMap<>(); + this.abortedCheckpoints = new HashSet<>(); + this.maxCommittedCheckpointId = Long.MIN_VALUE; + this.restoredCheckpointId = Long.MIN_VALUE; + } + + public void init(int parallelism) { + this.parallelism = parallelism; + } + + public void registerSubtask( + int subtask, int attemptNumber, OperatorCoordinator.SubtaskGateway gateway) { + Map attempts = + registeredSubtasks.computeIfAbsent(subtask, ignored -> new HashMap<>()); + if (!attempts.isEmpty() && !attempts.containsKey(attemptNumber)) { + attempts.clear(); + removePendingSubtask(subtask); + } + attempts.put(attemptNumber, gateway); + } + + public void unregisterSubtask(int subtask, int attemptNumber, Throwable throwable) { + Map attempts = registeredSubtasks.get(subtask); + if (attempts != null) { + attempts.remove(attemptNumber); + } + removePendingSubtask(subtask); + } + + public boolean isValid(int subtask, int attemptNumber) { + Map attempts = registeredSubtasks.get(subtask); + return attempts != null && attempts.containsKey(attemptNumber); + } + + public Collection activeGateways() { + Collection gateways = new ArrayList<>(); + for (Map attempts : + registeredSubtasks.values()) { + gateways.addAll(attempts.values()); + } + return gateways; + } + + public CommitResult receive(int subtask, FileInfoRequest request) throws Exception { + long envelopeCheckpointId = request.checkpointId(); + if (envelopeCheckpointId <= maxCommittedCheckpointId) { + return new CommitResult(true, 0, maxCommittedCheckpointId, false); + } + + recordEnvelope(envelopeCheckpointId, subtask, request); + recordFileInfos(subtask, request); + if (!envelopeAllReceived(envelopeCheckpointId)) { + return CommitResult.NONE; + } + + stageCheckpointsUpTo(envelopeCheckpointId); + + if (envelopeCheckpointId != restoredCheckpointId) { + return CommitResult.NONE; + } + + if (!envelopeAllRecovered(envelopeCheckpointId)) { + throw new IllegalStateException( + String.format( + "Restored checkpoint %d contains non-recovered file info.", + restoredCheckpointId)); + } + + int committedCount = coordinator.filterAndCommitUpToCheckpoint(restoredCheckpointId); + maxCommittedCheckpointId = Math.max(maxCommittedCheckpointId, restoredCheckpointId); + cleanupCommittedCheckpoints(restoredCheckpointId); + return new CommitResult(true, committedCount, restoredCheckpointId, true); + } + + public CommitResult notifyCheckpointComplete(long checkpointId) throws Exception { + if (checkpointId <= maxCommittedCheckpointId) { + return new CommitResult(true, 0, maxCommittedCheckpointId, false); + } + if (!stagedEnvelope(checkpointId)) { + throw new IllegalStateException( + String.format( + "Checkpoint %d completed before PWC staged file info from all subtasks.", + checkpointId)); + } + + coordinator.notifyCheckpointComplete(checkpointId); + maxCommittedCheckpointId = Math.max(maxCommittedCheckpointId, checkpointId); + cleanupCommittedCheckpoints(checkpointId); + return new CommitResult(true, 0, checkpointId, false); + } + + public void notifyCheckpointAborted(long checkpointId) { + abortedCheckpoints.add(checkpointId); + coordinator.notifyCheckpointAborted(checkpointId); + } + + public void restoreCheckpoint(long checkpointId) { + restoredCheckpointId = checkpointId; + } + + private void recordEnvelope(long checkpointId, int subtask, FileInfoRequest request) { + Map envelope = + pendingEnvelopes.computeIfAbsent(checkpointId, ignored -> new HashMap<>()); + if (envelope.containsKey(subtask)) { + throw new IllegalStateException( + String.format( + "Repeated file info envelope received for checkpoint %d subtask %d.", + checkpointId, subtask)); + } + envelope.put(subtask, request); + } + + private void recordFileInfos(int subtask, FileInfoRequest request) throws Exception { + Map> committablesByCheckpoint = new TreeMap<>(); + for (Committable committable : + CoordinatedFileInfoSender.deserializeCommittables(request.serializedData())) { + committablesByCheckpoint + .computeIfAbsent(committable.checkpointId(), ignored -> new ArrayList<>()) + .add(committable); + } + for (Map.Entry> entry : committablesByCheckpoint.entrySet()) { + if (entry.getKey() > maxCommittedCheckpointId) { + checkpoint(entry.getKey()).receive(subtask, request, entry.getValue()); + } + } + } + + private PendingCheckpoint checkpoint(long checkpointId) { + return checkpoints.computeIfAbsent( + checkpointId, ignored -> new PendingCheckpoint(checkpointId)); + } + + private boolean envelopeAllReceived(long checkpointId) { + Map receivedSubtasks = pendingEnvelopes.get(checkpointId); + return receivedSubtasks != null + && receivedSubtasks.keySet().containsAll(expectedSubtasks()); + } + + private boolean envelopeAllRecovered(long checkpointId) { + if (!envelopeAllReceived(checkpointId)) { + return false; + } + for (FileInfoRequest request : pendingEnvelopes.get(checkpointId).values()) { + if (!request.recovered()) { + return false; + } + } + for (PendingCheckpoint checkpoint : checkpoints.headMap(checkpointId, true).values()) { + if (!checkpoint.staged()) { + continue; + } + for (SubtaskFileInfo fileInfo : checkpoint.fileInfos()) { + if (!fileInfo.request().recovered()) { + return false; + } + } + } + return true; + } + + private Set expectedSubtasks() { + Set subtasks = new HashSet<>(); + for (int i = 0; i < parallelism; i++) { + subtasks.add(i); + } + return subtasks; + } + + private void stageCheckpointsUpTo(long checkpointId) throws Exception { + for (PendingCheckpoint checkpoint : checkpoints.headMap(checkpointId, true).values()) { + if (!checkpoint.staged()) { + saveCheckpoint(checkpoint); + checkpoint.markStaged(); + } + } + } + + private boolean stagedEnvelope(long checkpointId) { + return envelopeAllReceived(checkpointId) + && checkpoints.headMap(checkpointId, true).values().stream() + .allMatch(PendingCheckpoint::staged); + } + + private void saveCheckpoint(PendingCheckpoint checkpoint) throws Exception { + coordinator.save( + checkpoint.committablesAfter(maxCommittedCheckpointId), + checkpoint.checkpointId(), + checkpoint.maxWatermark()); + } + + private void cleanupCommittedCheckpoints(long checkpointId) { + checkpoints.keySet().removeIf(id -> id <= checkpointId); + pendingEnvelopes.keySet().removeIf(id -> id <= checkpointId); + abortedCheckpoints.removeIf(id -> id <= checkpointId); + } + + private void removePendingSubtask(int subtask) { + for (PendingCheckpoint checkpoint : checkpoints.values()) { + if (!checkpoint.staged()) { + checkpoint.removeSubtask(subtask); + } + } + checkpoints + .entrySet() + .removeIf(entry -> !entry.getValue().staged() && entry.getValue().isEmpty()); + for (Map subtasks : pendingEnvelopes.values()) { + subtasks.remove(subtask); + } + pendingEnvelopes.entrySet().removeIf(entry -> entry.getValue().isEmpty()); + } + + public void close() { + registeredSubtasks.clear(); + checkpoints.clear(); + pendingEnvelopes.clear(); + abortedCheckpoints.clear(); + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/SubtaskFileInfo.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/SubtaskFileInfo.java new file mode 100644 index 000000000000..9b95144c5360 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/coordinator/SubtaskFileInfo.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.paimon.flink.sink.Committable; + +import java.util.List; + +/** File information reported by one writer subtask for one checkpoint. */ +class SubtaskFileInfo { + + private final FileInfoRequest request; + private final List committables; + + SubtaskFileInfo(FileInfoRequest request, List committables) { + this.request = request; + this.committables = committables; + } + + FileInfoRequest request() { + return request; + } + + List committables() { + return committables; + } +} diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableStateTest.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableStateTest.java new file mode 100644 index 000000000000..087587b7ff5a --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedCommittableStateTest.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.flink.sink.Committable; +import org.apache.paimon.io.CompactIncrement; +import org.apache.paimon.io.DataIncrement; +import org.apache.paimon.table.sink.CommitMessageImpl; + +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.state.OperatorStateStore; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.junit.jupiter.api.Test; +import org.mockito.ArgumentCaptor; +import org.mockito.Mockito; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Tests for {@link CoordinatedCommittableState}. */ +class CoordinatedCommittableStateTest { + + @Test + @SuppressWarnings({"unchecked", "rawtypes"}) + void testSnapshotKeepsEmptyCheckpointForRestore() throws Exception { + StateInitializationContext context = context(Collections.emptyList(), false); + ListState flinkState = committableState(context); + + CoordinatedCommittableState state = new CoordinatedCommittableState(); + state.initialize(context); + state.snapshot(1L); + + assertThat(state.pendingCommittables()).containsOnlyKeys(1L); + assertThat(state.pendingCommittables().get(1L)).isEmpty(); + + ArgumentCaptor> serializedState = ArgumentCaptor.forClass(List.class); + Mockito.verify(flinkState).update(serializedState.capture()); + + CoordinatedCommittableState restored = new CoordinatedCommittableState(); + restored.initialize(context(serializedState.getValue(), true)); + assertThat(restored.pendingCommittables()).containsOnlyKeys(1L); + assertThat(restored.pendingCommittables().get(1L)).isEmpty(); + } + + @Test + void testPendingCheckpointsAreReturnedInAscendingOrder() throws Exception { + StateInitializationContext context = context(Collections.emptyList(), false); + + CoordinatedCommittableState state = new CoordinatedCommittableState(); + state.initialize(context); + state.snapshot(3L); + state.snapshot(1L); + state.snapshot(2L); + + assertThat(new ArrayList<>(state.pendingCommittables().keySet())) + .containsExactly(1L, 2L, 3L); + assertThat(state.pendingCommittables().get(1L)).isEmpty(); + assertThat(state.pendingCommittables().get(2L)).isEmpty(); + assertThat(state.pendingCommittables().get(3L)).isEmpty(); + } + + @Test + void testOnlyUnacknowledgedCommittablesAreReported() throws Exception { + StateInitializationContext context = context(Collections.emptyList(), false); + + CoordinatedCommittableState state = new CoordinatedCommittableState(); + state.initialize(context); + Committable ck1 = committable(1L); + Committable ck2 = committable(2L); + state.add(ck1); + state.add(ck2); + + assertThat(state.unacknowledgedCommittables()).containsExactly(ck1, ck2); + + state.markAcknowledged(Collections.singletonList(ck1)); + assertThat(state.unacknowledgedCommittables()).containsExactly(ck2); + + state.markCommittedUpTo(1L); + assertThat(state.pendingCommittables()).containsOnlyKeys(2L); + assertThat(state.unacknowledgedCommittables()).containsExactly(ck2); + } + + @SuppressWarnings({"unchecked", "rawtypes"}) + private StateInitializationContext context(Iterable committables, boolean restored) + throws Exception { + StateInitializationContext context = Mockito.mock(StateInitializationContext.class); + OperatorStateStore operatorStateStore = Mockito.mock(OperatorStateStore.class); + ListState committableState = Mockito.mock(ListState.class); + Mockito.when(context.getOperatorStateStore()).thenReturn(operatorStateStore); + Mockito.when(context.isRestored()).thenReturn(restored); + Mockito.when(committableState.get()).thenReturn(committables); + Mockito.when(operatorStateStore.getListState(Mockito.any(ListStateDescriptor.class))) + .thenAnswer( + invocation -> { + ListStateDescriptor descriptor = invocation.getArgument(0); + if ("pwc_pending_committables".equals(descriptor.getName())) { + return committableState; + } + throw new IllegalArgumentException( + "Unexpected state " + descriptor.getName()); + }); + return context; + } + + @SuppressWarnings({"unchecked", "rawtypes"}) + private ListState committableState(StateInitializationContext context) + throws Exception { + OperatorStateStore operatorStateStore = context.getOperatorStateStore(); + ListStateDescriptor descriptor = + new ListStateDescriptor<>("pwc_pending_committables", byte[].class); + return operatorStateStore.getListState(descriptor); + } + + private Committable committable(long checkpointId) { + return new Committable( + checkpointId, + new CommitMessageImpl( + BinaryRow.EMPTY_ROW, + 0, + null, + DataIncrement.emptyIncrement(), + CompactIncrement.emptyIncrement())); + } +} diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSenderTest.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSenderTest.java new file mode 100644 index 000000000000..41c20a82532a --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/CoordinatedFileInfoSenderTest.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.flink.sink.Committable; +import org.apache.paimon.io.CompactIncrement; +import org.apache.paimon.io.DataIncrement; +import org.apache.paimon.table.sink.CommitMessageImpl; + +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.jobgraph.tasks.TaskOperatorEventGateway; +import org.apache.flink.runtime.operators.coordination.CoordinationRequest; +import org.apache.flink.runtime.operators.coordination.CoordinationResponse; +import org.apache.flink.util.SerializedValue; +import org.junit.jupiter.api.Test; +import org.mockito.ArgumentCaptor; +import org.mockito.Mockito; + +import java.util.Collections; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** Tests for {@link CoordinatedFileInfoSender}. */ +class CoordinatedFileInfoSenderTest { + + @Test + void testSendWaitsForAckBeforeReturning() throws Exception { + TaskOperatorEventGateway gateway = Mockito.mock(TaskOperatorEventGateway.class); + OperatorID operatorId = new OperatorID(); + CompletableFuture ack = new CompletableFuture<>(); + Mockito.when(gateway.sendRequestToCoordinator(Mockito.eq(operatorId), Mockito.any())) + .thenReturn(ack); + + CoordinatedFileInfoSender sender = new CoordinatedFileInfoSender(gateway, operatorId); + sender.setSubtaskId(3); + sender.setAttemptNumber(4); + + ExecutorService executor = Executors.newSingleThreadExecutor(); + try { + Future send = + executor.submit( + () -> + sender.sendToCoordinator( + 1L, Collections.singletonList(committable(1L)))); + + Thread.sleep(100L); + assertThat(send.isDone()).isFalse(); + + ack.complete(ackResponse(1L, 3)); + send.get(5, TimeUnit.SECONDS); + } finally { + executor.shutdownNow(); + } + } + + @Test + void testSendEmptyFileInfo() throws Exception { + TaskOperatorEventGateway gateway = Mockito.mock(TaskOperatorEventGateway.class); + OperatorID operatorId = new OperatorID(); + Mockito.when(gateway.sendRequestToCoordinator(Mockito.eq(operatorId), Mockito.any())) + .thenReturn(CompletableFuture.completedFuture(ackResponse(1L, 3))); + + CoordinatedFileInfoSender sender = new CoordinatedFileInfoSender(gateway, operatorId); + sender.setSubtaskId(3); + sender.setAttemptNumber(4); + sender.sendToCoordinator(1L, Collections.emptyList()); + + ArgumentCaptor captor = ArgumentCaptor.forClass(SerializedValue.class); + Mockito.verify(gateway).sendRequestToCoordinator(Mockito.eq(operatorId), captor.capture()); + FileInfoRequest request = deserializeRequest(captor.getValue()); + + assertThat(request.checkpointId()).isEqualTo(1L); + assertThat(request.subtaskId()).isEqualTo(3); + assertThat(request.attemptNumber()).isEqualTo(4); + assertThat(request.committableCount()).isEqualTo(0); + assertThat(CoordinatedFileInfoSender.deserializeCommittables(request.serializedData())) + .isEmpty(); + } + + @Test + void testFailedSendCanBeRetriedWithSameCommittables() throws Exception { + TaskOperatorEventGateway gateway = Mockito.mock(TaskOperatorEventGateway.class); + OperatorID operatorId = new OperatorID(); + CompletableFuture failed = new CompletableFuture<>(); + failed.completeExceptionally(new RuntimeException("send failed")); + Mockito.when(gateway.sendRequestToCoordinator(Mockito.eq(operatorId), Mockito.any())) + .thenReturn(failed) + .thenReturn(CompletableFuture.completedFuture(ackResponse(1L, 3))); + + CoordinatedFileInfoSender sender = new CoordinatedFileInfoSender(gateway, operatorId); + sender.setSubtaskId(3); + sender.setAttemptNumber(4); + List committables = Collections.singletonList(committable(1L)); + + assertThatThrownBy(() -> sender.sendToCoordinator(1L, committables)) + .isInstanceOf(RuntimeException.class) + .hasRootCauseMessage("send failed"); + + sender.sendToCoordinator(1L, committables); + + ArgumentCaptor captor = ArgumentCaptor.forClass(SerializedValue.class); + Mockito.verify(gateway, Mockito.times(2)) + .sendRequestToCoordinator(Mockito.eq(operatorId), captor.capture()); + FileInfoRequest retryRequest = deserializeRequest(captor.getAllValues().get(1)); + + assertThat(retryRequest.checkpointId()).isEqualTo(1L); + assertThat(retryRequest.subtaskId()).isEqualTo(3); + assertThat(retryRequest.attemptNumber()).isEqualTo(4); + assertThat(retryRequest.committableCount()).isEqualTo(1); + + List retryCommittables = + CoordinatedFileInfoSender.deserializeCommittables(retryRequest.serializedData()); + assertThat(retryCommittables).hasSize(1); + assertThat(retryCommittables.get(0).checkpointId()).isEqualTo(1L); + } + + @Test + void testRecoveredFileInfoUsesSingleRequest() throws Exception { + TaskOperatorEventGateway gateway = Mockito.mock(TaskOperatorEventGateway.class); + OperatorID operatorId = new OperatorID(); + Mockito.when(gateway.sendRequestToCoordinator(Mockito.eq(operatorId), Mockito.any())) + .thenReturn(CompletableFuture.completedFuture(ackResponse(1L, 3))); + + CoordinatedFileInfoSender sender = new CoordinatedFileInfoSender(gateway, operatorId); + sender.setSubtaskId(3); + sender.setAttemptNumber(4); + sender.sendRecoveredFileInfoToCoordinator( + 1L, "commit-user", Collections.singletonList(committable(1L))); + + ArgumentCaptor captor = ArgumentCaptor.forClass(SerializedValue.class); + Mockito.verify(gateway).sendRequestToCoordinator(Mockito.eq(operatorId), captor.capture()); + FileInfoRequest request = deserializeRequest(captor.getValue()); + assertThat(request.recovered()).isTrue(); + assertThat(request.checkpointId()).isEqualTo(1L); + assertThat(request.subtaskId()).isEqualTo(3); + assertThat(request.attemptNumber()).isEqualTo(4); + assertThat(request.commitUser()).isEqualTo("commit-user"); + assertThat(request.committableCount()).isEqualTo(1); + assertThat(CoordinatedFileInfoSender.deserializeCommittables(request.serializedData())) + .hasSize(1); + } + + private Committable committable(long checkpointId) { + return new Committable( + checkpointId, + new CommitMessageImpl( + BinaryRow.EMPTY_ROW, + 0, + null, + DataIncrement.emptyIncrement(), + CompactIncrement.emptyIncrement())); + } + + private CoordinationResponse ackResponse(long checkpointId, int subtaskId) { + return CoordinationResponseUtils.wrap( + new FileInfoReceivedResponse(checkpointId, subtaskId)); + } + + @SuppressWarnings("unchecked") + private FileInfoRequest deserializeRequest(SerializedValue serializedValue) throws Exception { + return (FileInfoRequest) + ((SerializedValue) serializedValue) + .deserializeValue(Thread.currentThread().getContextClassLoader()); + } +} diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorITCase.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorITCase.java new file mode 100644 index 000000000000..9a41c9df01f1 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorITCase.java @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.paimon.Snapshot; +import org.apache.paimon.flink.CatalogITCaseBase; +import org.apache.paimon.flink.sink.FlinkSinkBuilder; +import org.apache.paimon.flink.source.AbstractNonCoordinatedSource; +import org.apache.paimon.flink.source.AbstractNonCoordinatedSourceReader; +import org.apache.paimon.flink.source.SimpleSourceSplit; + +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.api.connector.source.ReaderOutput; +import org.apache.flink.api.connector.source.SourceReader; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.api.dag.Transformation; +import org.apache.flink.core.execution.JobClient; +import org.apache.flink.core.io.InputStatus; +import org.apache.flink.runtime.execution.ExecutionState; +import org.apache.flink.runtime.minicluster.MiniCluster; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.VarCharType; +import org.apache.flink.types.Row; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; + +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Integration tests for {@link PaimonWriterCoordinator}. */ +@SuppressWarnings("BusyWait") +public class PaimonWriterCoordinatorITCase extends CatalogITCaseBase { + + private static final String MINI_CLUSTER_FIELD = "miniCluster"; + private static final RowType ROW_TYPE = + new RowType( + Arrays.asList( + new RowType.RowField("k", new IntType()), + new RowType.RowField("v", new VarCharType()))); + + @Override + protected List ddl() { + return Arrays.asList( + "CREATE TABLE unaware_table (k INT, v STRING) WITH (" + + "'bucket'='-1'," + + "'sink.committer-coordinator-operator.enabled'='true')", + "CREATE TABLE fixed_table (k INT, v STRING) WITH (" + + "'bucket'='1'," + + "'bucket-key'='k'," + + "'sink.committer-coordinator-operator.enabled'='true')", + "CREATE TABLE dynamic_table (k INT PRIMARY KEY NOT ENFORCED, v STRING) WITH (" + + "'bucket'='-1'," + + "'sink.committer-coordinator-operator.enabled'='true')"); + } + + @Test + @Timeout(120) + public void testStreamingCheckpointWriteUnawareTableWithWriterCoordinator() throws Exception { + testStreamingCheckpointWriteWithWriterCoordinator("unaware_table"); + } + + @Test + public void testFixedTableIgnoresWriterCoordinatorOption() throws Exception { + assertUsesGlobalCommitter(buildPaimonSink("fixed_table"), "fixed_table"); + } + + @Test + public void testDynamicTableIgnoresWriterCoordinatorOption() throws Exception { + assertUsesGlobalCommitter(buildPaimonSink("dynamic_table"), "dynamic_table"); + } + + private void testStreamingCheckpointWriteWithWriterCoordinator(String tableName) + throws Exception { + StreamExecutionEnvironment env = buildPaimonSink(tableName); + assertThat(transformationNames(env)).doesNotContain("Global Committer : " + tableName); + + JobClient jobClient = env.executeAsync(); + triggerCheckpointAndWaitForWrites(jobClient, tableName, 4); + jobClient.cancel().get(); + + sqlAssertWithRetry( + "SELECT * FROM " + tableName, + rows -> + rows.containsExactlyInAnyOrder( + Row.of(1, "one"), + Row.of(2, "two"), + Row.of(3, "three"), + Row.of(4, "four"))); + } + + private StreamExecutionEnvironment buildPaimonSink(String tableName) throws Exception { + StreamExecutionEnvironment env = + streamExecutionEnvironmentBuilder() + .streamingMode() + .parallelism(2) + .checkpointIntervalMs(100) + .build(); + + new FlinkSinkBuilder(paimonTable(tableName)) + .forRowData( + env.fromSource( + new EmitOnceAndWaitSource(), + WatermarkStrategy.noWatermarks(), + "EmitOnceAndWaitSource", + InternalTypeInfo.of(ROW_TYPE)) + .setParallelism(1)) + .build(); + return env; + } + + private void assertUsesGlobalCommitter(StreamExecutionEnvironment env, String tableName) { + assertThat(transformationNames(env)).contains("Global Committer : " + tableName); + } + + private List transformationNames(StreamExecutionEnvironment env) { + List names = new ArrayList<>(); + List> pending = new ArrayList<>(env.getTransformations()); + Set visited = new HashSet<>(); + while (!pending.isEmpty()) { + Transformation transformation = pending.remove(pending.size() - 1); + if (visited.add(transformation.getId())) { + names.add(transformation.getName()); + pending.addAll(transformation.getInputs()); + } + } + return names; + } + + @SuppressWarnings("unchecked") + private T reflectGetMiniCluster(Object instance) + throws NoSuchFieldException, IllegalAccessException { + Field field = instance.getClass().getDeclaredField(MINI_CLUSTER_FIELD); + field.setAccessible(true); + return (T) field.get(instance); + } + + private void triggerCheckpointAndWaitForWrites( + JobClient jobClient, String tableName, long totalRecords) throws Exception { + MiniCluster miniCluster = reflectGetMiniCluster(jobClient); + JobID jobID = jobClient.getJobID(); + waitForJobRunning(jobClient, miniCluster, jobID); + + long lastSnapshotId = -1L; + long deadline = System.currentTimeMillis() + 60_000L; + while (System.currentTimeMillis() < deadline) { + miniCluster.triggerCheckpoint(jobID).get(); + Snapshot snapshot = waitForNewSnapshot(tableName, lastSnapshotId, deadline); + lastSnapshotId = snapshot.id(); + if (snapshot.totalRecordCount() >= totalRecords) { + return; + } + } + throw new AssertionError("Timed out waiting for records committed by PWC."); + } + + private void waitForJobRunning(JobClient jobClient, MiniCluster miniCluster, JobID jobID) + throws Exception { + JobStatus jobStatus = jobClient.getJobStatus().get(); + while (jobStatus == JobStatus.INITIALIZING || jobStatus == JobStatus.CREATED) { + Thread.sleep(500L); + jobStatus = jobClient.getJobStatus().get(); + } + + if (jobStatus != JobStatus.RUNNING) { + throw new IllegalStateException("Job status is not RUNNING"); + } + + AtomicBoolean allTaskRunning = new AtomicBoolean(false); + while (!allTaskRunning.get()) { + allTaskRunning.set(true); + Thread.sleep(500L); + miniCluster + .getExecutionGraph(jobID) + .thenAccept( + graph -> + graph.getAllExecutionVertices() + .forEach( + vertex -> { + if (vertex.getExecutionState() + != ExecutionState.RUNNING) { + allTaskRunning.set(false); + } + })) + .get(); + } + } + + private Snapshot waitForNewSnapshot(String tableName, long initialSnapshotId, long deadline) + throws InterruptedException { + Snapshot snapshot = findLatestSnapshot(tableName); + while (System.currentTimeMillis() < deadline + && (snapshot == null || snapshot.id() == initialSnapshotId)) { + Thread.sleep(500L); + snapshot = findLatestSnapshot(tableName); + } + if (snapshot == null || snapshot.id() == initialSnapshotId) { + throw new AssertionError("Timed out waiting for a new Paimon snapshot."); + } + return snapshot; + } + + private static class EmitOnceAndWaitSource extends AbstractNonCoordinatedSource { + + private static final long serialVersionUID = 1L; + + @Override + public Boundedness getBoundedness() { + return Boundedness.CONTINUOUS_UNBOUNDED; + } + + @Override + public SourceReader createReader( + SourceReaderContext sourceReaderContext) { + return new Reader(); + } + + private static class Reader extends AbstractNonCoordinatedSourceReader { + + private boolean emitted; + + @Override + public InputStatus pollNext(ReaderOutput output) { + if (!emitted) { + output.collect(row(1, "one")); + output.collect(row(2, "two")); + output.collect(row(3, "three")); + output.collect(row(4, "four")); + emitted = true; + } + return InputStatus.NOTHING_AVAILABLE; + } + } + + private static RowData row(int k, String v) { + return GenericRowData.of(k, StringData.fromString(v)); + } + } +} diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorTest.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorTest.java new file mode 100644 index 000000000000..126e2650b2dc --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/sink/coordinator/PaimonWriterCoordinatorTest.java @@ -0,0 +1,780 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink.coordinator; + +import org.apache.paimon.CoreOptions; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.flink.sink.Committable; +import org.apache.paimon.flink.sink.CommittableSerializer; +import org.apache.paimon.flink.sink.Committer; +import org.apache.paimon.flink.sink.StoreCommitter; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.manifest.ManifestCommittable; +import org.apache.paimon.options.Options; +import org.apache.paimon.reader.RecordReader; +import org.apache.paimon.reader.RecordReaderIterator; +import org.apache.paimon.schema.Schema; +import org.apache.paimon.schema.SchemaManager; +import org.apache.paimon.table.FileStoreTable; +import org.apache.paimon.table.FileStoreTableFactory; +import org.apache.paimon.table.sink.CommitMessage; +import org.apache.paimon.table.sink.CommitMessageSerializer; +import org.apache.paimon.table.sink.StreamTableWrite; +import org.apache.paimon.table.source.TableRead; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.CloseableIterator; + +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.coordination.CoordinationResponse; +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.util.ExceptionUtils; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.Mockito; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.Mockito.when; + +/** Tests for {@link PaimonWriterCoordinator}. */ +public class PaimonWriterCoordinatorTest { + + private static final long CK0 = 7L; + private static final long CK1 = 8L; + private static final long CK2 = 9L; + private static final RowType ROW_TYPE = + RowType.of( + new DataType[] {DataTypes.INT(), DataTypes.BIGINT()}, new String[] {"a", "b"}); + + @TempDir public java.nio.file.Path tempDir; + + private Path tablePath; + private OperatorID operatorId; + private String commitUser; + + @BeforeEach + public void before() { + tablePath = new Path(tempDir.toString()); + operatorId = new OperatorID(); + commitUser = UUID.randomUUID().toString(); + } + + // ------------------------------------------------------------------------ + // basic function tests + // ------------------------------------------------------------------------ + + @Test + public void testCommitUserRestoredFromCoordinatorState() throws Exception { + FileStoreTable table = createFileStoreTable(); + String restoredCommitUser = commitUser; + + PaimonWriterCoordinator previous = createCoordinator(table, 1); + previous.start(); + byte[] coordinatorState = checkpointState(previous, CK1); + previous.close(); + + commitUser = UUID.randomUUID().toString(); + PaimonWriterCoordinator restored = createCoordinator(table, 1); + restored.resetToCheckpoint(CK1, coordinatorState); + restored.start(); + register(restored, 0); + + sendRequest( + restored, + recoveredFileInfoRequest(CK1, 0, restoredCommitUser, Collections.emptyList())); + waitForCoordinator(restored); + + sendRequest(restored, fileInfoRequest(table, CK2, 0, GenericRow.of(1, 10L))); + restored.notifyCheckpointComplete(CK2); + waitForCoordinator(restored); + assertThat(table.snapshotManager().latestSnapshot().commitUser()) + .isEqualTo(restoredCommitUser); + restored.close(); + } + + @Test + public void testFileInfoRequestAcksAfterReceive() throws Exception { + FileStoreTable table = createFileStoreTable(); + PaimonWriterCoordinator coordinator = createCoordinator(table, 1); + coordinator.start(); + register(coordinator, 0); + + FileInfoRequest request = fileInfoRequest(table, CK1, 0, 0, GenericRow.of(1, 10L)); + CoordinationResponse rawResponse = coordinator.handleCoordinationRequest(request).get(); + FileInfoReceivedResponse response = CoordinationResponseUtils.unwrap(rawResponse); + assertThat(response.checkpointId()).isEqualTo(CK1); + assertThat(response.subtaskId()).isEqualTo(0); + + coordinator.notifyCheckpointComplete(CK1); + waitForCoordinator(coordinator); + assertResults(table, "1, 10"); + + coordinator.close(); + } + + @Test + public void testFileInfoRequestFromStaleAttemptIsIgnored() throws Exception { + FileStoreTable table = createFileStoreTable(); + PaimonWriterCoordinator coordinator = createCoordinator(table, 1); + coordinator.start(); + register(coordinator, 0, 1); + + FileInfoRequest request = fileInfoRequest(table, CK1, 0, 0, GenericRow.of(1, 10L)); + CompletableFuture response = + coordinator.handleCoordinationRequest(request); + assertThatThrownBy(response::get) + .isInstanceOf(ExecutionException.class) + .hasCauseInstanceOf(IllegalStateException.class) + .hasMessageContaining("invalid subtask 0 attempt 0"); + + coordinator.notifyCheckpointComplete(CK1); + waitForCoordinator(coordinator); + assertThat(table.snapshotManager().latestSnapshotId()).isNull(); + + coordinator.close(); + } + + @Test + public void testCheckpointCompleteRequiresStagedFileInfo() throws Exception { + FileStoreTable table = createFileStoreTable(); + OperatorCoordinator.Context context = createContext(2); + PaimonWriterCoordinator coordinator = createCoordinator(table, context); + coordinator.start(); + register(coordinator, 0); + register(coordinator, 1); + + sendRequest(coordinator, fileInfoRequest(table, CK1, 0, GenericRow.of(1, 10L))); + coordinator.notifyCheckpointComplete(CK1); + waitForCoordinator(coordinator); + + Mockito.verify(context).failJob(Mockito.any(IllegalStateException.class)); + assertThat(table.snapshotManager().latestSnapshotId()).isNull(); + coordinator.close(); + } + + // ------------------------------------------------------------------------ + // restore and recovered file-info tests + // ------------------------------------------------------------------------ + + @Test + public void testRecoveredFileInfoWithoutCoordinatorRestoreWaitsForCheckpointComplete() + throws Exception { + FileStoreTable table = createFileStoreTable(); + PaimonWriterCoordinator coordinator = createCoordinator(table, 1); + coordinator.start(); + register(coordinator, 0); + + sendRequest( + coordinator, + recoveredFileInfoRequest( + CK1, 0, commitUser, committables(table, CK1, GenericRow.of(1, 10L)))); + waitForCoordinator(coordinator); + assertThat(table.snapshotManager().latestSnapshotId()).isNull(); + + coordinator.notifyCheckpointComplete(CK1); + waitForCoordinator(coordinator); + assertResults(table, "1, 10"); + + coordinator.close(); + } + + @Test + public void testRecoveredFileInfoCommitsPendingCommittables() throws Exception { + FileStoreTable table = createFileStoreTable(); + byte[] coordinatorState = createCoordinatorState(table, CK2); + OperatorCoordinator.Context context = createContext(2); + PaimonWriterCoordinator coordinator = createCoordinator(table, context); + coordinator.resetToCheckpoint(CK2, coordinatorState); + coordinator.start(); + register(coordinator, 0); + register(coordinator, 1); + + sendRequest( + coordinator, + recoveredFileInfoRequest( + CK2, 0, commitUser, committables(table, CK1, GenericRow.of(1, 10L)))); + waitForCoordinator(coordinator); + assertThat(table.snapshotManager().latestSnapshotId()).isNull(); + + sendRequest( + coordinator, + recoveredFileInfoRequest( + CK2, 1, commitUser, committables(table, CK1, GenericRow.of(2, 20L)))); + waitForCoordinator(coordinator); + + Mockito.verify(context).failJob(Mockito.any(Throwable.class)); + assertResults(table, "1, 10", "2, 20"); + + coordinator.close(); + } + + /** + * Restored file info is recommitted and triggers the expected recovery failure, after which a + * newer checkpoint is reported and completed normally. + */ + @Test + public void testCheckpointAfterRestoredCommitDoesNotFailJob() throws Exception { + FileStoreTable table = createFileStoreTable(); + byte[] coordinatorState = createCoordinatorState(table, CK1); + OperatorCoordinator.Context context = createContext(1); + PaimonWriterCoordinator coordinator = createCoordinator(table, context); + coordinator.resetToCheckpoint(CK1, coordinatorState); + coordinator.start(); + register(coordinator, 0); + + sendRequest( + coordinator, + recoveredFileInfoRequest( + CK1, 0, commitUser, committables(table, CK1, GenericRow.of(1, 10L)))); + waitForCoordinator(coordinator); + Mockito.verify(context).failJob(Mockito.any(Throwable.class)); + assertResults(table, "1, 10"); + + sendRequest(coordinator, fileInfoRequest(table, CK2, 0, GenericRow.of(2, 20L))); + coordinator.notifyCheckpointComplete(CK2); + waitForCoordinator(coordinator); + + Mockito.verify(context, Mockito.times(1)).failJob(Mockito.any(Throwable.class)); + assertResults(table, "1, 10", "2, 20"); + + coordinator.close(); + } + + /** + * At a restored checkpoint, one subtask reports data and another reports an empty recovered + * file-info payload. + */ + @Test + public void testEmptyRestoredFileInfoCompletesRestoredCheckpoint() throws Exception { + FileStoreTable table = createFileStoreTable(); + byte[] coordinatorState = createCoordinatorState(table, CK1); + OperatorCoordinator.Context context = createContext(2); + PaimonWriterCoordinator coordinator = createCoordinator(table, context); + coordinator.resetToCheckpoint(CK1, coordinatorState); + coordinator.start(); + register(coordinator, 0); + register(coordinator, 1); + + sendRequest( + coordinator, + recoveredFileInfoRequest( + CK1, 0, commitUser, committables(table, CK1, GenericRow.of(1, 10L)))); + waitForCoordinator(coordinator); + assertThat(table.snapshotManager().latestSnapshotId()).isNull(); + + sendRequest( + coordinator, recoveredFileInfoRequest(CK1, 1, commitUser, Collections.emptyList())); + waitForCoordinator(coordinator); + + Mockito.verify(context).failJob(Mockito.any(Throwable.class)); + assertResults(table, "1, 10"); + + coordinator.close(); + } + + /** + * During recovery to CK2, one recovered request cumulatively carries CK1 committables and the + * other recovered request is empty. + */ + @Test + public void testEmptyFileInfoCompletesEarlierCheckpointBeforeRestoredCommit() throws Exception { + FileStoreTable table = createFileStoreTable(); + byte[] coordinatorState = createCoordinatorState(table, CK2); + OperatorCoordinator.Context context = createContext(2); + PaimonWriterCoordinator coordinator = createCoordinator(table, context); + coordinator.resetToCheckpoint(CK2, coordinatorState); + coordinator.start(); + register(coordinator, 0); + register(coordinator, 1); + + List ck1Subtask0 = committables(table, CK1, GenericRow.of(1, 10L)); + sendRequest(coordinator, recoveredFileInfoRequest(CK2, 0, commitUser, ck1Subtask0)); + waitForCoordinator(coordinator); + assertThat(table.snapshotManager().latestSnapshotId()).isNull(); + + sendRequest( + coordinator, recoveredFileInfoRequest(CK2, 1, commitUser, Collections.emptyList())); + waitForCoordinator(coordinator); + + Mockito.verify(context).failJob(Mockito.any(Throwable.class)); + assertResults(table, "1, 10"); + + coordinator.close(); + } + + // ------------------------------------------------------------------------ + // abort and late-arrival tests + // ------------------------------------------------------------------------ + + /** + * CK0 commit, CK1 abort but Task not fail, CK2 commit The aborted checkpoint creates no table + * data, while completing the next checkpoint commits both the re-reported and newly produced + * rows exactly once. + */ + @Test + public void testCheckpointAbortDoesNotDropCommittables() throws Exception { + FileStoreTable table = createFileStoreTable(); + PaimonWriterCoordinator coordinator = createCoordinator(table, 1); + coordinator.start(); + + sendCheckpoint(coordinator, table, CK0, 0, GenericRow.of(0, 0L)); + coordinator.notifyCheckpointComplete(CK0); + waitForCoordinator(coordinator); + assertResults(table, "0, 0"); + + List ck1Committables = committables(table, CK1, GenericRow.of(1, 10L)); + sendRequest(coordinator, fileInfoRequest(CK1, 0, ck1Committables)); + coordinator.notifyCheckpointAborted(CK1); + waitForCoordinator(coordinator); + assertResults(table, "0, 0"); + + sendRequest( + coordinator, + fileInfoRequest(CK2, 0, committables(table, CK2, GenericRow.of(2, 20L)))); + coordinator.notifyCheckpointComplete(CK2); + waitForCoordinator(coordinator); + assertResults(table, "0, 0", "1, 10", "2, 20"); + + coordinator.close(); + } + + /** Only one of two subtasks reports a checkpoint before it is aborted. */ + @Test + public void testPartialCheckpointAbortDoesNotFailJob() throws Exception { + FileStoreTable table = createFileStoreTable(); + OperatorCoordinator.Context context = createContext(2); + PaimonWriterCoordinator coordinator = createCoordinator(table, context); + coordinator.start(); + + register(coordinator, 0); + register(coordinator, 1); + List subtask0Ck1 = committables(table, CK1, GenericRow.of(1, 10L)); + sendRequest(coordinator, fileInfoRequest(CK1, 0, subtask0Ck1)); + waitForCoordinator(coordinator); + + coordinator.notifyCheckpointAborted(CK1); + waitForCoordinator(coordinator); + Mockito.verify(context, Mockito.never()).failJob(Mockito.any(Throwable.class)); + assertThat(table.snapshotManager().latestSnapshotId()).isNull(); + + sendRequest( + coordinator, + fileInfoRequest(CK2, 0, committables(table, CK2, GenericRow.of(2, 20L)))); + sendRequest( + coordinator, + fileInfoRequest(CK2, 1, committables(table, CK2, GenericRow.of(3, 30L)))); + coordinator.notifyCheckpointComplete(CK2); + waitForCoordinator(coordinator); + + Mockito.verify(context, Mockito.never()).failJob(Mockito.any(Throwable.class)); + assertResults(table, "1, 10", "2, 20", "3, 30"); + + coordinator.close(); + } + + /** + * A file-info request arrives after the corresponding checkpoint has already been aborted. The + * late request is retained as reliable pending file info and committed by the next complete + * checkpoint envelope. + */ + @Test + public void testFileInfoAfterCheckpointAbortIsCommittedByLaterCheckpoint() throws Exception { + FileStoreTable table = createFileStoreTable(); + OperatorCoordinator.Context context = createContext(2); + PaimonWriterCoordinator coordinator = createCoordinator(table, context); + coordinator.start(); + + register(coordinator, 0); + register(coordinator, 1); + coordinator.notifyCheckpointAborted(CK1); + waitForCoordinator(coordinator); + + sendRequest(coordinator, fileInfoRequest(table, CK1, 0, GenericRow.of(1, 10L))); + waitForCoordinator(coordinator); + + Mockito.verify(context, Mockito.never()).failJob(Mockito.any(Throwable.class)); + assertThat(table.snapshotManager().latestSnapshotId()).isNull(); + + sendRequest(coordinator, fileInfoRequest(table, CK2, 0, GenericRow.of(2, 20L))); + sendRequest(coordinator, fileInfoRequest(table, CK2, 1, GenericRow.of(3, 30L))); + coordinator.notifyCheckpointComplete(CK2); + waitForCoordinator(coordinator); + + Mockito.verify(context, Mockito.never()).failJob(Mockito.any(Throwable.class)); + assertResults(table, "1, 10", "2, 20", "3, 30"); + + coordinator.close(); + } + + /** + * Scenario: PWC collects a checkpoint's file info, the checkpoint is aborted, and the same + * attempt reports the same envelope again. Under ack-based reporting this is a protocol error: + * if the first ACK was lost, the writer snapshot cannot complete and the attempt must failover. + */ + @Test + public void testDuplicateFileInfoAfterCollectedAbortIsRejected() throws Exception { + FileStoreTable table = createFileStoreTable(); + OperatorCoordinator.Context context = createContext(1); + PaimonWriterCoordinator coordinator = createCoordinator(table, context); + coordinator.start(); + register(coordinator, 0); + + FileInfoRequest request = fileInfoRequest(table, CK1, 0, GenericRow.of(1, 10L)); + sendRequest(coordinator, request); + waitForCoordinator(coordinator); + coordinator.notifyCheckpointAborted(CK1); + waitForCoordinator(coordinator); + + assertThatThrownBy(() -> sendRequest(coordinator, request)) + .isInstanceOf(ExecutionException.class) + .hasCauseInstanceOf(IllegalStateException.class) + .hasMessageContaining("Repeated file info envelope"); + + coordinator.close(); + } + + /** + * An earlier checkpoint has file info from only one subtask, while a later checkpoint receives + * complete cumulative file info from all subtasks. + */ + @Test + public void testLaterCheckpointCompleteCanCommitEarlierPartialFileInfo() throws Exception { + FileStoreTable table = createFileStoreTable(); + OperatorCoordinator.Context context = createContext(2); + PaimonWriterCoordinator coordinator = createCoordinator(table, context); + coordinator.start(); + + register(coordinator, 0); + register(coordinator, 1); + List ck1Subtask0 = committables(table, CK1, GenericRow.of(1, 10L)); + sendRequest(coordinator, fileInfoRequest(CK1, 0, ck1Subtask0)); + waitForCoordinator(coordinator); + + sendRequest( + coordinator, + fileInfoRequest(CK2, 0, committables(table, CK2, GenericRow.of(2, 20L)))); + sendRequest( + coordinator, + fileInfoRequest(CK2, 1, committables(table, CK2, GenericRow.of(3, 30L)))); + coordinator.notifyCheckpointComplete(CK2); + waitForCoordinator(coordinator); + + Mockito.verify(context, Mockito.never()).failJob(Mockito.any(Throwable.class)); + assertResults(table, "1, 10", "2, 20", "3, 30"); + + coordinator.close(); + } + + // ------------------------------------------------------------------------ + // attempt and stale-message tests + // ------------------------------------------------------------------------ + + @Test + public void testSubtaskFailoverReplacesUnstagedPendingFileInfo() throws Exception { + FileStoreTable table = createFileStoreTable(); + PaimonWriterCoordinator coordinator = createCoordinator(table, 2); + coordinator.start(); + + register(coordinator, 0, 0); + register(coordinator, 1, 0); + sendRequest(coordinator, fileInfoRequest(table, CK1, 0, 0, GenericRow.of(1, 10L))); + waitForCoordinator(coordinator); + + coordinator.executionAttemptFailed(0, 0, new RuntimeException("failover")); + waitForCoordinator(coordinator); + register(coordinator, 0, 1); + + List recoveredSubtask0 = new ArrayList<>(); + recoveredSubtask0.addAll(committables(table, CK1, GenericRow.of(9, 90L))); + recoveredSubtask0.addAll(committables(table, CK2, GenericRow.of(2, 20L))); + sendRequest(coordinator, fileInfoRequest(CK2, 0, 1, recoveredSubtask0)); + sendRequest(coordinator, fileInfoRequest(table, CK2, 1, 0, GenericRow.of(3, 30L))); + coordinator.notifyCheckpointComplete(CK2); + waitForCoordinator(coordinator); + + assertResults(table, "2, 20", "3, 30", "9, 90"); + coordinator.close(); + } + + /** + * A writer resends file info for a checkpoint that PWC has already committed. The table remains + * committed once, and the registered gateway receives a second commit-complete event in + * response to the stale resend, in addition to the original commit notification. + */ + @Test + public void testStaleFileInfoResendSendsCommitCompleteEvent() throws Exception { + FileStoreTable table = createFileStoreTable(); + PaimonWriterCoordinator coordinator = createCoordinator(table, 1); + coordinator.start(); + OperatorCoordinator.SubtaskGateway gateway = registerAndReturnGateway(coordinator, 0); + + FileInfoRequest request = fileInfoRequest(table, CK1, 0, GenericRow.of(1, 10L)); + sendRequest(coordinator, request); + coordinator.notifyCheckpointComplete(CK1); + waitForCoordinator(coordinator); + assertResults(table, "1, 10"); + + sendRequest(coordinator, request); + waitForCoordinator(coordinator); + + Mockito.verify(gateway, Mockito.times(2)).sendEvent(Mockito.any(CommitCompleteEvent.class)); + + coordinator.close(); + } + + private void sendCheckpoint( + PaimonWriterCoordinator coordinator, + FileStoreTable table, + long checkpointId, + int subtask, + GenericRow... rows) + throws Exception { + register(coordinator, subtask); + sendRequest(coordinator, fileInfoRequest(table, checkpointId, subtask, rows)); + waitForCoordinator(coordinator); + } + + private FileInfoReceivedResponse sendRequest( + PaimonWriterCoordinator coordinator, FileInfoRequest request) throws Exception { + CoordinationResponse rawResponse = coordinator.handleCoordinationRequest(request).get(); + return CoordinationResponseUtils.unwrap(rawResponse); + } + + private byte[] checkpointState(PaimonWriterCoordinator coordinator, long checkpointId) + throws Exception { + CompletableFuture result = new CompletableFuture<>(); + coordinator.checkpointCoordinator(checkpointId, result); + return result.get(); + } + + private byte[] createCoordinatorState(FileStoreTable table, long checkpointId) + throws Exception { + PaimonWriterCoordinator coordinator = createCoordinator(table, 1); + coordinator.start(); + byte[] state = checkpointState(coordinator, checkpointId); + coordinator.close(); + return state; + } + + private void register(PaimonWriterCoordinator coordinator, int subtask) { + register(coordinator, subtask, 0); + } + + private void register(PaimonWriterCoordinator coordinator, int subtask, int attemptNumber) { + OperatorCoordinator.SubtaskGateway gateway = + Mockito.mock(OperatorCoordinator.SubtaskGateway.class); + when(gateway.sendEvent(Mockito.any(OperatorEvent.class))) + .thenReturn(CompletableFuture.completedFuture(null)); + coordinator.executionAttemptReady(subtask, attemptNumber, gateway); + } + + private OperatorCoordinator.SubtaskGateway registerAndReturnGateway( + PaimonWriterCoordinator coordinator, int subtask) { + OperatorCoordinator.SubtaskGateway gateway = + Mockito.mock(OperatorCoordinator.SubtaskGateway.class); + when(gateway.sendEvent(Mockito.any(OperatorEvent.class))) + .thenReturn(CompletableFuture.completedFuture(null)); + coordinator.executionAttemptReady(subtask, 0, gateway); + return gateway; + } + + private FileInfoRequest fileInfoRequest( + FileStoreTable table, long checkpointId, int subtask, GenericRow... rows) + throws Exception { + return fileInfoRequest(table, checkpointId, subtask, 0, rows); + } + + private FileInfoRequest fileInfoRequest( + FileStoreTable table, + long checkpointId, + int subtask, + int attemptNumber, + GenericRow... rows) + throws Exception { + return fileInfoRequest( + checkpointId, subtask, attemptNumber, committables(table, checkpointId, rows)); + } + + private FileInfoRequest fileInfoRequest( + long checkpointId, int subtask, List committables) throws Exception { + return fileInfoRequest(checkpointId, subtask, 0, committables); + } + + private FileInfoRequest fileInfoRequest( + long checkpointId, int subtask, int attemptNumber, List committables) + throws Exception { + return FileInfoRequest.fileInfo( + checkpointId, + subtask, + attemptNumber, + Long.MIN_VALUE, + serialize(committables), + committables.size()); + } + + private FileInfoRequest recoveredFileInfoRequest( + long checkpointId, + int subtask, + String recoveredCommitUser, + List committables) + throws Exception { + return FileInfoRequest.recoveredFileInfo( + checkpointId, + subtask, + 0, + Long.MIN_VALUE, + serialize(committables), + committables.size(), + recoveredCommitUser); + } + + private List committables( + FileStoreTable table, long checkpointId, GenericRow... rows) throws Exception { + StreamTableWrite write = + table.newStreamWriteBuilder().withCommitUser(commitUser).newWrite(); + for (GenericRow row : rows) { + write.write(row); + } + List committables = new ArrayList<>(); + for (CommitMessage message : write.prepareCommit(false, checkpointId)) { + committables.add(new Committable(checkpointId, message)); + } + write.close(); + return committables; + } + + private byte[] serialize(List committables) throws Exception { + CommittableSerializer serializer = new CommittableSerializer(new CommitMessageSerializer()); + int total = 4; + List bytes = new ArrayList<>(); + for (Committable committable : committables) { + byte[] serialized = serializer.serialize(committable); + bytes.add(serialized); + total += 4 + serialized.length; + } + ByteBuffer buffer = ByteBuffer.allocate(total); + buffer.putInt(committables.size()); + for (byte[] serialized : bytes) { + buffer.putInt(serialized.length); + buffer.put(serialized); + } + return buffer.array(); + } + + private PaimonWriterCoordinator createCoordinator(FileStoreTable table, int parallelism) { + return createCoordinator(table, createContext(parallelism)); + } + + private OperatorCoordinator.Context createContext(int parallelism) { + OperatorCoordinator.Context context = Mockito.mock(OperatorCoordinator.Context.class); + when(context.getOperatorId()).thenReturn(operatorId); + when(context.currentParallelism()).thenReturn(parallelism); + when(context.getUserCodeClassloader()) + .thenReturn(Thread.currentThread().getContextClassLoader()); + return context; + } + + private PaimonWriterCoordinator createCoordinator( + FileStoreTable table, OperatorCoordinator.Context context) { + Committer.Factory factory = + commitContext -> + new StoreCommitter( + table, + table.newCommit(commitContext.commitUser()) + .ignoreEmptyCommit( + !commitContext.streamingCheckpointEnabled()), + commitContext); + return new PaimonWriterCoordinator( + true, + commitUser, + factory, + context, + new PaimonWriterCoordinator.CoordinatorExecutorThreadFactory("PWC", context), + null); + } + + private void waitForCoordinator(PaimonWriterCoordinator coordinator) { + CompletableFuture future = new CompletableFuture<>(); + coordinator.runInCoordinatorThread(() -> future.complete(null)); + try { + future.get(); + } catch (InterruptedException e) { + throw new AssertionError("Interrupted while waiting for coordinator.", e); + } catch (ExecutionException e) { + ExceptionUtils.rethrow(ExceptionUtils.stripExecutionException(e)); + } + } + + private FileStoreTable createFileStoreTable() throws Exception { + Options conf = new Options(); + conf.set(CoreOptions.PATH, tablePath.toString()); + conf.setString("bucket", "1"); + conf.setString("bucket-key", "a"); + new SchemaManager(LocalFileIO.create(), tablePath) + .createTable( + new Schema( + ROW_TYPE.getFields(), + Collections.emptyList(), + Collections.emptyList(), + conf.toMap(), + "")); + return FileStoreTableFactory.create(LocalFileIO.create(), conf); + } + + private void assertResults(FileStoreTable table, String... expected) { + TableRead read = table.newReadBuilder().newRead(); + List actual = new ArrayList<>(); + table.newReadBuilder() + .newScan() + .plan() + .splits() + .forEach( + split -> { + try { + RecordReader reader = read.createReader(split); + CloseableIterator iterator = + new RecordReaderIterator<>(reader); + while (iterator.hasNext()) { + InternalRow row = iterator.next(); + actual.add(row.getInt(0) + ", " + row.getLong(1)); + } + iterator.close(); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + Collections.sort(actual); + assertThat(actual).isEqualTo(Arrays.asList(expected)); + } +} diff --git a/paimon-flink/paimon-flink1-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java b/paimon-flink/paimon-flink1-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java index 1087dcb65cc2..9968459b47b9 100644 --- a/paimon-flink/paimon-flink1-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java +++ b/paimon-flink/paimon-flink1-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java @@ -34,6 +34,10 @@ public static int getIndexOfThisSubtask(RuntimeContext context) { return context.getIndexOfThisSubtask(); } + public static int getAttemptNumber(RuntimeContext context) { + return context.getAttemptNumber(); + } + public static @Nullable Integer getNumberOfParallelSubtasks(FunctionContext context) { return null; } diff --git a/paimon-flink/paimon-flink2-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java b/paimon-flink/paimon-flink2-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java index ff5fa868128c..c2a5ae6cc35b 100644 --- a/paimon-flink/paimon-flink2-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java +++ b/paimon-flink/paimon-flink2-common/src/main/java/org/apache/paimon/flink/utils/RuntimeContextUtils.java @@ -34,6 +34,10 @@ public static int getIndexOfThisSubtask(RuntimeContext context) { return context.getTaskInfo().getIndexOfThisSubtask(); } + public static int getAttemptNumber(RuntimeContext context) { + return context.getTaskInfo().getAttemptNumber(); + } + public static @Nullable Integer getNumberOfParallelSubtasks(FunctionContext context) { return context.getTaskInfo().getNumberOfParallelSubtasks(); }