diff --git a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/BucketedDvMaintainer.java b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/BucketedDvMaintainer.java index 788ea1cce85d..527d599f5d53 100644 --- a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/BucketedDvMaintainer.java +++ b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/BucketedDvMaintainer.java @@ -35,12 +35,13 @@ public class BucketedDvMaintainer { private final DeletionVectorsIndexFile dvIndexFile; - private final Map deletionVectors; + private final Map deletionVectors; protected final boolean bitmap64; private boolean modified; private BucketedDvMaintainer( - DeletionVectorsIndexFile dvIndexFile, Map deletionVectors) { + DeletionVectorsIndexFile dvIndexFile, + Map deletionVectors) { this.dvIndexFile = dvIndexFile; this.deletionVectors = deletionVectors; this.bitmap64 = dvIndexFile.bitmap64(); @@ -59,8 +60,12 @@ private DeletionVector createNewDeletionVector() { * @param position The row position within the file that has been deleted. */ public void notifyNewDeletion(String fileName, long position) { + notifyNewDeletion(DeletionFileKey.ofFileName(fileName), position); + } + + public void notifyNewDeletion(DeletionFileKey key, long position) { DeletionVector deletionVector = - deletionVectors.computeIfAbsent(fileName, k -> createNewDeletionVector()); + deletionVectors.computeIfAbsent(key, k -> createNewDeletionVector()); if (deletionVector.checkedDelete(position)) { modified = true; } @@ -73,7 +78,11 @@ public void notifyNewDeletion(String fileName, long position) { * @param deletionVector The deletion vector */ public void notifyNewDeletion(String fileName, DeletionVector deletionVector) { - deletionVectors.put(fileName, deletionVector); + notifyNewDeletion(DeletionFileKey.ofFileName(fileName), deletionVector); + } + + public void notifyNewDeletion(DeletionFileKey key, DeletionVector deletionVector) { + deletionVectors.put(key, deletionVector); modified = true; } @@ -85,11 +94,15 @@ public void notifyNewDeletion(String fileName, DeletionVector deletionVector) { * @param deletionVector The deletion vector */ public void mergeNewDeletion(String fileName, DeletionVector deletionVector) { - DeletionVector old = deletionVectors.get(fileName); + mergeNewDeletion(DeletionFileKey.ofFileName(fileName), deletionVector); + } + + public void mergeNewDeletion(DeletionFileKey key, DeletionVector deletionVector) { + DeletionVector old = deletionVectors.get(key); if (old != null) { deletionVector.merge(old); } - deletionVectors.put(fileName, deletionVector); + deletionVectors.put(key, deletionVector); modified = true; } @@ -100,8 +113,12 @@ public void mergeNewDeletion(String fileName, DeletionVector deletionVector) { * @param fileName The name of the file whose deletion vector should be removed. */ public void removeDeletionVectorOf(String fileName) { - if (deletionVectors.containsKey(fileName)) { - deletionVectors.remove(fileName); + removeDeletionVectorOf(DeletionFileKey.ofFileName(fileName)); + } + + public void removeDeletionVectorOf(DeletionFileKey key) { + if (deletionVectors.containsKey(key)) { + deletionVectors.remove(key); modified = true; } } @@ -128,7 +145,11 @@ public Optional writeDeletionVectorsIndex() { * Optional} if not. */ public Optional deletionVectorOf(String fileName) { - return Optional.ofNullable(deletionVectors.get(fileName)); + return deletionVectorOf(DeletionFileKey.ofFileName(fileName)); + } + + public Optional deletionVectorOf(DeletionFileKey key) { + return Optional.ofNullable(deletionVectors.get(key)); } public DeletionVectorsIndexFile dvIndexFile() { @@ -136,7 +157,7 @@ public DeletionVectorsIndexFile dvIndexFile() { } @VisibleForTesting - public Map deletionVectors() { + public Map deletionVectors() { return deletionVectors; } @@ -166,13 +187,15 @@ public BucketedDvMaintainer create( if (restoredFiles == null) { restoredFiles = Collections.emptyList(); } - Map deletionVectors = + Map deletionVectors = new HashMap<>(handler.readAllDeletionVectors(partition, bucket, restoredFiles)); return create(partition, bucket, deletionVectors); } public BucketedDvMaintainer create( - BinaryRow partition, int bucket, Map deletionVectors) { + BinaryRow partition, + int bucket, + Map deletionVectors) { return new BucketedDvMaintainer(handler.dvIndex(partition, bucket), deletionVectors); } } diff --git a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DataEvolutionApplyDvReader.java b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DataEvolutionApplyDvReader.java new file mode 100644 index 000000000000..fe9ad2e10283 --- /dev/null +++ b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DataEvolutionApplyDvReader.java @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.deletionvectors; + +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.fs.FileIO; +import org.apache.paimon.reader.RecordReader; +import org.apache.paimon.table.SpecialFields; +import org.apache.paimon.table.source.DeletionFile; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.Preconditions; +import org.apache.paimon.utils.ProjectedRow; +import org.apache.paimon.utils.Range; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +/** + * The RecordReader to apply deletion vectors for data evolution tables. At first, readType will be + * enriched by `_ROW_ID`, then the returned id will be filtered by DVs. + * + *

This reader assumes that the underlying reader will return monotonically incrementing + * _ROW_IDs, which is guaranteed by the current implementation. + */ +public class DataEvolutionApplyDvReader implements RecordReader { + + private final RecordReader reader; + private final List deletionVectors; + @Nullable private final ProjectedRow projectedRow; + private final int rowIdIndex; + + private long lastRowId = -1; + private int nextDvIndex; + private RowRangeDeletionVector currentDv; + + public DataEvolutionApplyDvReader(RecordReader reader, Info info) { + this.reader = reader; + this.deletionVectors = new ArrayList<>(info.deletionVectors); + this.deletionVectors.sort(Comparator.comparingLong(dv -> dv.range.from)); + this.rowIdIndex = info.rowIdIndex; + this.projectedRow = info.projectedRow; + + this.nextDvIndex = 1; + this.currentDv = deletionVectors.get(0); + } + + @Nullable + @Override + public RecordIterator readBatch() throws IOException { + RecordIterator iterator = reader.readBatch(); + if (iterator == null) { + return null; + } + + return new RecordIterator() { + + @Nullable + @Override + public InternalRow next() throws IOException { + while (true) { + InternalRow row = iterator.next(); + if (row == null) { + return null; + } + + if (!isDeleted(row)) { + if (projectedRow != null) { + return projectedRow.replaceRow(row); + } + return row; + } + } + } + + @Override + public void releaseBatch() { + iterator.releaseBatch(); + } + }; + } + + private boolean isDeleted(InternalRow row) { + long rowId = row.getLong(rowIdIndex); + checkRowIdMonotonicity(rowId); + + moveToPossibleDv(rowId); + + if (currentDv == null || !currentDv.mayContains(rowId)) { + return false; + } + + return currentDv.isDeleted(rowId); + } + + private void checkRowIdMonotonicity(long rowId) { + if (lastRowId >= 0) { + Preconditions.checkState( + rowId > lastRowId, + "This reader works only if underlying reader produces incremental _ROW_IDs."); + } + + lastRowId = rowId; + } + + private void moveToPossibleDv(long rowId) { + if (currentDv == null) { + return; + } + + while (rowId > currentDv.range.to) { + if (nextDvIndex >= deletionVectors.size()) { + currentDv = null; + return; + } + currentDv = deletionVectors.get(nextDvIndex); + nextDvIndex++; + } + } + + @Override + public void close() throws IOException { + reader.close(); + } + + public static Info readInfo( + FileIO fileIO, RowType readRowType, Map deletionFiles) + throws IOException { + if (deletionFiles == null || deletionFiles.isEmpty()) { + return Info.noDeletionVectors(readRowType); + } + + List deletionVectors = new ArrayList<>(deletionFiles.size()); + for (Map.Entry entry : deletionFiles.entrySet()) { + DeletionVector deletionVector = DeletionVector.read(fileIO, entry.getValue()); + if (!deletionVector.isEmpty()) { + deletionVectors.add(new RowRangeDeletionVector(entry.getKey(), deletionVector)); + } + } + if (deletionVectors.isEmpty()) { + return Info.noDeletionVectors(readRowType); + } + + int rowIdIndex = readRowType.getFieldIndex(SpecialFields.ROW_ID.name()); + RowType actualReadType = readRowType; + ProjectedRow projectedRow = null; + if (rowIdIndex == -1) { + actualReadType = SpecialFields.rowTypeWithRowId(readRowType); + rowIdIndex = actualReadType.getFieldCount() - 1; + int[] mappings = new int[readRowType.getFieldCount()]; + for (int i = 0; i < readRowType.getFieldCount(); i++) { + mappings[i] = i; + } + projectedRow = ProjectedRow.from(mappings); + } + + return new Info(deletionVectors, rowIdIndex, actualReadType, projectedRow); + } + + /** Information for data evolution deletion vector applying. */ + public static class Info { + + private final List deletionVectors; + private final int rowIdIndex; + public final RowType actualReadType; + @Nullable private final ProjectedRow projectedRow; + + private Info( + List deletionVectors, + int rowIdIndex, + RowType actualReadType, + @Nullable ProjectedRow projectedRow) { + this.deletionVectors = deletionVectors; + this.rowIdIndex = rowIdIndex; + this.actualReadType = actualReadType; + this.projectedRow = projectedRow; + } + + private static Info noDeletionVectors(RowType readRowType) { + return new Info(Collections.emptyList(), -1, readRowType, null); + } + + public boolean hasDeletionVectors() { + return !deletionVectors.isEmpty(); + } + } + + /** Deletion Vector and range pair. */ + private static class RowRangeDeletionVector { + + private final Range range; + private final DeletionVector deletionVector; + + private RowRangeDeletionVector(Range range, DeletionVector deletionVector) { + this.range = range; + this.deletionVector = deletionVector; + } + + boolean mayContains(long rowId) { + return rowId <= range.to && rowId >= range.from; + } + + boolean isDeleted(long rowId) { + return deletionVector.isDeleted(rowId - range.from); + } + } +} diff --git a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionFileKey.java b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionFileKey.java new file mode 100644 index 000000000000..d696fa9386b2 --- /dev/null +++ b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionFileKey.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.deletionvectors; + +import org.apache.paimon.utils.Range; + +import java.io.Serializable; +import java.util.Collection; + +/** Key for associating a data deletion vector with data. */ +public abstract class DeletionFileKey implements Serializable { + + private static final long serialVersionUID = 1L; + + public static FileNameKey ofFileName(String fileName) { + return new FileNameKey(fileName); + } + + public static RowIdRangeKey ofRange(Range range) { + return new RowIdRangeKey(range); + } + + public static Type checkType(Collection keys) { + if (keys == null || keys.isEmpty()) { + throw new RuntimeException("Empty keys."); + } + + Type type = null; + for (DeletionFileKey key : keys) { + if (type == null) { + type = key.type(); + } else if (type != key.type()) { + throw new IllegalStateException( + "All DeletionFileKeys should always be the same type, it's a bug."); + } + } + + return type; + } + + @Override + public abstract boolean equals(Object o); + + @Override + public abstract int hashCode(); + + @Override + public abstract String toString(); + + public abstract Type type(); + + /** Type of this key. */ + public enum Type { + FILE_NAME, + ROW_RANGE + } +} diff --git a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionFileWriter.java b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionFileWriter.java index ca2330664dd8..3f3ef537320f 100644 --- a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionFileWriter.java +++ b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionFileWriter.java @@ -38,7 +38,7 @@ public class DeletionFileWriter implements Closeable { private final Path path; private final boolean isExternalPath; private final DataOutputStream out; - private final LinkedHashMap dvMetas; + private final LinkedHashMap dvMetas; public DeletionFileWriter(IndexPathFactory pathFactory, FileIO fileIO) throws IOException { this.path = pathFactory.newPath(); @@ -53,6 +53,10 @@ public long getPos() { } public void write(String key, DeletionVector deletionVector) throws IOException { + write(DeletionFileKey.ofFileName(key), deletionVector); + } + + public void write(DeletionFileKey key, DeletionVector deletionVector) throws IOException { int start = out.size(); int length = deletionVector.serializeTo(out); dvMetas.put( diff --git a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionVector.java b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionVector.java index e06dc767a9f6..c28889910ea0 100644 --- a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionVector.java +++ b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionVector.java @@ -146,7 +146,7 @@ static DeletionVector read(DataInputStream dis, @Nullable Long length) throws IO } static Factory emptyFactory() { - return fileName -> Optional.empty(); + return key -> Optional.empty(); } static Factory factory(@Nullable BucketedDvMaintainer dvMaintainer) { @@ -159,8 +159,8 @@ static Factory factory(@Nullable BucketedDvMaintainer dvMaintainer) { static Factory factory( FileIO fileIO, List files, @Nullable List deletionFiles) { DeletionFile.Factory factory = DeletionFile.factory(files, deletionFiles); - return fileName -> { - Optional deletionFile = factory.create(fileName); + return key -> { + Optional deletionFile = factory.create(key); if (deletionFile.isPresent()) { return Optional.of(DeletionVector.read(fileIO, deletionFile.get())); } @@ -191,6 +191,6 @@ static DeletionVector deserializeFromBytes(byte[] bytes) { /** Interface to create {@link DeletionVector}. */ interface Factory { - Optional create(String fileName) throws IOException; + Optional create(DeletionFileKey key) throws IOException; } } diff --git a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionVectorIndexFileWriter.java b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionVectorIndexFileWriter.java index 6ce9e1c3875e..65e08ca0efe7 100644 --- a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionVectorIndexFileWriter.java +++ b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionVectorIndexFileWriter.java @@ -51,10 +51,11 @@ public DeletionVectorIndexFileWriter( * *

TODO: We can consider sending a message to delete the deletion file in the future. */ - public IndexFileMeta writeSingleFile(Map input) throws IOException { + public IndexFileMeta writeSingleFile(Map input) + throws IOException { DeletionFileWriter writer = new DeletionFileWriter(indexPathFactory, fileIO); try { - for (Map.Entry entry : input.entrySet()) { + for (Map.Entry entry : input.entrySet()) { writer.write(entry.getKey(), entry.getValue()); } } finally { @@ -63,25 +64,25 @@ public IndexFileMeta writeSingleFile(Map input) throws I return writer.result(); } - public List writeWithRolling(Map input) + public List writeWithRolling(Map input) throws IOException { if (input.isEmpty()) { return Collections.emptyList(); } List result = new ArrayList<>(); - Iterator> iterator = input.entrySet().iterator(); + Iterator> iterator = input.entrySet().iterator(); while (iterator.hasNext()) { result.add(tryWriter(iterator)); } return result; } - private IndexFileMeta tryWriter(Iterator> iterator) + private IndexFileMeta tryWriter(Iterator> iterator) throws IOException { DeletionFileWriter writer = new DeletionFileWriter(indexPathFactory, fileIO); try { while (iterator.hasNext()) { - Map.Entry entry = iterator.next(); + Map.Entry entry = iterator.next(); writer.write(entry.getKey(), entry.getValue()); if (writer.getPos() > targetSizeInBytes) { break; diff --git a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionVectorsIndexFile.java b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionVectorsIndexFile.java index e4606adf55d6..83c1c3d089a7 100644 --- a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionVectorsIndexFile.java +++ b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/DeletionVectorsIndexFile.java @@ -70,19 +70,22 @@ public boolean bitmap64() { * value is the corresponding DeletionVector object. * @throws UncheckedIOException If an I/O error occurs while reading from the file. */ - public Map readAllDeletionVectors(IndexFileMeta fileMeta) { - LinkedHashMap deletionVectorMetas = fileMeta.dvRanges(); + public Map readAllDeletionVectors(IndexFileMeta fileMeta) { + LinkedHashMap deletionVectorMetas = + fileMeta.dvRanges(); checkNotNull(deletionVectorMetas); - Map deletionVectors = new HashMap<>(); + Map deletionVectors = new HashMap<>(); Path filePath = pathFactory.toPath(fileMeta); try (SeekableInputStream inputStream = fileIO.newInputStream(filePath)) { checkVersion(inputStream); DataInputStream dataInputStream = new DataInputStream(inputStream); - for (DeletionVectorMeta deletionVectorMeta : deletionVectorMetas.values()) { + for (Map.Entry entry : + deletionVectorMetas.entrySet()) { + DeletionVectorMeta deletionVectorMeta = entry.getValue(); inputStream.seek(deletionVectorMeta.offset()); deletionVectors.put( - deletionVectorMeta.dataFileName(), + entry.getKey(), DeletionVector.read(dataInputStream, (long) deletionVectorMeta.length())); } } catch (Exception e) { @@ -96,16 +99,17 @@ public Map readAllDeletionVectors(IndexFileMeta fileMeta return deletionVectors; } - public Map readAllDeletionVectors(List indexFiles) { - Map deletionVectors = new HashMap<>(); + public Map readAllDeletionVectors( + List indexFiles) { + Map deletionVectors = new HashMap<>(); indexFiles.forEach(indexFile -> deletionVectors.putAll(readAllDeletionVectors(indexFile))); return deletionVectors; } /** Reads deletion vectors from a list of DeletionFile which belong to a same index file. */ - public Map readDeletionVector( - Map dataFileToDeletionFiles) { - Map deletionVectors = new HashMap<>(); + public Map readDeletionVector( + Map dataFileToDeletionFiles) { + Map deletionVectors = new HashMap<>(); if (dataFileToDeletionFiles.isEmpty()) { return deletionVectors; } @@ -113,13 +117,13 @@ public Map readDeletionVector( String indexFile = dataFileToDeletionFiles.values().stream().findAny().get().path(); try (SeekableInputStream inputStream = fileIO.newInputStream(new Path(indexFile))) { checkVersion(inputStream); - for (String dataFile : dataFileToDeletionFiles.keySet()) { - DeletionFile deletionFile = dataFileToDeletionFiles.get(dataFile); + for (DeletionFileKey key : dataFileToDeletionFiles.keySet()) { + DeletionFile deletionFile = dataFileToDeletionFiles.get(key); checkArgument(deletionFile.path().equals(indexFile)); inputStream.seek(deletionFile.offset()); DataInputStream dataInputStream = new DataInputStream(inputStream); deletionVectors.put( - dataFile, DeletionVector.read(dataInputStream, deletionFile.length())); + key, DeletionVector.read(dataInputStream, deletionFile.length())); } } catch (Exception e) { throw new RuntimeException("Unable to read deletion vector from file: " + indexFile, e); @@ -140,7 +144,7 @@ public DeletionVector readDeletionVector(DeletionFile deletionFile) { } } - public IndexFileMeta writeSingleFile(Map input) { + public IndexFileMeta writeSingleFile(Map input) { try { return createWriter().writeSingleFile(input); } catch (IOException e) { @@ -148,7 +152,7 @@ public IndexFileMeta writeSingleFile(Map input) { } } - public List writeWithRolling(Map input) { + public List writeWithRolling(Map input) { try { return createWriter().writeWithRolling(input); } catch (IOException e) { diff --git a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/FileNameKey.java b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/FileNameKey.java new file mode 100644 index 000000000000..ae065f829bd1 --- /dev/null +++ b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/FileNameKey.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.deletionvectors; + +import java.util.Objects; + +/** A deletion vector key backed by data file name. */ +public final class FileNameKey extends DeletionFileKey { + + private static final long serialVersionUID = 1L; + + private final String fileName; + + public FileNameKey(String fileName) { + this.fileName = fileName; + } + + public String fileName() { + return fileName; + } + + @Override + public boolean equals(Object o) { + if (o == null || getClass() != o.getClass()) { + return false; + } + FileNameKey that = (FileNameKey) o; + return Objects.equals(fileName, that.fileName); + } + + @Override + public int hashCode() { + return Objects.hash(fileName); + } + + @Override + public String toString() { + return "FileNameKey{" + "fileName='" + fileName + '\'' + '}'; + } + + @Override + public Type type() { + return Type.FILE_NAME; + } +} diff --git a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/RowIdRangeKey.java b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/RowIdRangeKey.java new file mode 100644 index 000000000000..5a308dd5a1d7 --- /dev/null +++ b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/RowIdRangeKey.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.deletionvectors; + +import org.apache.paimon.utils.Range; + +import java.util.Objects; + +/** + * A deletion vector key backed by an inclusive row id range. Note that the deletion vector of this + * key will store the relative offset within this range, just aligned with current FileNameKey. + */ +public final class RowIdRangeKey extends DeletionFileKey { + + private static final long serialVersionUID = 1L; + + private final Range range; + + public RowIdRangeKey(Range range) { + this.range = range; + } + + public Range range() { + return range; + } + + @Override + public boolean equals(Object o) { + if (o == null || getClass() != o.getClass()) { + return false; + } + RowIdRangeKey that = (RowIdRangeKey) o; + return Objects.equals(range, that.range); + } + + @Override + public int hashCode() { + return Objects.hash(range); + } + + @Override + public String toString() { + return "RowIdRangeKey{" + "range=" + range + '}'; + } + + @Override + public Type type() { + return Type.ROW_RANGE; + } +} diff --git a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/append/AppendDeleteFileMaintainer.java b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/append/AppendDeleteFileMaintainer.java index 70e00bc4adba..45675f410afd 100644 --- a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/append/AppendDeleteFileMaintainer.java +++ b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/append/AppendDeleteFileMaintainer.java @@ -20,6 +20,7 @@ import org.apache.paimon.annotation.VisibleForTesting; import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.deletionvectors.DeletionVector; import org.apache.paimon.deletionvectors.DeletionVectorsIndexFile; import org.apache.paimon.fs.Path; @@ -43,18 +44,18 @@ public class AppendDeleteFileMaintainer implements BaseAppendDeleteFileMaintaine private final DeletionVectorsIndexFile dvIndexFile; private final BinaryRow partition; - private final Map dataFileToDeletionFile; + private final Map dataFileToDeletionFile; private final Map indexNameToEntry; - private final Map> indexFileToDeletionFiles; - private final Map dataFileToIndexFile; + private final Map> indexFileToDeletionFiles; + private final Map dataFileToIndexFile; private final Set touchedIndexFiles; - private final Map deletionVectors; + private final Map deletionVectors; AppendDeleteFileMaintainer( DeletionVectorsIndexFile dvIndexFile, BinaryRow partition, List manifestEntries, - Map deletionFiles) { + Map deletionFiles) { this.dvIndexFile = dvIndexFile; this.partition = partition; this.dataFileToDeletionFile = new HashMap<>(deletionFiles); @@ -67,8 +68,8 @@ public class AppendDeleteFileMaintainer implements BaseAppendDeleteFileMaintaine this.indexFileToDeletionFiles = new HashMap<>(); this.dataFileToIndexFile = new HashMap<>(); - for (String dataFile : deletionFiles.keySet()) { - DeletionFile deletionFile = deletionFiles.get(dataFile); + for (DeletionFileKey dataFile : dataFileToDeletionFile.keySet()) { + DeletionFile deletionFile = dataFileToDeletionFile.get(dataFile); String indexFileName = new Path(deletionFile.path()).getName(); indexFileToDeletionFiles .computeIfAbsent(indexFileName, k -> new HashMap<>()) @@ -89,15 +90,23 @@ public int getBucket() { } public DeletionFile getDeletionFile(String dataFile) { - return this.dataFileToDeletionFile.get(dataFile); + return getDeletionFile(DeletionFileKey.ofFileName(dataFile)); + } + + public DeletionFile getDeletionFile(DeletionFileKey key) { + return this.dataFileToDeletionFile.get(key); } public void putDeletionFile(String dataFile, DeletionFile deletionFile) { - this.dataFileToDeletionFile.put(dataFile, deletionFile); + this.dataFileToDeletionFile.put(DeletionFileKey.ofFileName(dataFile), deletionFile); } public DeletionVector getDeletionVector(String dataFile) { - DeletionFile deletionFile = getDeletionFile(dataFile); + return getDeletionVector(DeletionFileKey.ofFileName(dataFile)); + } + + public DeletionVector getDeletionVector(DeletionFileKey key) { + DeletionFile deletionFile = getDeletionFile(key); if (deletionFile != null) { return dvIndexFile.readDeletionVector(deletionFile); } @@ -105,23 +114,27 @@ public DeletionVector getDeletionVector(String dataFile) { } public DeletionFile notifyRemovedDeletionVector(String dataFile) { - if (dataFileToIndexFile.containsKey(dataFile)) { - String indexFileName = dataFileToIndexFile.get(dataFile); + return notifyRemovedDeletionVector(DeletionFileKey.ofFileName(dataFile)); + } + + public DeletionFile notifyRemovedDeletionVector(DeletionFileKey key) { + if (dataFileToIndexFile.containsKey(key)) { + String indexFileName = dataFileToIndexFile.get(key); touchedIndexFiles.add(indexFileName); if (indexFileToDeletionFiles.containsKey(indexFileName)) { - return indexFileToDeletionFiles.get(indexFileName).remove(dataFile); + return indexFileToDeletionFiles.get(indexFileName).remove(key); } } return null; } @Override - public void notifyNewDeletionVector(String dataFile, DeletionVector deletionVector) { - DeletionFile previous = notifyRemovedDeletionVector(dataFile); + public void notifyNewDeletionVector(DeletionFileKey key, DeletionVector deletionVector) { + DeletionFile previous = notifyRemovedDeletionVector(key); if (previous != null) { deletionVector.merge(dvIndexFile.readDeletionVector(previous)); } - deletionVectors.put(dataFile, deletionVector); + deletionVectors.put(key, deletionVector); } @Override @@ -150,7 +163,7 @@ List writeUnchangedDeletionVector() { IndexManifestEntry oldEntry = indexNameToEntry.get(indexFile); // write unchanged deletion vector. - Map dataFileToDeletionFiles = + Map dataFileToDeletionFiles = indexFileToDeletionFiles.get(indexFile); if (!dataFileToDeletionFiles.isEmpty()) { List newIndexFiles = diff --git a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/append/BaseAppendDeleteFileMaintainer.java b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/append/BaseAppendDeleteFileMaintainer.java index b61985a17704..1934ea05a97a 100644 --- a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/append/BaseAppendDeleteFileMaintainer.java +++ b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/append/BaseAppendDeleteFileMaintainer.java @@ -21,6 +21,7 @@ import org.apache.paimon.Snapshot; import org.apache.paimon.data.BinaryRow; import org.apache.paimon.deletionvectors.BucketedDvMaintainer; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.deletionvectors.DeletionVector; import org.apache.paimon.index.DeletionVectorMeta; import org.apache.paimon.index.IndexFileHandler; @@ -55,7 +56,11 @@ public interface BaseAppendDeleteFileMaintainer { int getBucket(); - void notifyNewDeletionVector(String dataFile, DeletionVector deletionVector); + void notifyNewDeletionVector(DeletionFileKey key, DeletionVector deletionVector); + + default void notifyNewDeletionVector(String dataFile, DeletionVector deletionVector) { + notifyNewDeletionVector(DeletionFileKey.ofFileName(dataFile), deletionVector); + } List persist(); @@ -80,13 +85,15 @@ static AppendDeleteFileMaintainer forUnawareAppend( indexFileHandler.scan(snapshot, DELETION_VECTORS_INDEX).stream() .filter(e -> e.partition().equals(partition)) .collect(Collectors.toList()); - Map deletionFiles = new HashMap<>(); + Map deletionFiles = new HashMap<>(); for (IndexManifestEntry file : manifestEntries) { - LinkedHashMap dvMetas = file.indexFile().dvRanges(); + LinkedHashMap dvMetas = + file.indexFile().dvRanges(); checkNotNull(dvMetas); - for (DeletionVectorMeta dvMeta : dvMetas.values()) { + for (Map.Entry entry : dvMetas.entrySet()) { + DeletionVectorMeta dvMeta = entry.getValue(); deletionFiles.put( - dvMeta.dataFileName(), + entry.getKey(), new DeletionFile( indexFileHandler.filePath(file).toString(), dvMeta.offset(), diff --git a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/append/BucketedAppendDeleteFileMaintainer.java b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/append/BucketedAppendDeleteFileMaintainer.java index 7245ebfe121e..1a4619f78ac2 100644 --- a/paimon-core/src/main/java/org/apache/paimon/deletionvectors/append/BucketedAppendDeleteFileMaintainer.java +++ b/paimon-core/src/main/java/org/apache/paimon/deletionvectors/append/BucketedAppendDeleteFileMaintainer.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryRow; import org.apache.paimon.deletionvectors.BucketedDvMaintainer; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.deletionvectors.DeletionVector; import org.apache.paimon.manifest.FileKind; import org.apache.paimon.manifest.IndexManifestEntry; @@ -52,8 +53,8 @@ public int getBucket() { } @Override - public void notifyNewDeletionVector(String dataFile, DeletionVector deletionVector) { - maintainer.mergeNewDeletion(dataFile, deletionVector); + public void notifyNewDeletionVector(DeletionFileKey key, DeletionVector deletionVector) { + maintainer.mergeNewDeletion(key, deletionVector); } @Override diff --git a/paimon-core/src/main/java/org/apache/paimon/iceberg/IcebergCommitCallback.java b/paimon-core/src/main/java/org/apache/paimon/iceberg/IcebergCommitCallback.java index 4a70b705f72f..0c05f3953f87 100644 --- a/paimon-core/src/main/java/org/apache/paimon/iceberg/IcebergCommitCallback.java +++ b/paimon-core/src/main/java/org/apache/paimon/iceberg/IcebergCommitCallback.java @@ -23,6 +23,8 @@ import org.apache.paimon.data.BinaryRow; import org.apache.paimon.data.GenericArray; import org.apache.paimon.data.GenericRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; +import org.apache.paimon.deletionvectors.FileNameKey; import org.apache.paimon.factories.FactoryException; import org.apache.paimon.factories.FactoryUtil; import org.apache.paimon.fs.Path; @@ -1309,10 +1311,16 @@ private List createDvManifestFileMetas(Snapshot snapsho return Collections.emptyList(); } for (IndexManifestEntry entry : newIndexes) { - LinkedHashMap dvMetas = entry.indexFile().dvRanges(); + LinkedHashMap dvMetas = + entry.indexFile().dvRanges(); Path bucketPath = fileStorePathFactory.bucketPath(entry.partition(), entry.bucket()); if (dvMetas != null) { - for (DeletionVectorMeta dvMeta : dvMetas.values()) { + for (Map.Entry dvEntry : dvMetas.entrySet()) { + if (!(dvEntry.getKey() instanceof FileNameKey)) { + continue; + } + DeletionVectorMeta dvMeta = dvEntry.getValue(); + String dataFileName = ((FileNameKey) dvEntry.getKey()).fileName(); // Iceberg will check the cardinality between deserialized dv and iceberg // deletion file, so if deletionFile.cardinality() is null, we should stop @@ -1321,7 +1329,7 @@ private List createDvManifestFileMetas(Snapshot snapsho dvMeta.cardinality() != null, "cardinality in DeletionVector is null, stop generate dv for iceberg. " + "dataFile path is {}, indexFile path is {}", - new Path(bucketPath, dvMeta.dataFileName()), + new Path(bucketPath, dataFileName), indexFileHandler.filePath(entry).toString()); IcebergDataFileMeta deleteFileMeta = @@ -1332,7 +1340,7 @@ private List createDvManifestFileMetas(Snapshot snapsho entry.partition(), dvMeta.cardinality(), entry.indexFile().fileSize(), - new Path(bucketPath, dvMeta.dataFileName()).toString(), + new Path(bucketPath, dataFileName).toString(), (long) dvMeta.offset(), (long) dvMeta.length()); diff --git a/paimon-core/src/main/java/org/apache/paimon/index/DeletionVectorMeta.java b/paimon-core/src/main/java/org/apache/paimon/index/DeletionVectorMeta.java index 175a263b0ab4..215dd3fbe708 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/DeletionVectorMeta.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/DeletionVectorMeta.java @@ -18,15 +18,23 @@ package org.apache.paimon.index; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; +import org.apache.paimon.deletionvectors.FileNameKey; +import org.apache.paimon.deletionvectors.RowIdRangeKey; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.DataField; import org.apache.paimon.types.IntType; import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.Range; import javax.annotation.Nullable; import java.util.Objects; +import static org.apache.paimon.data.BinaryString.fromString; import static org.apache.paimon.utils.SerializationUtils.newStringType; /** Metadata of deletion vector. */ @@ -34,26 +42,94 @@ public class DeletionVectorMeta { public static final RowType SCHEMA = RowType.of( - new DataField(0, "f0", newStringType(false)), + new DataField(0, "f0", newStringType(true)), new DataField(1, "f1", new IntType(false)), new DataField(2, "f2", new IntType(false)), new DataField(3, "_CARDINALITY", new BigIntType(true))); - private final String dataFileName; + public static final RowType ROW_ID_RANGE_SCHEMA = + RowType.of( + new DataField(0, "_ROW_ID_START", new BigIntType(false)), + new DataField(1, "_ROW_ID_END", new BigIntType(false)), + new DataField(2, "_OFFSET", new IntType(false)), + new DataField(3, "_LENGTH", new IntType(false)), + new DataField(4, "_CARDINALITY", new BigIntType(true))); + + private final DeletionFileKey key; private final int offset; private final int length; @Nullable private final Long cardinality; public DeletionVectorMeta( String dataFileName, int start, int length, @Nullable Long cardinality) { - this.dataFileName = dataFileName; + this(DeletionFileKey.ofFileName(dataFileName), start, length, cardinality); + } + + public DeletionVectorMeta( + DeletionFileKey key, int start, int length, @Nullable Long cardinality) { + this.key = key; this.offset = start; this.length = length; this.cardinality = cardinality; } + public GenericRow toRow() { + switch (key.type()) { + case FILE_NAME: + String fileName = ((FileNameKey) key).fileName(); + return GenericRow.of(fromString(fileName), offset, length, cardinality); + case ROW_RANGE: + Range range = ((RowIdRangeKey) key).range(); + return GenericRow.of(range.from, range.to, offset, length, cardinality); + default: + throw new UnsupportedOperationException("Unsupported key type: " + key.type()); + } + } + + public static GenericRow newLegacyMarkerRow() { + return GenericRow.of(null, 0, 0, null); + } + + public static boolean isLegacyMarker(InternalArray metas) { + if (metas != null && metas.size() == 1) { + InternalRow row = metas.getRow(0, SCHEMA.getFieldCount()); + return row.isNullAt(0); + } + return false; + } + + public static DeletionVectorMeta fromRow(DeletionFileKey.Type keyType, InternalRow row) { + switch (keyType) { + case FILE_NAME: + DeletionFileKey fileNameKey = + DeletionFileKey.ofFileName(row.getString(0).toString()); + return new DeletionVectorMeta( + fileNameKey, + row.getInt(1), + row.getInt(2), + row.isNullAt(3) ? null : row.getLong(3)); + case ROW_RANGE: + DeletionFileKey rowRangeKey = + DeletionFileKey.ofRange(new Range(row.getLong(0), row.getLong(1))); + return new DeletionVectorMeta( + rowRangeKey, + row.getInt(2), + row.getInt(3), + row.isNullAt(4) ? null : row.getLong(4)); + default: + throw new UnsupportedOperationException("Unsupported key type: " + keyType); + } + } + + public DeletionFileKey key() { + return key; + } + public String dataFileName() { - return dataFileName; + if (key instanceof FileNameKey) { + return ((FileNameKey) key).fileName(); + } + throw new IllegalStateException("Deletion vector key is not file-name based: " + key); } public int offset() { @@ -77,21 +153,20 @@ public boolean equals(Object o) { DeletionVectorMeta that = (DeletionVectorMeta) o; return offset == that.offset && length == that.length - && Objects.equals(dataFileName, that.dataFileName) + && Objects.equals(key, that.key) && Objects.equals(cardinality, that.cardinality); } @Override public int hashCode() { - return Objects.hash(dataFileName, offset, length, cardinality); + return Objects.hash(key, offset, length, cardinality); } @Override public String toString() { return "DeletionVectorMeta{" - + "dataFileName='" - + dataFileName - + '\'' + + "key=" + + key + ", offset=" + offset + ", length=" diff --git a/paimon-core/src/main/java/org/apache/paimon/index/IndexFileHandler.java b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileHandler.java index e848ed205e14..1bf5236fb7a3 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/IndexFileHandler.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileHandler.java @@ -21,6 +21,7 @@ import org.apache.paimon.Snapshot; import org.apache.paimon.data.BinaryRow; import org.apache.paimon.data.InternalRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.deletionvectors.DeletionVector; import org.apache.paimon.deletionvectors.DeletionVectorsIndexFile; import org.apache.paimon.fs.FileIO; @@ -272,12 +273,12 @@ public void deleteManifest(String indexManifest) { indexManifestFile.delete(indexManifest); } - public Map readAllDeletionVectors( + public Map readAllDeletionVectors( BinaryRow partition, int bucket, List fileMetas) { return dvIndex(partition, bucket).readAllDeletionVectors(fileMetas); } - public Map readAllDeletionVectors(IndexManifestEntry entry) { + public Map readAllDeletionVectors(IndexManifestEntry entry) { return dvIndex(entry.partition(), entry.bucket()).readAllDeletionVectors(entry.indexFile()); } } diff --git a/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMeta.java b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMeta.java index a7c257a46e50..84fe081989bc 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMeta.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMeta.java @@ -19,6 +19,7 @@ package org.apache.paimon.index; import org.apache.paimon.annotation.Public; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.deletionvectors.DeletionVectorsIndexFile; import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; @@ -54,7 +55,11 @@ public class IndexFileMeta { "_DELETIONS_VECTORS_RANGES", new ArrayType(true, DeletionVectorMeta.SCHEMA)), new DataField(5, "_EXTERNAL_PATH", newStringType(true)), - new DataField(6, "_GLOBAL_INDEX", GlobalIndexMeta.SCHEMA))); + new DataField(6, "_GLOBAL_INDEX", GlobalIndexMeta.SCHEMA), + new DataField( + 7, + "_DELETION_VECTOR_ROW_ID_RANGES", + new ArrayType(true, DeletionVectorMeta.ROW_ID_RANGE_SCHEMA)))); private final String indexType; private final String fileName; @@ -67,7 +72,7 @@ public class IndexFileMeta { * Metadata only used by {@link DeletionVectorsIndexFile}, use LinkedHashMap to ensure that the * order of DeletionVectorMetas and the written DeletionVectors is consistent. */ - private final @Nullable LinkedHashMap dvRanges; + private final @Nullable LinkedHashMap dvRanges; private final @Nullable String externalPath; @@ -76,7 +81,7 @@ public IndexFileMeta( String fileName, long fileSize, long rowCount, - @Nullable LinkedHashMap dvRanges, + @Nullable LinkedHashMap dvRanges, @Nullable String externalPath) { this(indexType, fileName, fileSize, rowCount, dvRanges, externalPath, null); } @@ -86,7 +91,7 @@ public IndexFileMeta( String fileName, long fileSize, long rowCount, - @Nullable LinkedHashMap dvRanges, + @Nullable LinkedHashMap dvRanges, @Nullable String externalPath, @Nullable GlobalIndexMeta globalIndexMeta) { this.indexType = indexType; @@ -129,7 +134,7 @@ public long rowCount() { return rowCount; } - public @Nullable LinkedHashMap dvRanges() { + public @Nullable LinkedHashMap dvRanges() { return dvRanges; } diff --git a/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaSerializer.java b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaSerializer.java index 6d98e61248bb..47bc02840020 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaSerializer.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaSerializer.java @@ -22,13 +22,21 @@ import org.apache.paimon.data.GenericRow; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.utils.ObjectSerializer; import org.apache.paimon.utils.VersionedObjectSerializer; -import java.util.Collection; +import javax.annotation.Nullable; + +import java.util.Collections; import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; import static org.apache.paimon.data.BinaryString.fromString; +import static org.apache.paimon.deletionvectors.DeletionVectorsIndexFile.DELETION_VECTORS_INDEX; +import static org.apache.paimon.utils.Preconditions.checkState; /** A {@link VersionedObjectSerializer} for {@link IndexFileMeta}. */ public class IndexFileMetaSerializer extends ObjectSerializer { @@ -51,14 +59,17 @@ public InternalRow toRow(IndexFileMeta record) { ? null : new GenericArray(globalIndexMeta.extraFieldIds()), globalIndexMeta.indexMeta()); + LinkedHashMap dvRanges = record.dvRanges(); + return GenericRow.of( fromString(record.indexType()), fromString(record.fileName()), record.fileSize(), record.rowCount(), - dvMetasToRowArrayData(record.dvRanges()), + metasToRowArrayData(dvRanges, DeletionFileKey.Type.FILE_NAME), fromString(record.externalPath()), - globalIndexRow); + globalIndexRow, + metasToRowArrayData(dvRanges, DeletionFileKey.Type.ROW_RANGE)); } @Override @@ -81,46 +92,111 @@ public IndexFileMeta fromRow(InternalRow row) { row.getString(1).toString(), row.getLong(2), row.getLong(3), - row.isNullAt(4) ? null : rowArrayDataToDvMetas(row.getArray(4)), + readDvRanges(row), row.isNullAt(5) ? null : row.getString(5).toString(), globalIndexMeta); } - public static InternalArray dvMetasToRowArrayData( - LinkedHashMap dvRanges) { - if (dvRanges == null) { + // ----------------------- Write methods ------------------------------- + + /** + * Serialize the dvMetas to an GenericArray. Note that we set an invalid marker row to + * fileNameDv field if current rowRangeDv is not empty. This fast-fail path can prevent older + * sdk silently reading deleted records for data evolution tables. + */ + public static InternalArray metasToRowArrayData( + Map dvMetas, DeletionFileKey.Type type) { + if (dvMetas == null || dvMetas.isEmpty()) { return null; } - return dvMetasToRowArrayData(dvRanges.values()); - } - public static InternalArray dvMetasToRowArrayData(Collection dvMetas) { - return new GenericArray( - dvMetas.stream() - .map( - dvMeta -> - GenericRow.of( - fromString(dvMeta.dataFileName()), - dvMeta.offset(), - dvMeta.length(), - dvMeta.cardinality())) - .toArray(GenericRow[]::new)); + List rows; + + if (DeletionFileKey.checkType(dvMetas.keySet()) != type) { + if (type == DeletionFileKey.Type.FILE_NAME) { + // If dvRanges are row-range keyed, set a legacy marker row + rows = Collections.singletonList(DeletionVectorMeta.newLegacyMarkerRow()); + } else { + return null; + } + } else { + rows = + dvMetas.values().stream() + .map(DeletionVectorMeta::toRow) + .collect(Collectors.toList()); + } + + return new GenericArray(rows.toArray(new GenericRow[0])); } - public static LinkedHashMap rowArrayDataToDvMetas( + // ------------------------ Read methods ------------------------------- + + public static LinkedHashMap rowArrayDataToFileNameDvMetas( InternalArray arrayData) { - LinkedHashMap dvMetas = new LinkedHashMap<>(arrayData.size()); + return rowArrayDataToDvMetas( + arrayData, + DeletionFileKey.Type.FILE_NAME, + DeletionVectorMeta.SCHEMA.getFieldCount()); + } + + public static LinkedHashMap + rowArrayDataToRowIdRangeDvMetas(InternalArray arrayData) { + return rowArrayDataToDvMetas( + arrayData, + DeletionFileKey.Type.ROW_RANGE, + DeletionVectorMeta.ROW_ID_RANGE_SCHEMA.getFieldCount()); + } + + private static LinkedHashMap rowArrayDataToDvMetas( + InternalArray arrayData, DeletionFileKey.Type keyType, int fieldCount) { + LinkedHashMap dvMetas = + new LinkedHashMap<>(arrayData.size()); for (int i = 0; i < arrayData.size(); i++) { - InternalRow row = arrayData.getRow(i, DeletionVectorMeta.SCHEMA.getFieldCount()); - String dataFileName = row.getString(0).toString(); - dvMetas.put( - dataFileName, - new DeletionVectorMeta( - dataFileName, - row.getInt(1), - row.getInt(2), - row.isNullAt(3) ? null : row.getLong(3))); + if (arrayData.isNullAt(i)) { + continue; + } + DeletionVectorMeta dvMeta = + DeletionVectorMeta.fromRow(keyType, arrayData.getRow(i, fieldCount)); + dvMetas.put(dvMeta.key(), dvMeta); } return dvMetas; } + + private static LinkedHashMap readDvRanges( + InternalRow row) { + InternalArray fileNameDvRanges = row.isNullAt(4) ? null : row.getArray(4); + InternalArray rowRangeDvRanges = + row.getFieldCount() > 7 && !row.isNullAt(7) ? row.getArray(7) : null; + return readDvRanges( + row.getString(0).toString(), row.getLong(3), fileNameDvRanges, rowRangeDvRanges); + } + + public static LinkedHashMap readDvRanges( + String indexType, + long rowCount, + @Nullable InternalArray fileNameDvRanges, + @Nullable InternalArray rowRangeDvRanges) { + boolean hasFileNameDvRanges = + fileNameDvRanges != null && !DeletionVectorMeta.isLegacyMarker(fileNameDvRanges); + boolean hasRowRangeDvRanges = rowRangeDvRanges != null; + checkState( + !(hasFileNameDvRanges && hasRowRangeDvRanges), + "File-name deletion vector ranges and row-range deletion vector ranges should not" + + " be both non-null."); + if (hasFileNameDvRanges) { + return rowArrayDataToFileNameDvMetas(fileNameDvRanges); + } else if (hasRowRangeDvRanges) { + return rowArrayDataToRowIdRangeDvMetas(rowRangeDvRanges); + } + + if (!DELETION_VECTORS_INDEX.equals(indexType)) { + return null; + } + + checkState( + rowCount == 0, + "Invalid state, all null dvRanges with non-zero row count: " + rowCount); + + return new LinkedHashMap<>(); + } } diff --git a/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV1Deserializer.java b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV1Deserializer.java index 36e5190540c7..031968d652a3 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV1Deserializer.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV1Deserializer.java @@ -22,6 +22,7 @@ import org.apache.paimon.data.InternalRow; import org.apache.paimon.data.serializer.InternalRowSerializer; import org.apache.paimon.data.serializer.InternalSerializers; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.io.DataInputView; import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; @@ -90,15 +91,14 @@ public IndexFileMeta deserialize(DataInputView in) throws IOException { return fromRow(rowSerializer.deserialize(in)); } - public static LinkedHashMap rowArrayDataToDvMetas( + public static LinkedHashMap rowArrayDataToDvMetas( InternalArray arrayData) { - LinkedHashMap dvMetas = new LinkedHashMap<>(arrayData.size()); + LinkedHashMap dvMetas = + new LinkedHashMap<>(arrayData.size()); for (int i = 0; i < arrayData.size(); i++) { InternalRow row = arrayData.getRow(i, 3); - dvMetas.put( - row.getString(0).toString(), - new DeletionVectorMeta( - row.getString(0).toString(), row.getInt(1), row.getInt(2), null)); + DeletionFileKey key = DeletionFileKey.ofFileName(row.getString(0).toString()); + dvMetas.put(key, new DeletionVectorMeta(key, row.getInt(1), row.getInt(2), null)); } return dvMetas; } diff --git a/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV2Deserializer.java b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV2Deserializer.java index 32f192938f19..cf8d8fd8dfca 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV2Deserializer.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV2Deserializer.java @@ -18,9 +18,11 @@ package org.apache.paimon.index; +import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalRow; import org.apache.paimon.data.serializer.InternalRowSerializer; import org.apache.paimon.data.serializer.InternalSerializers; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.io.DataInputView; import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; @@ -31,9 +33,9 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; +import java.util.LinkedHashMap; import java.util.List; -import static org.apache.paimon.index.IndexFileMetaSerializer.rowArrayDataToDvMetas; import static org.apache.paimon.utils.SerializationUtils.newStringType; /** Serializer for {@link IndexFileMeta} with 1.2 version. */ @@ -82,4 +84,23 @@ public final List deserializeList(DataInputView source) throws IO public IndexFileMeta deserialize(DataInputView in) throws IOException { return fromRow(rowSerializer.deserialize(in)); } + + public static LinkedHashMap rowArrayDataToDvMetas( + InternalArray arrayData) { + LinkedHashMap dvMetas = + new LinkedHashMap<>(arrayData.size()); + for (int i = 0; i < arrayData.size(); i++) { + InternalRow row = arrayData.getRow(i, DeletionVectorMeta.SCHEMA.getFieldCount()); + String dataFileName = row.getString(0).toString(); + DeletionFileKey key = DeletionFileKey.ofFileName(dataFileName); + dvMetas.put( + key, + new DeletionVectorMeta( + key, + row.getInt(1), + row.getInt(2), + row.isNullAt(3) ? null : row.getLong(3))); + } + return dvMetas; + } } diff --git a/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV3Deserializer.java b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV3Deserializer.java index ac38b7c11b8c..9325b2fdb4e8 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV3Deserializer.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV3Deserializer.java @@ -18,9 +18,11 @@ package org.apache.paimon.index; +import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalRow; import org.apache.paimon.data.serializer.InternalRowSerializer; import org.apache.paimon.data.serializer.InternalSerializers; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.io.DataInputView; import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; @@ -32,9 +34,9 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; +import java.util.LinkedHashMap; import java.util.List; -import static org.apache.paimon.index.IndexFileMetaSerializer.rowArrayDataToDvMetas; import static org.apache.paimon.utils.SerializationUtils.newStringType; /** A {@link VersionedObjectSerializer} for {@link IndexFileMeta}. */ @@ -84,4 +86,9 @@ public final List deserializeList(DataInputView source) throws IO public IndexFileMeta deserialize(DataInputView in) throws IOException { return fromRow(rowSerializer.deserialize(in)); } + + public static LinkedHashMap rowArrayDataToDvMetas( + InternalArray arrayData) { + return IndexFileMetaV2Deserializer.rowArrayDataToDvMetas(arrayData); + } } diff --git a/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV4Deserializer.java b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV4Deserializer.java new file mode 100644 index 000000000000..e185e7d8142c --- /dev/null +++ b/paimon-core/src/main/java/org/apache/paimon/index/IndexFileMetaV4Deserializer.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.index; + +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.serializer.InternalRowSerializer; +import org.apache.paimon.data.serializer.InternalSerializers; +import org.apache.paimon.deletionvectors.DeletionFileKey; +import org.apache.paimon.io.DataInputView; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.BigIntType; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.RowType; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.List; + +import static org.apache.paimon.utils.SerializationUtils.newStringType; + +/** Deserializer for {@link IndexFileMeta} in commit message version 11. */ +public class IndexFileMetaV4Deserializer implements Serializable { + + private static final long serialVersionUID = 1L; + + public static final RowType SCHEMA = + new RowType( + false, + Arrays.asList( + new DataField(0, "_INDEX_TYPE", newStringType(false)), + new DataField(1, "_FILE_NAME", newStringType(false)), + new DataField(2, "_FILE_SIZE", new BigIntType(false)), + new DataField(3, "_ROW_COUNT", new BigIntType(false)), + new DataField( + 4, + "_DELETIONS_VECTORS_RANGES", + new ArrayType(true, DeletionVectorMeta.SCHEMA)), + new DataField(5, "_EXTERNAL_PATH", newStringType(true)), + new DataField(6, "_GLOBAL_INDEX", GlobalIndexMeta.SCHEMA))); + + protected final InternalRowSerializer rowSerializer; + + public IndexFileMetaV4Deserializer() { + this.rowSerializer = InternalSerializers.create(SCHEMA); + } + + public IndexFileMeta fromRow(InternalRow row) { + GlobalIndexMeta globalIndexMeta = null; + if (!row.isNullAt(6)) { + InternalRow globalIndexRow = row.getRow(6, 5); + globalIndexMeta = + new GlobalIndexMeta( + globalIndexRow.getLong(0), + globalIndexRow.getLong(1), + globalIndexRow.getInt(2), + globalIndexRow.isNullAt(3) + ? null + : globalIndexRow.getArray(3).toIntArray(), + globalIndexRow.isNullAt(4) ? null : globalIndexRow.getBinary(4)); + } + + return new IndexFileMeta( + row.getString(0).toString(), + row.getString(1).toString(), + row.getLong(2), + row.getLong(3), + row.isNullAt(4) ? null : rowArrayDataToDvMetas(row.getArray(4)), + row.isNullAt(5) ? null : row.getString(5).toString(), + globalIndexMeta); + } + + public final List deserializeList(DataInputView source) throws IOException { + int size = source.readInt(); + List records = new ArrayList<>(size); + for (int i = 0; i < size; i++) { + records.add(deserialize(source)); + } + return records; + } + + public IndexFileMeta deserialize(DataInputView in) throws IOException { + return fromRow(rowSerializer.deserialize(in)); + } + + public static LinkedHashMap rowArrayDataToDvMetas( + InternalArray arrayData) { + return IndexFileMetaV2Deserializer.rowArrayDataToDvMetas(arrayData); + } +} diff --git a/paimon-core/src/main/java/org/apache/paimon/io/KeyValueFileReaderFactory.java b/paimon-core/src/main/java/org/apache/paimon/io/KeyValueFileReaderFactory.java index 5f7e3741927e..1c56b86238dc 100644 --- a/paimon-core/src/main/java/org/apache/paimon/io/KeyValueFileReaderFactory.java +++ b/paimon-core/src/main/java/org/apache/paimon/io/KeyValueFileReaderFactory.java @@ -23,6 +23,7 @@ import org.apache.paimon.data.BinaryRow; import org.apache.paimon.data.InternalRow; import org.apache.paimon.deletionvectors.ApplyDeletionVectorReader; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.deletionvectors.DeletionVector; import org.apache.paimon.format.FileFormatDiscover; import org.apache.paimon.format.FormatKey; @@ -166,7 +167,8 @@ private FileRecordReader createRecordReader( -1, Collections.emptyMap()); - Optional deletionVector = dvFactory.create(file.fileName()); + Optional deletionVector = + dvFactory.create(DeletionFileKey.ofFileName(file.fileName())); if (deletionVector.isPresent() && !deletionVector.get().isEmpty()) { fileRecordReader = new ApplyDeletionVectorReader(fileRecordReader, deletionVector.get()); diff --git a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestEntry.java b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestEntry.java index 10ac3a86a2e4..d316676a030c 100644 --- a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestEntry.java +++ b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestEntry.java @@ -61,7 +61,11 @@ public class IndexManifestEntry { "_DELETIONS_VECTORS_RANGES", new ArrayType(true, DeletionVectorMeta.SCHEMA)), new DataField(8, "_EXTERNAL_PATH", newStringType(true)), - new DataField(9, "_GLOBAL_INDEX", GlobalIndexMeta.SCHEMA))); + new DataField(9, "_GLOBAL_INDEX", GlobalIndexMeta.SCHEMA), + new DataField( + 10, + "_DELETION_VECTOR_ROW_ID_RANGES", + new ArrayType(true, DeletionVectorMeta.ROW_ID_RANGE_SCHEMA)))); private final FileKind kind; private final BinaryRow partition; diff --git a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestEntrySerializer.java b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestEntrySerializer.java index 60113adff1c9..e98631fa3774 100644 --- a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestEntrySerializer.java +++ b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestEntrySerializer.java @@ -22,15 +22,18 @@ import org.apache.paimon.data.GenericArray; import org.apache.paimon.data.GenericRow; import org.apache.paimon.data.InternalRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; +import org.apache.paimon.index.DeletionVectorMeta; import org.apache.paimon.index.GlobalIndexMeta; import org.apache.paimon.index.IndexFileMeta; import org.apache.paimon.utils.VersionedObjectSerializer; +import java.util.LinkedHashMap; import java.util.function.Function; import static org.apache.paimon.data.BinaryString.fromString; -import static org.apache.paimon.index.IndexFileMetaSerializer.dvMetasToRowArrayData; -import static org.apache.paimon.index.IndexFileMetaSerializer.rowArrayDataToDvMetas; +import static org.apache.paimon.index.IndexFileMetaSerializer.metasToRowArrayData; +import static org.apache.paimon.index.IndexFileMetaSerializer.readDvRanges; import static org.apache.paimon.utils.SerializationUtils.deserializeBinaryRow; import static org.apache.paimon.utils.SerializationUtils.serializeBinaryRow; @@ -69,9 +72,10 @@ public InternalRow convertTo(IndexManifestEntry record) { fromString(indexFile.fileName()), indexFile.fileSize(), indexFile.rowCount(), - dvMetasToRowArrayData(indexFile.dvRanges()), + metasToRowArrayData(indexFile.dvRanges(), DeletionFileKey.Type.FILE_NAME), fromString(indexFile.externalPath()), - globalIndexRow); + globalIndexRow, + metasToRowArrayData(indexFile.dvRanges(), DeletionFileKey.Type.ROW_RANGE)); } @Override @@ -94,16 +98,25 @@ public IndexManifestEntry convertFrom(int version, InternalRow row) { rowRangeStart, rowRangeEnd, indexFieldId, extralFields, indexMeta); } + String indexType = row.getString(3).toString(); + long rowCount = row.getLong(6); + LinkedHashMap dvRanges = + readDvRanges( + indexType, + rowCount, + row.isNullAt(7) ? null : row.getArray(7), + row.getFieldCount() > 10 && !row.isNullAt(10) ? row.getArray(10) : null); + return new IndexManifestEntry( FileKind.fromByteValue(row.getByte(0)), deserializeBinaryRow(row.getBinary(1)), row.getInt(2), new IndexFileMeta( - row.getString(3).toString(), + indexType, row.getString(4).toString(), row.getLong(5), - row.getLong(6), - row.isNullAt(7) ? null : rowArrayDataToDvMetas(row.getArray(7)), + rowCount, + dvRanges, row.isNullAt(8) ? null : row.getString(8).toString(), globalIndexMeta)); } diff --git a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java index f99278085550..9b68eb9810de 100644 --- a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java +++ b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java @@ -19,6 +19,8 @@ package org.apache.paimon.manifest; import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; +import org.apache.paimon.deletionvectors.RowIdRangeKey; import org.apache.paimon.index.DeletionVectorMeta; import org.apache.paimon.index.GlobalIndexMeta; import org.apache.paimon.index.IndexFileMeta; @@ -30,6 +32,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; @@ -121,10 +124,11 @@ static class GlobalCombiner implements IndexManifestFileCombiner { public List combine( List prevIndexFiles, List newIndexFiles) { Map indexEntries = new HashMap<>(); - Set dvDataFiles = new HashSet<>(); + Set dvDataFiles = new HashSet<>(); for (IndexManifestEntry entry : prevIndexFiles) { indexEntries.put(entry.indexFile().fileName(), entry); - LinkedHashMap dvRanges = entry.indexFile().dvRanges(); + LinkedHashMap dvRanges = + entry.indexFile().dvRanges(); if (dvRanges != null) { dvDataFiles.addAll(dvRanges.keySet()); } @@ -132,14 +136,15 @@ public List combine( for (IndexManifestEntry entry : newIndexFiles) { String fileName = entry.indexFile().fileName(); - LinkedHashMap dvRanges = entry.indexFile().dvRanges(); + LinkedHashMap dvRanges = + entry.indexFile().dvRanges(); if (entry.kind() == FileKind.ADD) { checkState( !indexEntries.containsKey(fileName), "Trying to add file %s which is already added.", fileName); if (dvRanges != null) { - for (String dataFile : dvRanges.keySet()) { + for (DeletionFileKey dataFile : dvRanges.keySet()) { checkState( !dvDataFiles.contains(dataFile), "Trying to add dv for data file %s which is already added.", @@ -154,7 +159,7 @@ public List combine( "Trying to delete file %s which is not exists.", fileName); if (dvRanges != null) { - for (String dataFile : dvRanges.keySet()) { + for (DeletionFileKey dataFile : dvRanges.keySet()) { checkState( dvDataFiles.contains(dataFile), "Trying to delete dv for data file %s which is not exists.", @@ -165,6 +170,31 @@ public List combine( indexEntries.remove(fileName); } } + + if (!dvDataFiles.isEmpty()) { + // Check all deletion files with the same key type + if (DeletionFileKey.checkType(dvDataFiles) == DeletionFileKey.Type.ROW_RANGE) { + // Check row range deletion files non-overlapping + List fileRanges = + dvDataFiles.stream() + .map(key -> ((RowIdRangeKey) key).range()) + .sorted(Comparator.comparingLong(range -> range.from)) + .collect(Collectors.toList()); + + Range prevRange = null; + for (Range range : fileRanges) { + if (prevRange != null) { + checkState( + prevRange.to < range.from, + "Found overlapping row range %s and %s for data-evolution deletion files.", + prevRange, + range); + } + prevRange = range; + } + } + } + return new ArrayList<>(indexEntries.values()); } } diff --git a/paimon-core/src/main/java/org/apache/paimon/operation/BlobFallbackRecordReader.java b/paimon-core/src/main/java/org/apache/paimon/operation/BlobFallbackRecordReader.java index 30f214afcf9f..3994296461c0 100644 --- a/paimon-core/src/main/java/org/apache/paimon/operation/BlobFallbackRecordReader.java +++ b/paimon-core/src/main/java/org/apache/paimon/operation/BlobFallbackRecordReader.java @@ -24,6 +24,7 @@ import org.apache.paimon.data.InternalRow; import org.apache.paimon.io.DataFileMeta; import org.apache.paimon.reader.RecordReader; +import org.apache.paimon.table.SpecialFields; import org.apache.paimon.types.RowType; import org.apache.paimon.utils.Preconditions; import org.apache.paimon.utils.Range; @@ -55,8 +56,10 @@ public class BlobFallbackRecordReader implements RecordReader { private final List> groupReaders = new ArrayList<>(); - private final int blobIndex; private final int fieldCount; + private final int blobIndex; + private final int rowIdIndex; + private final int seqNumIndex; private boolean returned; BlobFallbackRecordReader( @@ -66,6 +69,8 @@ public class BlobFallbackRecordReader implements RecordReader { RowType readRowType, int blobIndex) { this.blobIndex = blobIndex; + this.rowIdIndex = readRowType.getFieldIndex(SpecialFields.ROW_ID.name()); + this.seqNumIndex = readRowType.getFieldIndex(SpecialFields.SEQUENCE_NUMBER.name()); this.fieldCount = readRowType.getFieldCount(); checkArgument(!files.isEmpty(), "Blob bunch should not be empty."); @@ -104,11 +109,14 @@ public class BlobFallbackRecordReader implements RecordReader { groupReaders.add( new ForceSingleBatchReader( new BlobSequenceGroupRecordReader( + entry.getKey(), groupFiles, readerFactory, rowRanges, readRowType, blobIndex, + rowIdIndex, + seqNumIndex, firstRowId, lastRowId))); } @@ -146,6 +154,7 @@ public RecordIterator readBatch() throws IOException { @Override public InternalRow next() throws IOException { InternalRow result = null; + long rowId = -1L; // We should always move each iterator forward // This may significantly increase memory usage and decrease read efficiency // if `blob-as-descriptor` is disabled and many non-null blobs are updated @@ -172,9 +181,12 @@ public InternalRow next() throws IOException { if (result == null && !isPlaceHolder(row)) { result = row; } + if (rowIdIndex >= 0 && rowId < 0) { + rowId = row.getLong(rowIdIndex); + } } if (result == null) { - result = nullBlobRow(); + result = nullBlobRow(rowId); } return result; } @@ -188,8 +200,16 @@ public void releaseBatch() { }; } - private InternalRow nullBlobRow() { - return new GenericRow(fieldCount); + private InternalRow nullBlobRow(long rowId) { + GenericRow row = new GenericRow(fieldCount); + if (rowIdIndex >= 0) { + row.setField(rowIdIndex, rowId); + } + // set the seq num to -1 indicating an all-placeholder row. + if (seqNumIndex >= 0) { + row.setField(seqNumIndex, -1L); + } + return row; } private boolean isPlaceHolder(InternalRow row) { @@ -262,14 +282,18 @@ public void close() throws IOException { */ public static class BlobSequenceGroupRecordReader implements RecordReader { + private final long maxSeq; private final List files; private final BlobFileReaderFactory readerFactory; // pushed row ranges private final List rowRanges; private final RowType readRowType; - private final int blobIndex; private final long lastRowId; + private final int blobIndex; + private final int rowIdIndex; + private final int seqNumIndex; + private RecordReader currentReader; private DataFileMeta currentFile; private int nextFileIndex; @@ -277,28 +301,30 @@ public static class BlobSequenceGroupRecordReader implements RecordReader files, BlobFileReaderFactory readerFactory, List rowRanges, RowType readRowType, int blobIndex, + int rowIdIndex, + int seqNumIndex, long firstRowId, long lastRowId) { + this.maxSeq = maxSeq; this.files = files; this.readerFactory = readerFactory; this.rowRanges = rowRanges == null ? null : Range.sortAndMergeOverlap(rowRanges); this.readRowType = readRowType; this.blobIndex = blobIndex; + this.rowIdIndex = rowIdIndex; + this.seqNumIndex = seqNumIndex; this.lastRowId = lastRowId; this.nextFileIndex = 0; this.nextRowRangeIndex = 0; setNextRowId(firstRowId); - - this.placeholderRow = null; } @Nullable @@ -383,7 +409,7 @@ public InternalRow next() { return null; } setNextRowId(rowId + 1); - return placeHolderRow(); + return placeHolderRow(rowId); } @Override @@ -393,13 +419,16 @@ public void releaseBatch() { }; } - private InternalRow placeHolderRow() { - if (placeholderRow == null) { - GenericRow row = new GenericRow(readRowType.getFieldCount()); - row.setField(blobIndex, BlobPlaceholder.INSTANCE); - placeholderRow = row; + private InternalRow placeHolderRow(long rowId) { + GenericRow row = new GenericRow(readRowType.getFieldCount()); + row.setField(blobIndex, BlobPlaceholder.INSTANCE); + if (rowIdIndex >= 0) { + row.setField(rowIdIndex, rowId); + } + if (seqNumIndex >= 0) { + row.setField(seqNumIndex, maxSeq); } - return placeholderRow; + return row; } private long lastRowId(DataFileMeta file) { diff --git a/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionSplitRead.java b/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionSplitRead.java index 145fdc9ad4af..2426a001428a 100644 --- a/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionSplitRead.java +++ b/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionSplitRead.java @@ -23,6 +23,7 @@ import org.apache.paimon.append.ForceSingleBatchReader; import org.apache.paimon.data.BinaryRow; import org.apache.paimon.data.InternalRow; +import org.apache.paimon.deletionvectors.DataEvolutionApplyDvReader; import org.apache.paimon.disk.IOManager; import org.apache.paimon.format.FileFormatDiscover; import org.apache.paimon.format.FormatKey; @@ -82,8 +83,7 @@ /** * A union {@link SplitRead} to read multiple inner files to merge columns, note that this class - * does not support filtering push down and deletion vectors, as they can interfere with the process - * of merging columns. + * does not support filtering push down, as it can interfere with the process of merging columns. * *

TODO: Optimize implementation of this class. */ @@ -156,6 +156,19 @@ private RecordReader createReader(DataSplit dataSplit) throws IOExc private RecordReader createReader( DataSplit dataSplit, List rowRanges, RowType readRowType) throws IOException { + DataEvolutionApplyDvReader.Info dvInfo = + DataEvolutionApplyDvReader.readInfo( + fileIO, readRowType, dataSplit.dataEvolutionDeletionFiles().orElse(null)); + RecordReader reader = + createReaderWithoutDeletionVector(dataSplit, rowRanges, dvInfo.actualReadType); + if (!dvInfo.hasDeletionVectors()) { + return reader; + } + return new DataEvolutionApplyDvReader(reader, dvInfo); + } + + private RecordReader createReaderWithoutDeletionVector( + DataSplit dataSplit, List rowRanges, RowType readRowType) throws IOException { List files = dataSplit.dataFiles(); BinaryRow partition = dataSplit.partition(); DataFilePathFactory dataFilePathFactory = diff --git a/paimon-core/src/main/java/org/apache/paimon/operation/RawFileSplitRead.java b/paimon-core/src/main/java/org/apache/paimon/operation/RawFileSplitRead.java index 464d9ea0e5d7..4f51a2007791 100644 --- a/paimon-core/src/main/java/org/apache/paimon/operation/RawFileSplitRead.java +++ b/paimon-core/src/main/java/org/apache/paimon/operation/RawFileSplitRead.java @@ -22,6 +22,7 @@ import org.apache.paimon.data.BinaryRow; import org.apache.paimon.data.InternalRow; import org.apache.paimon.deletionvectors.ApplyDeletionVectorReader; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.deletionvectors.DeletionVector; import org.apache.paimon.disk.IOManager; import org.apache.paimon.fileindex.FileIndexResult; @@ -179,7 +180,12 @@ public RecordReader createReader( DeletionVector.Factory dvFactory = DeletionVector.factory(fileIO, files, deletionFiles); Map> dvFactories = new HashMap<>(); for (DataFileMeta file : files) { - dvFactories.put(file.fileName(), () -> dvFactory.create(file.fileName()).orElse(null)); + dvFactories.put( + file.fileName(), + () -> + dvFactory + .create(DeletionFileKey.ofFileName(file.fileName())) + .orElse(null)); } return createReader(partition, bucket, files, dvFactories); } diff --git a/paimon-core/src/main/java/org/apache/paimon/operation/commit/ConflictDetection.java b/paimon-core/src/main/java/org/apache/paimon/operation/commit/ConflictDetection.java index 39bbca3338da..bcbd69c3adfa 100644 --- a/paimon-core/src/main/java/org/apache/paimon/operation/commit/ConflictDetection.java +++ b/paimon-core/src/main/java/org/apache/paimon/operation/commit/ConflictDetection.java @@ -22,6 +22,8 @@ import org.apache.paimon.Snapshot.CommitKind; import org.apache.paimon.data.BinaryRow; import org.apache.paimon.data.InternalRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; +import org.apache.paimon.deletionvectors.FileNameKey; import org.apache.paimon.index.DeletionVectorMeta; import org.apache.paimon.index.GlobalIndexMeta; import org.apache.paimon.index.IndexFileHandler; @@ -682,13 +684,19 @@ static List buildBaseEntriesWithDV( // Should not attach DELETE type dv index for base file. if (!indexManifestEntry.kind().equals(FileKind.DELETE)) { IndexFileMeta indexFile = indexManifestEntry.indexFile(); - LinkedHashMap dvRanges = indexFile.dvRanges(); + LinkedHashMap dvRanges = indexFile.dvRanges(); if (dvRanges != null) { - for (DeletionVectorMeta value : dvRanges.values()) { + for (Map.Entry entry : + dvRanges.entrySet()) { + // todo: check DV range consistency for Data-Evolution tables + if (!(entry.getKey() instanceof FileNameKey)) { + continue; + } + String dataFileName = ((FileNameKey) entry.getKey()).fileName(); checkState( - !fileNameToDVFileName.containsKey(value.dataFileName()), + !fileNameToDVFileName.containsKey(dataFileName), "One file should correspond to only one dv entry."); - fileNameToDVFileName.put(value.dataFileName(), indexFile.fileName()); + fileNameToDVFileName.put(dataFileName, indexFile.fileName()); } } } @@ -718,12 +726,16 @@ static List buildDeltaEntriesWithDV( // create a new one. Map> fileNameToDVEntry = new HashMap<>(); for (IndexManifestEntry deltaIndexEntry : deltaIndexEntries) { - LinkedHashMap dvRanges = + LinkedHashMap dvRanges = deltaIndexEntry.indexFile().dvRanges(); if (dvRanges != null) { - for (DeletionVectorMeta meta : dvRanges.values()) { - fileNameToDVEntry.putIfAbsent(meta.dataFileName(), new ArrayList<>()); - fileNameToDVEntry.get(meta.dataFileName()).add(deltaIndexEntry); + for (DeletionFileKey key : dvRanges.keySet()) { + if (!(key instanceof FileNameKey)) { + continue; + } + String dataFileName = ((FileNameKey) key).fileName(); + fileNameToDVEntry.putIfAbsent(dataFileName, new ArrayList<>()); + fileNameToDVEntry.get(dataFileName).add(deltaIndexEntry); } } } diff --git a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java index fedfecaafb03..cc163d87fcf2 100644 --- a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java +++ b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java @@ -889,9 +889,6 @@ private static void validateRowTracking(TableSchema schema, CoreOptions options) checkArgument( rowTrackingEnabled, "Data evolution config must enabled with row-tracking.enabled"); - checkArgument( - !options.deletionVectorsEnabled(), - "Data evolution config must disabled with deletion-vectors.enabled"); checkArgument( !options.clusteringIncrementalEnabled(), "Data evolution config must disabled with clustering.incremental"); diff --git a/paimon-core/src/main/java/org/apache/paimon/table/sink/CommitMessageSerializer.java b/paimon-core/src/main/java/org/apache/paimon/table/sink/CommitMessageSerializer.java index 5e621a9f789c..56d4751e841b 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/sink/CommitMessageSerializer.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/sink/CommitMessageSerializer.java @@ -25,6 +25,7 @@ import org.apache.paimon.index.IndexFileMetaV1Deserializer; import org.apache.paimon.index.IndexFileMetaV2Deserializer; import org.apache.paimon.index.IndexFileMetaV3Deserializer; +import org.apache.paimon.index.IndexFileMetaV4Deserializer; import org.apache.paimon.io.CompactIncrement; import org.apache.paimon.io.DataFileMeta; import org.apache.paimon.io.DataFileMeta08Serializer; @@ -51,7 +52,7 @@ /** {@link VersionedSerializer} for {@link CommitMessage}. */ public class CommitMessageSerializer implements VersionedSerializer { - public static final int CURRENT_VERSION = 11; + public static final int CURRENT_VERSION = 12; private final DataFileMetaSerializer dataFileSerializer; private final IndexFileMetaSerializer indexEntrySerializer; @@ -64,6 +65,7 @@ public class CommitMessageSerializer implements VersionedSerializer> fileDeserializer( private IOExceptionSupplier> indexEntryDeserializer( int version, DataInputView view) { - if (version >= 11) { + if (version >= 12) { return () -> indexEntrySerializer.deserializeList(view); + } else if (version >= 11) { + if (indexEntryV4Deserializer == null) { + indexEntryV4Deserializer = new IndexFileMetaV4Deserializer(); + } + return () -> indexEntryV4Deserializer.deserializeList(view); } else if (version >= 9) { if (indexEntryV3Deserializer == null) { indexEntryV3Deserializer = new IndexFileMetaV3Deserializer(); diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/DataSplit.java b/paimon-core/src/main/java/org/apache/paimon/table/source/DataSplit.java index 2ba4410f34d2..6919ec0e1eb3 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/DataSplit.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/DataSplit.java @@ -39,7 +39,9 @@ import org.apache.paimon.types.DataTypes; import org.apache.paimon.utils.FunctionWithIOException; import org.apache.paimon.utils.InternalRowUtils; +import org.apache.paimon.utils.Range; import org.apache.paimon.utils.RangeHelper; +import org.apache.paimon.utils.RowRangeIndex; import org.apache.paimon.utils.SerializationUtils; import javax.annotation.Nullable; @@ -48,7 +50,9 @@ import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.OptionalLong; @@ -63,7 +67,7 @@ public class DataSplit implements Split { private static final long serialVersionUID = 7L; private static final long MAGIC = -2394839472490812314L; - private static final int VERSION = 8; + private static final int VERSION = 9; private long snapshotId = 0; private BinaryRow partition; @@ -73,6 +77,7 @@ public class DataSplit implements Split { private List dataFiles; @Nullable private List dataDeletionFiles; + @Nullable private Map dataEvolutionDeletionFiles; private boolean isStreaming = false; private boolean rawConvertible; @@ -108,12 +113,17 @@ public Optional> deletionFiles() { return Optional.ofNullable(dataDeletionFiles); } + public Optional> dataEvolutionDeletionFiles() { + return Optional.ofNullable(dataEvolutionDeletionFiles); + } + public boolean isStreaming() { return isStreaming; } public boolean rawConvertible() { - return rawConvertible; + return rawConvertible + && (dataEvolutionDeletionFiles == null || dataEvolutionDeletionFiles.isEmpty()); } public OptionalLong earliestFileCreationEpochMillis() { @@ -141,7 +151,7 @@ public OptionalLong mergedRowCount() { } private boolean rawMergedRowCountAvailable() { - return rawConvertible + return rawConvertible() && (dataDeletionFiles == null || dataDeletionFiles.stream() .allMatch(f -> f == null || f.cardinality() != null)); @@ -168,6 +178,23 @@ private boolean dataEvolutionRowCountAvailable() { return false; } } + if (dataEvolutionDeletionFiles != null) { + RowRangeIndex rangeIndex = createRangeIndex(dataFiles); + for (Map.Entry entry : dataEvolutionDeletionFiles.entrySet()) { + // 1. Every cardinality should be non-null + if (entry.getValue().cardinality() == null) { + return false; + } + + // 2. Every deletion vector should be contained by a data file. + // For example, if one data file range is [0, 100], and a + // deletion file range is [50, 120], we cannot directly get the + // merged row count without reading the deletion file content. + if (!rangeIndex.contains(entry.getKey())) { + return false; + } + } + } return true; } @@ -182,6 +209,11 @@ private long dataEvolutionMergedRowCount() { } sum += maxCount; } + if (dataEvolutionDeletionFiles != null) { + for (DeletionFile deletionFile : dataEvolutionDeletionFiles.values()) { + sum -= deletionFile.cardinality(); + } + } return sum; } @@ -245,7 +277,7 @@ public Long nullCount(int fieldIndex, SimpleStatsEvolutions evolutions) { @Override public Optional> convertToRawFiles() { - if (rawConvertible) { + if (rawConvertible()) { return Optional.of( dataFiles.stream() .map(f -> makeRawTableFile(bucketPath, f)) @@ -312,9 +344,36 @@ public Optional filterDataFile(Predicate filter) { split.assign(this); split.dataFiles = filtered; split.dataDeletionFiles = filteredDeletion; + split.dataEvolutionDeletionFiles = + filterDataEvolutionDeletionFiles(dataEvolutionDeletionFiles, filtered); return Optional.of(split); } + @Nullable + private static Map filterDataEvolutionDeletionFiles( + @Nullable Map deletionFiles, List dataFiles) { + if (deletionFiles == null || deletionFiles.isEmpty()) { + return deletionFiles; + } + Map filtered = new LinkedHashMap<>(); + RowRangeIndex index = createRangeIndex(dataFiles); + + for (Map.Entry entry : deletionFiles.entrySet()) { + Range dvRange = entry.getKey(); + if (index.intersects(dvRange.from, dvRange.to)) { + filtered.put(dvRange, entry.getValue()); + } + } + return filtered.isEmpty() ? null : filtered; + } + + private static RowRangeIndex createRangeIndex(List files) { + List fileRanges = + files.stream().map(DataFileMeta::nonNullRowIdRange).collect(Collectors.toList()); + + return RowRangeIndex.create(fileRanges); + } + @Override public boolean equals(Object o) { if (this == o) { @@ -332,7 +391,8 @@ public boolean equals(Object o) { && Objects.equals(bucketPath, dataSplit.bucketPath) && Objects.equals(totalBuckets, dataSplit.totalBuckets) && Objects.equals(dataFiles, dataSplit.dataFiles) - && Objects.equals(dataDeletionFiles, dataSplit.dataDeletionFiles); + && Objects.equals(dataDeletionFiles, dataSplit.dataDeletionFiles) + && Objects.equals(dataEvolutionDeletionFiles, dataSplit.dataEvolutionDeletionFiles); } @Override @@ -345,6 +405,7 @@ public int hashCode() { totalBuckets, dataFiles, dataDeletionFiles, + dataEvolutionDeletionFiles, isStreaming, rawConvertible); } @@ -381,6 +442,7 @@ protected void assign(DataSplit other) { this.totalBuckets = other.totalBuckets; this.dataFiles = other.dataFiles; this.dataDeletionFiles = other.dataDeletionFiles; + this.dataEvolutionDeletionFiles = other.dataEvolutionDeletionFiles; this.isStreaming = other.isStreaming; this.rawConvertible = other.rawConvertible; } @@ -415,6 +477,8 @@ public void serialize(DataOutputView out) throws IOException { out.writeBoolean(isStreaming); out.writeBoolean(rawConvertible); + + serializeDataEvolutionDeletionFiles(out, dataEvolutionDeletionFiles); } public static DataSplit deserialize(DataInputView in) throws IOException { @@ -452,6 +516,8 @@ public static DataSplit deserialize(DataInputView in) throws IOException { boolean isStreaming = in.readBoolean(); boolean rawConvertible = in.readBoolean(); + Map dataEvolutionDeletionFiles = + version >= 9 ? deserializeDataEvolutionDeletionFiles(in, deletionFileSerde) : null; DataSplit.Builder builder = builder() @@ -467,6 +533,9 @@ public static DataSplit deserialize(DataInputView in) throws IOException { if (dataDeletionFiles != null) { builder.withDataDeletionFiles(dataDeletionFiles); } + if (dataEvolutionDeletionFiles != null) { + builder.withDataEvolutionDeletionFiles(dataEvolutionDeletionFiles); + } return builder.build(); } @@ -488,7 +557,7 @@ private static FunctionWithIOException getFileMetaS DataFileMetaFirstRowIdLegacySerializer serializer = new DataFileMetaFirstRowIdLegacySerializer(); return serializer::deserialize; - } else if (version == 8) { + } else if (version >= 8) { DataFileMetaSerializer serializer = new DataFileMetaSerializer(); return serializer::deserialize; } else { @@ -496,6 +565,42 @@ private static FunctionWithIOException getFileMetaS } } + private static void serializeDataEvolutionDeletionFiles( + DataOutputView out, @Nullable Map deletionFiles) + throws IOException { + if (deletionFiles == null || deletionFiles.isEmpty()) { + out.writeBoolean(false); + return; + } + + out.writeBoolean(true); + out.writeInt(deletionFiles.size()); + for (Map.Entry entry : deletionFiles.entrySet()) { + out.writeLong(entry.getKey().from); + out.writeLong(entry.getKey().to); + DeletionFile.serialize(out, entry.getValue()); + } + } + + @Nullable + private static Map deserializeDataEvolutionDeletionFiles( + DataInputView in, + FunctionWithIOException deletionFileSerde) + throws IOException { + if (!in.readBoolean()) { + return null; + } + + int size = in.readInt(); + Map deletionFiles = new LinkedHashMap<>(); + for (int i = 0; i < size; i++) { + Range key = new Range(in.readLong(), in.readLong()); + DeletionFile deletionFile = deletionFileSerde.apply(in); + deletionFiles.put(key, deletionFile); + } + return deletionFiles; + } + private static FunctionWithIOException getDeletionFileSerde( int version) { if (version >= 1 && version <= 3) { @@ -551,6 +656,12 @@ public Builder withDataDeletionFiles(List dataDeletionFiles) { return this; } + public Builder withDataEvolutionDeletionFiles( + Map dataEvolutionDeletionFiles) { + this.split.dataEvolutionDeletionFiles = new LinkedHashMap<>(dataEvolutionDeletionFiles); + return this; + } + public Builder isStreaming(boolean isStreaming) { this.split.isStreaming = isStreaming; return this; diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/DeletionFile.java b/paimon-core/src/main/java/org/apache/paimon/table/source/DeletionFile.java index 5bcf6898ed99..a157f00fc6ba 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/DeletionFile.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/DeletionFile.java @@ -19,6 +19,7 @@ package org.apache.paimon.table.source; import org.apache.paimon.annotation.Public; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.io.DataFileMeta; import org.apache.paimon.io.DataInputView; import org.apache.paimon.io.DataOutputView; @@ -182,7 +183,7 @@ public String toString() { } static Factory emptyFactory() { - return fileName -> Optional.empty(); + return key -> Optional.empty(); } public static Factory factory( @@ -190,21 +191,22 @@ public static Factory factory( if (deletionFiles == null) { return emptyFactory(); } - Map fileToDeletion = new HashMap<>(); + Map fileToDeletion = new HashMap<>(); for (int i = 0; i < files.size(); i++) { DeletionFile deletionFile = deletionFiles.get(i); if (deletionFile != null) { - fileToDeletion.put(files.get(i).fileName(), deletionFile); + fileToDeletion.put( + DeletionFileKey.ofFileName(files.get(i).fileName()), deletionFile); } } - return fileName -> { - DeletionFile deletionFile = fileToDeletion.get(fileName); + return key -> { + DeletionFile deletionFile = fileToDeletion.get(key); return Optional.ofNullable(deletionFile); }; } /** Interface to create {@link DeletionFile}. */ public interface Factory { - Optional create(String fileName) throws IOException; + Optional create(DeletionFileKey key) throws IOException; } } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/snapshot/SnapshotReaderImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/snapshot/SnapshotReaderImpl.java index 0a752f256467..c681be0a4785 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/snapshot/SnapshotReaderImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/snapshot/SnapshotReaderImpl.java @@ -24,7 +24,9 @@ import org.apache.paimon.codegen.RecordComparator; import org.apache.paimon.consumer.ConsumerManager; import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.deletionvectors.DeletionVectorsIndexFile; +import org.apache.paimon.deletionvectors.RowIdRangeKey; import org.apache.paimon.fs.Path; import org.apache.paimon.index.DeletionVectorMeta; import org.apache.paimon.index.IndexFileHandler; @@ -59,6 +61,7 @@ import org.apache.paimon.utils.Filter; import org.apache.paimon.utils.LazyField; import org.apache.paimon.utils.Pair; +import org.apache.paimon.utils.Preconditions; import org.apache.paimon.utils.Range; import org.apache.paimon.utils.RowRangeIndex; import org.apache.paimon.utils.SnapshotManager; @@ -398,7 +401,7 @@ private List generateSplits( Map>> entries) { List splits = new ArrayList<>(); // Read deletion indexes at once to reduce file IO - Map, Map> deletionFilesMap = null; + Map, Map> deletionFilesMap = null; if (!isStreaming) { deletionFilesMap = deletionVectors && snapshot != null @@ -435,12 +438,17 @@ private List generateSplits( .rawConvertible(splitGroup.rawConvertible) .withBucketPath(bucketPath); if (deletionVectors && deletionFilesMap != null) { - builder.withDataDeletionFiles( - getDeletionFiles( - dataFiles, - deletionFilesMap.getOrDefault( - Pair.of(partition, bucket), - Collections.emptyMap()))); + Map deletionFiles = + deletionFilesMap.getOrDefault( + Pair.of(partition, bucket), Collections.emptyMap()); + if (!options.dataEvolutionEnabled()) { + builder.withDataDeletionFiles( + getDeletionFiles(dataFiles, deletionFiles)); + } else { + Map rangeDeletionFiles = + getDataEvolutionDeletionFiles(dataFiles, deletionFiles); + builder.withDataEvolutionDeletionFiles(rangeDeletionFiles); + } } splits.add(builder.build()); } @@ -502,8 +510,10 @@ private Plan toIncrementalPlan( buckets.computeIfAbsent(part, k -> new HashSet<>()) .addAll(bucketMap.keySet())); // Read deletion indexes at once to reduce file IO - Map, Map> beforeDeletionFilesMap = null; - Map, Map> afterDeletionFilesMap = null; + Map, Map> beforeDeletionFilesMap = + null; + Map, Map> afterDeletionFilesMap = + null; if (!isStreaming && deletionVectors) { beforeDeletionFilesMap = beforeSnapshot.get() != null @@ -609,15 +619,39 @@ private RecordComparator partitionComparator() { } private List getDeletionFiles( - List dataFiles, Map deletionFilesMap) { + List dataFiles, Map deletionFilesMap) { List deletionFiles = new ArrayList<>(dataFiles.size()); dataFiles.stream() - .map(DataFileMeta::fileName) - .map(f -> deletionFilesMap == null ? null : deletionFilesMap.get(f)) + .map(f -> deletionFilesMap.get(DeletionFileKey.ofFileName(f.fileName()))) .forEach(deletionFiles::add); return deletionFiles; } + private Map getDataEvolutionDeletionFiles( + List dataFiles, Map deletionFilesMap) { + // Add all deletion files whose RowRange intersecting with any data file + // TODO: can we cache ranges of each bucket DVs? + List dataRanges = + dataFiles.stream() + .map(DataFileMeta::nonNullRowIdRange) + .collect(Collectors.toList()); + RowRangeIndex index = RowRangeIndex.create(dataRanges); + + Map deletionFiles = new LinkedHashMap<>(); + for (Map.Entry entry : deletionFilesMap.entrySet()) { + DeletionFileKey key = entry.getKey(); + Preconditions.checkState( + key instanceof RowIdRangeKey, + "Data Evolution tables should only have DVs by RowId Ranges."); + + Range dvRange = ((RowIdRangeKey) key).range(); + if (index.intersects(dvRange.from, dvRange.to)) { + deletionFiles.put(dvRange, entry.getValue()); + } + } + return deletionFiles; + } + private Set> toPartBuckets( Map>> entries) { return entries.entrySet().stream() @@ -628,12 +662,12 @@ private Set> toPartBuckets( .collect(Collectors.toSet()); } - private Map, Map> scanDvIndex( + private Map, Map> scanDvIndex( @Nullable Snapshot snapshot, Set> buckets) { if (snapshot == null || snapshot.indexManifest() == null || buckets.isEmpty()) { return Collections.emptyMap(); } - Map, Map> result = new HashMap<>(); + Map, Map> result = new HashMap<>(); Path indexManifestPath = indexFileHandler.indexManifestFilePath(snapshot.indexManifest()); Set> remainingBuckets = new HashSet<>(buckets); @@ -644,7 +678,7 @@ private Map, Map> scanDvIndex( Pair next = iterator.next(); BinaryRow partition = next.getLeft(); int bucket = next.getRight(); - Map fromCache = + Map fromCache = dvMetaCache.read(indexManifestPath, partition, bucket); if (fromCache != null) { result.put(next, fromCache); @@ -678,7 +712,7 @@ private Map, Map> scanDvIndex( (entry, indexFileMetas) -> { Pair partitionBucket = entry; if (remainingBuckets.contains(entry)) { - Map deletionFiles = + Map deletionFiles = toDeletionFiles(partitionBucket, indexFileMetas); result.put(partitionBucket, deletionFiles); if (dvMetaCache != null) { @@ -703,7 +737,7 @@ private Map, Map> scanDvIndex( private int deletionFileNumber(List fileMetas) { int count = 0; for (IndexFileMeta indexFile : fileMetas) { - LinkedHashMap dvRanges = indexFile.dvRanges(); + LinkedHashMap dvRanges = indexFile.dvRanges(); if (dvRanges != null) { count += dvRanges.size(); } @@ -711,18 +745,19 @@ private int deletionFileNumber(List fileMetas) { return count; } - private Map toDeletionFiles( + private Map toDeletionFiles( Pair partitionBucket, List fileMetas) { - Map deletionFiles = new HashMap<>(); + Map deletionFiles = new HashMap<>(); DeletionVectorsIndexFile dvIndex = indexFileHandler.dvIndex(partitionBucket.getLeft(), partitionBucket.getRight()); for (IndexFileMeta indexFile : fileMetas) { - LinkedHashMap dvRanges = indexFile.dvRanges(); + LinkedHashMap dvRanges = indexFile.dvRanges(); String dvFilePath = dvIndex.path(indexFile).toString(); if (dvRanges != null && !dvRanges.isEmpty()) { - for (DeletionVectorMeta dvMeta : dvRanges.values()) { + for (Map.Entry entry : dvRanges.entrySet()) { + DeletionVectorMeta dvMeta = entry.getValue(); deletionFiles.put( - dvMeta.dataFileName(), + entry.getKey(), new DeletionFile( dvFilePath, dvMeta.offset(), diff --git a/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java b/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java index 9ad88e977b3d..c8ffac459db5 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java @@ -22,8 +22,12 @@ import org.apache.paimon.casting.CastExecutor; import org.apache.paimon.casting.CastExecutors; import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.GenericArray; import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; +import org.apache.paimon.deletionvectors.RowIdRangeKey; import org.apache.paimon.disk.IOManager; import org.apache.paimon.fs.FileIO; import org.apache.paimon.index.DeletionVectorMeta; @@ -57,6 +61,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Iterator; @@ -229,7 +234,7 @@ private InternalRow toRow( IndexManifestEntry indexManifestEntry, CastExecutor partitionCastExecutor, RowType logicalRowType) { - LinkedHashMap dvMetas = + LinkedHashMap dvMetas = indexManifestEntry.indexFile().dvRanges(); GlobalIndexMeta globalMeta = indexManifestEntry.indexFile().globalIndexMeta(); String indexFieldName = null; @@ -254,14 +259,40 @@ private InternalRow toRow( BinaryString.fromString(indexManifestEntry.indexFile().fileName()), indexManifestEntry.indexFile().fileSize(), indexManifestEntry.indexFile().rowCount(), - dvMetas == null - ? null - : IndexFileMetaSerializer.dvMetasToRowArrayData(dvMetas.values()), + deletionVectorMetasToArray(dvMetas), globalMeta != null ? globalMeta.rowRangeStart() : null, globalMeta != null ? globalMeta.rowRangeEnd() : null, globalMeta != null ? globalMeta.indexFieldId() : null, indexFieldName != null ? BinaryString.fromString(indexFieldName) : null); } + + private InternalArray deletionVectorMetasToArray( + LinkedHashMap dvMetas) { + if (dvMetas == null || dvMetas.isEmpty()) { + return null; + } + + DeletionFileKey.Type keyType = DeletionFileKey.checkType(dvMetas.keySet()); + if (keyType == DeletionFileKey.Type.FILE_NAME) { + return IndexFileMetaSerializer.metasToRowArrayData( + dvMetas, DeletionFileKey.Type.FILE_NAME); + } + + // For row-range deletion vectors, reuse the same schema, replacing + // `fileName` field by formated range string e.g. [0, 100] + List rows = new ArrayList<>(); + for (Map.Entry entry : dvMetas.entrySet()) { + DeletionVectorMeta meta = entry.getValue(); + rows.add( + GenericRow.of( + BinaryString.fromString( + ((RowIdRangeKey) entry.getKey()).range().toString()), + meta.offset(), + meta.length(), + meta.cardinality())); + } + return new GenericArray(rows.toArray(new GenericRow[0])); + } } private static List allIndexEntries(FileStoreTable dataTable) { diff --git a/paimon-core/src/main/java/org/apache/paimon/utils/DVMetaCache.java b/paimon-core/src/main/java/org/apache/paimon/utils/DVMetaCache.java index b81241257b0f..40297b79e9fc 100644 --- a/paimon-core/src/main/java/org/apache/paimon/utils/DVMetaCache.java +++ b/paimon-core/src/main/java/org/apache/paimon/utils/DVMetaCache.java @@ -19,6 +19,7 @@ package org.apache.paimon.utils; import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.fs.Path; import org.apache.paimon.table.source.DeletionFile; @@ -51,14 +52,18 @@ private static int weigh(DVMetaCacheKey cacheKey, DVMetaCacheValue cacheValue) { } @Nullable - public Map read(Path manifestPath, BinaryRow partition, int bucket) { + public Map read( + Path manifestPath, BinaryRow partition, int bucket) { DVMetaCacheKey cacheKey = new DVMetaCacheKey(manifestPath, partition, bucket); DVMetaCacheValue cacheValue = this.cache.getIfPresent(cacheKey); return cacheValue == null ? null : cacheValue.get(); } public void put( - Path path, BinaryRow partition, int bucket, Map dvFilesMap) { + Path path, + BinaryRow partition, + int bucket, + Map dvFilesMap) { DVMetaCacheKey key = new DVMetaCacheKey(path, partition, bucket); this.cache.put(key, DVMetaCacheValue.eager(dvFilesMap)); } @@ -68,7 +73,7 @@ public void putLazy( BinaryRow partition, int bucket, int valueNumber, - Supplier> dvFilesSupplier) { + Supplier> dvFilesSupplier) { DVMetaCacheKey key = new DVMetaCacheKey(path, partition, bucket); this.cache.put(key, DVMetaCacheValue.lazy(valueNumber, dvFilesSupplier)); } @@ -84,13 +89,14 @@ private DVMetaCacheValue(int weight, DeletionFilesField deletionFilesField) { this.deletionFilesField = deletionFilesField; } - private static DVMetaCacheValue eager(Map deletionFiles) { + private static DVMetaCacheValue eager(Map deletionFiles) { return new DVMetaCacheValue( deletionFiles.size() + 1, new ExistingDeletionFilesField(deletionFiles)); } private static DVMetaCacheValue lazy( - int valueNumber, Supplier> deletionFilesSupplier) { + int valueNumber, + Supplier> deletionFilesSupplier) { return new DVMetaCacheValue( valueNumber + 1, new LazyDeletionFilesField(deletionFilesSupplier)); } @@ -99,40 +105,41 @@ private int weight() { return weight; } - private Map get() { + private Map get() { return deletionFilesField.get(); } } private interface DeletionFilesField { - Map get(); + Map get(); } private static final class ExistingDeletionFilesField implements DeletionFilesField { - private final Map deletionFiles; + private final Map deletionFiles; - private ExistingDeletionFilesField(Map deletionFiles) { + private ExistingDeletionFilesField(Map deletionFiles) { this.deletionFiles = deletionFiles; } @Override - public Map get() { + public Map get() { return deletionFiles; } } private static final class LazyDeletionFilesField implements DeletionFilesField { - private final LazyField> deletionFiles; + private final LazyField> deletionFiles; - private LazyDeletionFilesField(Supplier> deletionFilesSupplier) { + private LazyDeletionFilesField( + Supplier> deletionFilesSupplier) { this.deletionFiles = new LazyField<>(deletionFilesSupplier); } @Override - public synchronized Map get() { + public synchronized Map get() { return deletionFiles.get(); } } diff --git a/paimon-core/src/test/java/org/apache/paimon/deletionvectors/BucketedDvMaintainerTest.java b/paimon-core/src/test/java/org/apache/paimon/deletionvectors/BucketedDvMaintainerTest.java index 018d55e93304..0ec973415770 100644 --- a/paimon-core/src/test/java/org/apache/paimon/deletionvectors/BucketedDvMaintainerTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/deletionvectors/BucketedDvMaintainerTest.java @@ -88,13 +88,13 @@ public void test0(boolean bitmap64) { assertThat(dvMaintainer.deletionVectorOf("f3")).isEmpty(); IndexFileMeta file = dvMaintainer.writeDeletionVectorsIndex().get(); - Map deletionVectors = + Map deletionVectors = fileHandler.readAllDeletionVectors(partition, 0, Collections.singletonList(file)); - assertThat(deletionVectors.get("f1").isDeleted(1)).isTrue(); - assertThat(deletionVectors.get("f1").isDeleted(2)).isFalse(); - assertThat(deletionVectors.get("f2").isDeleted(1)).isFalse(); - assertThat(deletionVectors.get("f2").isDeleted(2)).isTrue(); - assertThat(deletionVectors.containsKey("f3")).isFalse(); + assertThat(deletionVectors.get(DeletionFileKey.ofFileName("f1")).isDeleted(1)).isTrue(); + assertThat(deletionVectors.get(DeletionFileKey.ofFileName("f1")).isDeleted(2)).isFalse(); + assertThat(deletionVectors.get(DeletionFileKey.ofFileName("f2")).isDeleted(1)).isFalse(); + assertThat(deletionVectors.get(DeletionFileKey.ofFileName("f2")).isDeleted(2)).isTrue(); + assertThat(deletionVectors.containsKey(DeletionFileKey.ofFileName("f3"))).isFalse(); } @ParameterizedTest @@ -246,12 +246,12 @@ public void testReadAndWriteMixedDv(boolean bitmap64) { assertThat(dvMaintainer2.bitmap64()).isEqualTo(!bitmap64); // verify two kinds of dv can exist in the same dv maintainer - Map dvs = dvMaintainer2.deletionVectors(); + Map dvs = dvMaintainer2.deletionVectors(); assertThat(dvs.size()).isEqualTo(3); - assertThat(dvs.get("f1").getCardinality()).isEqualTo(3); - assertThat(dvs.get("f2")) + assertThat(dvs.get(DeletionFileKey.ofFileName("f1")).getCardinality()).isEqualTo(3); + assertThat(dvs.get(DeletionFileKey.ofFileName("f2"))) .isInstanceOf(bitmap64 ? Bitmap64DeletionVector.class : BitmapDeletionVector.class); - assertThat(dvs.get("f3")) + assertThat(dvs.get(DeletionFileKey.ofFileName("f3"))) .isInstanceOf(bitmap64 ? BitmapDeletionVector.class : Bitmap64DeletionVector.class); file = dvMaintainer2.writeDeletionVectorsIndex().get(); @@ -271,16 +271,16 @@ public void testReadAndWriteMixedDv(boolean bitmap64) { commit2.commit(Collections.singletonList(commitMessage2)); // test read dv index file which contains two kinds of dv - Map readDvs = + Map readDvs = fileHandler.readAllDeletionVectors( partition, 0, fileHandler.scan( table.latestSnapshot().get(), "DELETION_VECTORS", partition, 0)); assertThat(readDvs.size()).isEqualTo(3); - assertThat(dvs.get("f1").getCardinality()).isEqualTo(3); - assertThat(dvs.get("f2").getCardinality()).isEqualTo(2); - assertThat(dvs.get("f3").getCardinality()).isEqualTo(2); + assertThat(dvs.get(DeletionFileKey.ofFileName("f1")).getCardinality()).isEqualTo(3); + assertThat(dvs.get(DeletionFileKey.ofFileName("f2")).getCardinality()).isEqualTo(2); + assertThat(dvs.get(DeletionFileKey.ofFileName("f3")).getCardinality()).isEqualTo(2); } private DeletionVector createDeletionVector(boolean bitmap64) { @@ -307,7 +307,7 @@ public static BucketedDvMaintainer createOrRestore( : handler.scanEntries(snapshot, DELETION_VECTORS_INDEX, partition).stream() .map(IndexManifestEntry::indexFile) .collect(Collectors.toList()); - Map deletionVectors = + Map deletionVectors = new HashMap<>(handler.readAllDeletionVectors(partition, 0, indexFiles)); return factory.create(partition, 0, deletionVectors); } diff --git a/paimon-core/src/test/java/org/apache/paimon/deletionvectors/DataEvolutionApplyDvReaderTest.java b/paimon-core/src/test/java/org/apache/paimon/deletionvectors/DataEvolutionApplyDvReaderTest.java new file mode 100644 index 000000000000..48760fcc4bc0 --- /dev/null +++ b/paimon-core/src/test/java/org/apache/paimon/deletionvectors/DataEvolutionApplyDvReaderTest.java @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.deletionvectors; + +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.index.DeletionVectorMeta; +import org.apache.paimon.index.IndexFileMeta; +import org.apache.paimon.index.IndexPathFactory; +import org.apache.paimon.options.MemorySize; +import org.apache.paimon.reader.RecordReader; +import org.apache.paimon.table.source.DeletionFile; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.Range; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Test for {@link DataEvolutionApplyDvReader}. */ +public class DataEvolutionApplyDvReaderTest { + + @TempDir java.nio.file.Path tempPath; + + @Test + public void testApplyUnorderedRowRangeDeletionVectors() throws IOException { + Range laterRange = new Range(110, 119); + Range earlierRange = new Range(100, 109); + + Map input = new LinkedHashMap<>(); + input.put(DeletionFileKey.ofRange(earlierRange), deletionVector(3)); + input.put(DeletionFileKey.ofRange(laterRange), deletionVector(0, 4)); + + LocalFileIO fileIO = LocalFileIO.create(); + IndexPathFactory pathFactory = pathFactory(); + DeletionVectorsIndexFile indexFile = + new DeletionVectorsIndexFile( + fileIO, pathFactory, MemorySize.ofBytes(Long.MAX_VALUE), false); + IndexFileMeta indexFileMeta = indexFile.writeSingleFile(input); + + Map deletionFiles = new LinkedHashMap<>(); + deletionFiles.put(laterRange, deletionFile(pathFactory, indexFileMeta, laterRange)); + deletionFiles.put(earlierRange, deletionFile(pathFactory, indexFileMeta, earlierRange)); + + RowType readType = + RowType.of( + new org.apache.paimon.types.DataType[] {DataTypes.INT()}, + new String[] {"value"}); + DataEvolutionApplyDvReader.Info info = + DataEvolutionApplyDvReader.readInfo(fileIO, readType, deletionFiles); + + assertThat(info.actualReadType.getFieldNames()).containsExactly("value", "_ROW_ID"); + + DataEvolutionApplyDvReader reader = + new DataEvolutionApplyDvReader(new MockRecordReader(rows(100, 115)), info); + + assertThat(readValues(reader)) + .containsExactly(100, 101, 102, 104, 105, 106, 107, 108, 109, 111, 112, 113, 115); + } + + @Test + public void testApplySparseReturnedRowRanges() throws IOException { + Range firstRange = new Range(100, 109); + Range secondRange = new Range(1000, 1009); + Range thirdRange = new Range(5000, 5009); + + Map input = new LinkedHashMap<>(); + input.put(DeletionFileKey.ofRange(firstRange), deletionVector(1)); + input.put(DeletionFileKey.ofRange(secondRange), deletionVector(5)); + input.put(DeletionFileKey.ofRange(thirdRange), deletionVector(0, 9)); + + LocalFileIO fileIO = LocalFileIO.create(); + IndexPathFactory pathFactory = pathFactory(); + DeletionVectorsIndexFile indexFile = + new DeletionVectorsIndexFile( + fileIO, pathFactory, MemorySize.ofBytes(Long.MAX_VALUE), false); + IndexFileMeta indexFileMeta = indexFile.writeSingleFile(input); + + Map deletionFiles = new LinkedHashMap<>(); + deletionFiles.put(firstRange, deletionFile(pathFactory, indexFileMeta, firstRange)); + deletionFiles.put(secondRange, deletionFile(pathFactory, indexFileMeta, secondRange)); + deletionFiles.put(thirdRange, deletionFile(pathFactory, indexFileMeta, thirdRange)); + + RowType readType = + RowType.of( + new org.apache.paimon.types.DataType[] {DataTypes.INT()}, + new String[] {"value"}); + DataEvolutionApplyDvReader.Info info = + DataEvolutionApplyDvReader.readInfo(fileIO, readType, deletionFiles); + + DataEvolutionApplyDvReader reader = + new DataEvolutionApplyDvReader( + new MockRecordReader( + rows( + 100, 101, 109, 500, 999, 1000, 1005, 1009, 3000, 5000, 5001, + 5009, 6000)), + info); + + assertThat(readValues(reader)) + .containsExactly(100, 109, 500, 999, 1000, 1009, 3000, 5001, 6000); + } + + private DeletionFile deletionFile( + IndexPathFactory pathFactory, IndexFileMeta indexFileMeta, Range range) { + DeletionVectorMeta meta = indexFileMeta.dvRanges().get(DeletionFileKey.ofRange(range)); + return new DeletionFile( + pathFactory.toPath(indexFileMeta).toString(), + meta.offset(), + meta.length(), + meta.cardinality()); + } + + private static DeletionVector deletionVector(long... positions) { + DeletionVector deletionVector = new BitmapDeletionVector(); + for (long position : positions) { + deletionVector.delete(position); + } + return deletionVector; + } + + private static List rows(int from, int to) { + List rows = new ArrayList<>(); + for (int i = from; i <= to; i++) { + rows.add(GenericRow.of(i, (long) i)); + } + return rows; + } + + private static List rows(int... rowIds) { + List rows = new ArrayList<>(); + for (int rowId : rowIds) { + rows.add(GenericRow.of(rowId, (long) rowId)); + } + return rows; + } + + private static List readValues(RecordReader reader) throws IOException { + List values = new ArrayList<>(); + try { + RecordReader.RecordIterator batch; + while ((batch = reader.readBatch()) != null) { + try { + InternalRow row; + while ((row = batch.next()) != null) { + values.add(row.getInt(0)); + assertThat(row.getFieldCount()).isEqualTo(1); + } + } finally { + batch.releaseBatch(); + } + } + } finally { + reader.close(); + } + return values; + } + + private IndexPathFactory pathFactory() { + Path dir = new Path(tempPath.toUri()); + return new IndexPathFactory() { + + @Override + public Path toPath(String fileName) { + return new Path(dir, fileName); + } + + @Override + public Path newPath() { + return new Path(dir, UUID.randomUUID().toString()); + } + + @Override + public boolean isExternalPath() { + return false; + } + }; + } + + /** Mock RecordReader for testing. */ + private static class MockRecordReader implements RecordReader { + + private final List rows; + private boolean consumed; + + private MockRecordReader(List rows) { + this.rows = rows; + } + + @Nullable + @Override + public RecordIterator readBatch() { + if (consumed) { + return null; + } + consumed = true; + return new MockRecordIterator(rows.iterator()); + } + + @Override + public void close() {} + } + + /** Mock RecordIterator for testing. */ + private static class MockRecordIterator implements RecordReader.RecordIterator { + + private final Iterator iterator; + + private MockRecordIterator(Iterator iterator) { + this.iterator = iterator; + } + + @Nullable + @Override + public InternalRow next() { + return iterator.hasNext() ? iterator.next() : null; + } + + @Override + public void releaseBatch() {} + } +} diff --git a/paimon-core/src/test/java/org/apache/paimon/deletionvectors/DeletionFileKeyTest.java b/paimon-core/src/test/java/org/apache/paimon/deletionvectors/DeletionFileKeyTest.java new file mode 100644 index 000000000000..55758e8b4d14 --- /dev/null +++ b/paimon-core/src/test/java/org/apache/paimon/deletionvectors/DeletionFileKeyTest.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.deletionvectors; + +import org.apache.paimon.utils.Range; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Test for {@link DeletionFileKey}. */ +class DeletionFileKeyTest { + + @Test + void testFileNameKeyEquality() { + DeletionFileKey key = DeletionFileKey.ofFileName("f1"); + DeletionFileKey sameKey = DeletionFileKey.ofFileName("f1"); + assertThat(key).isEqualTo(sameKey).isNotEqualTo(DeletionFileKey.ofFileName("f2")); + assertThat(key.hashCode()).isEqualTo(sameKey.hashCode()); + } + + @Test + void testRowIdRangeKeyEquality() { + DeletionFileKey key = DeletionFileKey.ofRange(new Range(1, 10)); + DeletionFileKey sameKey = DeletionFileKey.ofRange(new Range(1, 10)); + assertThat(key).isEqualTo(sameKey).isNotEqualTo(DeletionFileKey.ofRange(new Range(2, 10))); + assertThat(key.hashCode()).isEqualTo(sameKey.hashCode()); + } + + @Test + void testDifferentKeyKindsAreNotEqual() { + assertThat(DeletionFileKey.ofFileName("[1, 10]")) + .isNotEqualTo(DeletionFileKey.ofRange(new Range(1, 10))); + } +} diff --git a/paimon-core/src/test/java/org/apache/paimon/deletionvectors/DeletionVectorsIndexFileTest.java b/paimon-core/src/test/java/org/apache/paimon/deletionvectors/DeletionVectorsIndexFileTest.java index 9a243bf8bf09..9b4495bb78a0 100644 --- a/paimon-core/src/test/java/org/apache/paimon/deletionvectors/DeletionVectorsIndexFileTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/deletionvectors/DeletionVectorsIndexFileTest.java @@ -18,18 +18,22 @@ package org.apache.paimon.deletionvectors; +import org.apache.paimon.data.InternalRow; import org.apache.paimon.fs.Path; import org.apache.paimon.fs.local.LocalFileIO; import org.apache.paimon.index.DeletionVectorMeta; import org.apache.paimon.index.IndexFileMeta; +import org.apache.paimon.index.IndexFileMetaSerializer; import org.apache.paimon.index.IndexPathFactory; import org.apache.paimon.options.MemorySize; import org.apache.paimon.table.source.DeletionFile; +import org.apache.paimon.utils.Range; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import java.io.DataInputStream; import java.io.IOException; @@ -53,40 +57,43 @@ public class DeletionVectorsIndexFileTest { @TempDir java.nio.file.Path tempPath; @ParameterizedTest - @ValueSource(booleans = {false, true}) - public void testReadDvIndex(boolean bitmap64) { + @MethodSource("bitmap64AndKeyTypes") + public void testReadDvIndex(boolean bitmap64, KeyType keyType) { IndexPathFactory pathFactory = getPathFactory(); DeletionVectorsIndexFile deletionVectorsIndexFile = deletionVectorsIndexFile(pathFactory, bitmap64); // write - HashMap deleteMap = new HashMap<>(); + Map deleteMap = new HashMap<>(); + DeletionFileKey key1 = keyType.key(1); DeletionVector index1 = createEmptyDV(bitmap64); index1.delete(1); - deleteMap.put("file1.parquet", index1); + deleteMap.put(key1, index1); + DeletionFileKey key2 = keyType.key(2); DeletionVector index2 = createEmptyDV(bitmap64); index2.delete(2); index2.delete(3); - deleteMap.put("file2.parquet", index2); + deleteMap.put(key2, index2); + DeletionFileKey key33 = keyType.key(33); DeletionVector index3 = createEmptyDV(bitmap64); index3.delete(3); - deleteMap.put("file33.parquet", index3); + deleteMap.put(key33, index3); List indexFiles = deletionVectorsIndexFile.writeWithRolling(deleteMap); assertThat(indexFiles.size()).isEqualTo(1); // read IndexFileMeta file = indexFiles.get(0); - Map actualDeleteMap = + Map actualDeleteMap = deletionVectorsIndexFile.readAllDeletionVectors(indexFiles); - assertThat(actualDeleteMap.get("file1.parquet").isDeleted(1)).isTrue(); - assertThat(actualDeleteMap.get("file1.parquet").isDeleted(2)).isFalse(); - assertThat(actualDeleteMap.get("file2.parquet").isDeleted(2)).isTrue(); - assertThat(actualDeleteMap.get("file2.parquet").isDeleted(3)).isTrue(); - assertThat(actualDeleteMap.get("file33.parquet").isDeleted(3)).isTrue(); + assertThat(actualDeleteMap.get(key1).isDeleted(1)).isTrue(); + assertThat(actualDeleteMap.get(key1).isDeleted(2)).isFalse(); + assertThat(actualDeleteMap.get(key2).isDeleted(2)).isTrue(); + assertThat(actualDeleteMap.get(key2).isDeleted(3)).isTrue(); + assertThat(actualDeleteMap.get(key33).isDeleted(3)).isTrue(); // delete deletionVectorsIndexFile.delete(file); @@ -94,97 +101,125 @@ public void testReadDvIndex(boolean bitmap64) { } @ParameterizedTest - @ValueSource(booleans = {false, true}) - public void testReadDvIndexWithCopiousDv(boolean bitmap64) { + @MethodSource("bitmap64AndKeyTypes") + public void testReadSingleDvIndex(boolean bitmap64, KeyType keyType) { + IndexPathFactory pathFactory = getPathFactory(); + DeletionVectorsIndexFile deletionVectorsIndexFile = + deletionVectorsIndexFile(pathFactory, bitmap64); + DeletionFileKey key = keyType.key(10); + DeletionVector deletionVector = createEmptyDV(bitmap64); + deletionVector.delete(2); + deletionVector.delete(5); + Map deleteMap = new HashMap<>(); + deleteMap.put(key, deletionVector); + + List indexFiles = deletionVectorsIndexFile.writeWithRolling(deleteMap); + + assertThat(indexFiles).hasSize(1); + Map actual = + deletionVectorsIndexFile.readAllDeletionVectors(indexFiles); + assertThat(actual).containsOnlyKeys(key); + assertThat(actual.get(key).isDeleted(2)).isTrue(); + assertThat(actual.get(key).isDeleted(5)).isTrue(); + assertThat(actual.get(key).isDeleted(6)).isFalse(); + } + + @ParameterizedTest + @MethodSource("bitmap64AndKeyTypes") + public void testReadDvIndexWithCopiousDv(boolean bitmap64, KeyType keyType) { IndexPathFactory pathFactory = getPathFactory(); DeletionVectorsIndexFile deletionVectorsIndexFile = deletionVectorsIndexFile(pathFactory, bitmap64); // write Random random = new Random(); - HashMap deleteMap = new HashMap<>(); - HashMap deleteInteger = new HashMap<>(); + Map deleteMap = new HashMap<>(); + Map deleteInteger = new HashMap<>(); for (int i = 0; i < 100000; i++) { DeletionVector index = createEmptyDV(bitmap64); int num = random.nextInt(1000000); index.delete(num); - deleteMap.put(String.format("file%s.parquet", i), index); - deleteInteger.put(String.format("file%s.parquet", i), num); + DeletionFileKey key = keyType.key(i); + deleteMap.put(key, index); + deleteInteger.put(key, num); } // read List indexFiles = deletionVectorsIndexFile.writeWithRolling(deleteMap); assertThat(indexFiles.size()).isEqualTo(1); - Map dvs = + Map dvs = deletionVectorsIndexFile.readAllDeletionVectors(indexFiles); assertThat(dvs.size()).isEqualTo(100000); - for (String file : dvs.keySet()) { - int delete = deleteInteger.get(file); - assertThat(dvs.get(file).isDeleted(delete)).isTrue(); - assertThat(dvs.get(file).isDeleted(delete + 1)).isFalse(); + for (Map.Entry entry : deleteInteger.entrySet()) { + DeletionVector deletionVector = dvs.get(entry.getKey()); + assertThat(deletionVector.isDeleted(entry.getValue())).isTrue(); + assertThat(deletionVector.isDeleted(entry.getValue() + 1)).isFalse(); } } @ParameterizedTest - @ValueSource(booleans = {false, true}) - public void testReadDvIndexWithEnormousDv(boolean bitmap64) { + @MethodSource("bitmap64AndKeyTypes") + public void testReadDvIndexWithEnormousDv(boolean bitmap64, KeyType keyType) { IndexPathFactory pathFactory = getPathFactory(); DeletionVectorsIndexFile deletionVectorsIndexFile = deletionVectorsIndexFile(pathFactory, bitmap64); // write Random random = new Random(); - Map fileToDV = new HashMap<>(); - Map fileToCardinality = new HashMap<>(); + Map fileToDV = new HashMap<>(); + Map fileToCardinality = new HashMap<>(); for (int i = 0; i < 5; i++) { DeletionVector index = createEmptyDV(bitmap64); // the size of dv index file is about 20M for (int j = 0; j < 10000000; j++) { index.delete(random.nextInt(Integer.MAX_VALUE)); } - fileToCardinality.put("f" + i, index.getCardinality()); - fileToDV.put("f" + i, index); + DeletionFileKey key = keyType.key(i); + fileToCardinality.put(key, index.getCardinality()); + fileToDV.put(key, index); } List indexFiles = deletionVectorsIndexFile.writeWithRolling(fileToDV); // read assertThat(indexFiles.size()).isEqualTo(1); - Map dvs = + Map dvs = deletionVectorsIndexFile.readAllDeletionVectors(indexFiles); assertThat(dvs.size()).isEqualTo(5); - for (String file : dvs.keySet()) { - assertThat(dvs.get(file).getCardinality()).isEqualTo(fileToCardinality.get(file)); + for (Map.Entry entry : fileToCardinality.entrySet()) { + assertThat(dvs.get(entry.getKey()).getCardinality()).isEqualTo(entry.getValue()); } } @ParameterizedTest - @ValueSource(booleans = {false, true}) - public void testWriteDVIndexWithLimitedTargetSizePerIndexFile(boolean bitmap64) { + @MethodSource("bitmap64AndKeyTypes") + public void testWriteDVIndexWithLimitedTargetSizePerIndexFile( + boolean bitmap64, KeyType keyType) { IndexPathFactory pathFactory = getPathFactory(); DeletionVectorsIndexFile deletionVectorsIndexFile = deletionVectorsIndexFile(pathFactory, MemorySize.parse("2MB"), bitmap64); // write1 Random random = new Random(); - Map fileToDV = new HashMap<>(); - Map fileToCardinality = new HashMap<>(); + Map fileToDV = new HashMap<>(); + Map fileToCardinality = new HashMap<>(); for (int i = 0; i < 5; i++) { DeletionVector index = createEmptyDV(bitmap64); // the size of dv index file is about 1.7M for (int j = 0; j < 750000; j++) { index.delete(random.nextInt(Integer.MAX_VALUE)); } - fileToCardinality.put("f" + i, index.getCardinality()); - fileToDV.put("f" + i, index); + DeletionFileKey key = keyType.key(i); + fileToCardinality.put(key, index.getCardinality()); + fileToDV.put(key, index); } List indexFiles = deletionVectorsIndexFile.writeWithRolling(fileToDV); // assert 1 assertThat(indexFiles.size()).isEqualTo(3); - Map dvs = + Map dvs = deletionVectorsIndexFile.readAllDeletionVectors(indexFiles); - for (String file : dvs.keySet()) { - assertThat(dvs.get(file).getCardinality()).isEqualTo(fileToCardinality.get(file)); + for (Map.Entry entry : fileToCardinality.entrySet()) { + assertThat(dvs.get(entry.getKey()).getCardinality()).isEqualTo(entry.getValue()); } // write2 @@ -196,21 +231,23 @@ public void testWriteDVIndexWithLimitedTargetSizePerIndexFile(boolean bitmap64) for (int j = 0; j < 100000; j++) { index.delete(random.nextInt(Integer.MAX_VALUE)); } - fileToCardinality.put("f" + i, index.getCardinality()); - fileToDV.put("f" + i, index); + DeletionFileKey key = keyType.key(i); + fileToCardinality.put(key, index.getCardinality()); + fileToDV.put(key, index); } indexFiles = deletionVectorsIndexFile.writeWithRolling(fileToDV); // assert 2 assertThat(indexFiles.size()).isGreaterThan(1); dvs = deletionVectorsIndexFile.readAllDeletionVectors(indexFiles); - for (String file : dvs.keySet()) { - assertThat(dvs.get(file).getCardinality()).isEqualTo(fileToCardinality.get(file)); + for (Map.Entry entry : fileToCardinality.entrySet()) { + assertThat(dvs.get(entry.getKey()).getCardinality()).isEqualTo(entry.getValue()); } } - @Test - public void testReadV1AndV2() { + @ParameterizedTest + @MethodSource("keyTypes") + public void testReadV1AndV2(KeyType keyType) { IndexPathFactory pathFactory = getPathFactory(); DeletionVectorsIndexFile v1DeletionVectorsIndexFile = deletionVectorsIndexFile(pathFactory, false); @@ -219,27 +256,29 @@ public void testReadV1AndV2() { // write v1 dv Random random = new Random(); - HashMap deleteInteger = new HashMap<>(); + Map deleteInteger = new HashMap<>(); - HashMap deleteMap1 = new HashMap<>(); + Map deleteMap1 = new HashMap<>(); for (int i = 0; i < 50000; i++) { DeletionVector index = createEmptyDV(false); int num = random.nextInt(1000000); index.delete(num); - deleteMap1.put(String.format("file%s.parquet", i), index); - deleteInteger.put(String.format("file%s.parquet", i), num); + DeletionFileKey key = keyType.key(i); + deleteMap1.put(key, index); + deleteInteger.put(key, num); } List indexFiles1 = v1DeletionVectorsIndexFile.writeWithRolling(deleteMap1); assertThat(indexFiles1.size()).isEqualTo(1); // write v2 dv - HashMap deleteMap2 = new HashMap<>(); + Map deleteMap2 = new HashMap<>(); for (int i = 50000; i < 100000; i++) { DeletionVector index = createEmptyDV(true); int num = random.nextInt(1000000); index.delete(num); - deleteMap2.put(String.format("file%s.parquet", i), index); - deleteInteger.put(String.format("file%s.parquet", i), num); + DeletionFileKey key = keyType.key(i); + deleteMap2.put(key, index); + deleteInteger.put(key, num); } List indexFiles2 = v2DeletionVectorsIndexFile.writeWithRolling(deleteMap2); assertThat(indexFiles2.size()).isEqualTo(1); @@ -248,37 +287,39 @@ public void testReadV1AndV2() { Stream.concat(indexFiles1.stream(), indexFiles2.stream()) .collect(Collectors.toList()); // read when writeVersionID is V1 - Map dvs1 = + Map dvs1 = v1DeletionVectorsIndexFile.readAllDeletionVectors(totalIndexFiles); assertThat(dvs1.size()).isEqualTo(100000); - for (String file : dvs1.keySet()) { - int delete = deleteInteger.get(file); - assertThat(dvs1.get(file).isDeleted(delete)).isTrue(); - assertThat(dvs1.get(file).isDeleted(delete + 1)).isFalse(); + for (Map.Entry entry : deleteInteger.entrySet()) { + DeletionVector deletionVector = dvs1.get(entry.getKey()); + assertThat(deletionVector.isDeleted(entry.getValue())).isTrue(); + assertThat(deletionVector.isDeleted(entry.getValue() + 1)).isFalse(); } // read when writeVersionID is V2 - Map dvs2 = + Map dvs2 = v2DeletionVectorsIndexFile.readAllDeletionVectors(totalIndexFiles); assertThat(dvs2.size()).isEqualTo(100000); } @ParameterizedTest - @ValueSource(booleans = {false, true}) - public void testReadAllDeletionVectorsWithOutOfOrderDvRanges(boolean bitmap64) { + @MethodSource("bitmap64AndKeyTypes") + public void testReadAllDeletionVectorsWithOutOfOrderDvRanges( + boolean bitmap64, KeyType keyType) { IndexPathFactory pathFactory = getPathFactory(); DeletionVectorsIndexFile deletionVectorsIndexFile = deletionVectorsIndexFile(pathFactory, bitmap64); // write multiple DVs so they are stored sequentially in the index file - HashMap deleteMap = new HashMap<>(); - Map expected = new HashMap<>(); + Map deleteMap = new HashMap<>(); + Map expected = new HashMap<>(); for (int i = 0; i < 10; i++) { DeletionVector dv = createEmptyDV(bitmap64); dv.delete(i * 100); dv.delete(i * 100 + 1); - deleteMap.put("file" + i + ".parquet", dv); - expected.put("file" + i + ".parquet", i * 100); + DeletionFileKey key = keyType.key(i); + deleteMap.put(key, dv); + expected.put(key, i * 100); } List indexFiles = deletionVectorsIndexFile.writeWithRolling(deleteMap); @@ -287,12 +328,12 @@ public void testReadAllDeletionVectorsWithOutOfOrderDvRanges(boolean bitmap64) { // build a new IndexFileMeta with dvRanges in reverse offset order, // simulating compaction merging dvRanges from multiple sources - LinkedHashMap originalRanges = original.dvRanges(); - List> entries = + LinkedHashMap originalRanges = original.dvRanges(); + List> entries = new ArrayList<>(originalRanges.entrySet()); Collections.reverse(entries); - LinkedHashMap reversedRanges = new LinkedHashMap<>(); - for (Map.Entry entry : entries) { + LinkedHashMap reversedRanges = new LinkedHashMap<>(); + for (Map.Entry entry : entries) { reversedRanges.put(entry.getKey(), entry.getValue()); } @@ -306,35 +347,54 @@ public void testReadAllDeletionVectorsWithOutOfOrderDvRanges(boolean bitmap64) { original.externalPath()); // read with out-of-order dvRanges — this would fail without the seek fix - Map result = + Map result = deletionVectorsIndexFile.readAllDeletionVectors(reordered); assertThat(result).hasSize(10); - for (Map.Entry e : expected.entrySet()) { - assertThat(result.get(e.getKey()).isDeleted(e.getValue())).isTrue(); - assertThat(result.get(e.getKey()).isDeleted(e.getValue() + 1)).isTrue(); - assertThat(result.get(e.getKey()).isDeleted(e.getValue() + 2)).isFalse(); + for (Map.Entry e : expected.entrySet()) { + DeletionVector deletionVector = result.get(e.getKey()); + assertThat(deletionVector.isDeleted(e.getValue())).isTrue(); + assertThat(deletionVector.isDeleted(e.getValue() + 1)).isTrue(); + assertThat(deletionVector.isDeleted(e.getValue() + 2)).isFalse(); } } + @Test + public void testReadEmptyDeletionVectorsAfterSerialization() { + IndexPathFactory pathFactory = getPathFactory(); + DeletionVectorsIndexFile deletionVectorsIndexFile = + deletionVectorsIndexFile(pathFactory, false); + IndexFileMeta emptyFile = deletionVectorsIndexFile.writeSingleFile(Collections.emptyMap()); + IndexFileMetaSerializer serializer = new IndexFileMetaSerializer(); + InternalRow row = serializer.toRow(emptyFile); + IndexFileMeta serializedEmptyFile = serializer.fromRow(row); + + assertThat(emptyFile.dvRanges()).isEmpty(); + assertThat(row.isNullAt(4)).isTrue(); + assertThat(row.isNullAt(7)).isTrue(); + assertThat(serializedEmptyFile.dvRanges()).isEmpty(); + assertThat(deletionVectorsIndexFile.readAllDeletionVectors(serializedEmptyFile)).isEmpty(); + } + @ParameterizedTest - @ValueSource(booleans = {false, true}) - public void testReadDeletionFile(boolean bitmap64) throws IOException { + @MethodSource("bitmap64AndKeyTypes") + public void testReadDeletionFile(boolean bitmap64, KeyType keyType) throws IOException { IndexPathFactory pathFactory = getPathFactory(); DeletionVectorsIndexFile deletionVectorsIndexFile = deletionVectorsIndexFile(pathFactory, bitmap64); - HashMap deleteMap = new HashMap<>(); + Map deleteMap = new HashMap<>(); + DeletionFileKey key = keyType.key(1); DeletionVector index1 = createEmptyDV(bitmap64); index1.delete(1); index1.delete(10); index1.delete(100); - deleteMap.put("file1.parquet", index1); + deleteMap.put(key, index1); List indexFiles = deletionVectorsIndexFile.writeWithRolling(deleteMap); assertThat(indexFiles.size()).isEqualTo(1); IndexFileMeta indexFileMeta = indexFiles.get(0); - DeletionVectorMeta deletionVectorMeta = indexFileMeta.dvRanges().get("file1.parquet"); + DeletionVectorMeta deletionVectorMeta = indexFileMeta.dvRanges().get(key); DeletionFile deletionFile = new DeletionFile( @@ -391,6 +451,36 @@ public void testReadOldDeletionVector64Bit() throws IOException { } } + private static Stream bitmap64AndKeyTypes() { + return Stream.of(false, true) + .flatMap( + bitmap64 -> + Stream.of(KeyType.values()) + .map(keyType -> Arguments.of(bitmap64, keyType))); + } + + private static Stream keyTypes() { + return Stream.of(KeyType.values()); + } + + private enum KeyType { + FILE_NAME { + @Override + DeletionFileKey key(int id) { + return DeletionFileKey.ofFileName("file" + id + ".parquet"); + } + }, + ROW_ID_RANGE { + @Override + DeletionFileKey key(int id) { + long start = id * 10L; + return DeletionFileKey.ofRange(new Range(start, start + 9)); + } + }; + + abstract DeletionFileKey key(int id); + } + private DeletionVector createEmptyDV(boolean bitmap64) { return bitmap64 ? new Bitmap64DeletionVector() : new BitmapDeletionVector(); } diff --git a/paimon-core/src/test/java/org/apache/paimon/deletionvectors/append/AppendDeletionFileMaintainerHelper.java b/paimon-core/src/test/java/org/apache/paimon/deletionvectors/append/AppendDeletionFileMaintainerHelper.java index e84d09ae7260..d933b332d122 100644 --- a/paimon-core/src/test/java/org/apache/paimon/deletionvectors/append/AppendDeletionFileMaintainerHelper.java +++ b/paimon-core/src/test/java/org/apache/paimon/deletionvectors/append/AppendDeletionFileMaintainerHelper.java @@ -19,11 +19,13 @@ package org.apache.paimon.deletionvectors.append; import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.fs.Path; import org.apache.paimon.index.IndexFileHandler; import org.apache.paimon.manifest.IndexManifestEntry; import org.apache.paimon.table.source.DeletionFile; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -49,10 +51,15 @@ public static AppendDeleteFileMaintainer fromDeletionFiles( touchedIndexFileNames.contains( indexManifestEntry.indexFile().fileName())) .collect(Collectors.toList()); + Map convertedFiles = new HashMap<>(); + deletionFiles.forEach( + (fileName, file) -> { + convertedFiles.put(DeletionFileKey.ofFileName(fileName), file); + }); return new AppendDeleteFileMaintainer( indexFileHandler.dvIndex(partition, UNAWARE_BUCKET), partition, manifests, - deletionFiles); + convertedFiles); } } diff --git a/paimon-core/src/test/java/org/apache/paimon/deletionvectors/append/AppendDeletionFileMaintainerTest.java b/paimon-core/src/test/java/org/apache/paimon/deletionvectors/append/AppendDeletionFileMaintainerTest.java index 3a03985c885d..04a5e75f4fde 100644 --- a/paimon-core/src/test/java/org/apache/paimon/deletionvectors/append/AppendDeletionFileMaintainerTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/deletionvectors/append/AppendDeletionFileMaintainerTest.java @@ -21,7 +21,9 @@ import org.apache.paimon.CoreOptions; import org.apache.paimon.TestAppendFileStore; import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.deletionvectors.DeletionVector; +import org.apache.paimon.deletionvectors.FileNameKey; import org.apache.paimon.fs.FileIO; import org.apache.paimon.fs.local.LocalFileIO; import org.apache.paimon.index.DeletionVectorMeta; @@ -42,6 +44,7 @@ import java.util.List; import java.util.Map; +import static org.apache.paimon.table.BucketMode.UNAWARE_BUCKET; import static org.assertj.core.api.Assertions.assertThat; class AppendDeletionFileMaintainerTest { @@ -58,16 +61,17 @@ public void test(boolean bitmap64) throws Exception { Map> dvs = new HashMap<>(); dvs.put("f1", Arrays.asList(1, 3, 5)); dvs.put("f2", Arrays.asList(2, 4, 6)); - CommitMessageImpl commitMessage1 = store.writeDVIndexFiles(BinaryRow.EMPTY_ROW, 0, dvs); + CommitMessageImpl commitMessage1 = + store.writeDVIndexFiles(BinaryRow.EMPTY_ROW, UNAWARE_BUCKET, dvs); CommitMessageImpl commitMessage2 = store.writeDVIndexFiles( BinaryRow.EMPTY_ROW, - 1, + UNAWARE_BUCKET, Collections.singletonMap("f3", Arrays.asList(1, 2, 3))); store.commit(commitMessage1, commitMessage2); IndexPathFactory indexPathFactory = - store.pathFactory().indexFileFactory(BinaryRow.EMPTY_ROW, 0); + store.pathFactory().indexFileFactory(BinaryRow.EMPTY_ROW, UNAWARE_BUCKET); Map dataFileToDeletionFiles = new HashMap<>(); dataFileToDeletionFiles.putAll( createDeletionFileMapFromIndexFileMetas( @@ -91,6 +95,7 @@ public void test(boolean bitmap64) throws Exception { res = dvIFMaintainer.writeUnchangedDeletionVector(); assertThat(res.size()).isEqualTo(1); assertThat(res.get(0).kind()).isEqualTo(FileKind.DELETE); + assertThat(res.get(0).bucket()).isEqualTo(UNAWARE_BUCKET); // the dv of f1 and f2 are in one index file, and the dv of f1 is updated. // the dv of f2 need to be rewritten, and this index file should be marked as REMOVE. @@ -100,11 +105,20 @@ public void test(boolean bitmap64) throws Exception { assertThat(res.size()).isEqualTo(3); IndexManifestEntry entry = res.stream().filter(file -> file.kind() == FileKind.ADD).findAny().get(); - assertThat(entry.indexFile().dvRanges().containsKey("f2")).isTrue(); + assertThat(entry.indexFile().dvRanges().containsKey(DeletionFileKey.ofFileName("f2"))) + .isTrue(); + assertThat(res).allSatisfy(file -> assertThat(file.bucket()).isEqualTo(UNAWARE_BUCKET)); entry = res.stream() .filter(file -> file.kind() == FileKind.DELETE) - .filter(file -> file.bucket() == 0) + .filter( + file -> + file.indexFile() + .equals( + commitMessage1 + .newFilesIncrement() + .newIndexFiles() + .get(0))) .findAny() .get(); assertThat(entry.indexFile()) @@ -112,7 +126,14 @@ public void test(boolean bitmap64) throws Exception { entry = res.stream() .filter(file -> file.kind() == FileKind.DELETE) - .filter(file -> file.bucket() == 1) + .filter( + file -> + file.indexFile() + .equals( + commitMessage2 + .newFilesIncrement() + .newIndexFiles() + .get(0))) .findAny() .get(); assertThat(entry.indexFile()) @@ -123,10 +144,13 @@ private Map createDeletionFileMapFromIndexFileMetas( IndexPathFactory indexPathFactory, List fileMetas) { Map dataFileToDeletionFiles = new HashMap<>(); for (IndexFileMeta indexFileMeta : fileMetas) { - for (Map.Entry dvMeta : + for (Map.Entry dvMeta : indexFileMeta.dvRanges().entrySet()) { + if (!(dvMeta.getKey() instanceof FileNameKey)) { + continue; + } dataFileToDeletionFiles.put( - dvMeta.getKey(), + ((FileNameKey) dvMeta.getKey()).fileName(), new DeletionFile( indexPathFactory.toPath(indexFileMeta).toString(), dvMeta.getValue().offset(), diff --git a/paimon-core/src/test/java/org/apache/paimon/index/IndexFileHandlerTest.java b/paimon-core/src/test/java/org/apache/paimon/index/IndexFileHandlerTest.java index 70d5881e3405..ac5e256ebac3 100644 --- a/paimon-core/src/test/java/org/apache/paimon/index/IndexFileHandlerTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/index/IndexFileHandlerTest.java @@ -22,6 +22,7 @@ import org.apache.paimon.Snapshot; import org.apache.paimon.TestAppendFileStore; import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.fs.FileIO; import org.apache.paimon.fs.Path; import org.apache.paimon.fs.local.LocalFileIO; @@ -163,7 +164,10 @@ void testScanBucketsOnlyReturnsRequestedBuckets() throws Exception { assertThat(scanned).containsOnlyKeys(Pair.of(BinaryRow.EMPTY_ROW, 1)); assertThat(scanned.get(Pair.of(BinaryRow.EMPTY_ROW, 1))) .extracting(IndexFileMeta::dvRanges) - .allSatisfy(dvRanges -> assertThat(dvRanges).containsOnlyKeys("f1")); + .allSatisfy( + dvRanges -> + assertThat(dvRanges) + .containsOnlyKeys(DeletionFileKey.ofFileName("f1"))); assertThat( indexFileHandler.scanBuckets( diff --git a/paimon-core/src/test/java/org/apache/paimon/index/IndexFileMetaSerializerTest.java b/paimon-core/src/test/java/org/apache/paimon/index/IndexFileMetaSerializerTest.java index 7e4fe92c8d32..9c7ad1134b93 100644 --- a/paimon-core/src/test/java/org/apache/paimon/index/IndexFileMetaSerializerTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/index/IndexFileMetaSerializerTest.java @@ -18,13 +18,24 @@ package org.apache.paimon.index; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.deletionvectors.DeletionVectorsIndexFile; +import org.apache.paimon.io.DataInputDeserializer; +import org.apache.paimon.io.DataOutputSerializer; import org.apache.paimon.utils.ObjectSerializer; import org.apache.paimon.utils.ObjectSerializerTestBase; +import org.apache.paimon.utils.Range; +import org.junit.jupiter.api.Test; + +import java.io.IOException; import java.util.LinkedHashMap; import java.util.Random; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + /** Test for {@link org.apache.paimon.index.IndexFileMetaSerializer}. */ public class IndexFileMetaSerializerTest extends ObjectSerializerTestBase { @@ -61,13 +72,13 @@ public static IndexFileMeta randomHashIndexFile() { public static IndexFileMeta randomDeletionVectorIndexFile() { Random rnd = new Random(); - LinkedHashMap dvRanges = new LinkedHashMap<>(); + LinkedHashMap dvRanges = new LinkedHashMap<>(); dvRanges.put( - "my_file_name1", + DeletionFileKey.ofFileName("my_file_name1"), new DeletionVectorMeta( "my_file_name1", rnd.nextInt(), rnd.nextInt(), rnd.nextLong())); dvRanges.put( - "my_file_name2", + DeletionFileKey.ofFileName("my_file_name2"), new DeletionVectorMeta( "my_file_name2", rnd.nextInt(), rnd.nextInt(), rnd.nextLong())); return new IndexFileMeta( @@ -78,4 +89,58 @@ public static IndexFileMeta randomDeletionVectorIndexFile() { dvRanges, null); } + + public static IndexFileMeta rowIdRangeDeletionVectorIndexFile() { + DeletionFileKey rowIdRangeKey = DeletionFileKey.ofRange(new Range(10, 19)); + LinkedHashMap dvRanges = new LinkedHashMap<>(); + dvRanges.put(rowIdRangeKey, new DeletionVectorMeta(rowIdRangeKey, 4, 5, 6L)); + return new IndexFileMeta( + DeletionVectorsIndexFile.DELETION_VECTORS_INDEX, + "deletion_vectors_index_file_name", + 100, + 9, + dvRanges, + null); + } + + @Test + public void testRowIdRangeDeletionVectorsRoundTrip() { + IndexFileMeta indexFile = rowIdRangeDeletionVectorIndexFile(); + + IndexFileMeta actual = serializer().fromRow(serializer().toRow(indexFile)); + + assertThat(actual).isEqualTo(indexFile); + assertThat(actual.dvRanges()).containsOnlyKeys(DeletionFileKey.ofRange(new Range(10, 19))); + } + + @Test + public void testRowIdRangeDeletionVectorsSerializeRoundTrip() throws IOException { + IndexFileMeta indexFile = rowIdRangeDeletionVectorIndexFile(); + IndexFileMetaSerializer serializer = new IndexFileMetaSerializer(); + DataOutputSerializer out = new DataOutputSerializer(128); + + serializer.serialize(indexFile, out); + IndexFileMeta actual = + serializer.deserialize(new DataInputDeserializer(out.getCopyOfBuffer())); + + assertThat(actual).isEqualTo(indexFile); + } + + @Test + public void testRowIdRangeDeletionVectorsLegacyMarker() { + IndexFileMeta indexFile = rowIdRangeDeletionVectorIndexFile(); + IndexFileMetaSerializer serializer = new IndexFileMetaSerializer(); + + InternalRow row = serializer.toRow(indexFile); + + assertThat(serializer.fromRow(row)).isEqualTo(indexFile); + assertThat(DeletionVectorMeta.isLegacyMarker(row.getArray(4))).isTrue(); + assertThat(row.isNullAt(7)).isFalse(); + // ensure that old path will fast-fail + assertThatThrownBy( + () -> + IndexFileMetaSerializer.rowArrayDataToFileNameDvMetas( + row.getArray(4))) + .isInstanceOf(NullPointerException.class); + } } diff --git a/paimon-core/src/test/java/org/apache/paimon/manifest/IndexManifestEntrySerializerTest.java b/paimon-core/src/test/java/org/apache/paimon/manifest/IndexManifestEntrySerializerTest.java index 0429b8dae3e0..3dd6fbf7fb40 100644 --- a/paimon-core/src/test/java/org/apache/paimon/manifest/IndexManifestEntrySerializerTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/manifest/IndexManifestEntrySerializerTest.java @@ -18,13 +18,25 @@ package org.apache.paimon.manifest; +import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; +import org.apache.paimon.deletionvectors.DeletionVectorsIndexFile; +import org.apache.paimon.index.DeletionVectorMeta; +import org.apache.paimon.index.IndexFileMeta; import org.apache.paimon.utils.ObjectSerializer; import org.apache.paimon.utils.ObjectSerializerTestBase; +import org.apache.paimon.utils.Range; +import org.junit.jupiter.api.Test; + +import java.util.LinkedHashMap; import java.util.Random; import static org.apache.paimon.index.IndexFileMetaSerializerTest.randomIndexFile; +import static org.apache.paimon.index.IndexFileMetaSerializerTest.rowIdRangeDeletionVectorIndexFile; import static org.apache.paimon.io.DataFileTestUtils.row; +import static org.assertj.core.api.Assertions.assertThat; /** Test for {@link IndexManifestEntrySerializer}. */ public class IndexManifestEntrySerializerTest extends ObjectSerializerTestBase { @@ -47,4 +59,59 @@ public static IndexManifestEntry randomIndexEntry() { rnd.nextInt(), randomIndexFile()); } + + @Test + public void testRowIdRangeDeletionVectorsRoundTrip() { + IndexManifestEntrySerializer serializer = new IndexManifestEntrySerializer(); + IndexManifestEntry entry = + new IndexManifestEntry( + FileKind.ADD, row(1), 2, rowIdRangeDeletionVectorIndexFile()); + + IndexManifestEntry actual = serializer.fromRow(serializer.toRow(entry)); + IndexFileMeta actualIndexFile = actual.indexFile(); + + assertThat(serializer.getVersion()).isEqualTo(1); + assertThat(actual).isEqualTo(entry); + assertThat(actualIndexFile.dvRanges()) + .containsOnlyKeys(DeletionFileKey.ofRange(new Range(10, 19))); + } + + @Test + public void testRowFilterGetters() { + IndexManifestEntrySerializer serializer = new IndexManifestEntrySerializer(); + BinaryRow partition = row(1); + IndexManifestEntry entry = + new IndexManifestEntry(FileKind.ADD, partition, 2, randomIndexFile()); + + InternalRow row = serializer.toRow(entry); + + assertThat(IndexManifestEntrySerializer.partitionGetter().apply(row)).isEqualTo(partition); + assertThat(IndexManifestEntrySerializer.bucketGetter().apply(row)).isEqualTo(2); + assertThat(IndexManifestEntrySerializer.indexTypeGetter().apply(row)) + .isEqualTo(entry.indexFile().indexType()); + } + + @Test + public void testEmptyDeletionVectorsRoundTrip() { + IndexManifestEntrySerializer serializer = new IndexManifestEntrySerializer(); + IndexManifestEntry entry = + new IndexManifestEntry( + FileKind.ADD, + row(1), + 2, + new IndexFileMeta( + DeletionVectorsIndexFile.DELETION_VECTORS_INDEX, + "empty-dv", + 1, + 0, + new LinkedHashMap(), + null)); + + InternalRow row = serializer.convertTo(entry); + IndexManifestEntry actual = serializer.convertFrom(serializer.getVersion(), row); + + assertThat(row.isNullAt(7)).isTrue(); + assertThat(row.isNullAt(10)).isTrue(); + assertThat(actual.indexFile().dvRanges()).isEmpty(); + } } diff --git a/paimon-core/src/test/java/org/apache/paimon/manifest/ManifestCommittableSerializerCompatibilityTest.java b/paimon-core/src/test/java/org/apache/paimon/manifest/ManifestCommittableSerializerCompatibilityTest.java index c3a6ef964986..1910567cadab 100644 --- a/paimon-core/src/test/java/org/apache/paimon/manifest/ManifestCommittableSerializerCompatibilityTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/manifest/ManifestCommittableSerializerCompatibilityTest.java @@ -19,6 +19,7 @@ package org.apache.paimon.manifest; import org.apache.paimon.data.Timestamp; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.index.DeletionVectorMeta; import org.apache.paimon.index.GlobalIndexMeta; import org.apache.paimon.index.IndexFileMeta; @@ -94,9 +95,11 @@ public void testCompatibilityToV5CommitV11() throws IOException { null, globalIndexMeta); - LinkedHashMap dvRanges = new LinkedHashMap<>(); - dvRanges.put("dv_key1", new DeletionVectorMeta("dv_key1", 1, 2, 3L)); - dvRanges.put("dv_key2", new DeletionVectorMeta("dv_key2", 3, 4, 5L)); + LinkedHashMap dvRanges = new LinkedHashMap<>(); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key1"), new DeletionVectorMeta("dv_key1", 1, 2, 3L)); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key2"), new DeletionVectorMeta("dv_key2", 3, 4, 5L)); IndexFileMeta devIndexFile = new IndexFileMeta( "my_index_type", @@ -193,9 +196,11 @@ public void testCompatibilityToV4CommitV11() throws IOException { null, globalIndexMeta); - LinkedHashMap dvRanges = new LinkedHashMap<>(); - dvRanges.put("dv_key1", new DeletionVectorMeta("dv_key1", 1, 2, 3L)); - dvRanges.put("dv_key2", new DeletionVectorMeta("dv_key2", 3, 4, 5L)); + LinkedHashMap dvRanges = new LinkedHashMap<>(); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key1"), new DeletionVectorMeta("dv_key1", 1, 2, 3L)); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key2"), new DeletionVectorMeta("dv_key2", 3, 4, 5L)); IndexFileMeta devIndexFile = new IndexFileMeta( "my_index_type", @@ -284,9 +289,11 @@ public void testCompatibilityToV4CommitV10() throws IOException { new IndexFileMeta( "my_index_type", "my_index_file", 1024 * 100, 1002, null, null, null); - LinkedHashMap dvRanges = new LinkedHashMap<>(); - dvRanges.put("dv_key1", new DeletionVectorMeta("dv_key1", 1, 2, 3L)); - dvRanges.put("dv_key2", new DeletionVectorMeta("dv_key2", 3, 4, 5L)); + LinkedHashMap dvRanges = new LinkedHashMap<>(); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key1"), new DeletionVectorMeta("dv_key1", 1, 2, 3L)); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key2"), new DeletionVectorMeta("dv_key2", 3, 4, 5L)); IndexFileMeta devIndexFile = new IndexFileMeta( "my_index_type", @@ -370,9 +377,11 @@ public void testCompatibilityToV4CommitV9() throws IOException { Arrays.asList("asdf", "qwer", "zxcv")); List dataFiles = Collections.singletonList(dataFile); - LinkedHashMap dvRanges = new LinkedHashMap<>(); - dvRanges.put("dv_key1", new DeletionVectorMeta("dv_key1", 1, 2, 3L)); - dvRanges.put("dv_key2", new DeletionVectorMeta("dv_key2", 3, 4, 5L)); + LinkedHashMap dvRanges = new LinkedHashMap<>(); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key1"), new DeletionVectorMeta("dv_key1", 1, 2, 3L)); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key2"), new DeletionVectorMeta("dv_key2", 3, 4, 5L)); IndexFileMeta indexFile = new IndexFileMeta( "my_index_type", @@ -454,9 +463,11 @@ public void testCompatibilityToV4CommitV8() throws IOException { null); List dataFiles = Collections.singletonList(dataFile); - LinkedHashMap dvMetas = new LinkedHashMap<>(); - dvMetas.put("dv_key1", new DeletionVectorMeta("dv_key1", 1, 2, 3L)); - dvMetas.put("dv_key2", new DeletionVectorMeta("dv_key2", 3, 4, 5L)); + LinkedHashMap dvMetas = new LinkedHashMap<>(); + dvMetas.put( + DeletionFileKey.ofFileName("dv_key1"), new DeletionVectorMeta("dv_key1", 1, 2, 3L)); + dvMetas.put( + DeletionFileKey.ofFileName("dv_key2"), new DeletionVectorMeta("dv_key2", 3, 4, 5L)); IndexFileMeta indexFile = new IndexFileMeta( "my_index_type", "my_index_file", 1024 * 100, 1002, dvMetas, null); @@ -533,9 +544,11 @@ public void testCompatibilityToV4CommitV7() throws IOException { null); List dataFiles = Collections.singletonList(dataFile); - LinkedHashMap dvRanges = new LinkedHashMap<>(); - dvRanges.put("dv_key1", new DeletionVectorMeta("dv_key1", 1, 2, 3L)); - dvRanges.put("dv_key2", new DeletionVectorMeta("dv_key2", 3, 4, 5L)); + LinkedHashMap dvRanges = new LinkedHashMap<>(); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key1"), new DeletionVectorMeta("dv_key1", 1, 2, 3L)); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key2"), new DeletionVectorMeta("dv_key2", 3, 4, 5L)); IndexFileMeta indexFile = new IndexFileMeta( "my_index_type", "my_index_file", 1024 * 100, 1002, dvRanges, null); @@ -611,9 +624,11 @@ public void testCompatibilityToV3CommitV7() throws IOException { null); List dataFiles = Collections.singletonList(dataFile); - LinkedHashMap dvRanges = new LinkedHashMap<>(); - dvRanges.put("dv_key1", new DeletionVectorMeta("dv_key1", 1, 2, 3L)); - dvRanges.put("dv_key2", new DeletionVectorMeta("dv_key2", 3, 4, 5L)); + LinkedHashMap dvRanges = new LinkedHashMap<>(); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key1"), new DeletionVectorMeta("dv_key1", 1, 2, 3L)); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key2"), new DeletionVectorMeta("dv_key2", 3, 4, 5L)); IndexFileMeta indexFile = new IndexFileMeta( "my_index_type", "my_index_file", 1024 * 100, 1002, dvRanges, null); @@ -686,9 +701,11 @@ public void testCompatibilityToV3CommitV6() throws IOException { null); List dataFiles = Collections.singletonList(dataFile); - LinkedHashMap dvRanges = new LinkedHashMap<>(); - dvRanges.put("dv_key1", new DeletionVectorMeta("dv_key1", 1, 2, 3L)); - dvRanges.put("dv_key2", new DeletionVectorMeta("dv_key2", 3, 4, 5L)); + LinkedHashMap dvRanges = new LinkedHashMap<>(); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key1"), new DeletionVectorMeta("dv_key1", 1, 2, 3L)); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key2"), new DeletionVectorMeta("dv_key2", 3, 4, 5L)); IndexFileMeta indexFile = new IndexFileMeta( "my_index_type", "my_index_file", 1024 * 100, 1002, dvRanges, null); @@ -761,9 +778,11 @@ public void testCompatibilityToV3CommitV5() throws IOException { null); List dataFiles = Collections.singletonList(dataFile); - LinkedHashMap dvRanges = new LinkedHashMap<>(); - dvRanges.put("dv_key1", new DeletionVectorMeta("dv_key1", 1, 2, 3L)); - dvRanges.put("dv_key2", new DeletionVectorMeta("dv_key2", 3, 4, 5L)); + LinkedHashMap dvRanges = new LinkedHashMap<>(); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key1"), new DeletionVectorMeta("dv_key1", 1, 2, 3L)); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key2"), new DeletionVectorMeta("dv_key2", 3, 4, 5L)); IndexFileMeta indexFile = new IndexFileMeta( "my_index_type", "my_index_file", 1024 * 100, 1002, dvRanges, null); @@ -835,9 +854,13 @@ public void testCompatibilityToV3CommitV4() throws IOException { null); List dataFiles = Collections.singletonList(dataFile); - LinkedHashMap dvRanges = new LinkedHashMap<>(); - dvRanges.put("dv_key1", new DeletionVectorMeta("dv_key1", 1, 2, null)); - dvRanges.put("dv_key2", new DeletionVectorMeta("dv_key2", 3, 4, null)); + LinkedHashMap dvRanges = new LinkedHashMap<>(); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key1"), + new DeletionVectorMeta("dv_key1", 1, 2, null)); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key2"), + new DeletionVectorMeta("dv_key2", 3, 4, null)); IndexFileMeta indexFile = new IndexFileMeta( "my_index_type", "my_index_file", 1024 * 100, 1002, dvRanges, null); @@ -910,9 +933,13 @@ public void testCompatibilityToV3CommitV3() throws IOException { null); List dataFiles = Collections.singletonList(dataFile); - LinkedHashMap dvRanges = new LinkedHashMap<>(); - dvRanges.put("dv_key1", new DeletionVectorMeta("dv_key1", 1, 2, null)); - dvRanges.put("dv_key2", new DeletionVectorMeta("dv_key2", 3, 4, null)); + LinkedHashMap dvRanges = new LinkedHashMap<>(); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key1"), + new DeletionVectorMeta("dv_key1", 1, 2, null)); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key2"), + new DeletionVectorMeta("dv_key2", 3, 4, null)); IndexFileMeta indexFile = new IndexFileMeta( "my_index_type", "my_index_file", 1024 * 100, 1002, dvRanges, null); @@ -985,9 +1012,13 @@ public void testCompatibilityToV2CommitV2() throws IOException { null); List dataFiles = Collections.singletonList(dataFile); - LinkedHashMap dvRanges = new LinkedHashMap<>(); - dvRanges.put("dv_key1", new DeletionVectorMeta("dv_key1", 1, 2, null)); - dvRanges.put("dv_key2", new DeletionVectorMeta("dv_key2", 3, 4, null)); + LinkedHashMap dvRanges = new LinkedHashMap<>(); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key1"), + new DeletionVectorMeta("dv_key1", 1, 2, null)); + dvRanges.put( + DeletionFileKey.ofFileName("dv_key2"), + new DeletionVectorMeta("dv_key2", 3, 4, null)); IndexFileMeta indexFile = new IndexFileMeta( "my_index_type", "my_index_file", 1024 * 100, 1002, dvRanges, null); diff --git a/paimon-core/src/test/java/org/apache/paimon/operation/BlobFallbackRecordReaderTest.java b/paimon-core/src/test/java/org/apache/paimon/operation/BlobFallbackRecordReaderTest.java index 596dbcd955fa..f996f499a90c 100644 --- a/paimon-core/src/test/java/org/apache/paimon/operation/BlobFallbackRecordReaderTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/operation/BlobFallbackRecordReaderTest.java @@ -58,6 +58,17 @@ public class BlobFallbackRecordReaderTest { new DataField(1, SpecialFields.ROW_ID.name(), DataTypes.BIGINT()), new DataField( 2, SpecialFields.SEQUENCE_NUMBER.name(), DataTypes.BIGINT()))); + private static final RowType READ_ROW_TYPE_WITH_SEQUENCE_ONLY = + new RowType( + Arrays.asList( + new DataField(BLOB_INDEX, BLOB_FIELD, DataTypes.BLOB()), + new DataField( + 1, SpecialFields.SEQUENCE_NUMBER.name(), DataTypes.BIGINT()))); + private static final RowType READ_ROW_TYPE_WITH_ROW_ID_ONLY = + new RowType( + Arrays.asList( + new DataField(BLOB_INDEX, BLOB_FIELD, DataTypes.BLOB()), + new DataField(1, SpecialFields.ROW_ID.name(), DataTypes.BIGINT()))); @Test public void testBlobSequenceGroupReaderWithRowRanges() throws Exception { @@ -144,6 +155,48 @@ public void testBlobFallbackRecordReaderReturnsNullIfAllRowsArePlaceholders() th placeholderRows(newFile, 0, oldFile, 0)); assertThat(rows.rowIds).isEmpty(); + assertThat(rows.nullBlobRowIds).containsExactly(0L); + assertThat(rows.nullBlobSequenceNumbers).containsExactly(-1L); + assertThat(rows.nullBlobRowCount).isEqualTo(1); + assertThat(rows.placeholderRowCount).isEqualTo(0); + } + + @Test + public void testBlobFallbackRecordReaderReturnsSequenceIfAllRowsArePlaceholdersWithoutRowId() + throws Exception { + DataFileMeta newFile = blobFile("new-placeholder-file", 0, 1, 2); + DataFileMeta oldFile = blobFile("old-placeholder-file", 0, 1, 1); + + ReadResult rows = + readFallback( + Arrays.asList(newFile, oldFile), + null, + placeholderRows(newFile, 0, oldFile, 0), + READ_ROW_TYPE_WITH_SEQUENCE_ONLY); + + assertThat(rows.rowIds).isEmpty(); + assertThat(rows.nullBlobRowIds).isEmpty(); + assertThat(rows.nullBlobSequenceNumbers).containsExactly(-1L); + assertThat(rows.nullBlobRowCount).isEqualTo(1); + assertThat(rows.placeholderRowCount).isEqualTo(0); + } + + @Test + public void testBlobFallbackRecordReaderReturnsRowIdIfAllRowsArePlaceholdersWithoutSequence() + throws Exception { + DataFileMeta newFile = blobFile("new-placeholder-file", 0, 1, 2); + DataFileMeta oldFile = blobFile("old-placeholder-file", 0, 1, 1); + + ReadResult rows = + readFallback( + Arrays.asList(newFile, oldFile), + null, + placeholderRows(newFile, 0, oldFile, 0), + READ_ROW_TYPE_WITH_ROW_ID_ONLY); + + assertThat(rows.rowIds).isEmpty(); + assertThat(rows.nullBlobRowIds).containsExactly(0L); + assertThat(rows.nullBlobSequenceNumbers).isEmpty(); assertThat(rows.nullBlobRowCount).isEqualTo(1); assertThat(rows.placeholderRowCount).isEqualTo(0); } @@ -171,13 +224,23 @@ public void testBlobFallbackRecordReaderWithRowRanges() throws Exception { private static ReadResult readFallback( List files, List rowRanges, Set placeholderRows) throws Exception { + return readFallback(files, rowRanges, placeholderRows, READ_ROW_TYPE); + } + + private static ReadResult readFallback( + List files, + List rowRanges, + Set placeholderRows, + RowType readRowType) + throws Exception { return ReadResult.read( new BlobFallbackRecordReader( files, file -> oneRowPerBatchReader(fileRows(file, rowRanges, placeholderRows)), rowRanges, - READ_ROW_TYPE, - BLOB_INDEX)); + readRowType, + BLOB_INDEX), + readRowType); } private static ReadResult readSequenceGroup( @@ -189,13 +252,17 @@ private static ReadResult readSequenceGroup( throws Exception { return ReadResult.read( new BlobSequenceGroupRecordReader( + sequenceNumber, files, file -> oneRowPerBatchReader(fileRows(file, rowRanges)), rowRanges, READ_ROW_TYPE, BLOB_INDEX, + READ_ROW_TYPE.getFieldIndex(SpecialFields.ROW_ID.name()), + READ_ROW_TYPE.getFieldIndex(SpecialFields.SEQUENCE_NUMBER.name()), firstRowId, - lastRowId)); + lastRowId), + READ_ROW_TYPE); } private static DataFileMeta blobFile( @@ -359,15 +426,25 @@ public void close() {} } private static class ReadResult { + private final int rowIdIndex; + private final int sequenceNumberIndex; + final List rowIds = new ArrayList<>(); final List sequenceNumbers = new ArrayList<>(); + final List nullBlobRowIds = new ArrayList<>(); + final List nullBlobSequenceNumbers = new ArrayList<>(); final List batchSizes = new ArrayList<>(); int placeholderRowCount; int nullBlobRowCount; - static ReadResult read(RecordReader reader) throws Exception { + private ReadResult(RowType rowType) { + this.rowIdIndex = rowType.getFieldIndex(SpecialFields.ROW_ID.name()); + this.sequenceNumberIndex = rowType.getFieldIndex(SpecialFields.SEQUENCE_NUMBER.name()); + } + + static ReadResult read(RecordReader reader, RowType rowType) throws Exception { try { - ReadResult result = new ReadResult(); + ReadResult result = new ReadResult(rowType); RecordIterator batch; while ((batch = reader.readBatch()) != null) { int batchSize = 0; @@ -388,11 +465,21 @@ static ReadResult read(RecordReader reader) throws Exception { private void add(InternalRow row) { if (row.isNullAt(BLOB_INDEX)) { nullBlobRowCount++; + if (rowIdIndex >= 0) { + nullBlobRowIds.add(row.getLong(rowIdIndex)); + } + if (sequenceNumberIndex >= 0) { + nullBlobSequenceNumbers.add(row.getLong(sequenceNumberIndex)); + } } else if (row.getBlob(BLOB_INDEX) == BlobPlaceholder.INSTANCE) { placeholderRowCount++; } else { - rowIds.add(row.getLong(1)); - sequenceNumbers.add(row.getLong(2)); + if (rowIdIndex >= 0) { + rowIds.add(row.getLong(rowIdIndex)); + } + if (sequenceNumberIndex >= 0) { + sequenceNumbers.add(row.getLong(sequenceNumberIndex)); + } } } } diff --git a/paimon-core/src/test/java/org/apache/paimon/operation/FileStoreCommitTest.java b/paimon-core/src/test/java/org/apache/paimon/operation/FileStoreCommitTest.java index 5a386947c172..267684a2aea7 100644 --- a/paimon-core/src/test/java/org/apache/paimon/operation/FileStoreCommitTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/operation/FileStoreCommitTest.java @@ -28,6 +28,7 @@ import org.apache.paimon.catalog.SnapshotCommit; import org.apache.paimon.data.BinaryRow; import org.apache.paimon.deletionvectors.BucketedDvMaintainer; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.deletionvectors.DeletionVector; import org.apache.paimon.fs.Path; import org.apache.paimon.fs.local.LocalFileIO; @@ -928,11 +929,11 @@ public void testDVIndexFiles(boolean bitmap64) throws Exception { // assert 1 assertThat(store.scanDVIndexFiles(partition, 0).size()).isEqualTo(2); BucketedDvMaintainer maintainer = store.createOrRestoreDVMaintainer(partition, 0); - Map dvs = maintainer.deletionVectors(); + Map dvs = maintainer.deletionVectors(); assertThat(dvs.size()).isEqualTo(2); - assertThat(dvs.get("f2").isDeleted(2)).isTrue(); - assertThat(dvs.get("f2").isDeleted(3)).isFalse(); - assertThat(dvs.get("f2").isDeleted(4)).isTrue(); + assertThat(dvs.get(DeletionFileKey.ofFileName("f2")).isDeleted(2)).isTrue(); + assertThat(dvs.get(DeletionFileKey.ofFileName("f2")).isDeleted(3)).isFalse(); + assertThat(dvs.get(DeletionFileKey.ofFileName("f2")).isDeleted(4)).isTrue(); // commit 2 List deleted = @@ -949,8 +950,8 @@ public void testDVIndexFiles(boolean bitmap64) throws Exception { maintainer = store.createOrRestoreDVMaintainer(partition, 0); dvs = maintainer.deletionVectors(); assertThat(dvs.size()).isEqualTo(2); - assertThat(dvs.get("f1").isDeleted(3)).isTrue(); - assertThat(dvs.get("f2").isDeleted(3)).isTrue(); + assertThat(dvs.get(DeletionFileKey.ofFileName("f1")).isDeleted(3)).isTrue(); + assertThat(dvs.get(DeletionFileKey.ofFileName("f2")).isDeleted(3)).isTrue(); } @Test diff --git a/paimon-core/src/test/java/org/apache/paimon/operation/commit/ConflictDetectionTest.java b/paimon-core/src/test/java/org/apache/paimon/operation/commit/ConflictDetectionTest.java index c3371e1c19b9..20757432554a 100644 --- a/paimon-core/src/test/java/org/apache/paimon/operation/commit/ConflictDetectionTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/operation/commit/ConflictDetectionTest.java @@ -20,6 +20,7 @@ import org.apache.paimon.Snapshot; import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.index.DeletionVectorMeta; import org.apache.paimon.index.GlobalIndexMeta; import org.apache.paimon.index.IndexFileMeta; @@ -321,9 +322,9 @@ private SimpleFileEntryWithDV createFileEntryWithDV( private IndexManifestEntry createDvIndexEntry( String fileName, FileKind kind, List fileNames) { - LinkedHashMap dvRanges = new LinkedHashMap<>(); + LinkedHashMap dvRanges = new LinkedHashMap<>(); for (String name : fileNames) { - dvRanges.put(name, new DeletionVectorMeta(name, 1, 1, 1L)); + dvRanges.put(DeletionFileKey.ofFileName(name), new DeletionVectorMeta(name, 1, 1, 1L)); } return new IndexManifestEntry( kind, diff --git a/paimon-core/src/test/java/org/apache/paimon/schema/SchemaValidationTest.java b/paimon-core/src/test/java/org/apache/paimon/schema/SchemaValidationTest.java index ee3e25846a83..7005f11c302d 100644 --- a/paimon-core/src/test/java/org/apache/paimon/schema/SchemaValidationTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/schema/SchemaValidationTest.java @@ -858,6 +858,31 @@ public void testMergeOnReadCoexistsWithVisibilityCallbackAndPostponeBucket() { .doesNotThrowAnyException(); } + @Test + public void testDataEvolutionCoexistsWithDeletionVectors() { + List fields = + Arrays.asList( + new DataField(0, "f0", DataTypes.INT()), + new DataField(1, "f1", DataTypes.INT())); + Map options = new HashMap<>(); + options.put(CoreOptions.ROW_TRACKING_ENABLED.key(), "true"); + options.put(DATA_EVOLUTION_ENABLED.key(), "true"); + options.put("deletion-vectors.enabled", "true"); + options.put(BUCKET.key(), String.valueOf(-1)); + assertThatCode( + () -> + validateTableSchema( + new TableSchema( + 1, + fields, + 10, + singletonList("f0"), + emptyList(), + options, + ""))) + .doesNotThrowAnyException(); + } + @Test public void testBucketAppendBackwardCompatibility() { List fields = diff --git a/paimon-core/src/test/java/org/apache/paimon/separated/ClusteringTableTest.java b/paimon-core/src/test/java/org/apache/paimon/separated/ClusteringTableTest.java index eaf48f79c8e8..35396307e800 100644 --- a/paimon-core/src/test/java/org/apache/paimon/separated/ClusteringTableTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/separated/ClusteringTableTest.java @@ -27,6 +27,8 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.GenericRow; import org.apache.paimon.data.InternalRow; +import org.apache.paimon.deletionvectors.DeletionFileKey; +import org.apache.paimon.deletionvectors.FileNameKey; import org.apache.paimon.disk.IOManager; import org.apache.paimon.fs.Path; import org.apache.paimon.io.DataFileMeta; @@ -456,7 +458,11 @@ public void testDeletionVectorCleanupAfterMerge() throws Exception { Set dvReferencedFiles = new HashSet<>(); for (IndexManifestEntry indexEntry : table.indexManifestFileReader().read(snapshot.indexManifest())) { - dvReferencedFiles.addAll(indexEntry.indexFile().dvRanges().keySet()); + for (DeletionFileKey key : indexEntry.indexFile().dvRanges().keySet()) { + if (key instanceof FileNameKey) { + dvReferencedFiles.add(((FileNameKey) key).fileName()); + } + } } // Every DV-referenced file must be an active data file diff --git a/paimon-core/src/test/java/org/apache/paimon/table/DataEvolutionDeletionVectorTest.java b/paimon-core/src/test/java/org/apache/paimon/table/DataEvolutionDeletionVectorTest.java new file mode 100644 index 000000000000..508f5f5cca99 --- /dev/null +++ b/paimon-core/src/test/java/org/apache/paimon/table/DataEvolutionDeletionVectorTest.java @@ -0,0 +1,560 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.table; + +import org.apache.paimon.CoreOptions; +import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.BlobData; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.deletionvectors.BitmapDeletionVector; +import org.apache.paimon.deletionvectors.DeletionFileKey; +import org.apache.paimon.deletionvectors.DeletionVector; +import org.apache.paimon.deletionvectors.append.AppendDeleteFileMaintainer; +import org.apache.paimon.deletionvectors.append.BaseAppendDeleteFileMaintainer; +import org.apache.paimon.format.blob.BlobFileFormat; +import org.apache.paimon.globalindex.IndexedSplit; +import org.apache.paimon.index.DeletionVectorMeta; +import org.apache.paimon.index.IndexFileMeta; +import org.apache.paimon.io.CompactIncrement; +import org.apache.paimon.io.DataFileMeta; +import org.apache.paimon.io.DataIncrement; +import org.apache.paimon.manifest.FileKind; +import org.apache.paimon.manifest.IndexManifestEntry; +import org.apache.paimon.manifest.ManifestEntry; +import org.apache.paimon.reader.RecordReader; +import org.apache.paimon.schema.Schema; +import org.apache.paimon.table.sink.BatchTableCommit; +import org.apache.paimon.table.sink.BatchTableWrite; +import org.apache.paimon.table.sink.BatchWriteBuilder; +import org.apache.paimon.table.sink.CommitMessage; +import org.apache.paimon.table.sink.CommitMessageImpl; +import org.apache.paimon.table.source.DataSplit; +import org.apache.paimon.table.source.ReadBuilder; +import org.apache.paimon.table.source.Split; +import org.apache.paimon.table.source.TableScan; +import org.apache.paimon.table.system.TableIndexesTable; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.Range; + +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.paimon.table.BucketMode.UNAWARE_BUCKET; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** Tests row-range deletion vectors for data evolution tables. */ +public class DataEvolutionDeletionVectorTest extends DataEvolutionTestBase { + + @Test + public void testReadWithRowRangeDeletionVectors() throws Exception { + FileStoreTable table = prepareTableWithStructuredUpdateAndDeletionVectors(); + + ReadBuilder readBuilder = + table.newReadBuilder().withRowRanges(Collections.singletonList(new Range(0, 14))); + TableScan.Plan plan = readBuilder.newScan().plan(); + assertThat(plan.splits()).hasSize(1); + + DataSplit dataSplit = toDataSplit(plan.splits().get(0)); + assertRegularFileRowRanges( + dataSplit.dataFiles(), + Arrays.asList( + new Range(0, 4), + new Range(0, 4), + new Range(5, 9), + new Range(5, 9), + new Range(10, 14), + new Range(10, 14))); + assertThat(dataSplit.dataEvolutionDeletionFiles()).isPresent(); + Map deletionFiles = dataSplit.dataEvolutionDeletionFiles().get(); + assertThat(deletionFiles.keySet()) + .containsExactlyInAnyOrder(new Range(0, 3), new Range(4, 10), new Range(11, 12)); + + assertThat(readRows(readBuilder, plan)) + .containsExactly( + "0|name-0|updated-0|0", + "2|name-2|updated-2|2", + "3|name-3|updated-3|3", + "5|name-5|updated-5|5", + "7|name-7|updated-7|7", + "8|name-8|updated-8|8", + "9|name-9|updated-9|9", + "11|name-11|updated-11|11", + "13|name-13|updated-13|13", + "14|name-14|updated-14|14"); + } + + @Test + public void testReadSingleRowIdWithRowRangeDeletionVectors() throws Exception { + FileStoreTable table = prepareTableWithStructuredUpdateAndDeletionVectors(); + + assertThat( + readRows( + table.newReadBuilder() + .withRowRanges(Collections.singletonList(new Range(4, 4))))) + .isEmpty(); + assertThat( + readRows( + table.newReadBuilder() + .withRowRanges(Collections.singletonList(new Range(7, 7))))) + .containsExactly("7|name-7|updated-7|7"); + } + + @Test + public void testReadProjectedColumnWithRowRangeDeletionVectors() throws Exception { + FileStoreTable table = prepareTableWithStructuredUpdateAndDeletionVectors(); + + ReadBuilder structuredColumnRead = + table.newReadBuilder() + .withProjection(new int[] {2}) + .withRowRanges(Collections.singletonList(new Range(0, 14))); + assertThat(readProjectedStrings(structuredColumnRead)) + .containsExactly( + "updated-0", + "updated-2", + "updated-3", + "updated-5", + "updated-7", + "updated-8", + "updated-9", + "updated-11", + "updated-13", + "updated-14"); + + ReadBuilder blobColumnRead = + table.newReadBuilder() + .withProjection(new int[] {3}) + .withRowRanges(Collections.singletonList(new Range(0, 14))); + assertThat(readProjectedBlobValues(blobColumnRead)) + .containsExactly(0, 2, 3, 5, 7, 8, 9, 11, 13, 14); + } + + @Test + public void testReadWithoutRowRangePushdownWithRowRangeDeletionVectors() throws Exception { + FileStoreTable table = prepareTableWithStructuredUpdateAndDeletionVectors(); + + assertThat(readRows(table.newReadBuilder())) + .containsExactly( + "0|name-0|updated-0|0", + "2|name-2|updated-2|2", + "3|name-3|updated-3|3", + "5|name-5|updated-5|5", + "7|name-7|updated-7|7", + "8|name-8|updated-8|8", + "9|name-9|updated-9|9", + "11|name-11|updated-11|11", + "13|name-13|updated-13|13", + "14|name-14|updated-14|14"); + + assertThat(readProjectedStrings(table.newReadBuilder().withProjection(new int[] {2}))) + .containsExactly( + "updated-0", + "updated-2", + "updated-3", + "updated-5", + "updated-7", + "updated-8", + "updated-9", + "updated-11", + "updated-13", + "updated-14"); + assertThat(readProjectedBlobValues(table.newReadBuilder().withProjection(new int[] {3}))) + .containsExactly(0, 2, 3, 5, 7, 8, 9, 11, 13, 14); + } + + @Test + public void testMergedRowCountWithSpanningRowRangeDeletionVector() throws Exception { + FileStoreTable table = prepareTableWithStructuredUpdateAndDeletionVectors(); + + DataSplit dataSplit = planDataSplit(table, new Range(0, 14)); + + assertThat(dataSplit.mergedRowCount()).hasValue(10L); + } + + @Test + public void testMergedRowCountIsEmptyWhenDeletionVectorExceedsPlannedDataFiles() + throws Exception { + FileStoreTable table = prepareTableWithStructuredUpdateAndDeletionVectors(); + + DataSplit dataSplit = planDataSplit(table, new Range(0, 4)); + + assertThat(dataSplit.mergedRowCount()).isEmpty(); + } + + @Test + public void testMergedRowCountWithContainedRowRangeDeletionVectors() throws Exception { + FileStoreTable table = + prepareTableWithStructuredUpdateAndDeletionVectors( + Arrays.asList( + new DvSpec(new Range(0, 3), 1), + new DvSpec(new Range(5, 9), 6), + new DvSpec(new Range(11, 12), 12))); + + DataSplit dataSplit = planDataSplit(table, new Range(0, 14)); + + assertThat(dataSplit.mergedRowCount()).hasValue(12L); + } + + @Test + public void testReadAfterRemovingAllRowRangeDeletionVectors() throws Exception { + FileStoreTable table = prepareTableWithStructuredUpdateAndDeletionVectors(); + ReadBuilder readWithDvs = + table.newReadBuilder().withRowRanges(Collections.singletonList(new Range(0, 14))); + assertThat(readRows(readWithDvs)).hasSize(10); + + AppendDeleteFileMaintainer maintainer = + BaseAppendDeleteFileMaintainer.forUnawareAppend( + table.store().newIndexFileHandler(), + table.latestSnapshot().get(), + BinaryRow.EMPTY_ROW); + maintainer.notifyRemovedDeletionVector(DeletionFileKey.ofRange(new Range(0, 3))); + maintainer.notifyRemovedDeletionVector(DeletionFileKey.ofRange(new Range(4, 10))); + maintainer.notifyRemovedDeletionVector(DeletionFileKey.ofRange(new Range(11, 12))); + + List newIndexFiles = new ArrayList<>(); + List deletedIndexFiles = new ArrayList<>(); + for (IndexManifestEntry entry : maintainer.persist()) { + if (entry.kind() == FileKind.ADD) { + newIndexFiles.add(entry.indexFile()); + } else if (entry.kind() == FileKind.DELETE) { + deletedIndexFiles.add(entry.indexFile()); + } + } + commitDefault( + Collections.singletonList( + new CommitMessageImpl( + BinaryRow.EMPTY_ROW, + UNAWARE_BUCKET, + null, + new DataIncrement( + Collections.emptyList(), + Collections.emptyList(), + Collections.emptyList(), + newIndexFiles, + deletedIndexFiles), + CompactIncrement.emptyIncrement()))); + table = getTableDefault(); + + ReadBuilder readBuilder = + table.newReadBuilder().withRowRanges(Collections.singletonList(new Range(0, 14))); + TableScan.Plan plan = readBuilder.newScan().plan(); + assertThat(plan.splits()).hasSize(1); + DataSplit dataSplit = toDataSplit(plan.splits().get(0)); + + assertThat(dataSplit.dataEvolutionDeletionFiles()) + .hasValueSatisfying(deletionFiles -> assertThat(deletionFiles).isEmpty()); + assertThat(readRows(readBuilder, plan)) + .containsExactly( + "0|name-0|updated-0|0", + "1|name-1|updated-1|1", + "2|name-2|updated-2|2", + "3|name-3|updated-3|3", + "4|name-4|updated-4|4", + "5|name-5|updated-5|5", + "6|name-6|updated-6|6", + "7|name-7|updated-7|7", + "8|name-8|updated-8|8", + "9|name-9|updated-9|9", + "10|name-10|updated-10|10", + "11|name-11|updated-11|11", + "12|name-12|updated-12|12", + "13|name-13|updated-13|13", + "14|name-14|updated-14|14"); + } + + @Test + public void testCommitOverlappingRowRangeDeletionVectorsFails() throws Exception { + createTableDefault(); + + FileStoreTable table = getTableDefault(); + writeBaseRows(table); + + assertThatThrownBy( + () -> + commitDeletionVectors( + table, + Arrays.asList( + new DvSpec(new Range(0, 5), 1), + new DvSpec(new Range(5, 10), 6)))) + .isInstanceOf(RuntimeException.class) + .hasRootCauseInstanceOf(IllegalStateException.class) + .hasStackTraceContaining( + "Found overlapping row range [0, 5] and [5, 10] for data-evolution deletion files."); + } + + @Test + public void testTableIndexesShowsRowRangeDeletionVectorKeys() throws Exception { + FileStoreTable table = prepareTableWithStructuredUpdateAndDeletionVectors(); + TableIndexesTable indexesTable = new TableIndexesTable(table); + + ReadBuilder readBuilder = indexesTable.newReadBuilder(); + List dvKeys = new ArrayList<>(); + try (RecordReader reader = + readBuilder.newRead().createReader(readBuilder.newScan().plan())) { + reader.forEachRemaining( + row -> { + if (!row.isNullAt(6)) { + InternalArray dvRanges = row.getArray(6); + for (int i = 0; i < dvRanges.size(); i++) { + dvKeys.add( + dvRanges.getRow( + i, + DeletionVectorMeta.SCHEMA.getFieldCount()) + .getString(0) + .toString()); + } + } + }); + } + + assertThat(dvKeys).containsExactlyInAnyOrder("[0, 3]", "[4, 10]", "[11, 12]"); + } + + @Override + protected Schema schemaDefault() { + Schema.Builder schemaBuilder = Schema.newBuilder(); + schemaBuilder.column("f0", DataTypes.INT()); + schemaBuilder.column("f1", DataTypes.STRING()); + schemaBuilder.column("f2", DataTypes.STRING()); + schemaBuilder.column("f3", DataTypes.BLOB()); + schemaBuilder.option(CoreOptions.TARGET_FILE_SIZE.key(), "128 MB"); + schemaBuilder.option(CoreOptions.BLOB_TARGET_FILE_SIZE.key(), "1 b"); + schemaBuilder.option(CoreOptions.ROW_TRACKING_ENABLED.key(), "true"); + schemaBuilder.option(CoreOptions.DATA_EVOLUTION_ENABLED.key(), "true"); + schemaBuilder.option(CoreOptions.DELETION_VECTORS_ENABLED.key(), "true"); + return schemaBuilder.build(); + } + + private FileStoreTable prepareTableWithStructuredUpdateAndDeletionVectors() throws Exception { + return prepareTableWithStructuredUpdateAndDeletionVectors( + Arrays.asList( + new DvSpec(new Range(0, 3), 1), + new DvSpec(new Range(4, 10), 4, 6, 10), + new DvSpec(new Range(11, 12), 12))); + } + + private FileStoreTable prepareTableWithStructuredUpdateAndDeletionVectors( + List deletionVectorSpecs) throws Exception { + createTableDefault(); + + FileStoreTable table = getTableDefault(); + writeBaseRows(table); + assertRegularFileRowRanges( + table.store().newScan().plan().files().stream() + .map(ManifestEntry::file) + .collect(Collectors.toList()), + Arrays.asList(new Range(0, 4), new Range(5, 9), new Range(10, 14))); + assertFirstBlobFileRowRanges( + table, Arrays.asList(new Range(0, 0), new Range(1, 1), new Range(2, 2)), 15); + + updateStructuredColumn(table); + commitDeletionVectors(table, deletionVectorSpecs); + return getTableDefault(); + } + + private void writeBaseRows(FileStoreTable table) throws Exception { + for (int batch = 0; batch < 3; batch++) { + BatchWriteBuilder builder = table.newBatchWriteBuilder(); + try (BatchTableWrite write = builder.newWrite(); + BatchTableCommit commit = builder.newCommit()) { + for (int rowId = batch * 5; rowId < batch * 5 + 5; rowId++) { + write.write( + GenericRow.of( + rowId, + BinaryString.fromString("name-" + rowId), + BinaryString.fromString("base-" + rowId), + new BlobData(new byte[] {(byte) rowId}))); + } + commit.commit(write.prepareCommit()); + } + } + } + + private void updateStructuredColumn(FileStoreTable table) throws Exception { + RowType writeType = table.rowType().project(Collections.singletonList("f2")); + for (int batch = 0; batch < 3; batch++) { + BatchWriteBuilder builder = table.newBatchWriteBuilder(); + try (BatchTableWrite write = builder.newWrite().withWriteType(writeType); + BatchTableCommit commit = builder.newCommit()) { + long firstRowId = batch * 5L; + for (int rowId = batch * 5; rowId < batch * 5 + 5; rowId++) { + write.write(GenericRow.of(BinaryString.fromString("updated-" + rowId))); + } + List commitables = write.prepareCommit(); + setFirstRowId(commitables, firstRowId); + commit.commit(commitables); + } + } + } + + private void commitDeletionVectors(FileStoreTable table, List deletionVectorSpecs) + throws Exception { + AppendDeleteFileMaintainer maintainer = + BaseAppendDeleteFileMaintainer.forUnawareAppend( + table.store().newIndexFileHandler(), + table.latestSnapshot().get(), + BinaryRow.EMPTY_ROW); + + for (DvSpec spec : deletionVectorSpecs) { + DeletionVector deletionVector = new BitmapDeletionVector(); + for (long rowId : spec.deletedRowIds) { + deletionVector.delete(rowId - spec.range.from); + } + maintainer.notifyNewDeletionVector(DeletionFileKey.ofRange(spec.range), deletionVector); + } + + List newIndexFiles = new ArrayList<>(); + List deletedIndexFiles = new ArrayList<>(); + for (IndexManifestEntry entry : maintainer.persist()) { + if (entry.kind() == FileKind.ADD) { + newIndexFiles.add(entry.indexFile()); + } else if (entry.kind() == FileKind.DELETE) { + deletedIndexFiles.add(entry.indexFile()); + } + } + + commitDefault( + Collections.singletonList( + new CommitMessageImpl( + BinaryRow.EMPTY_ROW, + UNAWARE_BUCKET, + null, + new DataIncrement( + Collections.emptyList(), + Collections.emptyList(), + Collections.emptyList(), + newIndexFiles, + deletedIndexFiles), + CompactIncrement.emptyIncrement()))); + } + + private static DataSplit planDataSplit(FileStoreTable table, Range range) { + ReadBuilder readBuilder = + table.newReadBuilder().withRowRanges(Collections.singletonList(range)); + TableScan.Plan plan = readBuilder.newScan().plan(); + assertThat(plan.splits()).hasSize(1); + return toDataSplit(plan.splits().get(0)); + } + + private static List readRows(ReadBuilder readBuilder, TableScan.Plan plan) + throws IOException { + List rows = new ArrayList<>(); + try (RecordReader reader = readBuilder.newRead().createReader(plan)) { + reader.forEachRemaining(row -> rows.add(formatRow(row))); + } + rows.sort((left, right) -> Integer.compare(rowId(left), rowId(right))); + return rows; + } + + private static List readRows(ReadBuilder readBuilder) throws IOException { + return readRows(readBuilder, readBuilder.newScan().plan()); + } + + private static List readProjectedStrings(ReadBuilder readBuilder) throws IOException { + List rows = new ArrayList<>(); + try (RecordReader reader = + readBuilder.newRead().createReader(readBuilder.newScan().plan())) { + reader.forEachRemaining(row -> rows.add(row.getString(0).toString())); + } + rows.sort((left, right) -> Integer.compare(projectedRowId(left), projectedRowId(right))); + return rows; + } + + private static List readProjectedBlobValues(ReadBuilder readBuilder) + throws IOException { + List rows = new ArrayList<>(); + try (RecordReader reader = + readBuilder.newRead().createReader(readBuilder.newScan().plan())) { + reader.forEachRemaining(row -> rows.add(row.getBlob(0).toData()[0] & 0xFF)); + } + Collections.sort(rows); + return rows; + } + + private static int rowId(String row) { + return Integer.parseInt(row.substring(0, row.indexOf('|'))); + } + + private static int projectedRowId(String row) { + return Integer.parseInt(row.substring(row.lastIndexOf('-') + 1)); + } + + private static String formatRow(InternalRow row) { + return row.getInt(0) + + "|" + + row.getString(1) + + "|" + + row.getString(2) + + "|" + + (row.getBlob(3).toData()[0] & 0xFF); + } + + private static DataSplit toDataSplit(Split split) { + if (split instanceof IndexedSplit) { + return ((IndexedSplit) split).dataSplit(); + } + return (DataSplit) split; + } + + private static void assertRegularFileRowRanges( + List dataFiles, List expected) { + List actual = + dataFiles.stream() + .filter(file -> !BlobFileFormat.isBlobFile(file.fileName())) + .map(DataFileMeta::nonNullRowIdRange) + .sorted((left, right) -> Long.compare(left.from, right.from)) + .collect(Collectors.toList()); + assertThat(actual).isEqualTo(expected); + } + + private static void assertFirstBlobFileRowRanges( + FileStoreTable table, List expectedFirstRanges, int expectedCount) { + List actual = + table.store().newScan().plan().files().stream() + .map(ManifestEntry::file) + .filter(file -> BlobFileFormat.isBlobFile(file.fileName())) + .map(DataFileMeta::nonNullRowIdRange) + .sorted((left, right) -> Long.compare(left.from, right.from)) + .collect(Collectors.toList()); + assertThat(actual).hasSize(expectedCount); + assertThat(actual.subList(0, expectedFirstRanges.size())).isEqualTo(expectedFirstRanges); + } + + private static class DvSpec { + + private final Range range; + private final long[] deletedRowIds; + + private DvSpec(Range range, long... deletedRowIds) { + this.range = range; + this.deletedRowIds = deletedRowIds; + } + } +} diff --git a/paimon-core/src/test/java/org/apache/paimon/table/source/DataSplitCompatibleTest.java b/paimon-core/src/test/java/org/apache/paimon/table/source/DataSplitCompatibleTest.java index dcd888b7f1ca..77b1898b6ced 100644 --- a/paimon-core/src/test/java/org/apache/paimon/table/source/DataSplitCompatibleTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/table/source/DataSplitCompatibleTest.java @@ -38,6 +38,7 @@ import org.apache.paimon.types.TimestampType; import org.apache.paimon.utils.IOUtils; import org.apache.paimon.utils.InstantiationUtil; +import org.apache.paimon.utils.Range; import org.junit.jupiter.api.Test; @@ -50,6 +51,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ThreadLocalRandom; @@ -90,6 +92,31 @@ public void testSplitMergedRowCount() { assertThat(split.mergedRowCount()).hasValue(5700L); } + @Test + public void testDataEvolutionDeletionFilesSerialize() throws Exception { + List dataFiles = + Collections.singletonList( + newDataFile(10, SimpleStats.EMPTY_STATS, null).assignFirstRowId(0)); + Range key = new Range(0, 9); + Map dataEvolutionDeletionFiles = new LinkedHashMap<>(); + dataEvolutionDeletionFiles.put(key, new DeletionFile("p", 1, 2, 3L)); + DataSplit split = + DataSplit.builder() + .withSnapshot(1) + .withPartition(BinaryRow.EMPTY_ROW) + .withBucket(1) + .withBucketPath("my path") + .rawConvertible(true) + .withDataFiles(dataFiles) + .withDataEvolutionDeletionFiles(dataEvolutionDeletionFiles) + .build(); + + DataSplit actual = InstantiationUtil.clone(split); + + assertThat(actual.dataEvolutionDeletionFiles()).hasValue(dataEvolutionDeletionFiles); + assertThat(actual.mergedRowCount()).hasValue(7L); + } + @Test public void testSplitMinMaxValue() { Map> schemas = new HashMap<>(); diff --git a/paimon-core/src/test/java/org/apache/paimon/utils/DVMetaCacheTest.java b/paimon-core/src/test/java/org/apache/paimon/utils/DVMetaCacheTest.java index ab1ae19cfbd9..98ace86b028c 100644 --- a/paimon-core/src/test/java/org/apache/paimon/utils/DVMetaCacheTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/utils/DVMetaCacheTest.java @@ -22,6 +22,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.GenericRow; import org.apache.paimon.data.serializer.InternalRowSerializer; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.fs.Path; import org.apache.paimon.table.source.DeletionFile; import org.apache.paimon.types.DataTypes; @@ -50,44 +51,46 @@ public void testPutAndRead() { BinaryRow partition = partition("year=2023/month=12"); // Put data for bucket 1 with multiple files - Map dvFiles1 = new HashMap<>(); + Map dvFiles1 = new HashMap<>(); dvFiles1.put( - "data-a1b2c3d4-e5f6-7890-abcd-ef1234567890-1.parquet", + key("data-a1b2c3d4-e5f6-7890-abcd-ef1234567890-1.parquet"), new DeletionFile("index-a1b2c3d4-e5f6-7890-abcd-ef1234567890-1", 0L, 100L, 42L)); dvFiles1.put( - "data-a1b2c3d4-e5f6-7890-abcd-ef1234567890-2.parquet", + key("data-a1b2c3d4-e5f6-7890-abcd-ef1234567890-2.parquet"), new DeletionFile("index-a1b2c3d4-e5f6-7890-abcd-ef1234567890-1", 100L, 500L, null)); cache.put(path, partition, 1, dvFiles1); // Put data for bucket 2 with single file - Map dvFiles2 = new HashMap<>(); + Map dvFiles2 = new HashMap<>(); dvFiles2.put( - "data-b2c3d4e5-f6g7-8901-bcde-f23456789012-1.parquet", + key("data-b2c3d4e5-f6g7-8901-bcde-f23456789012-1.parquet"), new DeletionFile("index-b2c3d4e5-f6g7-8901-bcde-f23456789012-1", 0L, 300L, 12L)); cache.put(path, partition, 2, dvFiles2); // Read bucket 1 - verify multiple files - Map result1 = cache.read(path, partition, 1); + Map result1 = cache.read(path, partition, 1); assertThat(result1).isNotNull().hasSize(2); assertThat(result1) .containsKeys( - "data-a1b2c3d4-e5f6-7890-abcd-ef1234567890-1.parquet", - "data-a1b2c3d4-e5f6-7890-abcd-ef1234567890-2.parquet"); + key("data-a1b2c3d4-e5f6-7890-abcd-ef1234567890-1.parquet"), + key("data-a1b2c3d4-e5f6-7890-abcd-ef1234567890-2.parquet")); - DeletionFile file1 = result1.get("data-a1b2c3d4-e5f6-7890-abcd-ef1234567890-1.parquet"); + DeletionFile file1 = + result1.get(key("data-a1b2c3d4-e5f6-7890-abcd-ef1234567890-1.parquet")); assertThat(file1.path()).isEqualTo("index-a1b2c3d4-e5f6-7890-abcd-ef1234567890-1"); assertThat(file1.offset()).isEqualTo(0L); assertThat(file1.length()).isEqualTo(100L); assertThat(file1.cardinality()).isEqualTo(42L); - DeletionFile file2 = result1.get("data-a1b2c3d4-e5f6-7890-abcd-ef1234567890-2.parquet"); + DeletionFile file2 = + result1.get(key("data-a1b2c3d4-e5f6-7890-abcd-ef1234567890-2.parquet")); assertThat(file2.path()).isEqualTo("index-a1b2c3d4-e5f6-7890-abcd-ef1234567890-1"); assertThat(file2.cardinality()).isNull(); // Read bucket 2 - verify single file - Map result2 = cache.read(path, partition, 2); + Map result2 = cache.read(path, partition, 2); assertThat(result2).isNotNull().hasSize(1); - assertThat(result2).containsKey("data-b2c3d4e5-f6g7-8901-bcde-f23456789012-1.parquet"); + assertThat(result2).containsKey(key("data-b2c3d4e5-f6g7-8901-bcde-f23456789012-1.parquet")); // Read non-existent key assertThat(cache.read(path, partition("year=2024/month=01"), 0)).isNull(); @@ -102,7 +105,7 @@ public void testEmptyMap() { cache.put(path, partition, 1, new HashMap<>()); // Should return empty map, not null - Map result = cache.read(path, partition, 1); + Map result = cache.read(path, partition, 1); assertThat(result).isNotNull().isEmpty(); } @@ -120,9 +123,9 @@ public void testLazyValue() { 1, () -> { invoked.incrementAndGet(); - Map dvFiles = new HashMap<>(); + Map dvFiles = new HashMap<>(); dvFiles.put( - "data-d4e5f6g7-h8i9-0123-defg-456789012345-1.parquet", + key("data-d4e5f6g7-h8i9-0123-defg-456789012345-1.parquet"), new DeletionFile( "index-d4e5f6g7-h8i9-0123-defg-456789012345-1", 0L, 100L, 1L)); return dvFiles; @@ -130,11 +133,11 @@ public void testLazyValue() { assertThat(invoked).hasValue(0); - Map result1 = cache.read(path, partition, 1); + Map result1 = cache.read(path, partition, 1); assertThat(result1).isNotNull().hasSize(1); assertThat(invoked).hasValue(1); - Map result2 = cache.read(path, partition, 1); + Map result2 = cache.read(path, partition, 1); assertThat(result2).isSameAs(result1); assertThat(invoked).hasValue(1); } @@ -147,9 +150,9 @@ public void testLazyValueInitializedOnceConcurrently() throws Exception { AtomicInteger invoked = new AtomicInteger(); CountDownLatch supplierEntered = new CountDownLatch(1); CountDownLatch releaseSupplier = new CountDownLatch(1); - Map dvFiles = new HashMap<>(); + Map dvFiles = new HashMap<>(); dvFiles.put( - "data-d4e5f6g7-h8i9-0123-defg-456789012345-1.parquet", + key("data-d4e5f6g7-h8i9-0123-defg-456789012345-1.parquet"), new DeletionFile("index-d4e5f6g7-h8i9-0123-defg-456789012345-1", 0L, 100L, 1L)); cache.putLazy( @@ -171,11 +174,11 @@ public void testLazyValueInitializedOnceConcurrently() throws Exception { ExecutorService executor = Executors.newFixedThreadPool(2); try { - Future> first = + Future> first = executor.submit(() -> cache.read(path, partition, 1)); assertThat(supplierEntered.await(5, TimeUnit.SECONDS)).isTrue(); - Future> second = + Future> second = executor.submit(() -> cache.read(path, partition, 1)); releaseSupplier.countDown(); @@ -195,18 +198,18 @@ public void testCacheEviction() { BinaryRow partition = partition("year=2023/month=08"); // Fill cache to capacity - Map dvFiles1 = new HashMap<>(); + Map dvFiles1 = new HashMap<>(); dvFiles1.put( - "data-e5f6g7h8-i9j0-1234-efgh-567890123456-1.parquet", + key("data-e5f6g7h8-i9j0-1234-efgh-567890123456-1.parquet"), new DeletionFile("index-e5f6g7h8-i9j0-1234-efgh-567890123456-1", 0L, 100L, 1L)); dvFiles1.put( - "data-e5f6g7h8-i9j0-1234-efgh-567890123456-2.parquet", + key("data-e5f6g7h8-i9j0-1234-efgh-567890123456-2.parquet"), new DeletionFile("index-e5f6g7h8-i9j0-1234-efgh-567890123456-1", 100L, 200L, 2L)); cache.put(path, partition, 1, dvFiles1); - Map dvFiles2 = new HashMap<>(); + Map dvFiles2 = new HashMap<>(); dvFiles2.put( - "data-f6g7h8i9-j0k1-2345-fghi-678901234567-1.parquet", + key("data-f6g7h8i9-j0k1-2345-fghi-678901234567-1.parquet"), new DeletionFile("index-f6g7h8i9-j0k1-2345-fghi-678901234567-1", 0L, 100L, 2L)); cache.put(path, partition, 2, dvFiles2); @@ -215,16 +218,16 @@ public void testCacheEviction() { assertThat(cache.read(path, partition, 2)).isNotNull(); // Add third entry, should evict itself - Map dvFiles3 = new HashMap<>(); + Map dvFiles3 = new HashMap<>(); dvFiles3.put( - "data-g7h8i9j0-k1l2-3456-ghij-789012345678-1.parquet", + key("data-g7h8i9j0-k1l2-3456-ghij-789012345678-1.parquet"), new DeletionFile("index-g7h8i9j0-k1l2-3456-ghij-789012345678-1", 0L, 100L, 3L)); cache.put(path, partition, 3, dvFiles3); // Add forth entry, should evict first one - Map dvFiles4 = new HashMap<>(); + Map dvFiles4 = new HashMap<>(); dvFiles4.put( - "data-g7h8i9j0-k1l2-311456-ghij-789012345678-1.parquet", + key("data-g7h8i9j0-k1l2-311456-ghij-789012345678-1.parquet"), new DeletionFile("index-g117h8i9j0-k1l2-3456-ghij-789012345678-1", 0L, 100L, 3L)); cache.put(path, partition, 4, dvFiles4); @@ -244,4 +247,8 @@ private BinaryRow partition(String partitionValue) { .toBinaryRow(GenericRow.of(BinaryString.fromString(partitionValue))) .copy(); } + + private DeletionFileKey key(String fileName) { + return DeletionFileKey.ofFileName(fileName); + } } diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/BatchFileStoreITCase.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/BatchFileStoreITCase.java index a3cbc264b96c..2b6c96a3e6e7 100644 --- a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/BatchFileStoreITCase.java +++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/BatchFileStoreITCase.java @@ -21,6 +21,7 @@ import org.apache.paimon.CoreOptions; import org.apache.paimon.Snapshot; import org.apache.paimon.catalog.Catalog; +import org.apache.paimon.deletionvectors.DeletionFileKey; import org.apache.paimon.deletionvectors.DeletionVector; import org.apache.paimon.flink.util.AbstractTestBase; import org.apache.paimon.manifest.IndexManifestEntry; @@ -155,7 +156,8 @@ public void testFullCompactionNoDv() throws Catalog.TableNotExistException { assertThat(batchSql("SELECT * FROM T1 WHERE a = 1")).containsExactly(Row.of(1, "66", "77")); } - private Map deletionVectors(FileStoreTable table, Snapshot snapshot) { + private Map deletionVectors( + FileStoreTable table, Snapshot snapshot) { assertThat(snapshot.indexManifest()).isNotNull(); List indexManifestEntries = table.indexManifestFileReader().read(snapshot.indexManifest()); diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/SparkDataFileMeta.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/SparkDataFileMeta.scala index 5a9f68328ce2..6041fe788146 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/SparkDataFileMeta.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/SparkDataFileMeta.scala @@ -19,6 +19,7 @@ package org.apache.paimon.spark.commands import org.apache.paimon.data.BinaryRow +import org.apache.paimon.deletionvectors.DeletionFileKey import org.apache.paimon.io.DataFileMeta import org.apache.paimon.spark.PaimonImplicits import org.apache.paimon.table.source.{DataSplit, DeletionFile} @@ -59,7 +60,7 @@ object SparkDataFileMeta { dataSplit.bucket, totalBuckets, file, - dvFactory.create(file.fileName())) + dvFactory.create(DeletionFileKey.ofFileName(file.fileName()))) } }.toSeq diff --git a/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/DeletionVectorTest.scala b/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/DeletionVectorTest.scala index 6a34236572c4..1fd63719d1d4 100644 --- a/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/DeletionVectorTest.scala +++ b/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/DeletionVectorTest.scala @@ -19,7 +19,7 @@ package org.apache.paimon.spark.sql import org.apache.paimon.data.BinaryRow -import org.apache.paimon.deletionvectors.{BucketedDvMaintainer, BucketedDvMaintainerTest, DeletionVector} +import org.apache.paimon.deletionvectors.{BucketedDvMaintainer, BucketedDvMaintainerTest, DeletionVector, FileNameKey} import org.apache.paimon.fs.Path import org.apache.paimon.spark.PaimonSparkTestBase import org.apache.paimon.spark.read.PaimonSplitScan @@ -753,6 +753,7 @@ class DeletionVectorTest extends PaimonSparkTestBase with AdaptiveSparkPlanHelpe ) .deletionVectors() .asScala + .map { case (key: FileNameKey, dv) => key.fileName() -> dv } }.toMap }