Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 18 additions & 12 deletions docs/generated/core_configuration.html
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,24 @@
<td>Boolean</td>
<td>Whether enable data evolution for row tracking table.</td>
</tr>
<tr>
<td><h5>data-evolution.merge-into.file-pruning</h5></td>
<td style="word-wrap: break-word;">true</td>
<td>Boolean</td>
<td>If true, enables the file-level pruning step for MergeInto partial column update on data-evolution tables. Set this to false when most files in the target partition are expected to be updated, so that the overhead of collecting touched file IDs outweighs the benefit of pruning untouched files.</td>
</tr>
<tr>
<td><h5>data-evolution.merge-into.source-persist</h5></td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>Whether to persist source when process merge into action on data evolution table.</td>
</tr>
<tr>
<td><h5>data-evolution.nested-field.enabled</h5></td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>Whether to enable sub-field-level data evolution for nested (struct) columns. When enabled, an update that only touches some sub-fields of a nested column writes an incremental file containing just those sub-fields (aligned by row id); when disabled, the whole top-level column is rewritten. Requires data-evolution.enabled=true.</td>
</tr>
<tr>
<td><h5>data-evolution.row-sidecar.enabled</h5></td>
<td style="word-wrap: break-word;">false</td>
Expand All @@ -494,18 +512,6 @@
<td>Double</td>
<td>Maximum selected row ratio for reading a row-store sidecar file. The value must be in (0, 1]. The sidecar is used only when the selected row ratio is no more than this value and the selected row count is no more than data-evolution.row-sidecar.max-selected-rows.</td>
</tr>
<tr>
<td><h5>data-evolution.merge-into.file-pruning</h5></td>
<td style="word-wrap: break-word;">true</td>
<td>Boolean</td>
<td>If true, enables the file-level pruning step for MergeInto partial column update on data-evolution tables. Set this to false when most files in the target partition are expected to be updated, so that the overhead of collecting touched file IDs outweighs the benefit of pruning untouched files.</td>
</tr>
<tr>
<td><h5>data-evolution.merge-into.source-persist</h5></td>
<td style="word-wrap: break-word;">false</td>
<td>Boolean</td>
<td>Whether to persist source when process merge into action on data evolution table.</td>
</tr>
<tr>
<td><h5>data-file.external-paths</h5></td>
<td style="word-wrap: break-word;">(none)</td>
Expand Down
16 changes: 16 additions & 0 deletions paimon-api/src/main/java/org/apache/paimon/CoreOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -2354,6 +2354,18 @@ public String toString() {
.defaultValue(false)
.withDescription("Whether enable data evolution for row tracking table.");

public static final ConfigOption<Boolean> DATA_EVOLUTION_NESTED_FIELD_ENABLED =
key("data-evolution.nested-field.enabled")
.booleanType()
.defaultValue(false)
.withDescription(
"Whether to enable sub-field-level data evolution for nested (struct) "
+ "columns. When enabled, an update that only touches some "
+ "sub-fields of a nested column writes an incremental file "
+ "containing just those sub-fields (aligned by row id); when "
+ "disabled, the whole top-level column is rewritten. Requires "
+ "data-evolution.enabled=true.");

public static final ConfigOption<Boolean> DATA_EVOLUTION_ROW_SIDECAR_ENABLED =
key("data-evolution.row-sidecar.enabled")
.booleanType()
Expand Down Expand Up @@ -3958,6 +3970,10 @@ public boolean dataEvolutionEnabled() {
return options.get(DATA_EVOLUTION_ENABLED);
}

public boolean dataEvolutionNestedFieldEnabled() {
return options.get(DATA_EVOLUTION_NESTED_FIELD_ENABLED);
}

public boolean dataEvolutionRowSidecarEnabled() {
return options.get(DATA_EVOLUTION_ROW_SIDECAR_ENABLED);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,9 @@ public TableSchema project(@Nullable List<String> writeCols) {
return new TableSchema(
version,
id,
new RowType(fields).project(writeCols).getFields(),
// writeCols may contain nested dotted paths (e.g. "nest.a") for sub-field-level
// data evolution; projectByPaths handles both plain top-level names and paths
new RowType(fields).projectByPaths(writeCols).getFields(),
highestFieldId,
partitionKeys,
primaryKeys,
Expand Down
159 changes: 159 additions & 0 deletions paimon-api/src/main/java/org/apache/paimon/types/RowType.java
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,165 @@ public RowType project(String... names) {
return project(Arrays.asList(names));
}

/**
* Project this row type by a list of (possibly nested) dotted paths, e.g. {@code ["f0",
* "nest.a"]}. A path without a dot selects the whole top-level field (same as {@link
* #project(List)}); a dotted path selects only the addressed sub-field of a nested {@link
* RowType}, preserving field ids and nullability of every level. Schema field order is
* preserved. This is used by data evolution to reconstruct the partial nested schema of a
* column-group file from its {@code writeCols}.
*/
public RowType projectByPaths(List<String> paths) {
return projectTypeByPaths(this, paths);
}

private static RowType projectTypeByPaths(RowType type, List<String> paths) {
// group paths by their immediate child name; a child appearing without a tail (or also with
// a tail) is selected as a whole field
Map<String, List<String>> childToSubPaths = new HashMap<>();
Set<String> wholeChildren = new HashSet<>();
Set<String> fieldNames = new HashSet<>();
for (DataField field : type.getFields()) {
fieldNames.add(field.name());
}
for (String path : paths) {
int dot = path.indexOf('.');
// Prefer an exact field-name match so a column whose name itself contains a dot (and
// any
// plain top-level name) is selected whole; only split into head.tail for genuine nested
// sub-field paths that do not name a field directly. This keeps backward compatibility
// with the legacy exact-name project(List).
if (dot < 0 || fieldNames.contains(path)) {
childToSubPaths.computeIfAbsent(path, k -> new ArrayList<>());
wholeChildren.add(path);
} else {
String head = path.substring(0, dot);
String tail = path.substring(dot + 1);
childToSubPaths.computeIfAbsent(head, k -> new ArrayList<>()).add(tail);
}
}

Set<String> matched = new HashSet<>();
List<DataField> result = new ArrayList<>();
for (DataField field : type.getFields()) {
List<String> subPaths = childToSubPaths.get(field.name());
if (subPaths == null) {
continue;
}
matched.add(field.name());
if (wholeChildren.contains(field.name()) || subPaths.isEmpty()) {
result.add(field);
} else if (field.type() instanceof RowType) {
RowType prunedChild =
projectTypeByPaths((RowType) field.type(), subPaths)
.copy(field.type().isNullable());
result.add(field.newType(prunedChild));
} else {
// a dotted path addresses a sub-field, but this field is not a ROW; reject rather
// than silently selecting the whole field, so invalid dotted paths surface early
throw new IllegalArgumentException(
"Cannot project sub-field(s) "
+ subPaths
+ " of non-ROW field '"
+ field.name()
+ "' in "
+ type);
}
}
if (!matched.containsAll(childToSubPaths.keySet())) {
Set<String> unknown = new HashSet<>(childToSubPaths.keySet());
unknown.removeAll(matched);
throw new IllegalArgumentException(
"Cannot project by paths, unknown field(s) " + unknown + " in " + type);
}
return new RowType(type.isNullable(), result);
}

/**
* Compute the dotted paths describing this (possibly partially nested) write type relative to a
* full row type. A top-level field, or a nested field whose structure fully covers the
* corresponding field in {@code fullType}, is emitted by its name; a nested field that only
* covers some sub-fields is expanded into dotted leaf paths. This is the inverse of {@link
* #projectByPaths(List)} and is used to derive {@code writeCols}.
*/
public List<String> leafPaths(RowType fullType) {
List<String> result = new ArrayList<>();
collectLeafPaths(getFields(), fullType, "", result);
return result;
}

private static void collectLeafPaths(
List<DataField> writeFields, RowType fullType, String prefix, List<String> out) {
for (DataField writeField : writeFields) {
String path = prefix.isEmpty() ? writeField.name() : prefix + "." + writeField.name();
// A field absent from the reference type (e.g. the _ROW_ID / _SEQUENCE_NUMBER special
// fields added by row tracking, which are not part of the table's logical row type) has
// no sub-field split: emit it whole by name, matching the legacy getFieldNames()
// output.
if (!fullType.containsField(writeField.id())) {
out.add(path);
continue;
}
DataField fullField = fullType.getField(writeField.id());
boolean willExpand =
writeField.type() instanceof RowType
&& fullField.type() instanceof RowType
&& !coversFully(
(RowType) writeField.type(), (RowType) fullField.type());
// A dotted path is only unambiguous if no name segment contains a literal '.'. A name
// with a dot is fine when emitted whole at top level (projectByPaths matches it
// exactly),
// but not when it participates in a multi-segment nested path.
if (writeField.name().indexOf('.') >= 0 && (!prefix.isEmpty() || willExpand)) {
throw new UnsupportedOperationException(
"Sub-field-level data evolution does not support a nested field whose name "
+ "contains '.': "
+ path);
}
if (willExpand) {
// A partial struct nested inside another partial struct (a path deeper than one
// level, e.g. nest.sub.x) cannot be composed back on read — the data-evolution read
// path only assembles one nested level. Reject it here so such a file is never
// written/committed and later breaks full-table reads.
if (!prefix.isEmpty()) {
throw new UnsupportedOperationException(
"Sub-field-level data evolution supports only one level of partial "
+ "nesting; the nested sub-field '"
+ path
+ "' cannot be partially written. Write the whole '"
+ path
+ "' sub-field instead.");
}
collectLeafPaths(
((RowType) writeField.type()).getFields(),
(RowType) fullField.type(),
path,
out);
} else {
out.add(path);
}
}
}

/** Whether {@code part} contains every (recursively nested) field of {@code full}. */
private static boolean coversFully(RowType part, RowType full) {
if (part.getFieldCount() != full.getFieldCount()) {
return false;
}
for (DataField fullField : full.getFields()) {
if (!part.containsField(fullField.id())) {
return false;
}
DataField partField = part.getField(fullField.id());
if (partField.type() instanceof RowType && fullField.type() instanceof RowType) {
if (!coversFully((RowType) partField.type(), (RowType) fullField.type())) {
return false;
}
}
}
return true;
}

private Map<String, DataField> nameToField() {
Map<String, DataField> nameToField = this.laziedNameToField;
if (nameToField == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,18 @@ public class DataEvolutionFileReader implements RecordReader<InternalRow> {
private final int[] rowOffsets;
private final int[] fieldOffsets;
private final RecordReader<InternalRow>[] readers;
@Nullable private final DataEvolutionRow.NestedField[] nested;

public DataEvolutionFileReader(
int[] rowOffsets, int[] fieldOffsets, RecordReader<InternalRow>[] readers) {
this(rowOffsets, fieldOffsets, readers, null);
}

public DataEvolutionFileReader(
int[] rowOffsets,
int[] fieldOffsets,
RecordReader<InternalRow>[] readers,
@Nullable DataEvolutionRow.NestedField[] nested) {
checkArgument(rowOffsets != null, "Row offsets must not be null");
checkArgument(fieldOffsets != null, "Field offsets must not be null");
checkArgument(
Expand All @@ -70,12 +79,14 @@ public DataEvolutionFileReader(
this.rowOffsets = rowOffsets;
this.fieldOffsets = fieldOffsets;
this.readers = readers;
this.nested = nested;
}

@Override
@Nullable
public RecordIterator<InternalRow> readBatch() throws IOException {
DataEvolutionRow row = new DataEvolutionRow(readers.length, rowOffsets, fieldOffsets);
row.setNested(nested);
RecordIterator<InternalRow>[] iterators = new RecordIterator[readers.length];
for (int i = 0; i < readers.length; i++) {
RecordReader<InternalRow> reader = readers[i];
Expand Down
Loading
Loading