From 486cc1f63ec65cae6189649b92ba98a9fdc927b6 Mon Sep 17 00:00:00 2001 From: Adrian Stratulat Date: Fri, 24 Jan 2025 08:44:19 +0100 Subject: [PATCH] Spark 3.5: Select for rewriting the files belonging to old partitioning schemas --- .../iceberg/actions/SizeBasedDataRewriter.java | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java index 61b90d9fc6e3..adb4d7032cce 100644 --- a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java @@ -75,13 +75,20 @@ protected Iterable filterFiles(Iterable tasks) { } private boolean shouldRewrite(FileScanTask task) { - return wronglySized(task) || tooManyDeletes(task) || tooHighDeleteRatio(task); + return wronglySized(task) + || tooManyDeletes(task) + || tooHighDeleteRatio(task) + || oldPartitioning(task); } private boolean tooManyDeletes(FileScanTask task) { return task.deletes() != null && task.deletes().size() >= deleteFileThreshold; } + private boolean oldPartitioning(FileScanTask task) { + return task.file().specId() != table().spec().specId(); + } + @Override protected Iterable> filterFileGroups(List> groups) { return Iterables.filter(groups, this::shouldRewrite); @@ -92,7 +99,12 @@ private boolean shouldRewrite(List group) { || enoughContent(group) || tooMuchContent(group) || anyTaskHasTooManyDeletes(group) - || anyTaskHasTooHighDeleteRatio(group); + || anyTaskHasTooHighDeleteRatio(group) + || anyTaskHasOldPartitioning(group); + } + + private boolean anyTaskHasOldPartitioning(List group) { + return group.stream().anyMatch(this::oldPartitioning); } private boolean anyTaskHasTooManyDeletes(List group) {