iceberg: keep split bins within target size (#8640)

This commit is contained in:
Chris Lu
2026-03-15 12:48:31 -07:00
committed by GitHub
parent d9d6707401
commit 5f19e3259f
2 changed files with 86 additions and 31 deletions

View File

@@ -363,9 +363,9 @@ func buildCompactionBins(entries []iceberg.ManifestEntry, targetSize int64, minF
} }
// splitOversizedBin splits a bin whose total size exceeds targetSize into // splitOversizedBin splits a bin whose total size exceeds targetSize into
// sub-bins that each stay under targetSize while meeting minFiles. // sub-bins that stay under targetSize. Bins that cannot reach minFiles
// Entries are sorted by size descending before splitting so that large // without violating targetSize are left uncompacted rather than merged into
// files are placed first, improving bin packing efficiency. // oversized bins.
func splitOversizedBin(bin compactionBin, targetSize int64, minFiles int) []compactionBin { func splitOversizedBin(bin compactionBin, targetSize int64, minFiles int) []compactionBin {
// Sort largest-first for better packing. // Sort largest-first for better packing.
sorted := make([]iceberg.ManifestEntry, len(bin.Entries)) sorted := make([]iceberg.ManifestEntry, len(bin.Entries))
@@ -394,41 +394,43 @@ func splitOversizedBin(bin compactionBin, targetSize int64, minFiles int) []comp
bins = append(bins, current) bins = append(bins, current)
} }
// Merge any bin with fewer than minFiles entries into its neighbor. var valid []compactionBin
// Repeat until no more merges are needed. This handles runts at the var pending []compactionBin
// end (from the split) and small leading bins (from largest-first for _, candidate := range bins {
// sorting placing single large files into their own bins). if len(candidate.Entries) >= minFiles {
for changed := true; changed && len(bins) > 1; { valid = append(valid, candidate)
changed = false continue
for i := 0; i < len(bins); i++ { }
if len(bins[i].Entries) >= minFiles { pending = append(pending, candidate)
continue }
// Try to fold entries from underfilled bins into valid bins when they fit.
for _, runt := range pending {
for _, entry := range runt.Entries {
bestIdx := -1
bestRemaining := int64(-1)
entrySize := entry.DataFile().FileSizeBytes()
for i := range valid {
remaining := targetSize - valid[i].TotalSize - entrySize
if remaining < 0 {
continue
}
if bestIdx == -1 || remaining < bestRemaining {
bestIdx = i
bestRemaining = remaining
}
} }
// Pick the better neighbor to merge into. if bestIdx >= 0 {
var target int valid[bestIdx].Entries = append(valid[bestIdx].Entries, entry)
if i == 0 { valid[bestIdx].TotalSize += entrySize
target = 1
} else if i == len(bins)-1 {
target = i - 1
} else if bins[i-1].TotalSize <= bins[i+1].TotalSize {
target = i - 1
} else {
target = i + 1
} }
bins[target].Entries = append(bins[target].Entries, bins[i].Entries...)
bins[target].TotalSize += bins[i].TotalSize
bins = append(bins[:i], bins[i+1:]...)
changed = true
break // restart scan after structural change
} }
} }
// Final guard: if a single remaining bin has fewer than minFiles if len(valid) == 0 {
// (entire input too small), return nothing.
if len(bins) == 1 && len(bins[0].Entries) < minFiles {
return nil return nil
} }
return bins return valid
} }
// partitionKey creates a string key from a partition map for grouping. // partitionKey creates a string key from a partition map for grouping.

View File

@@ -515,6 +515,59 @@ func TestBuildCompactionBinsMultiplePartitions(t *testing.T) {
} }
} }
func TestSplitOversizedBinRespectsTargetSize(t *testing.T) {
targetSize := int64(100)
minFiles := 2
entries := makeTestEntries(t, []testEntrySpec{
{path: "data/f1.parquet", size: 80, partition: map[int]any{}},
{path: "data/f2.parquet", size: 80, partition: map[int]any{}},
{path: "data/f3.parquet", size: 10, partition: map[int]any{}},
{path: "data/f4.parquet", size: 10, partition: map[int]any{}},
})
bins := splitOversizedBin(compactionBin{
PartitionKey: "__unpartitioned__",
Partition: map[int]any{},
Entries: entries,
TotalSize: 180,
}, targetSize, minFiles)
if len(bins) == 0 {
t.Fatal("expected at least one valid bin")
}
for i, bin := range bins {
if bin.TotalSize > targetSize {
t.Fatalf("bin %d exceeds target size: got %d want <= %d", i, bin.TotalSize, targetSize)
}
if len(bin.Entries) < minFiles {
t.Fatalf("bin %d does not meet minFiles: got %d want >= %d", i, len(bin.Entries), minFiles)
}
}
}
func TestSplitOversizedBinDropsImpossibleRunts(t *testing.T) {
targetSize := int64(100)
minFiles := 2
entries := makeTestEntries(t, []testEntrySpec{
{path: "data/f1.parquet", size: 60, partition: map[int]any{}},
{path: "data/f2.parquet", size: 60, partition: map[int]any{}},
{path: "data/f3.parquet", size: 60, partition: map[int]any{}},
})
bins := splitOversizedBin(compactionBin{
PartitionKey: "__unpartitioned__",
Partition: map[int]any{},
Entries: entries,
TotalSize: 180,
}, targetSize, minFiles)
if len(bins) != 0 {
t.Fatalf("expected no valid bins, got %d", len(bins))
}
}
type testEntrySpec struct { type testEntrySpec struct {
path string path string
size int64 size int64