iceberg: add delete file rewrite maintenance (#8664)

* iceberg: add delete file rewrite maintenance

* iceberg: preserve untouched delete files during rewrites

* iceberg: share detection threshold defaults

* iceberg: add partition-scoped maintenance filters (#8665)

* iceberg: add partition-scoped maintenance filters

* iceberg: tighten where-filter partition matching
This commit is contained in:
Chris Lu
2026-03-16 21:11:09 -07:00
committed by GitHub
parent a3717cd4b5
commit e5c0889473
11 changed files with 2138 additions and 115 deletions

View File

@@ -323,6 +323,10 @@ func (h *Handler) rewriteManifests(
if err != nil {
return "", nil, fmt.Errorf("load metadata: %w", err)
}
predicate, err := parsePartitionPredicate(config.Where, meta)
if err != nil {
return "", nil, err
}
currentSnap := meta.CurrentSnapshot()
if currentSnap == nil || currentSnap.ManifestList == "" {
@@ -349,10 +353,6 @@ func (h *Handler) rewriteManifests(
}
}
if int64(len(dataManifests)) < config.MinManifestsToRewrite {
return fmt.Sprintf("only %d data manifests, below threshold of %d", len(dataManifests), config.MinManifestsToRewrite), nil, nil
}
// Collect all entries from data manifests, grouped by partition spec ID
// so we write one merged manifest per spec (required for spec-evolved tables).
type specEntries struct {
@@ -363,10 +363,9 @@ func (h *Handler) rewriteManifests(
specMap := make(map[int32]*specEntries)
// Build a lookup from spec ID to PartitionSpec
specByID := make(map[int]iceberg.PartitionSpec)
for _, ps := range meta.PartitionSpecs() {
specByID[ps.ID()] = ps
}
specByID := specByID(meta)
var carriedDataManifests []iceberg.ManifestFile
var manifestsRewritten int64
for _, mf := range dataManifests {
manifestData, err := loadFileByIcebergPath(ctx, filerClient, bucketName, tablePath, mf.FilePath())
@@ -378,6 +377,28 @@ func (h *Handler) rewriteManifests(
return "", nil, fmt.Errorf("parse manifest %s: %w", mf.FilePath(), err)
}
if predicate != nil {
spec, found := specByID[int(mf.PartitionSpecID())]
if !found {
return "", nil, fmt.Errorf("partition spec %d not found in table metadata", mf.PartitionSpecID())
}
allMatch := len(entries) > 0
for _, entry := range entries {
match, err := predicate.Matches(spec, entry.DataFile().Partition())
if err != nil {
return "", nil, err
}
if !match {
allMatch = false
break
}
}
if !allMatch {
carriedDataManifests = append(carriedDataManifests, mf)
continue
}
}
sid := mf.PartitionSpecID()
se, ok := specMap[sid]
if !ok {
@@ -389,6 +410,11 @@ func (h *Handler) rewriteManifests(
specMap[sid] = se
}
se.entries = append(se.entries, entries...)
manifestsRewritten++
}
if manifestsRewritten < config.MinManifestsToRewrite {
return fmt.Sprintf("only %d data manifests, below threshold of %d", manifestsRewritten, config.MinManifestsToRewrite), nil, nil
}
if len(specMap) == 0 {
@@ -425,6 +451,7 @@ func (h *Handler) rewriteManifests(
// Write one merged manifest per partition spec
var newManifests []iceberg.ManifestFile
newManifests = append(newManifests, carriedDataManifests...)
totalEntries := 0
for _, se := range specMap {
totalEntries += len(se.entries)
@@ -514,11 +541,11 @@ func (h *Handler) rewriteManifests(
committed = true
metrics := map[string]int64{
MetricManifestsRewritten: int64(len(dataManifests)),
MetricManifestsRewritten: manifestsRewritten,
MetricEntriesTotal: int64(totalEntries),
MetricDurationMs: time.Since(start).Milliseconds(),
}
return fmt.Sprintf("rewrote %d manifests into %d (%d entries)", len(dataManifests), len(specMap), totalEntries), metrics, nil
return fmt.Sprintf("rewrote %d manifests into %d (%d entries)", manifestsRewritten, len(specMap), totalEntries), metrics, nil
}
// ---------------------------------------------------------------------------