iceberg: handle filer-backed compaction inputs (#8638)

* iceberg: handle filer-backed compaction inputs

* iceberg: preserve upsert creation times

* iceberg: align compaction test schema

* iceberg: tighten compact output assertion

* iceberg: document compact output match

* iceberg: clear stale chunks in upsert helper

* iceberg: strengthen compaction integration coverage
This commit is contained in:
Chris Lu
2026-03-15 17:46:06 -07:00
committed by GitHub
parent 0afc675a55
commit e24630251c
3 changed files with 358 additions and 9 deletions

View File

@@ -109,6 +109,22 @@ func (h *Handler) compactDataFiles(
var deletedManifestEntries []iceberg.ManifestEntry
totalMerged := 0
entrySeqNum := func(entry iceberg.ManifestEntry) *int64 {
seqNum := entry.SequenceNum()
if seqNum < 0 {
return nil
}
return &seqNum
}
entryFileSeqNum := func(entry iceberg.ManifestEntry) *int64 {
if fileSeqNum := entry.FileSequenceNum(); fileSeqNum != nil {
value := *fileSeqNum
return &value
}
return entrySeqNum(entry)
}
metaDir := path.Join(s3tables.TablesPath, bucketName, tablePath, "metadata")
dataDir := path.Join(s3tables.TablesPath, bucketName, tablePath, "data")
@@ -189,7 +205,7 @@ func (h *Handler) compactDataFiles(
delEntry := iceberg.NewManifestEntry(
iceberg.EntryStatusDELETED,
&newSnapID,
nil, nil,
entrySeqNum(entry), entryFileSeqNum(entry),
entry.DataFile(),
)
deletedManifestEntries = append(deletedManifestEntries, delEntry)
@@ -221,7 +237,7 @@ func (h *Handler) compactDataFiles(
existingEntry := iceberg.NewManifestEntry(
iceberg.EntryStatusEXISTING,
func() *int64 { id := entry.SnapshotID(); return &id }(),
nil, nil,
entrySeqNum(entry), entryFileSeqNum(entry),
entry.DataFile(),
)
manifestEntries = append(manifestEntries, existingEntry)

View File

@@ -9,12 +9,15 @@ import (
"io"
"path"
"strings"
"sync"
"time"
"github.com/apache/iceberg-go/table"
"github.com/seaweedfs/seaweedfs/weed/filer"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
"github.com/seaweedfs/seaweedfs/weed/s3api/s3tables"
util_http "github.com/seaweedfs/seaweedfs/weed/util/http"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
@@ -25,6 +28,30 @@ type filerFileEntry struct {
Entry *filer_pb.Entry
}
var initGlobalHTTPClientOnce sync.Once
type singleFilerClient struct {
client filer_pb.SeaweedFilerClient
}
func (c singleFilerClient) WithFilerClient(_ bool, fn func(filer_pb.SeaweedFilerClient) error) error {
return fn(c.client)
}
func (c singleFilerClient) AdjustedUrl(location *filer_pb.Location) string {
if location == nil {
return ""
}
if location.PublicUrl != "" {
return location.PublicUrl
}
return location.Url
}
func (c singleFilerClient) GetDataCenter() string {
return ""
}
// listFilerEntries lists all entries in a directory.
func listFilerEntries(ctx context.Context, client filer_pb.SeaweedFilerClient, dir, prefix string) ([]*filer_pb.Entry, error) {
var entries []*filer_pb.Entry
@@ -174,15 +201,20 @@ func loadFileByIcebergPath(ctx context.Context, client filer_pb.SeaweedFilerClie
return nil, fmt.Errorf("file not found: %s/%s", dir, fileName)
}
// Inline content is available for small files (metadata, manifests, and
// manifest lists written by saveFilerFile). Larger files uploaded via S3
// are stored as chunks with empty Content — detect this and return a
// clear error rather than silently returning empty data.
if len(resp.Entry.Content) == 0 && len(resp.Entry.Chunks) > 0 {
return nil, fmt.Errorf("file %s/%s is stored in chunks; only inline content is supported", dir, fileName)
if len(resp.Entry.Content) > 0 || len(resp.Entry.Chunks) == 0 {
return resp.Entry.Content, nil
}
return resp.Entry.Content, nil
initGlobalHTTPClientOnce.Do(util_http.InitGlobalHttpClient)
reader := filer.NewFileReader(singleFilerClient{client: client}, resp.Entry)
if closer, ok := reader.(io.Closer); ok {
defer closer.Close()
}
data, err := io.ReadAll(reader)
if err != nil {
return nil, fmt.Errorf("read chunked file %s/%s: %w", dir, fileName, err)
}
return data, nil
}
// normalizeIcebergPath converts an Iceberg path (which may be an S3 URL, an