s3: fix remote object not caching (#7790)

* s3: fix remote object not caching

* s3: address review comments for remote object caching

- Fix leading slash in object name by using strings.TrimPrefix
- Return cached entry from CacheRemoteObjectToLocalCluster to get updated local chunk locations
- Reuse existing helper function instead of inline gRPC call

* s3/filer: add singleflight deduplication for remote object caching

- Add singleflight.Group to FilerServer to deduplicate concurrent cache operations
- Wrap CacheRemoteObjectToLocalCluster with singleflight to ensure only one
  caching operation runs per object when multiple clients request the same file
- Add early-return check for already-cached objects
- S3 API calls filer gRPC with timeout and graceful fallback on error
- Clear negative bucket cache when bucket is created via weed shell
- Add integration tests for remote cache with singleflight deduplication

This benefits all clients (S3, HTTP, Hadoop) accessing remote-mounted objects
by preventing redundant cache operations and improving concurrent access performance.

Fixes: https://github.com/seaweedfs/seaweedfs/discussions/7599

* fix: data race in concurrent remote object caching

- Add mutex to protect chunks slice from concurrent append
- Add mutex to protect fetchAndWriteErr from concurrent read/write
- Fix incorrect error check (was checking assignResult.Error instead of parseErr)
- Rename inner variable to avoid shadowing fetchAndWriteErr

* fix: address code review comments

- Remove duplicate remote caching block in GetObjectHandler, keep only singleflight version
- Add mutex protection for concurrent chunk slice and error access (data race fix)
- Use lazy initialization for S3 client in tests to avoid panic during package load
- Fix markdown linting: add language specifier to code fence, blank lines around tables
- Add 'all' target to Makefile as alias for test-with-server
- Remove unused 'util' import

* style: remove emojis from test files

* fix: add defensive checks and sort chunks by offset

- Add nil check and type assertion check for singleflight result
- Sort chunks by offset after concurrent fetching to maintain file order

* fix: improve test diagnostics and path normalization

- runWeedShell now returns error for better test diagnostics
- Add all targets to .PHONY in Makefile (logs-primary, logs-remote, health)
- Strip leading slash from normalizedObject to avoid double slashes in path

---------

Co-authored-by: chrislu <chris.lu@gmail.com>
Co-authored-by: Chris Lu <chrislusf@users.noreply.github.com>
This commit is contained in:
G-OD
2025-12-16 20:41:04 +00:00
committed by GitHub
parent 697b56003d
commit 504b258258
13 changed files with 992 additions and 36 deletions

View File

@@ -25,12 +25,20 @@ func MapRemoteStorageLocationPathToFullPath(localMountedDir util.FullPath, remot
return localMountedDir.Child(remoteLocationPath[len(remoteMountedLocation.Path):])
}
func CacheRemoteObjectToLocalCluster(filerClient filer_pb.FilerClient, remoteConf *remote_pb.RemoteConf, remoteLocation *remote_pb.RemoteStorageLocation, parent util.FullPath, entry *filer_pb.Entry) error {
return filerClient.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
_, err := client.CacheRemoteObjectToLocalCluster(context.Background(), &filer_pb.CacheRemoteObjectToLocalClusterRequest{
// CacheRemoteObjectToLocalCluster caches a remote object to the local cluster.
// It returns the updated entry with local chunk locations.
// Parameters remoteConf and remoteLocation are kept for backward compatibility but are not used.
func CacheRemoteObjectToLocalCluster(filerClient filer_pb.FilerClient, remoteConf *remote_pb.RemoteConf, remoteLocation *remote_pb.RemoteStorageLocation, parent util.FullPath, entry *filer_pb.Entry) (*filer_pb.Entry, error) {
var cachedEntry *filer_pb.Entry
err := filerClient.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
resp, cacheErr := client.CacheRemoteObjectToLocalCluster(context.Background(), &filer_pb.CacheRemoteObjectToLocalClusterRequest{
Directory: string(parent),
Name: entry.Name,
})
return err
if cacheErr == nil && resp != nil {
cachedEntry = resp.Entry
}
return cacheErr
})
return cachedEntry, err
}

View File

@@ -196,6 +196,9 @@ func (s3a *S3ApiServer) updateBucketConfigCacheFromEntry(entry *filer_pb.Entry)
// Update cache
glog.V(3).Infof("updateBucketConfigCacheFromEntry: updating cache for bucket %s, ObjectLockConfig=%+v", bucket, config.ObjectLockConfig)
s3a.bucketConfigCache.Set(bucket, config)
// Remove from negative cache since bucket now exists
// This is important for buckets created via weed shell or other external means
s3a.bucketConfigCache.RemoveNegativeCache(bucket)
}
// invalidateBucketConfigCache removes a bucket from the configuration cache

View File

@@ -3,14 +3,15 @@ package s3api
import (
"context"
"encoding/json"
"math"
"sync"
"github.com/aws/aws-sdk-go/service/s3"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
"github.com/seaweedfs/seaweedfs/weed/s3api/s3_constants"
"github.com/seaweedfs/seaweedfs/weed/s3api/s3err"
"github.com/seaweedfs/seaweedfs/weed/util"
"math"
"sync"
)
var loadBucketMetadataFromFiler = func(r *BucketRegistry, bucketName string) (*BucketMetaData, error) {
@@ -85,8 +86,10 @@ func (r *BucketRegistry) init() error {
func (r *BucketRegistry) LoadBucketMetadata(entry *filer_pb.Entry) {
bucketMetadata := buildBucketMetadata(r.s3a.iam, entry)
r.metadataCacheLock.Lock()
defer r.metadataCacheLock.Unlock()
r.metadataCache[entry.Name] = bucketMetadata
r.metadataCacheLock.Unlock()
// Remove from notFound cache since bucket now exists
r.unMarkNotFound(entry.Name)
}
func buildBucketMetadata(accountManager AccountManager, entry *filer_pb.Entry) *BucketMetaData {

View File

@@ -659,6 +659,13 @@ func (s3a *S3ApiServer) GetObjectHandler(w http.ResponseWriter, r *http.Request)
return
}
// Handle remote storage objects: cache to local cluster if object is remote-only
// This uses singleflight to deduplicate concurrent caching requests for the same object
// On cache error, gracefully falls back to streaming from remote
if objectEntryForSSE.IsInRemoteOnly() {
objectEntryForSSE = s3a.cacheRemoteObjectWithDedup(r.Context(), bucket, object, objectEntryForSSE)
}
// Re-check bucket policy with object entry for tag-based conditions (e.g., s3:ExistingObjectTag)
if errCode := s3a.recheckPolicyWithObjectEntry(r, bucket, object, string(s3_constants.ACTION_READ), objectEntryForSSE.Extended, "GetObjectHandler"); errCode != s3err.ErrNone {
s3err.WriteErrorResponse(w, r, errCode)
@@ -3319,3 +3326,63 @@ func (s3a *S3ApiServer) getMultipartInfo(entry *filer_pb.Entry, partNumber int)
// No part boundaries metadata or part not found
return partsCount, nil
}
// cacheRemoteObjectWithDedup caches a remote-only object to the local cluster.
// The filer server handles singleflight deduplication, so all clients (S3, HTTP, Hadoop) benefit.
// On cache error, returns the original entry (streaming from remote will still work).
// Uses a bounded timeout to avoid blocking requests indefinitely.
func (s3a *S3ApiServer) cacheRemoteObjectWithDedup(ctx context.Context, bucket, object string, entry *filer_pb.Entry) *filer_pb.Entry {
// Use a bounded timeout for caching to avoid blocking requests indefinitely
// 30 seconds should be enough for most objects; large objects may timeout but will still stream
const cacheTimeout = 30 * time.Second
cacheCtx, cancel := context.WithTimeout(ctx, cacheTimeout)
defer cancel()
// Build the full path for the object
// Normalize object path: remove duplicate slashes and leading slash to avoid double slashes in path
dir := s3a.option.BucketsPath + "/" + bucket
normalizedObject := strings.TrimPrefix(removeDuplicateSlashes(object), "/")
if idx := strings.LastIndex(normalizedObject, "/"); idx > 0 {
dir = dir + "/" + normalizedObject[:idx]
normalizedObject = normalizedObject[idx+1:]
}
glog.V(2).Infof("cacheRemoteObjectWithDedup: caching %s/%s (remote size: %d)", bucket, object, entry.RemoteEntry.RemoteSize)
// Call the filer's CacheRemoteObjectToLocalCluster via gRPC
// The filer handles singleflight deduplication internally
var cachedEntry *filer_pb.Entry
err := s3a.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
resp, cacheErr := client.CacheRemoteObjectToLocalCluster(cacheCtx, &filer_pb.CacheRemoteObjectToLocalClusterRequest{
Directory: dir,
Name: normalizedObject,
})
if cacheErr != nil {
return cacheErr
}
if resp != nil && resp.Entry != nil {
cachedEntry = resp.Entry
}
return nil
})
if err != nil {
// Caching failed - log and return original entry
// Streaming from remote storage will still work via filer proxy
if errors.Is(err, context.DeadlineExceeded) {
glog.V(1).Infof("cacheRemoteObjectWithDedup: timeout caching %s/%s after %v (will stream from remote)", bucket, object, cacheTimeout)
} else {
glog.Warningf("cacheRemoteObjectWithDedup: failed to cache %s/%s: %v (will stream from remote)", bucket, object, err)
}
return entry
}
// If caching succeeded and we got chunks, use the cached entry's chunks
if cachedEntry != nil && len(cachedEntry.GetChunks()) > 0 {
glog.V(1).Infof("cacheRemoteObjectWithDedup: successfully cached %s/%s (%d chunks)", bucket, object, len(cachedEntry.GetChunks()))
// Preserve original entry metadata but use new chunks
entry.Chunks = cachedEntry.Chunks
}
return entry
}

View File

@@ -70,7 +70,7 @@ type S3ApiServer struct {
inFlightDataSize int64
inFlightUploads int64
inFlightDataLimitCond *sync.Cond
embeddedIam *EmbeddedIamApi // Embedded IAM API server (when enabled)
embeddedIam *EmbeddedIamApi // Embedded IAM API server (when enabled)
}
func NewS3ApiServer(router *mux.Router, option *S3ApiServerOption) (s3ApiServer *S3ApiServer, err error) {

View File

@@ -3,11 +3,13 @@ package weed_server
import (
"context"
"fmt"
"sort"
"strings"
"sync"
"time"
"github.com/seaweedfs/seaweedfs/weed/filer"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/operation"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
@@ -19,6 +21,59 @@ import (
)
func (fs *FilerServer) CacheRemoteObjectToLocalCluster(ctx context.Context, req *filer_pb.CacheRemoteObjectToLocalClusterRequest) (*filer_pb.CacheRemoteObjectToLocalClusterResponse, error) {
// Use singleflight to deduplicate concurrent caching requests for the same object
// This benefits all clients: S3 API, filer HTTP, Hadoop, etc.
cacheKey := req.Directory + "/" + req.Name
result, err, shared := fs.remoteCacheGroup.Do(cacheKey, func() (interface{}, error) {
return fs.doCacheRemoteObjectToLocalCluster(ctx, req)
})
if shared {
glog.V(2).Infof("CacheRemoteObjectToLocalCluster: shared result for %s", cacheKey)
}
if err != nil {
return nil, err
}
if result == nil {
return nil, fmt.Errorf("unexpected nil result from singleflight")
}
resp, ok := result.(*filer_pb.CacheRemoteObjectToLocalClusterResponse)
if !ok {
return nil, fmt.Errorf("unexpected result type from singleflight")
}
return resp, nil
}
// doCacheRemoteObjectToLocalCluster performs the actual caching operation.
// This is called from singleflight, so only one instance runs per object.
func (fs *FilerServer) doCacheRemoteObjectToLocalCluster(ctx context.Context, req *filer_pb.CacheRemoteObjectToLocalClusterRequest) (*filer_pb.CacheRemoteObjectToLocalClusterResponse, error) {
// find the entry first to check if already cached
entry, err := fs.filer.FindEntry(ctx, util.JoinPath(req.Directory, req.Name))
if err == filer_pb.ErrNotFound {
return nil, err
}
if err != nil {
return nil, fmt.Errorf("find entry %s/%s: %v", req.Directory, req.Name, err)
}
resp := &filer_pb.CacheRemoteObjectToLocalClusterResponse{}
// Early return if not a remote-only object or already cached
if entry.Remote == nil || entry.Remote.RemoteSize == 0 {
resp.Entry = entry.ToProtoEntry()
return resp, nil
}
if len(entry.GetChunks()) > 0 {
// Already has local chunks - already cached
glog.V(2).Infof("CacheRemoteObjectToLocalCluster: %s/%s already cached (%d chunks)", req.Directory, req.Name, len(entry.GetChunks()))
resp.Entry = entry.ToProtoEntry()
return resp, nil
}
glog.V(1).Infof("CacheRemoteObjectToLocalCluster: caching %s/%s (remote size: %d)", req.Directory, req.Name, entry.Remote.RemoteSize)
// load all mappings
mappingEntry, err := fs.filer.FindEntry(ctx, util.JoinPath(filer.DirectoryEtcRemote, filer.REMOTE_STORAGE_MOUNT_FILE))
@@ -52,17 +107,6 @@ func (fs *FilerServer) CacheRemoteObjectToLocalCluster(ctx context.Context, req
return nil, fmt.Errorf("unmarshal remote storage conf %s/%s: %v", filer.DirectoryEtcRemote, remoteStorageMountedLocation.Name+filer.REMOTE_STORAGE_CONF_SUFFIX, unMarshalErr)
}
// find the entry
entry, err := fs.filer.FindEntry(ctx, util.JoinPath(req.Directory, req.Name))
if err == filer_pb.ErrNotFound {
return nil, err
}
resp := &filer_pb.CacheRemoteObjectToLocalClusterResponse{}
if entry.Remote == nil || entry.Remote.RemoteSize == 0 {
return resp, nil
}
// detect storage option
so, err := fs.detectStorageOption(ctx, req.Directory, "", "", 0, "", "", "", "")
if err != nil {
@@ -81,6 +125,7 @@ func (fs *FilerServer) CacheRemoteObjectToLocalCluster(ctx context.Context, req
dest := util.FullPath(remoteStorageMountedLocation.Path).Child(string(util.FullPath(req.Directory).Child(req.Name))[len(localMountedDir):])
var chunks []*filer_pb.FileChunk
var chunksMu sync.Mutex
var fetchAndWriteErr error
var wg sync.WaitGroup
@@ -99,16 +144,28 @@ func (fs *FilerServer) CacheRemoteObjectToLocalCluster(ctx context.Context, req
// assign one volume server
assignResult, err := operation.Assign(ctx, fs.filer.GetMaster, fs.grpcDialOption, assignRequest, altRequest)
if err != nil {
fetchAndWriteErr = err
chunksMu.Lock()
if fetchAndWriteErr == nil {
fetchAndWriteErr = err
}
chunksMu.Unlock()
return
}
if assignResult.Error != "" {
fetchAndWriteErr = fmt.Errorf("assign: %v", assignResult.Error)
chunksMu.Lock()
if fetchAndWriteErr == nil {
fetchAndWriteErr = fmt.Errorf("assign: %v", assignResult.Error)
}
chunksMu.Unlock()
return
}
fileId, parseErr := needle.ParseFileIdFromString(assignResult.Fid)
if assignResult.Error != "" {
fetchAndWriteErr = fmt.Errorf("unrecognized file id %s: %v", assignResult.Fid, parseErr)
if parseErr != nil {
chunksMu.Lock()
if fetchAndWriteErr == nil {
fetchAndWriteErr = fmt.Errorf("unrecognized file id %s: %v", assignResult.Fid, parseErr)
}
chunksMu.Unlock()
return
}
@@ -125,7 +182,7 @@ func (fs *FilerServer) CacheRemoteObjectToLocalCluster(ctx context.Context, req
assignedServerAddress := pb.NewServerAddressWithGrpcPort(assignResult.Url, assignResult.GrpcPort)
var etag string
err = operation.WithVolumeServerClient(false, assignedServerAddress, fs.grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
resp, fetchAndWriteErr := volumeServerClient.FetchAndWriteNeedle(context.Background(), &volume_server_pb.FetchAndWriteNeedleRequest{
resp, fetchErr := volumeServerClient.FetchAndWriteNeedle(context.Background(), &volume_server_pb.FetchAndWriteNeedleRequest{
VolumeId: uint32(fileId.VolumeId),
NeedleId: uint64(fileId.Key),
Cookie: uint32(fileId.Cookie),
@@ -140,21 +197,23 @@ func (fs *FilerServer) CacheRemoteObjectToLocalCluster(ctx context.Context, req
Path: string(dest),
},
})
if fetchAndWriteErr != nil {
return fmt.Errorf("volume server %s fetchAndWrite %s: %v", assignResult.Url, dest, fetchAndWriteErr)
} else {
etag = resp.ETag
if fetchErr != nil {
return fmt.Errorf("volume server %s fetchAndWrite %s: %v", assignResult.Url, dest, fetchErr)
}
etag = resp.ETag
return nil
})
if err != nil && fetchAndWriteErr == nil {
fetchAndWriteErr = err
if err != nil {
chunksMu.Lock()
if fetchAndWriteErr == nil {
fetchAndWriteErr = err
}
chunksMu.Unlock()
return
}
chunks = append(chunks, &filer_pb.FileChunk{
chunk := &filer_pb.FileChunk{
FileId: assignResult.Fid,
Offset: localOffset,
Size: uint64(size),
@@ -165,13 +224,24 @@ func (fs *FilerServer) CacheRemoteObjectToLocalCluster(ctx context.Context, req
FileKey: uint64(fileId.Key),
Cookie: uint32(fileId.Cookie),
},
})
}
chunksMu.Lock()
chunks = append(chunks, chunk)
chunksMu.Unlock()
})
}
wg.Wait()
if fetchAndWriteErr != nil {
return nil, fetchAndWriteErr
chunksMu.Lock()
err = fetchAndWriteErr
// Sort chunks by offset to maintain file order
sort.Slice(chunks, func(i, j int) bool {
return chunks[i].Offset < chunks[j].Offset
})
chunksMu.Unlock()
if err != nil {
return nil, err
}
garbage := entry.GetChunks()

View File

@@ -11,6 +11,7 @@ import (
"time"
"github.com/seaweedfs/seaweedfs/weed/stats"
"golang.org/x/sync/singleflight"
"google.golang.org/grpc"
@@ -108,6 +109,9 @@ type FilerServer struct {
// track known metadata listeners
knownListenersLock sync.Mutex
knownListeners map[int32]int32
// deduplicates concurrent remote object caching operations
remoteCacheGroup singleflight.Group
}
func NewFilerServer(defaultMux, readonlyMux *http.ServeMux, option *FilerOption) (fs *FilerServer, err error) {

View File

@@ -345,7 +345,7 @@ func (c *commandRemoteCache) doComprehensiveSync(commandEnv *CommandEnv, writer
fmt.Fprintf(writer, "Caching %s... ", pathToCacheCopy)
if err := filer.CacheRemoteObjectToLocalCluster(commandEnv, remoteConf, remoteLocation, util.FullPath(dir), localEntry); err != nil {
if _, err := filer.CacheRemoteObjectToLocalCluster(commandEnv, remoteConf, remoteLocation, util.FullPath(dir), localEntry); err != nil {
fmt.Fprintf(writer, "failed: %v\n", err)
if executionErr == nil {
executionErr = err