s3: fix remote object not caching (#7790)
* s3: fix remote object not caching * s3: address review comments for remote object caching - Fix leading slash in object name by using strings.TrimPrefix - Return cached entry from CacheRemoteObjectToLocalCluster to get updated local chunk locations - Reuse existing helper function instead of inline gRPC call * s3/filer: add singleflight deduplication for remote object caching - Add singleflight.Group to FilerServer to deduplicate concurrent cache operations - Wrap CacheRemoteObjectToLocalCluster with singleflight to ensure only one caching operation runs per object when multiple clients request the same file - Add early-return check for already-cached objects - S3 API calls filer gRPC with timeout and graceful fallback on error - Clear negative bucket cache when bucket is created via weed shell - Add integration tests for remote cache with singleflight deduplication This benefits all clients (S3, HTTP, Hadoop) accessing remote-mounted objects by preventing redundant cache operations and improving concurrent access performance. Fixes: https://github.com/seaweedfs/seaweedfs/discussions/7599 * fix: data race in concurrent remote object caching - Add mutex to protect chunks slice from concurrent append - Add mutex to protect fetchAndWriteErr from concurrent read/write - Fix incorrect error check (was checking assignResult.Error instead of parseErr) - Rename inner variable to avoid shadowing fetchAndWriteErr * fix: address code review comments - Remove duplicate remote caching block in GetObjectHandler, keep only singleflight version - Add mutex protection for concurrent chunk slice and error access (data race fix) - Use lazy initialization for S3 client in tests to avoid panic during package load - Fix markdown linting: add language specifier to code fence, blank lines around tables - Add 'all' target to Makefile as alias for test-with-server - Remove unused 'util' import * style: remove emojis from test files * fix: add defensive checks and sort chunks by offset - Add nil check and type assertion check for singleflight result - Sort chunks by offset after concurrent fetching to maintain file order * fix: improve test diagnostics and path normalization - runWeedShell now returns error for better test diagnostics - Add all targets to .PHONY in Makefile (logs-primary, logs-remote, health) - Strip leading slash from normalizedObject to avoid double slashes in path --------- Co-authored-by: chrislu <chris.lu@gmail.com> Co-authored-by: Chris Lu <chrislusf@users.noreply.github.com>
This commit is contained in:
@@ -196,6 +196,9 @@ func (s3a *S3ApiServer) updateBucketConfigCacheFromEntry(entry *filer_pb.Entry)
|
||||
// Update cache
|
||||
glog.V(3).Infof("updateBucketConfigCacheFromEntry: updating cache for bucket %s, ObjectLockConfig=%+v", bucket, config.ObjectLockConfig)
|
||||
s3a.bucketConfigCache.Set(bucket, config)
|
||||
// Remove from negative cache since bucket now exists
|
||||
// This is important for buckets created via weed shell or other external means
|
||||
s3a.bucketConfigCache.RemoveNegativeCache(bucket)
|
||||
}
|
||||
|
||||
// invalidateBucketConfigCache removes a bucket from the configuration cache
|
||||
|
||||
@@ -3,14 +3,15 @@ package s3api
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"math"
|
||||
"sync"
|
||||
|
||||
"github.com/aws/aws-sdk-go/service/s3"
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/s3api/s3_constants"
|
||||
"github.com/seaweedfs/seaweedfs/weed/s3api/s3err"
|
||||
"github.com/seaweedfs/seaweedfs/weed/util"
|
||||
"math"
|
||||
"sync"
|
||||
)
|
||||
|
||||
var loadBucketMetadataFromFiler = func(r *BucketRegistry, bucketName string) (*BucketMetaData, error) {
|
||||
@@ -85,8 +86,10 @@ func (r *BucketRegistry) init() error {
|
||||
func (r *BucketRegistry) LoadBucketMetadata(entry *filer_pb.Entry) {
|
||||
bucketMetadata := buildBucketMetadata(r.s3a.iam, entry)
|
||||
r.metadataCacheLock.Lock()
|
||||
defer r.metadataCacheLock.Unlock()
|
||||
r.metadataCache[entry.Name] = bucketMetadata
|
||||
r.metadataCacheLock.Unlock()
|
||||
// Remove from notFound cache since bucket now exists
|
||||
r.unMarkNotFound(entry.Name)
|
||||
}
|
||||
|
||||
func buildBucketMetadata(accountManager AccountManager, entry *filer_pb.Entry) *BucketMetaData {
|
||||
|
||||
@@ -659,6 +659,13 @@ func (s3a *S3ApiServer) GetObjectHandler(w http.ResponseWriter, r *http.Request)
|
||||
return
|
||||
}
|
||||
|
||||
// Handle remote storage objects: cache to local cluster if object is remote-only
|
||||
// This uses singleflight to deduplicate concurrent caching requests for the same object
|
||||
// On cache error, gracefully falls back to streaming from remote
|
||||
if objectEntryForSSE.IsInRemoteOnly() {
|
||||
objectEntryForSSE = s3a.cacheRemoteObjectWithDedup(r.Context(), bucket, object, objectEntryForSSE)
|
||||
}
|
||||
|
||||
// Re-check bucket policy with object entry for tag-based conditions (e.g., s3:ExistingObjectTag)
|
||||
if errCode := s3a.recheckPolicyWithObjectEntry(r, bucket, object, string(s3_constants.ACTION_READ), objectEntryForSSE.Extended, "GetObjectHandler"); errCode != s3err.ErrNone {
|
||||
s3err.WriteErrorResponse(w, r, errCode)
|
||||
@@ -3319,3 +3326,63 @@ func (s3a *S3ApiServer) getMultipartInfo(entry *filer_pb.Entry, partNumber int)
|
||||
// No part boundaries metadata or part not found
|
||||
return partsCount, nil
|
||||
}
|
||||
|
||||
// cacheRemoteObjectWithDedup caches a remote-only object to the local cluster.
|
||||
// The filer server handles singleflight deduplication, so all clients (S3, HTTP, Hadoop) benefit.
|
||||
// On cache error, returns the original entry (streaming from remote will still work).
|
||||
// Uses a bounded timeout to avoid blocking requests indefinitely.
|
||||
func (s3a *S3ApiServer) cacheRemoteObjectWithDedup(ctx context.Context, bucket, object string, entry *filer_pb.Entry) *filer_pb.Entry {
|
||||
// Use a bounded timeout for caching to avoid blocking requests indefinitely
|
||||
// 30 seconds should be enough for most objects; large objects may timeout but will still stream
|
||||
const cacheTimeout = 30 * time.Second
|
||||
cacheCtx, cancel := context.WithTimeout(ctx, cacheTimeout)
|
||||
defer cancel()
|
||||
|
||||
// Build the full path for the object
|
||||
// Normalize object path: remove duplicate slashes and leading slash to avoid double slashes in path
|
||||
dir := s3a.option.BucketsPath + "/" + bucket
|
||||
normalizedObject := strings.TrimPrefix(removeDuplicateSlashes(object), "/")
|
||||
if idx := strings.LastIndex(normalizedObject, "/"); idx > 0 {
|
||||
dir = dir + "/" + normalizedObject[:idx]
|
||||
normalizedObject = normalizedObject[idx+1:]
|
||||
}
|
||||
|
||||
glog.V(2).Infof("cacheRemoteObjectWithDedup: caching %s/%s (remote size: %d)", bucket, object, entry.RemoteEntry.RemoteSize)
|
||||
|
||||
// Call the filer's CacheRemoteObjectToLocalCluster via gRPC
|
||||
// The filer handles singleflight deduplication internally
|
||||
var cachedEntry *filer_pb.Entry
|
||||
err := s3a.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
|
||||
resp, cacheErr := client.CacheRemoteObjectToLocalCluster(cacheCtx, &filer_pb.CacheRemoteObjectToLocalClusterRequest{
|
||||
Directory: dir,
|
||||
Name: normalizedObject,
|
||||
})
|
||||
if cacheErr != nil {
|
||||
return cacheErr
|
||||
}
|
||||
if resp != nil && resp.Entry != nil {
|
||||
cachedEntry = resp.Entry
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
// Caching failed - log and return original entry
|
||||
// Streaming from remote storage will still work via filer proxy
|
||||
if errors.Is(err, context.DeadlineExceeded) {
|
||||
glog.V(1).Infof("cacheRemoteObjectWithDedup: timeout caching %s/%s after %v (will stream from remote)", bucket, object, cacheTimeout)
|
||||
} else {
|
||||
glog.Warningf("cacheRemoteObjectWithDedup: failed to cache %s/%s: %v (will stream from remote)", bucket, object, err)
|
||||
}
|
||||
return entry
|
||||
}
|
||||
|
||||
// If caching succeeded and we got chunks, use the cached entry's chunks
|
||||
if cachedEntry != nil && len(cachedEntry.GetChunks()) > 0 {
|
||||
glog.V(1).Infof("cacheRemoteObjectWithDedup: successfully cached %s/%s (%d chunks)", bucket, object, len(cachedEntry.GetChunks()))
|
||||
// Preserve original entry metadata but use new chunks
|
||||
entry.Chunks = cachedEntry.Chunks
|
||||
}
|
||||
|
||||
return entry
|
||||
}
|
||||
|
||||
@@ -70,7 +70,7 @@ type S3ApiServer struct {
|
||||
inFlightDataSize int64
|
||||
inFlightUploads int64
|
||||
inFlightDataLimitCond *sync.Cond
|
||||
embeddedIam *EmbeddedIamApi // Embedded IAM API server (when enabled)
|
||||
embeddedIam *EmbeddedIamApi // Embedded IAM API server (when enabled)
|
||||
}
|
||||
|
||||
func NewS3ApiServer(router *mux.Router, option *S3ApiServerOption) (s3ApiServer *S3ApiServer, err error) {
|
||||
|
||||
Reference in New Issue
Block a user