Files
seaweedFS/weed/remote_storage/s3/s3_storage_client.go
Chris Lu f3c5ba3cd6 feat(filer): add lazy directory listing for remote mounts (#8615)
* feat(filer): add lazy directory listing for remote mounts

Directory listings on remote mounts previously only queried the local
filer store. With lazy mounts the listing was empty; with eager mounts
it went stale over time.

Add on-demand directory listing that fetches from remote and caches
results with a 5-minute TTL:

- Add `ListDirectory` to `RemoteStorageClient` interface (delimiter-based,
  single-level listing, separate from recursive `Traverse`)
- Implement in S3, GCS, and Azure backends using each platform's
  hierarchical listing API
- Add `maybeLazyListFromRemote` to filer: before each directory listing,
  check if the directory is under a remote mount with an expired cache,
  fetch from remote, persist entries to the local store, then let existing
  listing logic run on the populated store
- Use singleflight to deduplicate concurrent requests for the same directory
- Skip local-only entries (no RemoteEntry) to avoid overwriting unsynced uploads
- Errors are logged and swallowed (availability over consistency)

* refactor: extract xattr key to constant xattrRemoteListingSyncedAt

* feat: make listing cache TTL configurable per mount via listing_cache_ttl_seconds

Add listing_cache_ttl_seconds field to RemoteStorageLocation protobuf.
When 0 (default), lazy directory listing is disabled for that mount.
When >0, enables on-demand directory listing with the specified TTL.

Expose as -listingCacheTTL flag on remote.mount command.

* refactor: address review feedback for lazy directory listing

- Add context.Context to ListDirectory interface and all implementations
- Capture startTime before remote call for accurate TTL tracking
- Simplify S3 ListDirectory using ListObjectsV2PagesWithContext
- Make maybeLazyListFromRemote return void (errors always swallowed)
- Remove redundant trailing-slash path manipulation in caller
- Update tests to match new signatures

* When an existing entry has Remote != nil, we should merge remote metadata   into it rather than replacing it.

* fix(gcs): wrap ListDirectory iterator error with context

The raw iterator error was returned without bucket/path context,
making it harder to debug. Wrap it consistently with the S3 pattern.

* fix(s3): guard against nil pointer dereference in Traverse and ListDirectory

Some S3-compatible backends may return nil for LastModified, Size, or
ETag fields. Check for nil before dereferencing to prevent panics.

* fix(filer): remove blanket 2-minute timeout from lazy listing context

Individual SDK operations (S3, GCS, Azure) already have per-request
timeouts and retry policies. The blanket timeout could cut off large
directory listings mid-operation even though individual pages were
succeeding.

* fix(filer): preserve trace context in lazy listing with WithoutCancel

Use context.WithoutCancel(ctx) instead of context.Background() so
trace/span values from the incoming request are retained for
distributed tracing, while still decoupling cancellation.

* fix(filer): use Store.FindEntry for internal lookups, add Uid/Gid to files, fix updateDirectoryListingSyncedAt

- Use f.Store.FindEntry instead of f.FindEntry for staleness check and
  child lookups to avoid unnecessary lazy-fetch overhead
- Set OS_UID/OS_GID on new file entries for consistency with directories
- In updateDirectoryListingSyncedAt, use Store.UpdateEntry for existing
  directories instead of CreateEntry to avoid deleteChunksIfNotNew and
  NotifyUpdateEvent side effects

* fix(filer): distinguish not-found from store errors in lazy listing

Previously, any error from Store.FindEntry was treated as "not found,"
which could cause entry recreation/overwrite on transient DB failures.
Now check for filer_pb.ErrNotFound explicitly and skip entries or
bail out on real store errors.

* refactor(filer): use errors.Is for ErrNotFound comparisons
2026-03-13 09:36:54 -07:00

379 lines
11 KiB
Go

package s3
import (
"context"
"fmt"
"io"
"net/http"
"reflect"
"strings"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/aws/credentials"
"github.com/aws/aws-sdk-go/aws/request"
"github.com/aws/aws-sdk-go/aws/session"
v4 "github.com/aws/aws-sdk-go/aws/signer/v4"
"github.com/aws/aws-sdk-go/service/s3"
"github.com/aws/aws-sdk-go/service/s3/s3iface"
"github.com/aws/aws-sdk-go/service/s3/s3manager"
"github.com/seaweedfs/seaweedfs/weed/filer"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/remote_pb"
"github.com/seaweedfs/seaweedfs/weed/remote_storage"
"github.com/seaweedfs/seaweedfs/weed/util"
"github.com/seaweedfs/seaweedfs/weed/util/version"
)
func init() {
remote_storage.RemoteStorageClientMakers["s3"] = new(s3RemoteStorageMaker)
}
type s3RemoteStorageMaker struct{}
func (s s3RemoteStorageMaker) HasBucket() bool {
return true
}
func (s s3RemoteStorageMaker) Make(conf *remote_pb.RemoteConf) (remote_storage.RemoteStorageClient, error) {
client := &s3RemoteStorageClient{
supportTagging: true,
conf: conf,
}
config := &aws.Config{
Region: aws.String(conf.S3Region),
Endpoint: aws.String(conf.S3Endpoint),
S3ForcePathStyle: aws.Bool(conf.S3ForcePathStyle),
S3DisableContentMD5Validation: aws.Bool(true),
}
if conf.S3AccessKey != "" && conf.S3SecretKey != "" {
config.Credentials = credentials.NewStaticCredentials(conf.S3AccessKey, conf.S3SecretKey, "")
} else if conf.S3AccessKey == "" && conf.S3SecretKey == "" {
// Explicitly disable signing for public buckets.
config.Credentials = credentials.AnonymousCredentials
}
sess, err := session.NewSession(config)
if err != nil {
return nil, fmt.Errorf("create aws session: %w", err)
}
if conf.S3V4Signature {
sess.Handlers.Sign.PushBackNamed(v4.SignRequestHandler)
}
sess.Handlers.Build.PushBack(func(r *request.Request) {
r.HTTPRequest.Header.Set("User-Agent", "SeaweedFS/"+version.VERSION_NUMBER)
})
sess.Handlers.Build.PushFront(skipSha256PayloadSigning)
client.conn = s3.New(sess)
return client, nil
}
type s3RemoteStorageClient struct {
conf *remote_pb.RemoteConf
conn s3iface.S3API
supportTagging bool
}
var _ = remote_storage.RemoteStorageClient(&s3RemoteStorageClient{supportTagging: true})
func (s *s3RemoteStorageClient) Traverse(remote *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) (err error) {
pathKey := remote.Path[1:]
listInput := &s3.ListObjectsV2Input{
Bucket: aws.String(remote.Bucket),
ContinuationToken: nil,
Delimiter: nil, // not aws.String("/"), iterate through all entries
EncodingType: nil,
ExpectedBucketOwner: nil,
FetchOwner: nil,
MaxKeys: nil, // aws.Int64(1000),
Prefix: aws.String(pathKey),
RequestPayer: nil,
StartAfter: nil,
}
isLastPage := false
for !isLastPage && err == nil {
var localErr error
listErr := s.conn.ListObjectsV2Pages(listInput, func(page *s3.ListObjectsV2Output, lastPage bool) bool {
for _, content := range page.Contents {
key := *content.Key
key = "/" + key
dir, name := util.FullPath(key).DirAndName()
remoteEntry := &filer_pb.RemoteEntry{
StorageName: s.conf.Name,
}
if content.LastModified != nil {
remoteEntry.RemoteMtime = content.LastModified.Unix()
}
if content.Size != nil {
remoteEntry.RemoteSize = *content.Size
}
if content.ETag != nil {
remoteEntry.RemoteETag = *content.ETag
}
if err := visitFn(dir, name, false, remoteEntry); err != nil {
localErr = err
return false
}
}
listInput.ContinuationToken = page.NextContinuationToken
isLastPage = lastPage
return true
})
if listErr != nil {
err = fmt.Errorf("list %v: %w", remote, listErr)
}
if localErr != nil {
err = fmt.Errorf("process %v: %w", remote, localErr)
}
}
return
}
func (s *s3RemoteStorageClient) ListDirectory(ctx context.Context, loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) error {
pathKey := loc.Path[1:]
if pathKey != "" && !strings.HasSuffix(pathKey, "/") {
pathKey += "/"
}
listInput := &s3.ListObjectsV2Input{
Bucket: aws.String(loc.Bucket),
Prefix: aws.String(pathKey),
Delimiter: aws.String("/"),
}
var localErr error
listErr := s.conn.ListObjectsV2PagesWithContext(ctx, listInput, func(page *s3.ListObjectsV2Output, lastPage bool) bool {
for _, prefix := range page.CommonPrefixes {
if prefix.Prefix == nil {
continue
}
dirKey := "/" + strings.TrimSuffix(*prefix.Prefix, "/")
dir, name := util.FullPath(dirKey).DirAndName()
if err := visitFn(dir, name, true, nil); err != nil {
localErr = err
return false
}
}
for _, content := range page.Contents {
key := "/" + *content.Key
if strings.HasSuffix(key, "/") {
continue // skip directory markers
}
dir, name := util.FullPath(key).DirAndName()
remoteEntry := &filer_pb.RemoteEntry{
StorageName: s.conf.Name,
}
if content.LastModified != nil {
remoteEntry.RemoteMtime = content.LastModified.Unix()
}
if content.Size != nil {
remoteEntry.RemoteSize = *content.Size
}
if content.ETag != nil {
remoteEntry.RemoteETag = *content.ETag
}
if err := visitFn(dir, name, false, remoteEntry); err != nil {
localErr = err
return false
}
}
return true
})
if listErr != nil {
return fmt.Errorf("list directory %v: %w", loc, listErr)
}
if localErr != nil {
return fmt.Errorf("process directory %v: %w", loc, localErr)
}
return nil
}
func (s *s3RemoteStorageClient) StatFile(loc *remote_pb.RemoteStorageLocation) (remoteEntry *filer_pb.RemoteEntry, err error) {
resp, err := s.conn.HeadObject(&s3.HeadObjectInput{
Bucket: aws.String(loc.Bucket),
Key: aws.String(loc.Path[1:]),
})
if err != nil {
if reqErr, ok := err.(awserr.RequestFailure); ok && reqErr.StatusCode() == http.StatusNotFound {
return nil, remote_storage.ErrRemoteObjectNotFound
}
return nil, fmt.Errorf("stat s3 %s%s: %w", loc.Bucket, loc.Path, err)
}
remoteEntry = &filer_pb.RemoteEntry{
StorageName: s.conf.Name,
}
if resp.ContentLength != nil {
remoteEntry.RemoteSize = *resp.ContentLength
}
if resp.LastModified != nil {
remoteEntry.RemoteMtime = resp.LastModified.Unix()
}
if resp.ETag != nil {
remoteEntry.RemoteETag = *resp.ETag
}
return remoteEntry, nil
}
func (s *s3RemoteStorageClient) ReadFile(loc *remote_pb.RemoteStorageLocation, offset int64, size int64) (data []byte, err error) {
downloader := s3manager.NewDownloaderWithClient(s.conn, func(u *s3manager.Downloader) {
u.PartSize = int64(4 * 1024 * 1024)
u.Concurrency = 1
})
dataSlice := make([]byte, int(size))
writerAt := aws.NewWriteAtBuffer(dataSlice)
_, err = downloader.Download(writerAt, &s3.GetObjectInput{
Bucket: aws.String(loc.Bucket),
Key: aws.String(loc.Path[1:]),
Range: aws.String(fmt.Sprintf("bytes=%d-%d", offset, offset+size-1)),
})
if err != nil {
return nil, fmt.Errorf("failed to download file %s%s: %v", loc.Bucket, loc.Path, err)
}
return writerAt.Bytes(), nil
}
func (s *s3RemoteStorageClient) WriteDirectory(loc *remote_pb.RemoteStorageLocation, entry *filer_pb.Entry) (err error) {
return nil
}
func (s *s3RemoteStorageClient) RemoveDirectory(loc *remote_pb.RemoteStorageLocation) (err error) {
return nil
}
func (s *s3RemoteStorageClient) WriteFile(loc *remote_pb.RemoteStorageLocation, entry *filer_pb.Entry, reader io.Reader) (remoteEntry *filer_pb.RemoteEntry, err error) {
fileSize := int64(filer.FileSize(entry))
partSize := int64(8 * 1024 * 1024) // The minimum/default allowed part size is 5MB
for partSize*1000 < fileSize {
partSize *= 4
}
// Create an uploader with the session and custom options
uploader := s3manager.NewUploaderWithClient(s.conn, func(u *s3manager.Uploader) {
u.PartSize = partSize
u.Concurrency = 1
})
// process tagging
tags := ""
var awsTags *string
// openstack swift doesn't support s3 object tagging
if s.conf.S3SupportTagging {
for k, v := range entry.Extended {
if len(tags) > 0 {
tags = tags + "&"
}
tags = tags + k + "=" + string(v)
}
awsTags = aws.String(tags)
}
// Upload the file to S3.
_, err = uploader.Upload(&s3manager.UploadInput{
Bucket: aws.String(loc.Bucket),
Key: aws.String(loc.Path[1:]),
Body: reader,
Tagging: awsTags,
StorageClass: aws.String(s.conf.S3StorageClass),
})
//in case it fails to upload
if err != nil {
return nil, fmt.Errorf("upload to %s/%s%s: %v", loc.Name, loc.Bucket, loc.Path, err)
}
// read back the remote entry
return s.readFileRemoteEntry(loc)
}
func toTagging(attributes map[string][]byte) *s3.Tagging {
tagging := &s3.Tagging{}
for k, v := range attributes {
tagging.TagSet = append(tagging.TagSet, &s3.Tag{
Key: aws.String(k),
Value: aws.String(string(v)),
})
}
return tagging
}
func (s *s3RemoteStorageClient) readFileRemoteEntry(loc *remote_pb.RemoteStorageLocation) (*filer_pb.RemoteEntry, error) {
return s.StatFile(loc)
}
func (s *s3RemoteStorageClient) UpdateFileMetadata(loc *remote_pb.RemoteStorageLocation, oldEntry *filer_pb.Entry, newEntry *filer_pb.Entry) (err error) {
if reflect.DeepEqual(oldEntry.Extended, newEntry.Extended) {
return nil
}
tagging := toTagging(newEntry.Extended)
if len(tagging.TagSet) > 0 {
_, err = s.conn.PutObjectTagging(&s3.PutObjectTaggingInput{
Bucket: aws.String(loc.Bucket),
Key: aws.String(loc.Path[1:]),
Tagging: toTagging(newEntry.Extended),
})
} else {
_, err = s.conn.DeleteObjectTagging(&s3.DeleteObjectTaggingInput{
Bucket: aws.String(loc.Bucket),
Key: aws.String(loc.Path[1:]),
})
}
return
}
func (s *s3RemoteStorageClient) DeleteFile(loc *remote_pb.RemoteStorageLocation) (err error) {
_, err = s.conn.DeleteObject(&s3.DeleteObjectInput{
Bucket: aws.String(loc.Bucket),
Key: aws.String(loc.Path[1:]),
})
return
}
func (s *s3RemoteStorageClient) ListBuckets() (buckets []*remote_storage.Bucket, err error) {
resp, err := s.conn.ListBuckets(&s3.ListBucketsInput{})
if err != nil {
return nil, fmt.Errorf("list buckets: %w", err)
}
for _, b := range resp.Buckets {
buckets = append(buckets, &remote_storage.Bucket{
Name: *b.Name,
CreatedAt: *b.CreationDate,
})
}
return
}
func (s *s3RemoteStorageClient) CreateBucket(name string) (err error) {
_, err = s.conn.CreateBucket(&s3.CreateBucketInput{
ACL: nil,
Bucket: aws.String(name),
CreateBucketConfiguration: nil,
GrantFullControl: nil,
GrantRead: nil,
GrantReadACP: nil,
GrantWrite: nil,
GrantWriteACP: nil,
ObjectLockEnabledForBucket: nil,
})
if err != nil {
return fmt.Errorf("%s create bucket %s: %v", s.conf.Name, name, err)
}
return
}
func (s *s3RemoteStorageClient) DeleteBucket(name string) (err error) {
_, err = s.conn.DeleteBucket(&s3.DeleteBucketInput{
Bucket: aws.String(name),
})
if err != nil {
return fmt.Errorf("delete bucket %s: %v", name, err)
}
return
}