* improve large file sync throughput for remote.cache and filer.sync
Three main throughput improvements:
1. Adaptive chunk sizing for remote.cache: targets ~32 chunks per file
instead of always starting at 5MB. A 500MB file now uses ~16MB chunks
(32 chunks) instead of 5MB chunks (100 chunks), reducing per-chunk
overhead (volume assign, gRPC call, needle write) by 3x.
2. Configurable concurrency at every layer:
- remote.cache chunk concurrency: -chunkConcurrency flag (default 8)
- remote.cache S3 download concurrency: -downloadConcurrency flag
(default raised from 1 to 5 per chunk)
- filer.sync chunk concurrency: -chunkConcurrency flag (default 32)
3. S3 multipart download concurrency raised from 1 to 5: the S3 manager
downloader was using Concurrency=1, serializing all part downloads
within each chunk. This alone can 5x per-chunk download speed.
The concurrency values flow through the gRPC request chain:
shell command → CacheRemoteObjectToLocalClusterRequest →
FetchAndWriteNeedleRequest → S3 downloader
Zero values in the request mean "use server defaults", maintaining
full backward compatibility with existing callers.
Ref #8481
* fix: use full maxMB for chunk size cap and remove loop guard
Address review feedback:
- Use full maxMB instead of maxMB/2 for maxChunkSize to avoid
unnecessarily limiting chunk size for very large files.
- Remove chunkSize < maxChunkSize guard from the safety loop so it
can always grow past maxChunkSize when needed to stay under 1000
chunks (e.g., extremely large files with small maxMB).
* address review feedback: help text, validation, naming, docs
- Fix help text for -chunkConcurrency and -downloadConcurrency flags
to say "0 = server default" instead of advertising specific numeric
defaults that could drift from the server implementation.
- Validate chunkConcurrency and downloadConcurrency are within int32
range before narrowing, returning a user-facing error if out of range.
- Rename ReadRemoteErr to readRemoteErr to follow Go naming conventions.
- Add doc comment to SetChunkConcurrency noting it must be called
during initialization before replication goroutines start.
- Replace doubling loop in chunk size safety check with direct
ceil(remoteSize/1000) computation to guarantee the 1000-chunk cap.
* address Copilot review: clamp concurrency, fix chunk count, clarify proto docs
- Use ceiling division for chunk count check to avoid overcounting
when file size is an exact multiple of chunk size.
- Clamp chunkConcurrency (max 1024) and downloadConcurrency (max 1024
at filer, max 64 at volume server) to prevent excessive goroutines.
- Always use ReadFileWithConcurrency when the client supports it,
falling back to the implementation's default when value is 0.
- Clarify proto comments that download_concurrency only applies when
the remote storage client supports it (currently S3).
- Include specific server defaults in help text (e.g., "0 = server
default 8") so users see the actual values in -h output.
* fix data race on executionErr and use %w for error wrapping
- Protect concurrent writes to executionErr in remote.cache worker
goroutines with a sync.Mutex to eliminate the data race.
- Use %w instead of %v in volume_grpc_remote.go error formatting
to preserve the error chain for errors.Is/errors.As callers.
389 lines
11 KiB
Go
389 lines
11 KiB
Go
package s3
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"reflect"
|
|
"strings"
|
|
|
|
"github.com/aws/aws-sdk-go/aws"
|
|
"github.com/aws/aws-sdk-go/aws/awserr"
|
|
"github.com/aws/aws-sdk-go/aws/credentials"
|
|
"github.com/aws/aws-sdk-go/aws/request"
|
|
"github.com/aws/aws-sdk-go/aws/session"
|
|
v4 "github.com/aws/aws-sdk-go/aws/signer/v4"
|
|
"github.com/aws/aws-sdk-go/service/s3"
|
|
"github.com/aws/aws-sdk-go/service/s3/s3iface"
|
|
"github.com/aws/aws-sdk-go/service/s3/s3manager"
|
|
"github.com/seaweedfs/seaweedfs/weed/filer"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/remote_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/remote_storage"
|
|
"github.com/seaweedfs/seaweedfs/weed/util"
|
|
"github.com/seaweedfs/seaweedfs/weed/util/version"
|
|
)
|
|
|
|
func init() {
|
|
remote_storage.RemoteStorageClientMakers["s3"] = new(s3RemoteStorageMaker)
|
|
}
|
|
|
|
type s3RemoteStorageMaker struct{}
|
|
|
|
func (s s3RemoteStorageMaker) HasBucket() bool {
|
|
return true
|
|
}
|
|
|
|
func (s s3RemoteStorageMaker) Make(conf *remote_pb.RemoteConf) (remote_storage.RemoteStorageClient, error) {
|
|
client := &s3RemoteStorageClient{
|
|
supportTagging: true,
|
|
conf: conf,
|
|
}
|
|
config := &aws.Config{
|
|
Region: aws.String(conf.S3Region),
|
|
Endpoint: aws.String(conf.S3Endpoint),
|
|
S3ForcePathStyle: aws.Bool(conf.S3ForcePathStyle),
|
|
S3DisableContentMD5Validation: aws.Bool(true),
|
|
}
|
|
if conf.S3AccessKey != "" && conf.S3SecretKey != "" {
|
|
config.Credentials = credentials.NewStaticCredentials(conf.S3AccessKey, conf.S3SecretKey, "")
|
|
} else if conf.S3AccessKey == "" && conf.S3SecretKey == "" {
|
|
// Explicitly disable signing for public buckets.
|
|
config.Credentials = credentials.AnonymousCredentials
|
|
}
|
|
|
|
sess, err := session.NewSession(config)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create aws session: %w", err)
|
|
}
|
|
if conf.S3V4Signature {
|
|
sess.Handlers.Sign.PushBackNamed(v4.SignRequestHandler)
|
|
}
|
|
sess.Handlers.Build.PushBack(func(r *request.Request) {
|
|
r.HTTPRequest.Header.Set("User-Agent", "SeaweedFS/"+version.VERSION_NUMBER)
|
|
})
|
|
sess.Handlers.Build.PushFront(skipSha256PayloadSigning)
|
|
client.conn = s3.New(sess)
|
|
return client, nil
|
|
}
|
|
|
|
type s3RemoteStorageClient struct {
|
|
conf *remote_pb.RemoteConf
|
|
conn s3iface.S3API
|
|
supportTagging bool
|
|
}
|
|
|
|
var _ = remote_storage.RemoteStorageClient(&s3RemoteStorageClient{supportTagging: true})
|
|
|
|
func (s *s3RemoteStorageClient) Traverse(remote *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) (err error) {
|
|
|
|
pathKey := remote.Path[1:]
|
|
|
|
listInput := &s3.ListObjectsV2Input{
|
|
Bucket: aws.String(remote.Bucket),
|
|
ContinuationToken: nil,
|
|
Delimiter: nil, // not aws.String("/"), iterate through all entries
|
|
EncodingType: nil,
|
|
ExpectedBucketOwner: nil,
|
|
FetchOwner: nil,
|
|
MaxKeys: nil, // aws.Int64(1000),
|
|
Prefix: aws.String(pathKey),
|
|
RequestPayer: nil,
|
|
StartAfter: nil,
|
|
}
|
|
isLastPage := false
|
|
for !isLastPage && err == nil {
|
|
var localErr error
|
|
listErr := s.conn.ListObjectsV2Pages(listInput, func(page *s3.ListObjectsV2Output, lastPage bool) bool {
|
|
for _, content := range page.Contents {
|
|
key := *content.Key
|
|
key = "/" + key
|
|
dir, name := util.FullPath(key).DirAndName()
|
|
remoteEntry := &filer_pb.RemoteEntry{
|
|
StorageName: s.conf.Name,
|
|
}
|
|
if content.LastModified != nil {
|
|
remoteEntry.RemoteMtime = content.LastModified.Unix()
|
|
}
|
|
if content.Size != nil {
|
|
remoteEntry.RemoteSize = *content.Size
|
|
}
|
|
if content.ETag != nil {
|
|
remoteEntry.RemoteETag = *content.ETag
|
|
}
|
|
if err := visitFn(dir, name, false, remoteEntry); err != nil {
|
|
localErr = err
|
|
return false
|
|
}
|
|
}
|
|
listInput.ContinuationToken = page.NextContinuationToken
|
|
isLastPage = lastPage
|
|
return true
|
|
})
|
|
if listErr != nil {
|
|
err = fmt.Errorf("list %v: %w", remote, listErr)
|
|
}
|
|
if localErr != nil {
|
|
err = fmt.Errorf("process %v: %w", remote, localErr)
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
func (s *s3RemoteStorageClient) ListDirectory(ctx context.Context, loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) error {
|
|
pathKey := loc.Path[1:]
|
|
if pathKey != "" && !strings.HasSuffix(pathKey, "/") {
|
|
pathKey += "/"
|
|
}
|
|
|
|
listInput := &s3.ListObjectsV2Input{
|
|
Bucket: aws.String(loc.Bucket),
|
|
Prefix: aws.String(pathKey),
|
|
Delimiter: aws.String("/"),
|
|
}
|
|
|
|
var localErr error
|
|
listErr := s.conn.ListObjectsV2PagesWithContext(ctx, listInput, func(page *s3.ListObjectsV2Output, lastPage bool) bool {
|
|
for _, prefix := range page.CommonPrefixes {
|
|
if prefix.Prefix == nil {
|
|
continue
|
|
}
|
|
dirKey := "/" + strings.TrimSuffix(*prefix.Prefix, "/")
|
|
dir, name := util.FullPath(dirKey).DirAndName()
|
|
if err := visitFn(dir, name, true, nil); err != nil {
|
|
localErr = err
|
|
return false
|
|
}
|
|
}
|
|
for _, content := range page.Contents {
|
|
key := "/" + *content.Key
|
|
if strings.HasSuffix(key, "/") {
|
|
continue // skip directory markers
|
|
}
|
|
dir, name := util.FullPath(key).DirAndName()
|
|
remoteEntry := &filer_pb.RemoteEntry{
|
|
StorageName: s.conf.Name,
|
|
}
|
|
if content.LastModified != nil {
|
|
remoteEntry.RemoteMtime = content.LastModified.Unix()
|
|
}
|
|
if content.Size != nil {
|
|
remoteEntry.RemoteSize = *content.Size
|
|
}
|
|
if content.ETag != nil {
|
|
remoteEntry.RemoteETag = *content.ETag
|
|
}
|
|
if err := visitFn(dir, name, false, remoteEntry); err != nil {
|
|
localErr = err
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
})
|
|
if listErr != nil {
|
|
return fmt.Errorf("list directory %v: %w", loc, listErr)
|
|
}
|
|
if localErr != nil {
|
|
return fmt.Errorf("process directory %v: %w", loc, localErr)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *s3RemoteStorageClient) StatFile(loc *remote_pb.RemoteStorageLocation) (remoteEntry *filer_pb.RemoteEntry, err error) {
|
|
resp, err := s.conn.HeadObject(&s3.HeadObjectInput{
|
|
Bucket: aws.String(loc.Bucket),
|
|
Key: aws.String(loc.Path[1:]),
|
|
})
|
|
if err != nil {
|
|
if reqErr, ok := err.(awserr.RequestFailure); ok && reqErr.StatusCode() == http.StatusNotFound {
|
|
return nil, remote_storage.ErrRemoteObjectNotFound
|
|
}
|
|
return nil, fmt.Errorf("stat s3 %s%s: %w", loc.Bucket, loc.Path, err)
|
|
}
|
|
remoteEntry = &filer_pb.RemoteEntry{
|
|
StorageName: s.conf.Name,
|
|
}
|
|
if resp.ContentLength != nil {
|
|
remoteEntry.RemoteSize = *resp.ContentLength
|
|
}
|
|
if resp.LastModified != nil {
|
|
remoteEntry.RemoteMtime = resp.LastModified.Unix()
|
|
}
|
|
if resp.ETag != nil {
|
|
remoteEntry.RemoteETag = *resp.ETag
|
|
}
|
|
return remoteEntry, nil
|
|
}
|
|
|
|
func (s *s3RemoteStorageClient) ReadFile(loc *remote_pb.RemoteStorageLocation, offset int64, size int64) (data []byte, err error) {
|
|
return s.ReadFileWithConcurrency(loc, offset, size, 5)
|
|
}
|
|
|
|
func (s *s3RemoteStorageClient) ReadFileWithConcurrency(loc *remote_pb.RemoteStorageLocation, offset int64, size int64, concurrency int) (data []byte, err error) {
|
|
if concurrency <= 0 {
|
|
concurrency = 5
|
|
}
|
|
downloader := s3manager.NewDownloaderWithClient(s.conn, func(u *s3manager.Downloader) {
|
|
u.PartSize = int64(4 * 1024 * 1024)
|
|
u.Concurrency = concurrency
|
|
})
|
|
|
|
dataSlice := make([]byte, int(size))
|
|
writerAt := aws.NewWriteAtBuffer(dataSlice)
|
|
|
|
_, err = downloader.Download(writerAt, &s3.GetObjectInput{
|
|
Bucket: aws.String(loc.Bucket),
|
|
Key: aws.String(loc.Path[1:]),
|
|
Range: aws.String(fmt.Sprintf("bytes=%d-%d", offset, offset+size-1)),
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to download file %s%s: %v", loc.Bucket, loc.Path, err)
|
|
}
|
|
|
|
return writerAt.Bytes(), nil
|
|
}
|
|
|
|
func (s *s3RemoteStorageClient) WriteDirectory(loc *remote_pb.RemoteStorageLocation, entry *filer_pb.Entry) (err error) {
|
|
return nil
|
|
}
|
|
|
|
func (s *s3RemoteStorageClient) RemoveDirectory(loc *remote_pb.RemoteStorageLocation) (err error) {
|
|
return nil
|
|
}
|
|
|
|
func (s *s3RemoteStorageClient) WriteFile(loc *remote_pb.RemoteStorageLocation, entry *filer_pb.Entry, reader io.Reader) (remoteEntry *filer_pb.RemoteEntry, err error) {
|
|
|
|
fileSize := int64(filer.FileSize(entry))
|
|
|
|
partSize := int64(8 * 1024 * 1024) // The minimum/default allowed part size is 5MB
|
|
for partSize*1000 < fileSize {
|
|
partSize *= 4
|
|
}
|
|
|
|
// Create an uploader with the session and custom options
|
|
uploader := s3manager.NewUploaderWithClient(s.conn, func(u *s3manager.Uploader) {
|
|
u.PartSize = partSize
|
|
u.Concurrency = 1
|
|
})
|
|
|
|
// process tagging
|
|
tags := ""
|
|
var awsTags *string
|
|
// openstack swift doesn't support s3 object tagging
|
|
if s.conf.S3SupportTagging {
|
|
for k, v := range entry.Extended {
|
|
if len(tags) > 0 {
|
|
tags = tags + "&"
|
|
}
|
|
tags = tags + k + "=" + string(v)
|
|
}
|
|
awsTags = aws.String(tags)
|
|
}
|
|
|
|
// Upload the file to S3.
|
|
uploadInput := &s3manager.UploadInput{
|
|
Bucket: aws.String(loc.Bucket),
|
|
Key: aws.String(loc.Path[1:]),
|
|
Body: reader,
|
|
Tagging: awsTags,
|
|
}
|
|
if s.conf.S3StorageClass != "" {
|
|
uploadInput.StorageClass = aws.String(s.conf.S3StorageClass)
|
|
}
|
|
_, err = uploader.Upload(uploadInput)
|
|
|
|
//in case it fails to upload
|
|
if err != nil {
|
|
return nil, fmt.Errorf("upload to %s/%s%s: %v", loc.Name, loc.Bucket, loc.Path, err)
|
|
}
|
|
|
|
// read back the remote entry
|
|
return s.readFileRemoteEntry(loc)
|
|
|
|
}
|
|
|
|
func toTagging(attributes map[string][]byte) *s3.Tagging {
|
|
tagging := &s3.Tagging{}
|
|
for k, v := range attributes {
|
|
tagging.TagSet = append(tagging.TagSet, &s3.Tag{
|
|
Key: aws.String(k),
|
|
Value: aws.String(string(v)),
|
|
})
|
|
}
|
|
return tagging
|
|
}
|
|
|
|
func (s *s3RemoteStorageClient) readFileRemoteEntry(loc *remote_pb.RemoteStorageLocation) (*filer_pb.RemoteEntry, error) {
|
|
return s.StatFile(loc)
|
|
}
|
|
|
|
func (s *s3RemoteStorageClient) UpdateFileMetadata(loc *remote_pb.RemoteStorageLocation, oldEntry *filer_pb.Entry, newEntry *filer_pb.Entry) (err error) {
|
|
if reflect.DeepEqual(oldEntry.Extended, newEntry.Extended) {
|
|
return nil
|
|
}
|
|
tagging := toTagging(newEntry.Extended)
|
|
if len(tagging.TagSet) > 0 {
|
|
_, err = s.conn.PutObjectTagging(&s3.PutObjectTaggingInput{
|
|
Bucket: aws.String(loc.Bucket),
|
|
Key: aws.String(loc.Path[1:]),
|
|
Tagging: toTagging(newEntry.Extended),
|
|
})
|
|
} else {
|
|
_, err = s.conn.DeleteObjectTagging(&s3.DeleteObjectTaggingInput{
|
|
Bucket: aws.String(loc.Bucket),
|
|
Key: aws.String(loc.Path[1:]),
|
|
})
|
|
}
|
|
return
|
|
}
|
|
func (s *s3RemoteStorageClient) DeleteFile(loc *remote_pb.RemoteStorageLocation) (err error) {
|
|
_, err = s.conn.DeleteObject(&s3.DeleteObjectInput{
|
|
Bucket: aws.String(loc.Bucket),
|
|
Key: aws.String(loc.Path[1:]),
|
|
})
|
|
return
|
|
}
|
|
|
|
func (s *s3RemoteStorageClient) ListBuckets() (buckets []*remote_storage.Bucket, err error) {
|
|
resp, err := s.conn.ListBuckets(&s3.ListBucketsInput{})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("list buckets: %w", err)
|
|
}
|
|
for _, b := range resp.Buckets {
|
|
buckets = append(buckets, &remote_storage.Bucket{
|
|
Name: *b.Name,
|
|
CreatedAt: *b.CreationDate,
|
|
})
|
|
}
|
|
return
|
|
}
|
|
|
|
func (s *s3RemoteStorageClient) CreateBucket(name string) (err error) {
|
|
_, err = s.conn.CreateBucket(&s3.CreateBucketInput{
|
|
ACL: nil,
|
|
Bucket: aws.String(name),
|
|
CreateBucketConfiguration: nil,
|
|
GrantFullControl: nil,
|
|
GrantRead: nil,
|
|
GrantReadACP: nil,
|
|
GrantWrite: nil,
|
|
GrantWriteACP: nil,
|
|
ObjectLockEnabledForBucket: nil,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("%s create bucket %s: %v", s.conf.Name, name, err)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (s *s3RemoteStorageClient) DeleteBucket(name string) (err error) {
|
|
_, err = s.conn.DeleteBucket(&s3.DeleteBucketInput{
|
|
Bucket: aws.String(name),
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("delete bucket %s: %v", name, err)
|
|
}
|
|
return
|
|
}
|