s3api: make conditional mutations atomic and AWS-compatible (#8802)

* s3api: serialize conditional write finalization

* s3api: add conditional delete mutation checks

* s3api: enforce destination conditions for copy

* s3api: revalidate multipart completion under lock

* s3api: rollback failed put finalization hooks

* s3api: report delete-marker version deletions

* s3api: fix copy destination versioning edge cases

* s3api: make versioned multipart completion idempotent

* test/s3: cover conditional mutation regressions

* s3api: rollback failed copy version finalization

* s3api: resolve suspended delete conditions via latest entry

* s3api: remove copy test null-version injection

* s3api: reject out-of-order multipart completions

* s3api: preserve multipart replay version metadata

* s3api: surface copy destination existence errors

* s3api: simplify delete condition target resolution

* test/s3: make conditional delete assertions order independent

* test/s3: add distributed lock gateway integration

* s3api: fail closed multipart versioned completion

* s3api: harden copy metadata and overwrite paths

* s3api: create delete markers for suspended deletes

* s3api: allow duplicate multipart completion parts
This commit is contained in:
Chris Lu
2026-03-27 19:22:26 -07:00
committed by GitHub
parent bf2a2d2538
commit 0adb78bc6b
19 changed files with 2545 additions and 688 deletions

View File

@@ -292,7 +292,7 @@ func (s3a *S3ApiServer) PutObjectHandler(w http.ResponseWriter, r *http.Request)
dataReader = mimeDetect(r, dataReader)
}
etag, errCode, sseMetadata := s3a.putToFiler(r, filePath, dataReader, bucket, 1)
etag, errCode, sseMetadata := s3a.putToFiler(r, filePath, dataReader, bucket, object, 1, nil)
if errCode != s3err.ErrNone {
s3err.WriteErrorResponse(w, r, errCode)
@@ -312,7 +312,42 @@ func (s3a *S3ApiServer) PutObjectHandler(w http.ResponseWriter, r *http.Request)
writeSuccessResponseEmpty(w, r)
}
func (s3a *S3ApiServer) putToFiler(r *http.Request, filePath string, dataReader io.Reader, bucket string, partNumber int) (etag string, code s3err.ErrorCode, sseMetadata SSEResponseMetadata) {
func (s3a *S3ApiServer) withObjectWriteLock(bucket, object string, preconditionFn func() s3err.ErrorCode, fn func() s3err.ErrorCode) s3err.ErrorCode {
runPrecondition := func() s3err.ErrorCode {
if preconditionFn == nil {
return s3err.ErrNone
}
return preconditionFn()
}
if object == "" || s3a.newObjectWriteLock == nil {
if errCode := runPrecondition(); errCode != s3err.ErrNone {
return errCode
}
return fn()
}
lock := s3a.newObjectWriteLock(bucket, object)
if lock == nil {
if errCode := runPrecondition(); errCode != s3err.ErrNone {
return errCode
}
return fn()
}
defer func() {
if err := lock.StopShortLivedLock(); err != nil {
glog.Warningf("withObjectWriteLock: failed to release lock for %s/%s: %v", bucket, object, err)
}
}()
if errCode := runPrecondition(); errCode != s3err.ErrNone {
return errCode
}
return fn()
}
func (s3a *S3ApiServer) putToFiler(r *http.Request, filePath string, dataReader io.Reader, bucket string, object string, partNumber int, afterCreate func(entry *filer_pb.Entry) s3err.ErrorCode) (etag string, code s3err.ErrorCode, sseMetadata SSEResponseMetadata) {
// NEW OPTIMIZATION: Write directly to volume servers, bypassing filer proxy
// This eliminates the filer proxy overhead for PUT operations
// Note: filePath is now passed directly instead of URL (no parsing needed)
@@ -598,12 +633,8 @@ func (s3a *S3ApiServer) putToFiler(r *http.Request, filePath string, dataReader
// Store ETag in Extended attribute for future retrieval (e.g. multipart parts)
entry.Extended[s3_constants.ExtETagKey] = []byte(etag)
// Set object owner
amzAccountId := r.Header.Get(s3_constants.AmzAccountId)
if amzAccountId != "" {
entry.Extended[s3_constants.ExtAmzOwnerKey] = []byte(amzAccountId)
glog.V(2).Infof("putToFiler: setting owner %s for object %s", amzAccountId, filePath)
}
// Set object owner according to bucket ownership settings.
s3a.setObjectOwnerFromRequest(r, bucket, entry)
// Set version ID if present
if versionIdHeader := r.Header.Get(s3_constants.ExtVersionIdKey); versionIdHeader != "" {
@@ -611,6 +642,16 @@ func (s3a *S3ApiServer) putToFiler(r *http.Request, filePath string, dataReader
glog.V(3).Infof("putToFiler: setting version ID %s for object %s", versionIdHeader, filePath)
}
for _, metadataHeader := range []string{
s3_constants.ExtObjectLockModeKey,
s3_constants.ExtRetentionUntilDateKey,
s3_constants.ExtLegalHoldKey,
} {
if value := r.Header.Get(metadataHeader); value != "" {
entry.Extended[metadataHeader] = []byte(value)
}
}
// Set TTL-based S3 expiry flag only if object has a TTL
if entry.Attributes.TtlSec > 0 {
entry.Extended[s3_constants.SeaweedFSExpiresS3] = []byte("true")
@@ -699,30 +740,57 @@ func (s3a *S3ApiServer) putToFiler(r *http.Request, filePath string, dataReader
// This matches the chunk upload behavior and prevents orphaned chunks
glog.V(3).Infof("putToFiler: About to create entry - dir=%s, name=%s, chunks=%d, extended keys=%d",
path.Dir(filePath), path.Base(filePath), len(entry.Chunks), len(entry.Extended))
createErr := s3a.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
req := &filer_pb.CreateEntryRequest{
Directory: path.Dir(filePath),
Entry: entry,
var createErr error
var rollbackErr error
entryCreated := false
preconditionFn := func() s3err.ErrorCode {
if object == "" {
return s3err.ErrNone
}
glog.V(3).Infof("putToFiler: Calling CreateEntry for %s", filePath)
if err := filer_pb.CreateEntry(context.Background(), client, req); err != nil {
glog.Errorf("putToFiler: CreateEntry returned error: %v", err)
return err
return s3a.checkConditionalHeaders(r, bucket, object)
}
createCode := s3a.withObjectWriteLock(bucket, object, preconditionFn, func() s3err.ErrorCode {
createErr = s3a.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
req := &filer_pb.CreateEntryRequest{
Directory: path.Dir(filePath),
Entry: entry,
}
glog.V(3).Infof("putToFiler: Calling CreateEntry for %s", filePath)
if err := filer_pb.CreateEntry(context.Background(), client, req); err != nil {
glog.Errorf("putToFiler: CreateEntry returned error: %v", err)
return err
}
return nil
})
if createErr != nil {
return filerErrorToS3Error(createErr)
}
return nil
entryCreated = true
if afterCreate != nil {
if afterCreateCode := afterCreate(entry); afterCreateCode != s3err.ErrNone {
rollbackErr = s3a.rmObject(path.Dir(filePath), path.Base(filePath), true, false)
if rollbackErr != nil {
glog.Errorf("putToFiler: failed to rollback created entry for %s after post-create error: %v", filePath, rollbackErr)
} else {
entryCreated = false
}
return afterCreateCode
}
}
return s3err.ErrNone
})
if createErr != nil {
glog.Errorf("putToFiler: failed to create entry for %s: %v", filePath, createErr)
if createCode != s3err.ErrNone {
if createErr != nil {
glog.Errorf("putToFiler: failed to create entry for %s: %v", filePath, createErr)
}
// CRITICAL: Cleanup orphaned chunks before returning error
// If CreateEntry fails, the uploaded chunks are orphaned and must be deleted
// to prevent resource leaks and wasted storage
if len(chunkResult.FileChunks) > 0 {
glog.Warningf("putToFiler: CreateEntry failed, attempting to cleanup %d orphaned chunks", len(chunkResult.FileChunks))
// If the entry was never created, the uploaded chunks are orphaned and must be deleted.
if !entryCreated && len(chunkResult.FileChunks) > 0 {
glog.Warningf("putToFiler: finalization failed, attempting to cleanup %d orphaned chunks", len(chunkResult.FileChunks))
s3a.deleteOrphanedChunks(chunkResult.FileChunks)
}
return "", filerErrorToS3Error(createErr), SSEResponseMetadata{}
return "", createCode, SSEResponseMetadata{}
}
glog.V(3).Infof("putToFiler: CreateEntry SUCCESS for %s", filePath)
@@ -982,7 +1050,7 @@ func (s3a *S3ApiServer) putSuspendedVersioningObject(r *http.Request, bucket, ob
}
// Upload the file using putToFiler - this will create the file with version metadata
etag, errCode, sseMetadata = s3a.putToFiler(r, filePath, body, bucket, 1)
etag, errCode, sseMetadata = s3a.putToFiler(r, filePath, body, bucket, normalizedObject, 1, nil)
if errCode != s3err.ErrNone {
glog.Errorf("putSuspendedVersioningObject: failed to upload object: %v", errCode)
return "", errCode, SSEResponseMetadata{}
@@ -1088,8 +1156,6 @@ func (s3a *S3ApiServer) putVersionedObject(r *http.Request, bucket, object strin
// We need to construct the object path relative to the bucket
versionObjectPath := normalizedObject + s3_constants.VersionsFolder + "/" + versionFileName
versionFilePath := s3a.toFilerPath(bucket, versionObjectPath)
bucketDir := s3a.bucketDir(bucket)
body := dataReader
if objectContentType == "" {
body = mimeDetect(r, body)
@@ -1097,71 +1163,55 @@ func (s3a *S3ApiServer) putVersionedObject(r *http.Request, bucket, object strin
glog.V(2).Infof("putVersionedObject: uploading %s/%s version %s to %s", bucket, object, versionId, versionFilePath)
etag, errCode, sseMetadata = s3a.putToFiler(r, versionFilePath, body, bucket, 1)
r.Header.Set(s3_constants.ExtVersionIdKey, versionId)
defer r.Header.Del(s3_constants.ExtVersionIdKey)
explicitMode := r.Header.Get(s3_constants.AmzObjectLockMode)
explicitRetainUntilDate := r.Header.Get(s3_constants.AmzObjectLockRetainUntilDate)
if explicitMode != "" {
r.Header.Set(s3_constants.ExtObjectLockModeKey, explicitMode)
defer r.Header.Del(s3_constants.ExtObjectLockModeKey)
}
if explicitRetainUntilDate != "" {
parsedTime, parseErr := time.Parse(time.RFC3339, explicitRetainUntilDate)
if parseErr != nil {
glog.Errorf("putVersionedObject: failed to parse retention until date: %v", parseErr)
return "", "", s3err.ErrInvalidRequest, SSEResponseMetadata{}
}
r.Header.Set(s3_constants.ExtRetentionUntilDateKey, strconv.FormatInt(parsedTime.Unix(), 10))
defer r.Header.Del(s3_constants.ExtRetentionUntilDateKey)
}
if legalHold := r.Header.Get(s3_constants.AmzObjectLockLegalHold); legalHold != "" {
r.Header.Set(s3_constants.ExtLegalHoldKey, legalHold)
defer r.Header.Del(s3_constants.ExtLegalHoldKey)
}
if explicitMode == "" && explicitRetainUntilDate == "" {
tempEntry := &filer_pb.Entry{Extended: make(map[string][]byte)}
if err := s3a.applyBucketDefaultRetention(bucket, tempEntry); err == nil {
if modeBytes, ok := tempEntry.Extended[s3_constants.ExtObjectLockModeKey]; ok {
r.Header.Set(s3_constants.ExtObjectLockModeKey, string(modeBytes))
defer r.Header.Del(s3_constants.ExtObjectLockModeKey)
}
if dateBytes, ok := tempEntry.Extended[s3_constants.ExtRetentionUntilDateKey]; ok {
r.Header.Set(s3_constants.ExtRetentionUntilDateKey, string(dateBytes))
defer r.Header.Del(s3_constants.ExtRetentionUntilDateKey)
}
}
}
etag, errCode, sseMetadata = s3a.putToFiler(r, versionFilePath, body, bucket, normalizedObject, 1, func(versionEntry *filer_pb.Entry) s3err.ErrorCode {
if err := s3a.updateLatestVersionInDirectory(bucket, normalizedObject, versionId, versionFileName, versionEntry); err != nil {
glog.Errorf("putVersionedObject: failed to update latest version in directory: %v", err)
return s3err.ErrInternalError
}
return s3err.ErrNone
})
if errCode != s3err.ErrNone {
glog.Errorf("putVersionedObject: failed to upload version: %v", errCode)
return "", "", errCode, SSEResponseMetadata{}
}
// Get the uploaded entry to add versioning metadata
// Use retry logic to handle filer consistency delays
var versionEntry *filer_pb.Entry
var err error
maxRetries := 8
for attempt := 1; attempt <= maxRetries; attempt++ {
versionEntry, err = s3a.getEntry(bucketDir, versionObjectPath)
if err == nil {
break
}
if attempt < maxRetries {
// Exponential backoff: 10ms, 20ms, 40ms, 80ms, 160ms, 320ms, 640ms
delay := time.Millisecond * time.Duration(10*(1<<(attempt-1)))
time.Sleep(delay)
}
}
if err != nil {
glog.Errorf("putVersionedObject: failed to get version entry after %d attempts: %v", maxRetries, err)
return "", "", s3err.ErrInternalError, SSEResponseMetadata{}
}
// Add versioning metadata to this version
if versionEntry.Extended == nil {
versionEntry.Extended = make(map[string][]byte)
}
versionEntry.Extended[s3_constants.ExtVersionIdKey] = []byte(versionId)
// Store ETag (unquoted) in Extended attribute
versionEntry.Extended[s3_constants.ExtETagKey] = []byte(etag)
// Set object owner for versioned objects
s3a.setObjectOwnerFromRequest(r, bucket, versionEntry)
// Extract and store object lock metadata from request headers
if err := s3a.extractObjectLockMetadataFromRequest(r, versionEntry); err != nil {
glog.Errorf("putVersionedObject: failed to extract object lock metadata: %v", err)
return "", "", s3err.ErrInvalidRequest, SSEResponseMetadata{}
}
// Update the version entry with metadata
err = s3a.mkFile(bucketDir, versionObjectPath, versionEntry.Chunks, func(updatedEntry *filer_pb.Entry) {
updatedEntry.Extended = versionEntry.Extended
updatedEntry.Attributes = versionEntry.Attributes
updatedEntry.Chunks = versionEntry.Chunks
})
if err != nil {
glog.Errorf("putVersionedObject: failed to update version metadata: %v", err)
return "", "", s3err.ErrInternalError, SSEResponseMetadata{}
}
// Update the .versions directory metadata to indicate this is the latest version
// Pass versionEntry to cache its metadata for single-scan list efficiency
err = s3a.updateLatestVersionInDirectory(bucket, normalizedObject, versionId, versionFileName, versionEntry)
if err != nil {
glog.Errorf("putVersionedObject: failed to update latest version in directory: %v", err)
return "", "", s3err.ErrInternalError, SSEResponseMetadata{}
}
glog.V(2).Infof("putVersionedObject: successfully created version %s for %s/%s (normalized: %s)", versionId, bucket, object, normalizedObject)
return versionId, etag, s3err.ErrNone, sseMetadata
}
@@ -1685,12 +1735,25 @@ func (s3a *S3ApiServer) etagMatches(headerValue, objectETag string) bool {
return false
}
func normalizeConditionalTargetEntry(entry *filer_pb.Entry) *filer_pb.Entry {
if entry == nil {
return nil
}
if entry.Extended != nil {
if deleteMarker, exists := entry.Extended[s3_constants.ExtDeleteMarkerKey]; exists && string(deleteMarker) == "true" {
return nil
}
}
return entry
}
// validateConditionalHeaders checks conditional headers against the provided entry
func (s3a *S3ApiServer) validateConditionalHeaders(r *http.Request, headers conditionalHeaders, entry *filer_pb.Entry, bucket, object string) s3err.ErrorCode {
if !headers.isSet {
return s3err.ErrNone
}
entry = normalizeConditionalTargetEntry(entry)
objectExists := entry != nil
// For PUT requests, all specified conditions must be met.
@@ -1812,7 +1875,7 @@ func (s3a *S3ApiServer) checkConditionalHeaders(r *http.Request, bucket, object
// This ensures we check conditions against the LATEST version, not a null version.
entry, err := s3a.resolveObjectEntry(bucket, object)
if err != nil {
if errors.Is(err, filer_pb.ErrNotFound) {
if errors.Is(err, filer_pb.ErrNotFound) || errors.Is(err, ErrDeleteMarker) {
entry = nil
} else {
glog.Errorf("checkConditionalHeaders: error resolving object entry for %s/%s: %v", bucket, object, err)
@@ -1828,6 +1891,7 @@ func (s3a *S3ApiServer) validateConditionalHeadersForReads(r *http.Request, head
return ConditionalHeaderResult{ErrorCode: s3err.ErrNone, Entry: entry}
}
entry = normalizeConditionalTargetEntry(entry)
objectExists := entry != nil
// If object doesn't exist, fail for If-Match and If-Unmodified-Since
@@ -1954,7 +2018,7 @@ func (s3a *S3ApiServer) checkConditionalHeadersForReads(r *http.Request, bucket,
// This ensures we check conditions against the LATEST version, not a null version.
entry, err := s3a.resolveObjectEntry(bucket, object)
if err != nil {
if errors.Is(err, filer_pb.ErrNotFound) {
if errors.Is(err, filer_pb.ErrNotFound) || errors.Is(err, ErrDeleteMarker) {
entry = nil
} else {
glog.Errorf("checkConditionalHeadersForReads: error resolving object entry for %s/%s: %v", bucket, object, err)