Fix chown Input/output error on large file sets (#7996)

* Fix chown Input/output error on large file sets (Fixes #7911)

Implemented retry logic for MySQL/MariaDB backend to handle transient errors like deadlocks and timeouts.

* Fix syntax error: missing closing brace

* Refactor: Use %w for error wrapping and errors.As for extraction

* Fix: Disable retry logic inside transactions
This commit is contained in:
Chris Lu
2026-01-09 18:02:59 -08:00
committed by GitHub
parent 88e9e2c471
commit 379c032868
4 changed files with 216 additions and 92 deletions

View File

@@ -32,6 +32,7 @@ type AbstractSqlStore struct {
SupportBucketTable bool SupportBucketTable bool
dbs map[string]bool dbs map[string]bool
dbsLock sync.Mutex dbsLock sync.Mutex
RetryableErrorCallback func(err error) bool
} }
var _ filer.BucketAware = (*AbstractSqlStore)(nil) var _ filer.BucketAware = (*AbstractSqlStore)(nil)
@@ -151,15 +152,18 @@ func (store *AbstractSqlStore) getTxOrDB(ctx context.Context, fullpath util.Full
func (store *AbstractSqlStore) InsertEntry(ctx context.Context, entry *filer.Entry) (err error) { func (store *AbstractSqlStore) InsertEntry(ctx context.Context, entry *filer.Entry) (err error) {
// define the work to be done
var doInsert func() error
doInsert = func() error {
db, bucket, shortPath, err := store.getTxOrDB(ctx, entry.FullPath, false) db, bucket, shortPath, err := store.getTxOrDB(ctx, entry.FullPath, false)
if err != nil { if err != nil {
return fmt.Errorf("findDB %s : %v", entry.FullPath, err) return fmt.Errorf("findDB %s : %w", entry.FullPath, err)
} }
dir, name := shortPath.DirAndName() dir, name := shortPath.DirAndName()
meta, err := entry.EncodeAttributesAndChunks() meta, err := entry.EncodeAttributesAndChunks()
if err != nil { if err != nil {
return fmt.Errorf("encode %s: %s", entry.FullPath, err) return fmt.Errorf("encode %s: %w", entry.FullPath, err)
} }
if len(entry.GetChunks()) > filer.CountEntryChunksForGzip { if len(entry.GetChunks()) > filer.CountEntryChunksForGzip {
@@ -174,42 +178,62 @@ func (store *AbstractSqlStore) InsertEntry(ctx context.Context, entry *filer.Ent
res, err = db.ExecContext(ctx, store.GetSqlUpdate(bucket), meta, util.HashStringToLong(dir), name, dir) res, err = db.ExecContext(ctx, store.GetSqlUpdate(bucket), meta, util.HashStringToLong(dir), name, dir)
} }
if err != nil { if err != nil {
return fmt.Errorf("%s %s: %s", sqlInsert, entry.FullPath, err) return fmt.Errorf("%s %s: %w", sqlInsert, entry.FullPath, err)
} }
_, err = res.RowsAffected() _, err = res.RowsAffected()
if err != nil { if err != nil {
return fmt.Errorf("%s %s but no rows affected: %s", sqlInsert, entry.FullPath, err) return fmt.Errorf("%s %s but no rows affected: %w", sqlInsert, entry.FullPath, err)
} }
return nil return nil
} }
if store.RetryableErrorCallback != nil {
if ctx.Value("tx") != nil {
return doInsert()
}
return util.RetryUntil("InsertEntry", doInsert, store.RetryableErrorCallback)
}
return doInsert()
}
func (store *AbstractSqlStore) UpdateEntry(ctx context.Context, entry *filer.Entry) (err error) { func (store *AbstractSqlStore) UpdateEntry(ctx context.Context, entry *filer.Entry) (err error) {
var doUpdate func() error
doUpdate = func() error {
db, bucket, shortPath, err := store.getTxOrDB(ctx, entry.FullPath, false) db, bucket, shortPath, err := store.getTxOrDB(ctx, entry.FullPath, false)
if err != nil { if err != nil {
return fmt.Errorf("findDB %s : %v", entry.FullPath, err) return fmt.Errorf("findDB %s : %w", entry.FullPath, err)
} }
dir, name := shortPath.DirAndName() dir, name := shortPath.DirAndName()
meta, err := entry.EncodeAttributesAndChunks() meta, err := entry.EncodeAttributesAndChunks()
if err != nil { if err != nil {
return fmt.Errorf("encode %s: %s", entry.FullPath, err) return fmt.Errorf("encode %s: %w", entry.FullPath, err)
} }
res, err := db.ExecContext(ctx, store.GetSqlUpdate(bucket), meta, util.HashStringToLong(dir), name, dir) res, err := db.ExecContext(ctx, store.GetSqlUpdate(bucket), meta, util.HashStringToLong(dir), name, dir)
if err != nil { if err != nil {
return fmt.Errorf("update %s: %s", entry.FullPath, err) return fmt.Errorf("update %s: %w", entry.FullPath, err)
} }
_, err = res.RowsAffected() _, err = res.RowsAffected()
if err != nil { if err != nil {
return fmt.Errorf("update %s but no rows affected: %s", entry.FullPath, err) return fmt.Errorf("update %s but no rows affected: %w", entry.FullPath, err)
} }
return nil return nil
} }
if store.RetryableErrorCallback != nil {
if ctx.Value("tx") != nil {
return doUpdate()
}
return util.RetryUntil("UpdateEntry", doUpdate, store.RetryableErrorCallback)
}
return doUpdate()
}
func (store *AbstractSqlStore) FindEntry(ctx context.Context, fullpath util.FullPath) (*filer.Entry, error) { func (store *AbstractSqlStore) FindEntry(ctx context.Context, fullpath util.FullPath) (*filer.Entry, error) {
db, bucket, shortPath, err := store.getTxOrDB(ctx, fullpath, false) db, bucket, shortPath, err := store.getTxOrDB(ctx, fullpath, false)
@@ -240,31 +264,43 @@ func (store *AbstractSqlStore) FindEntry(ctx context.Context, fullpath util.Full
func (store *AbstractSqlStore) DeleteEntry(ctx context.Context, fullpath util.FullPath) error { func (store *AbstractSqlStore) DeleteEntry(ctx context.Context, fullpath util.FullPath) error {
var doDelete func() error
doDelete = func() error {
db, bucket, shortPath, err := store.getTxOrDB(ctx, fullpath, false) db, bucket, shortPath, err := store.getTxOrDB(ctx, fullpath, false)
if err != nil { if err != nil {
return fmt.Errorf("findDB %s : %v", fullpath, err) return fmt.Errorf("findDB %s : %w", fullpath, err)
} }
dir, name := shortPath.DirAndName() dir, name := shortPath.DirAndName()
res, err := db.ExecContext(ctx, store.GetSqlDelete(bucket), util.HashStringToLong(dir), name, dir) res, err := db.ExecContext(ctx, store.GetSqlDelete(bucket), util.HashStringToLong(dir), name, dir)
if err != nil { if err != nil {
return fmt.Errorf("delete %s: %s", fullpath, err) return fmt.Errorf("delete %s: %w", fullpath, err)
} }
_, err = res.RowsAffected() _, err = res.RowsAffected()
if err != nil { if err != nil {
return fmt.Errorf("delete %s but no rows affected: %s", fullpath, err) return fmt.Errorf("delete %s but no rows affected: %w", fullpath, err)
}
return nil
} }
return nil if store.RetryableErrorCallback != nil {
if ctx.Value("tx") != nil {
return doDelete()
}
return util.RetryUntil("DeleteEntry", doDelete, store.RetryableErrorCallback)
}
return doDelete()
} }
func (store *AbstractSqlStore) DeleteFolderChildren(ctx context.Context, fullpath util.FullPath) error { func (store *AbstractSqlStore) DeleteFolderChildren(ctx context.Context, fullpath util.FullPath) error {
var doDeleteFolderChildren func() error
doDeleteFolderChildren = func() error {
db, bucket, shortPath, err := store.getTxOrDB(ctx, fullpath, true) db, bucket, shortPath, err := store.getTxOrDB(ctx, fullpath, true)
if err != nil { if err != nil {
return fmt.Errorf("findDB %s : %v", fullpath, err) return fmt.Errorf("findDB %s : %w", fullpath, err)
} }
if isValidBucket(bucket) && shortPath == "/" { if isValidBucket(bucket) && shortPath == "/" {
@@ -281,16 +317,25 @@ func (store *AbstractSqlStore) DeleteFolderChildren(ctx context.Context, fullpat
glog.V(4).InfofCtx(ctx, "delete %s SQL %s %d", string(shortPath), store.GetSqlDeleteFolderChildren(bucket), util.HashStringToLong(string(shortPath))) glog.V(4).InfofCtx(ctx, "delete %s SQL %s %d", string(shortPath), store.GetSqlDeleteFolderChildren(bucket), util.HashStringToLong(string(shortPath)))
res, err := db.ExecContext(ctx, store.GetSqlDeleteFolderChildren(bucket), util.HashStringToLong(string(shortPath)), string(shortPath)) res, err := db.ExecContext(ctx, store.GetSqlDeleteFolderChildren(bucket), util.HashStringToLong(string(shortPath)), string(shortPath))
if err != nil { if err != nil {
return fmt.Errorf("deleteFolderChildren %s: %s", fullpath, err) return fmt.Errorf("deleteFolderChildren %s: %w", fullpath, err)
} }
_, err = res.RowsAffected() _, err = res.RowsAffected()
if err != nil { if err != nil {
return fmt.Errorf("deleteFolderChildren %s but no rows affected: %s", fullpath, err) return fmt.Errorf("deleteFolderChildren %s but no rows affected: %w", fullpath, err)
} }
return nil return nil
} }
if store.RetryableErrorCallback != nil {
if ctx.Value("tx") != nil {
return doDeleteFolderChildren()
}
return util.RetryUntil("DeleteFolderChildren", doDeleteFolderChildren, store.RetryableErrorCallback)
}
return doDeleteFolderChildren()
}
func (store *AbstractSqlStore) ListDirectoryPrefixedEntries(ctx context.Context, dirPath util.FullPath, startFileName string, includeStartFile bool, limit int64, prefix string, eachEntryFunc filer.ListEachEntryFunc) (lastFileName string, err error) { func (store *AbstractSqlStore) ListDirectoryPrefixedEntries(ctx context.Context, dirPath util.FullPath, startFileName string, includeStartFile bool, limit int64, prefix string, eachEntryFunc filer.ListEachEntryFunc) (lastFileName string, err error) {
db, bucket, shortPath, err := store.getTxOrDB(ctx, dirPath, true) db, bucket, shortPath, err := store.getTxOrDB(ctx, dirPath, true)

View File

@@ -4,6 +4,7 @@ import (
"crypto/tls" "crypto/tls"
"crypto/x509" "crypto/x509"
"database/sql" "database/sql"
"errors"
"fmt" "fmt"
"os" "os"
"strings" "strings"
@@ -67,6 +68,19 @@ func (store *MysqlStore) initialize(dsn string, upsertQuery string, enableUpsert
UpsertQueryTemplate: upsertQuery, UpsertQueryTemplate: upsertQuery,
} }
store.RetryableErrorCallback = func(err error) bool {
var mysqlError *mysql.MySQLError
if errors.As(err, &mysqlError) {
if mysqlError.Number == 1213 { // ER_LOCK_DEADLOCK
return true
}
if mysqlError.Number == 1205 { // ER_LOCK_WAIT_TIMEOUT
return true
}
}
return false
}
if enableTls { if enableTls {
rootCertPool := x509.NewCertPool() rootCertPool := x509.NewCertPool()
pem, err := os.ReadFile(caCrtDir) pem, err := os.ReadFile(caCrtDir)

View File

@@ -58,13 +58,13 @@ func MultiRetry(name string, errList []string, job func() error) (err error) {
} }
// RetryUntil retries until the job returns no error or onErrFn returns false // RetryUntil retries until the job returns no error or onErrFn returns false
func RetryUntil(name string, job func() error, onErrFn func(err error) (shouldContinue bool)) { func RetryUntil(name string, job func() error, onErrFn func(err error) (shouldContinue bool)) error {
waitTime := time.Second waitTime := time.Second
for { for {
err := job() err := job()
if err == nil { if err == nil {
waitTime = time.Second waitTime = time.Second
break return nil
} }
if onErrFn(err) { if onErrFn(err) {
if strings.Contains(err.Error(), "transport") || strings.Contains(err.Error(), "ResourceExhausted") || strings.Contains(err.Error(), "Unavailable") { if strings.Contains(err.Error(), "transport") || strings.Contains(err.Error(), "ResourceExhausted") || strings.Contains(err.Error(), "Unavailable") {
@@ -76,7 +76,7 @@ func RetryUntil(name string, job func() error, onErrFn func(err error) (shouldCo
} }
continue continue
} else { } else {
break return err
} }
} }
} }

65
weed/util/retry_test.go Normal file
View File

@@ -0,0 +1,65 @@
package util
import (
"errors"
"testing"
)
func TestRetryUntil(t *testing.T) {
// Test case 1: Function succeeds immediately
t.Run("SucceedsImmediately", func(t *testing.T) {
callCount := 0
err := RetryUntil("test", func() error {
callCount++
return nil
}, func(err error) bool {
return false
})
if err != nil {
t.Errorf("Expected no error, got %v", err)
}
if callCount != 1 {
t.Errorf("Expected 1 call, got %d", callCount)
}
})
// Test case 2: Function fails with retryable error, then succeeds
t.Run("SucceedsAfterRetry", func(t *testing.T) {
callCount := 0
err := RetryUntil("test", func() error {
callCount++
if callCount < 3 {
return errors.New("retryable error")
}
return nil
}, func(err error) bool {
return err.Error() == "retryable error"
})
if err != nil {
t.Errorf("Expected no error, got %v", err)
}
if callCount != 3 {
t.Errorf("Expected 3 calls, got %d", callCount)
}
})
// Test case 3: Function fails with non-retryable error
t.Run("FailsNonRetryable", func(t *testing.T) {
callCount := 0
err := RetryUntil("test", func() error {
callCount++
return errors.New("fatal error")
}, func(err error) bool {
return err.Error() == "retryable error"
})
if err == nil || err.Error() != "fatal error" {
t.Errorf("Expected 'fatal error', got %v", err)
}
if callCount != 1 {
t.Errorf("Expected 1 call, got %d", callCount)
}
})
}