* filer.sync: add exponential backoff on unexpected EOF during replication When the source volume server drops connections under high traffic, filer.sync retries aggressively (every 1-6s), hammering the already overloaded source. This adds a longer exponential backoff (10s to 2min) specifically for "unexpected EOF" errors, reducing pressure on the source while still retrying indefinitely until success. Also adds more logging throughout the replication path: - Log source URL and error at V(0) when ReadPart or io.ReadAll fails - Log content-length and byte counts at V(4) on success - Log backoff duration in retry messages Fixes #8542 * filer.sync: extract backoff helper and fix 2-minute cap - Extract nextEofBackoff() and isEofError() helpers to deduplicate the backoff logic between fetchAndWrite and uploadManifestChunk - Fix the cap: previously 80s would double to 160s and pass the < 2min check uncapped. Now doubles first, then clamps to 2min. * filer.sync: log source URL instead of empty upload URL on read errors UploadUrl is not populated until after the reader is consumed, so the V(0) and V(4) logs were printing an empty string. Add SourceUrl field to UploadOption and populate it from the HTTP response in fetchAndWrite. * filer.sync: guard isEofError against nil error * filer.sync: use errors.Is for EOF detection, fix log wording - Replace broad substring matching ("read input", "unexpected EOF") with errors.Is(err, io.ErrUnexpectedEOF) and errors.Is(err, io.EOF) so only actual EOF errors trigger the longer backoff - Fix awkward log phrasing: "interrupted replicate" → "interrupted while replicating" * filer.sync: remove EOF backoff from uploadManifestChunk uploadManifestChunk reads from an in-memory bytes.Reader, so any EOF errors there are from the destination side, not a broken source stream. The long source-oriented backoff is inappropriate; let RetryUntil handle destination retries at its normal cadence. --------- Co-authored-by: Copilot <copilot@github.com>
420 lines
12 KiB
Go
420 lines
12 KiB
Go
package filersink
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/schollz/progressbar/v3"
|
|
"google.golang.org/protobuf/proto"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/filer"
|
|
"github.com/seaweedfs/seaweedfs/weed/util"
|
|
|
|
"google.golang.org/grpc"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/operation"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
|
|
util_http "github.com/seaweedfs/seaweedfs/weed/util/http"
|
|
)
|
|
|
|
func (fs *FilerSink) replicateChunks(ctx context.Context, sourceChunks []*filer_pb.FileChunk, path string, sourceMtime int64) (replicatedChunks []*filer_pb.FileChunk, err error) {
|
|
if len(sourceChunks) == 0 {
|
|
return
|
|
}
|
|
if ctx == nil {
|
|
ctx = context.Background()
|
|
}
|
|
|
|
// a simple progress bar. Not ideal. Fix me.
|
|
var bar *progressbar.ProgressBar
|
|
if len(sourceChunks) > 1 {
|
|
name := filepath.Base(path)
|
|
bar = progressbar.NewOptions64(int64(len(sourceChunks)),
|
|
progressbar.OptionClearOnFinish(),
|
|
progressbar.OptionOnCompletion(func() {
|
|
fmt.Fprint(os.Stderr, "\n")
|
|
}),
|
|
progressbar.OptionFullWidth(),
|
|
progressbar.OptionSetDescription(name),
|
|
)
|
|
}
|
|
|
|
replicatedChunks = make([]*filer_pb.FileChunk, len(sourceChunks))
|
|
var errLock sync.Mutex
|
|
setError := func(e error) {
|
|
if e == nil {
|
|
return
|
|
}
|
|
errLock.Lock()
|
|
if err == nil {
|
|
err = e
|
|
}
|
|
errLock.Unlock()
|
|
}
|
|
hasError := func() bool {
|
|
errLock.Lock()
|
|
defer errLock.Unlock()
|
|
return err != nil
|
|
}
|
|
|
|
var wg sync.WaitGroup
|
|
for chunkIndex, sourceChunk := range sourceChunks {
|
|
if hasError() {
|
|
break
|
|
}
|
|
|
|
if sourceChunk.IsChunkManifest {
|
|
replicatedChunk, replicateErr := fs.replicateOneManifestChunk(ctx, sourceChunk, path, sourceMtime)
|
|
if replicateErr != nil {
|
|
setError(replicateErr)
|
|
break
|
|
}
|
|
replicatedChunks[chunkIndex] = replicatedChunk
|
|
if bar != nil {
|
|
bar.Add(1)
|
|
}
|
|
continue
|
|
}
|
|
|
|
wg.Add(1)
|
|
index, source := chunkIndex, sourceChunk
|
|
fs.executor.Execute(func() {
|
|
defer wg.Done()
|
|
var replicatedChunk *filer_pb.FileChunk
|
|
retryErr := util.Retry("replicate chunks", func() error {
|
|
chunk, e := fs.replicateOneChunk(source, path, sourceMtime)
|
|
if e != nil {
|
|
return e
|
|
}
|
|
replicatedChunk = chunk
|
|
return nil
|
|
})
|
|
if retryErr != nil {
|
|
setError(retryErr)
|
|
return
|
|
}
|
|
|
|
replicatedChunks[index] = replicatedChunk
|
|
if bar != nil {
|
|
bar.Add(1)
|
|
}
|
|
})
|
|
}
|
|
wg.Wait()
|
|
|
|
return
|
|
}
|
|
|
|
func (fs *FilerSink) replicateOneChunk(sourceChunk *filer_pb.FileChunk, path string, sourceMtime int64) (*filer_pb.FileChunk, error) {
|
|
|
|
fileId, err := fs.fetchAndWrite(sourceChunk, path, sourceMtime)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("copy %s: %v", sourceChunk.GetFileIdString(), err)
|
|
}
|
|
|
|
return &filer_pb.FileChunk{
|
|
FileId: fileId,
|
|
Offset: sourceChunk.Offset,
|
|
Size: sourceChunk.Size,
|
|
ModifiedTsNs: sourceChunk.ModifiedTsNs,
|
|
ETag: sourceChunk.ETag,
|
|
SourceFileId: sourceChunk.GetFileIdString(),
|
|
CipherKey: sourceChunk.CipherKey,
|
|
IsCompressed: sourceChunk.IsCompressed,
|
|
SseType: sourceChunk.SseType,
|
|
SseMetadata: sourceChunk.SseMetadata,
|
|
}, nil
|
|
}
|
|
|
|
func (fs *FilerSink) replicateOneManifestChunk(ctx context.Context, sourceChunk *filer_pb.FileChunk, path string, sourceMtime int64) (*filer_pb.FileChunk, error) {
|
|
resolvedChunks, err := filer.ResolveOneChunkManifest(ctx, fs.filerSource.LookupFileId, sourceChunk)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("resolve manifest %s: %w", sourceChunk.GetFileIdString(), err)
|
|
}
|
|
|
|
replicatedResolvedChunks, err := fs.replicateChunks(ctx, resolvedChunks, path, sourceMtime)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("replicate manifest data chunks %s: %w", sourceChunk.GetFileIdString(), err)
|
|
}
|
|
|
|
manifestDataChunks := make([]*filer_pb.FileChunk, len(replicatedResolvedChunks))
|
|
for i, chunk := range replicatedResolvedChunks {
|
|
copied := *chunk
|
|
manifestDataChunks[i] = &copied
|
|
}
|
|
filer_pb.BeforeEntrySerialization(manifestDataChunks)
|
|
manifestData, err := proto.Marshal(&filer_pb.FileChunkManifest{
|
|
Chunks: manifestDataChunks,
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("marshal manifest %s: %w", sourceChunk.GetFileIdString(), err)
|
|
}
|
|
|
|
manifestFileId, err := fs.uploadManifestChunk(path, sourceMtime, sourceChunk.GetFileIdString(), manifestData)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &filer_pb.FileChunk{
|
|
FileId: manifestFileId,
|
|
Offset: sourceChunk.Offset,
|
|
Size: sourceChunk.Size,
|
|
ModifiedTsNs: sourceChunk.ModifiedTsNs,
|
|
ETag: sourceChunk.ETag,
|
|
SourceFileId: sourceChunk.GetFileIdString(),
|
|
IsChunkManifest: true,
|
|
}, nil
|
|
}
|
|
|
|
func (fs *FilerSink) uploadManifestChunk(path string, sourceMtime int64, sourceFileId string, manifestData []byte) (fileId string, err error) {
|
|
uploader, err := operation.NewUploader()
|
|
if err != nil {
|
|
glog.V(0).Infof("upload manifest data %v: %v", sourceFileId, err)
|
|
return "", fmt.Errorf("upload manifest data: %w", err)
|
|
}
|
|
|
|
retryName := fmt.Sprintf("replicate manifest chunk %s", sourceFileId)
|
|
err = util.RetryUntil(retryName, func() error {
|
|
currentFileId, uploadResult, uploadErr, _ := uploader.UploadWithRetry(
|
|
fs,
|
|
&filer_pb.AssignVolumeRequest{
|
|
Count: 1,
|
|
Replication: fs.replication,
|
|
Collection: fs.collection,
|
|
TtlSec: fs.ttlSec,
|
|
DataCenter: fs.dataCenter,
|
|
DiskType: fs.diskType,
|
|
Path: path,
|
|
},
|
|
&operation.UploadOption{
|
|
Filename: "",
|
|
Cipher: false,
|
|
IsInputCompressed: false,
|
|
MimeType: "application/octet-stream",
|
|
PairMap: nil,
|
|
RetryForever: false,
|
|
},
|
|
func(host, fileId string) string {
|
|
return fs.buildUploadUrl(host, fileId)
|
|
},
|
|
bytes.NewReader(manifestData),
|
|
)
|
|
if uploadErr != nil {
|
|
return fmt.Errorf("upload manifest data: %w", uploadErr)
|
|
}
|
|
if uploadResult.Error != "" {
|
|
return fmt.Errorf("upload manifest result: %v", uploadResult.Error)
|
|
}
|
|
|
|
fileId = currentFileId
|
|
return nil
|
|
}, func(uploadErr error) (shouldContinue bool) {
|
|
if fs.hasSourceNewerVersion(path, sourceMtime) {
|
|
glog.V(1).Infof("skip retrying stale source manifest %s for %s: %v", sourceFileId, path, uploadErr)
|
|
return false
|
|
}
|
|
glog.V(0).Infof("replicate manifest %s for %s: %v", sourceFileId, path, uploadErr)
|
|
return true
|
|
})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return fileId, nil
|
|
}
|
|
|
|
func (fs *FilerSink) fetchAndWrite(sourceChunk *filer_pb.FileChunk, path string, sourceMtime int64) (fileId string, err error) {
|
|
uploader, err := operation.NewUploader()
|
|
if err != nil {
|
|
glog.V(0).Infof("upload source data %v: %v", sourceChunk.GetFileIdString(), err)
|
|
return "", fmt.Errorf("upload data: %w", err)
|
|
}
|
|
|
|
eofBackoff := time.Duration(0)
|
|
retryName := fmt.Sprintf("replicate chunk %s", sourceChunk.GetFileIdString())
|
|
err = util.RetryUntil(retryName, func() error {
|
|
filename, header, resp, readErr := fs.filerSource.ReadPart(sourceChunk.GetFileIdString())
|
|
if readErr != nil {
|
|
return fmt.Errorf("read part %s: %w", sourceChunk.GetFileIdString(), readErr)
|
|
}
|
|
defer util_http.CloseResponse(resp)
|
|
|
|
sourceUrl := ""
|
|
if resp.Request != nil && resp.Request.URL != nil {
|
|
sourceUrl = resp.Request.URL.String()
|
|
}
|
|
|
|
currentFileId, uploadResult, uploadErr, _ := uploader.UploadWithRetry(
|
|
fs,
|
|
&filer_pb.AssignVolumeRequest{
|
|
Count: 1,
|
|
Replication: fs.replication,
|
|
Collection: fs.collection,
|
|
TtlSec: fs.ttlSec,
|
|
DataCenter: fs.dataCenter,
|
|
DiskType: fs.diskType,
|
|
Path: path,
|
|
},
|
|
&operation.UploadOption{
|
|
Filename: filename,
|
|
Cipher: false,
|
|
IsInputCompressed: "gzip" == header.Get("Content-Encoding"),
|
|
MimeType: header.Get("Content-Type"),
|
|
PairMap: nil,
|
|
RetryForever: false,
|
|
SourceUrl: sourceUrl,
|
|
},
|
|
func(host, fileId string) string {
|
|
fileUrl := fs.buildUploadUrl(host, fileId)
|
|
glog.V(4).Infof("replicating %s to %s header:%+v", filename, fileUrl, header)
|
|
return fileUrl
|
|
},
|
|
resp.Body,
|
|
)
|
|
if uploadErr != nil {
|
|
return fmt.Errorf("upload data: %w", uploadErr)
|
|
}
|
|
if uploadResult.Error != "" {
|
|
return fmt.Errorf("upload result: %v", uploadResult.Error)
|
|
}
|
|
|
|
eofBackoff = 0
|
|
fileId = currentFileId
|
|
return nil
|
|
}, func(uploadErr error) (shouldContinue bool) {
|
|
if fs.hasSourceNewerVersion(path, sourceMtime) {
|
|
glog.V(1).Infof("skip retrying stale source %s for %s: %v", sourceChunk.GetFileIdString(), path, uploadErr)
|
|
return false
|
|
}
|
|
if isEofError(uploadErr) {
|
|
eofBackoff = nextEofBackoff(eofBackoff)
|
|
glog.V(0).Infof("source connection interrupted while replicating %s for %s, backing off %v: %v", sourceChunk.GetFileIdString(), path, eofBackoff, uploadErr)
|
|
time.Sleep(eofBackoff)
|
|
} else {
|
|
glog.V(0).Infof("replicate %s for %s: %v", sourceChunk.GetFileIdString(), path, uploadErr)
|
|
}
|
|
return true
|
|
})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return fileId, nil
|
|
}
|
|
|
|
const maxEofBackoff = 2 * time.Minute
|
|
|
|
// nextEofBackoff returns the next backoff duration for unexpected EOF errors.
|
|
// It starts at 10s, doubles each time, and caps at 2 minutes.
|
|
func nextEofBackoff(current time.Duration) time.Duration {
|
|
if current < 10*time.Second {
|
|
return 10 * time.Second
|
|
}
|
|
current *= 2
|
|
if current > maxEofBackoff {
|
|
current = maxEofBackoff
|
|
}
|
|
return current
|
|
}
|
|
|
|
func isEofError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
return errors.Is(err, io.ErrUnexpectedEOF) || errors.Is(err, io.EOF)
|
|
}
|
|
|
|
func (fs *FilerSink) buildUploadUrl(host, fileId string) string {
|
|
if fs.writeChunkByFiler {
|
|
return fmt.Sprintf("http://%s/?proxyChunkId=%s", fs.address, fileId)
|
|
}
|
|
return fmt.Sprintf("http://%s/%s", host, fileId)
|
|
}
|
|
|
|
func (fs *FilerSink) hasSourceNewerVersion(targetPath string, sourceMtime int64) bool {
|
|
if sourceMtime <= 0 || fs.filerSource == nil {
|
|
return false
|
|
}
|
|
|
|
sourcePath, ok := fs.targetPathToSourcePath(targetPath)
|
|
if !ok {
|
|
return false
|
|
}
|
|
|
|
sourceEntry, err := filer_pb.GetEntry(context.Background(), fs.filerSource, sourcePath)
|
|
if err != nil {
|
|
glog.V(1).Infof("lookup source entry %s: %v", sourcePath, err)
|
|
return false
|
|
}
|
|
if sourceEntry == nil {
|
|
glog.V(1).Infof("source entry %s no longer exists", sourcePath)
|
|
return true
|
|
}
|
|
|
|
return sourceEntry.Attributes != nil && sourceEntry.Attributes.Mtime > sourceMtime
|
|
}
|
|
|
|
func (fs *FilerSink) targetPathToSourcePath(targetPath string) (util.FullPath, bool) {
|
|
if fs.filerSource == nil {
|
|
return "", false
|
|
}
|
|
|
|
normalizePath := func(p string) string {
|
|
p = strings.TrimSuffix(p, "/")
|
|
if p == "" {
|
|
return "/"
|
|
}
|
|
return p
|
|
}
|
|
|
|
sourceRoot := normalizePath(fs.filerSource.Dir)
|
|
targetRoot := normalizePath(fs.dir)
|
|
targetPath = normalizePath(targetPath)
|
|
|
|
var relative string
|
|
switch {
|
|
case targetRoot == "/":
|
|
relative = strings.TrimPrefix(targetPath, "/")
|
|
case targetPath == targetRoot:
|
|
relative = ""
|
|
case strings.HasPrefix(targetPath, targetRoot+"/"):
|
|
relative = targetPath[len(targetRoot)+1:]
|
|
default:
|
|
return "", false
|
|
}
|
|
|
|
if relative == "" {
|
|
return util.FullPath(sourceRoot), true
|
|
}
|
|
return util.FullPath(sourceRoot).Child(relative), true
|
|
}
|
|
|
|
var _ = filer_pb.FilerClient(&FilerSink{})
|
|
|
|
func (fs *FilerSink) WithFilerClient(streamingMode bool, fn func(filer_pb.SeaweedFilerClient) error) error {
|
|
|
|
return pb.WithGrpcClient(streamingMode, fs.signature, func(grpcConnection *grpc.ClientConn) error {
|
|
client := filer_pb.NewSeaweedFilerClient(grpcConnection)
|
|
return fn(client)
|
|
}, fs.grpcAddress, false, fs.grpcDialOption)
|
|
|
|
}
|
|
|
|
func (fs *FilerSink) AdjustedUrl(location *filer_pb.Location) string {
|
|
return location.Url
|
|
}
|
|
|
|
func (fs *FilerSink) GetDataCenter() string {
|
|
return fs.dataCenter
|
|
}
|