filer.sync: show active chunk transfers when sync progress stalls
When the sync watermark is not advancing, print each in-progress chunk transfer with its file path, bytes received so far, and current status (downloading, uploading, or waiting with backoff duration). This helps diagnose which files are blocking progress during replication. Closes #8542
This commit is contained in:
@@ -359,6 +359,7 @@ func doSubscribeFilerMetaChanges(clientId int32, clientEpoch int32, sourceGrpcDi
|
|||||||
}
|
}
|
||||||
|
|
||||||
var lastLogTsNs = time.Now().UnixNano()
|
var lastLogTsNs = time.Now().UnixNano()
|
||||||
|
var lastProgressedTsNs int64
|
||||||
var clientName = fmt.Sprintf("syncFrom_%s_To_%s", string(sourceFiler), string(targetFiler))
|
var clientName = fmt.Sprintf("syncFrom_%s_To_%s", string(sourceFiler), string(targetFiler))
|
||||||
processEventFnWithOffset := pb.AddOffsetFunc(func(resp *filer_pb.SubscribeMetadataResponse) error {
|
processEventFnWithOffset := pb.AddOffsetFunc(func(resp *filer_pb.SubscribeMetadataResponse) error {
|
||||||
processor.AddSyncJob(resp)
|
processor.AddSyncJob(resp)
|
||||||
@@ -372,6 +373,13 @@ func doSubscribeFilerMetaChanges(clientId int32, clientEpoch int32, sourceGrpcDi
|
|||||||
now := time.Now().UnixNano()
|
now := time.Now().UnixNano()
|
||||||
glog.V(0).Infof("sync %s to %s progressed to %v %0.2f/sec", sourceFiler, targetFiler, time.Unix(0, offsetTsNs), float64(counter)/(float64(now-lastLogTsNs)/1e9))
|
glog.V(0).Infof("sync %s to %s progressed to %v %0.2f/sec", sourceFiler, targetFiler, time.Unix(0, offsetTsNs), float64(counter)/(float64(now-lastLogTsNs)/1e9))
|
||||||
lastLogTsNs = now
|
lastLogTsNs = now
|
||||||
|
if offsetTsNs == lastProgressedTsNs {
|
||||||
|
for _, t := range filerSink.ActiveTransfers() {
|
||||||
|
glog.V(0).Infof(" %s %s: %d bytes received, %s",
|
||||||
|
t.ChunkFileId, t.Path, t.BytesReceived, t.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lastProgressedTsNs = offsetTsNs
|
||||||
// collect synchronous offset
|
// collect synchronous offset
|
||||||
statsCollect.FilerSyncOffsetGauge.WithLabelValues(sourceFiler.String(), targetFiler.String(), clientName, sourcePath).Set(float64(offsetTsNs))
|
statsCollect.FilerSyncOffsetGauge.WithLabelValues(sourceFiler.String(), targetFiler.String(), clientName, sourcePath).Set(float64(offsetTsNs))
|
||||||
return setOffset(targetGrpcDialOption, targetFiler, getSignaturePrefixByPath(sourcePath), sourceFilerSignature, offsetTsNs)
|
return setOffset(targetGrpcDialOption, targetFiler, getSignaturePrefixByPath(sourcePath), sourceFilerSignature, offsetTsNs)
|
||||||
|
|||||||
@@ -241,6 +241,14 @@ func (fs *FilerSink) fetchAndWrite(sourceChunk *filer_pb.FileChunk, path string,
|
|||||||
return "", fmt.Errorf("upload data: %w", err)
|
return "", fmt.Errorf("upload data: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
transferStatus := &ChunkTransferStatus{
|
||||||
|
ChunkFileId: sourceChunk.GetFileIdString(),
|
||||||
|
Path: path,
|
||||||
|
Status: "downloading",
|
||||||
|
}
|
||||||
|
fs.activeTransfers.Store(sourceChunk.GetFileIdString(), transferStatus)
|
||||||
|
defer fs.activeTransfers.Delete(sourceChunk.GetFileIdString())
|
||||||
|
|
||||||
eofBackoff := time.Duration(0)
|
eofBackoff := time.Duration(0)
|
||||||
var partialData []byte
|
var partialData []byte
|
||||||
var savedFilename string
|
var savedFilename string
|
||||||
@@ -282,6 +290,9 @@ func (fs *FilerSink) fetchAndWrite(sourceChunk *filer_pb.FileChunk, path string,
|
|||||||
fullData = data
|
fullData = data
|
||||||
}
|
}
|
||||||
|
|
||||||
|
transferStatus.BytesReceived = int64(len(fullData))
|
||||||
|
transferStatus.Status = "uploading"
|
||||||
|
|
||||||
currentFileId, uploadResult, uploadErr, _ := uploader.UploadWithRetry(
|
currentFileId, uploadResult, uploadErr, _ := uploader.UploadWithRetry(
|
||||||
fs,
|
fs,
|
||||||
&filer_pb.AssignVolumeRequest{
|
&filer_pb.AssignVolumeRequest{
|
||||||
@@ -326,9 +337,12 @@ func (fs *FilerSink) fetchAndWrite(sourceChunk *filer_pb.FileChunk, path string,
|
|||||||
}
|
}
|
||||||
if isEofError(retryErr) {
|
if isEofError(retryErr) {
|
||||||
eofBackoff = nextEofBackoff(eofBackoff)
|
eofBackoff = nextEofBackoff(eofBackoff)
|
||||||
|
transferStatus.BytesReceived = int64(len(partialData))
|
||||||
|
transferStatus.Status = fmt.Sprintf("waiting %v", eofBackoff)
|
||||||
glog.V(0).Infof("source connection interrupted while replicating %s for %s (%d bytes received so far), backing off %v: %v",
|
glog.V(0).Infof("source connection interrupted while replicating %s for %s (%d bytes received so far), backing off %v: %v",
|
||||||
sourceChunk.GetFileIdString(), path, len(partialData), eofBackoff, retryErr)
|
sourceChunk.GetFileIdString(), path, len(partialData), eofBackoff, retryErr)
|
||||||
time.Sleep(eofBackoff)
|
time.Sleep(eofBackoff)
|
||||||
|
transferStatus.Status = "downloading"
|
||||||
} else {
|
} else {
|
||||||
glog.V(0).Infof("replicate %s for %s: %v", sourceChunk.GetFileIdString(), path, retryErr)
|
glog.V(0).Infof("replicate %s for %s: %v", sourceChunk.GetFileIdString(), path, retryErr)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
|
"sync"
|
||||||
|
|
||||||
"github.com/seaweedfs/seaweedfs/weed/pb"
|
"github.com/seaweedfs/seaweedfs/weed/pb"
|
||||||
"github.com/seaweedfs/seaweedfs/weed/wdclient"
|
"github.com/seaweedfs/seaweedfs/weed/wdclient"
|
||||||
@@ -20,6 +21,14 @@ import (
|
|||||||
"github.com/seaweedfs/seaweedfs/weed/util"
|
"github.com/seaweedfs/seaweedfs/weed/util"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// ChunkTransferStatus tracks the progress of a single chunk being replicated.
|
||||||
|
type ChunkTransferStatus struct {
|
||||||
|
ChunkFileId string
|
||||||
|
Path string
|
||||||
|
BytesReceived int64
|
||||||
|
Status string // "downloading", "uploading", or "waiting 10s" etc.
|
||||||
|
}
|
||||||
|
|
||||||
type FilerSink struct {
|
type FilerSink struct {
|
||||||
filerSource *source.FilerSource
|
filerSource *source.FilerSource
|
||||||
grpcAddress string
|
grpcAddress string
|
||||||
@@ -35,6 +44,7 @@ type FilerSink struct {
|
|||||||
isIncremental bool
|
isIncremental bool
|
||||||
executor *util.LimitedConcurrentExecutor
|
executor *util.LimitedConcurrentExecutor
|
||||||
signature int32
|
signature int32
|
||||||
|
activeTransfers sync.Map // chunkFileId -> *ChunkTransferStatus
|
||||||
}
|
}
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
@@ -101,6 +111,16 @@ func (fs *FilerSink) SetChunkConcurrency(concurrency int) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ActiveTransfers returns a snapshot of all in-progress chunk transfers.
|
||||||
|
func (fs *FilerSink) ActiveTransfers() []*ChunkTransferStatus {
|
||||||
|
var transfers []*ChunkTransferStatus
|
||||||
|
fs.activeTransfers.Range(func(key, value any) bool {
|
||||||
|
transfers = append(transfers, value.(*ChunkTransferStatus))
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
return transfers
|
||||||
|
}
|
||||||
|
|
||||||
func (fs *FilerSink) DeleteEntry(key string, isDirectory, deleteIncludeChunks bool, signatures []int32) error {
|
func (fs *FilerSink) DeleteEntry(key string, isDirectory, deleteIncludeChunks bool, signatures []int32) error {
|
||||||
|
|
||||||
dir, name := util.FullPath(key).DirAndName()
|
dir, name := util.FullPath(key).DirAndName()
|
||||||
|
|||||||
Reference in New Issue
Block a user