filer.sync: show active chunk transfers when sync progress stalls (#8889)
* filer.sync: show active chunk transfers when sync progress stalls When the sync watermark is not advancing, print each in-progress chunk transfer with its file path, bytes received so far, and current status (downloading, uploading, or waiting with backoff duration). This helps diagnose which files are blocking progress during replication. Closes #8542 * filer.sync: include last error in stall diagnostics * filer.sync: fix data races in ChunkTransferStatus Add sync.RWMutex to ChunkTransferStatus and lock around all field mutations in fetchAndWrite. ActiveTransfers now returns value copies under RLock so callers get immutable snapshots.
This commit is contained in:
@@ -241,6 +241,14 @@ func (fs *FilerSink) fetchAndWrite(sourceChunk *filer_pb.FileChunk, path string,
|
||||
return "", fmt.Errorf("upload data: %w", err)
|
||||
}
|
||||
|
||||
transferStatus := &ChunkTransferStatus{
|
||||
ChunkFileId: sourceChunk.GetFileIdString(),
|
||||
Path: path,
|
||||
Status: "downloading",
|
||||
}
|
||||
fs.activeTransfers.Store(sourceChunk.GetFileIdString(), transferStatus)
|
||||
defer fs.activeTransfers.Delete(sourceChunk.GetFileIdString())
|
||||
|
||||
eofBackoff := time.Duration(0)
|
||||
var partialData []byte
|
||||
var savedFilename string
|
||||
@@ -282,6 +290,11 @@ func (fs *FilerSink) fetchAndWrite(sourceChunk *filer_pb.FileChunk, path string,
|
||||
fullData = data
|
||||
}
|
||||
|
||||
transferStatus.mu.Lock()
|
||||
transferStatus.BytesReceived = int64(len(fullData))
|
||||
transferStatus.Status = "uploading"
|
||||
transferStatus.mu.Unlock()
|
||||
|
||||
currentFileId, uploadResult, uploadErr, _ := uploader.UploadWithRetry(
|
||||
fs,
|
||||
&filer_pb.AssignVolumeRequest{
|
||||
@@ -324,11 +337,21 @@ func (fs *FilerSink) fetchAndWrite(sourceChunk *filer_pb.FileChunk, path string,
|
||||
glog.V(1).Infof("skip retrying stale source %s for %s: %v", sourceChunk.GetFileIdString(), path, retryErr)
|
||||
return false
|
||||
}
|
||||
transferStatus.mu.Lock()
|
||||
transferStatus.LastErr = retryErr.Error()
|
||||
transferStatus.mu.Unlock()
|
||||
if isEofError(retryErr) {
|
||||
eofBackoff = nextEofBackoff(eofBackoff)
|
||||
transferStatus.mu.Lock()
|
||||
transferStatus.BytesReceived = int64(len(partialData))
|
||||
transferStatus.Status = fmt.Sprintf("waiting %v", eofBackoff)
|
||||
transferStatus.mu.Unlock()
|
||||
glog.V(0).Infof("source connection interrupted while replicating %s for %s (%d bytes received so far), backing off %v: %v",
|
||||
sourceChunk.GetFileIdString(), path, len(partialData), eofBackoff, retryErr)
|
||||
time.Sleep(eofBackoff)
|
||||
transferStatus.mu.Lock()
|
||||
transferStatus.Status = "downloading"
|
||||
transferStatus.mu.Unlock()
|
||||
} else {
|
||||
glog.V(0).Infof("replicate %s for %s: %v", sourceChunk.GetFileIdString(), path, retryErr)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user