fix: skip log files with deleted volumes in filer backup (#7692)
fix: skip log files with deleted volumes in filer backup (#3720) When filer.backup or filer.meta.backup resumes after being stopped, it may encounter persisted log files stored on volumes that have since been deleted (via volume.deleteEmpty -force). Previously, this caused the backup to get stuck in an infinite retry loop with 'volume X not found' errors. This fix catches 'volume not found' errors when reading log files and skips the problematic file instead of failing. The backup will now: - Log a warning about the missing volume - Skip the problematic log file - Continue with the next log file, allowing progress The VolumeNotFoundPattern regex was already defined but never used - this change puts it to use. Fixes #3720
This commit is contained in:
@@ -152,9 +152,21 @@ func (f *Filer) logFlushFunc(logBuffer *log_buffer.LogBuffer, startTime, stopTim
|
|||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
VolumeNotFoundPattern = regexp.MustCompile(`volume \d+? not found`)
|
volumeNotFoundPattern = regexp.MustCompile(`volume \d+? not found`)
|
||||||
|
chunkNotFoundPattern = regexp.MustCompile(`(urls not found|File Not Found)`)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// isChunkNotFoundError checks if the error indicates that a volume or chunk
|
||||||
|
// has been deleted and is no longer available. These errors can be skipped
|
||||||
|
// when reading persisted log files since the data is unrecoverable.
|
||||||
|
func isChunkNotFoundError(err error) bool {
|
||||||
|
if err == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
errMsg := err.Error()
|
||||||
|
return volumeNotFoundPattern.MatchString(errMsg) || chunkNotFoundPattern.MatchString(errMsg)
|
||||||
|
}
|
||||||
|
|
||||||
func (f *Filer) ReadPersistedLogBuffer(startPosition log_buffer.MessagePosition, stopTsNs int64, eachLogEntryFn log_buffer.EachLogEntryFuncType) (lastTsNs int64, isDone bool, err error) {
|
func (f *Filer) ReadPersistedLogBuffer(startPosition log_buffer.MessagePosition, stopTsNs int64, eachLogEntryFn log_buffer.EachLogEntryFuncType) (lastTsNs int64, isDone bool, err error) {
|
||||||
|
|
||||||
visitor, visitErr := f.collectPersistedLogBuffer(startPosition, stopTsNs)
|
visitor, visitErr := f.collectPersistedLogBuffer(startPosition, stopTsNs)
|
||||||
|
|||||||
@@ -265,11 +265,12 @@ func (c *LogFileEntryCollector) collectMore(v *OrderedLogVisitor) (err error) {
|
|||||||
// ----------
|
// ----------
|
||||||
|
|
||||||
type LogFileQueueIterator struct {
|
type LogFileQueueIterator struct {
|
||||||
q *util.Queue[*LogFileEntry]
|
q *util.Queue[*LogFileEntry]
|
||||||
masterClient *wdclient.MasterClient
|
masterClient *wdclient.MasterClient
|
||||||
startTsNs int64
|
startTsNs int64
|
||||||
stopTsNs int64
|
stopTsNs int64
|
||||||
currentFileIterator *LogFileIterator
|
pendingEntries []*filer_pb.LogEntry
|
||||||
|
pendingIndex int
|
||||||
}
|
}
|
||||||
|
|
||||||
func newLogFileQueueIterator(masterClient *wdclient.MasterClient, q *util.Queue[*LogFileEntry], startTsNs, stopTsNs int64) *LogFileQueueIterator {
|
func newLogFileQueueIterator(masterClient *wdclient.MasterClient, q *util.Queue[*LogFileEntry], startTsNs, stopTsNs int64) *LogFileQueueIterator {
|
||||||
@@ -284,13 +285,17 @@ func newLogFileQueueIterator(masterClient *wdclient.MasterClient, q *util.Queue[
|
|||||||
// getNext will return io.EOF when done
|
// getNext will return io.EOF when done
|
||||||
func (iter *LogFileQueueIterator) getNext(v *OrderedLogVisitor) (logEntry *filer_pb.LogEntry, err error) {
|
func (iter *LogFileQueueIterator) getNext(v *OrderedLogVisitor) (logEntry *filer_pb.LogEntry, err error) {
|
||||||
for {
|
for {
|
||||||
if iter.currentFileIterator != nil {
|
// return pending entries first
|
||||||
logEntry, err = iter.currentFileIterator.getNext()
|
if iter.pendingIndex < len(iter.pendingEntries) {
|
||||||
if err != io.EOF {
|
logEntry = iter.pendingEntries[iter.pendingIndex]
|
||||||
return
|
iter.pendingIndex++
|
||||||
}
|
return logEntry, nil
|
||||||
}
|
}
|
||||||
// now either iter.currentFileIterator is nil or err is io.EOF
|
// reset for next file
|
||||||
|
iter.pendingEntries = nil
|
||||||
|
iter.pendingIndex = 0
|
||||||
|
|
||||||
|
// read entries from next file
|
||||||
if iter.q.Len() == 0 {
|
if iter.q.Len() == 0 {
|
||||||
return nil, io.EOF
|
return nil, io.EOF
|
||||||
}
|
}
|
||||||
@@ -313,7 +318,38 @@ func (iter *LogFileQueueIterator) getNext(v *OrderedLogVisitor) (logEntry *filer
|
|||||||
if next != nil && next.TsNs <= iter.startTsNs {
|
if next != nil && next.TsNs <= iter.startTsNs {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
iter.currentFileIterator = newLogFileIterator(iter.masterClient, t.FileEntry, iter.startTsNs, iter.stopTsNs)
|
|
||||||
|
// read all entries from this file
|
||||||
|
iter.pendingEntries, err = iter.readFileEntries(t.FileEntry)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// readFileEntries reads all log entries from a single file
|
||||||
|
func (iter *LogFileQueueIterator) readFileEntries(fileEntry *Entry) (entries []*filer_pb.LogEntry, err error) {
|
||||||
|
fileIterator := newLogFileIterator(iter.masterClient, fileEntry, iter.startTsNs, iter.stopTsNs)
|
||||||
|
defer func() {
|
||||||
|
if closeErr := fileIterator.Close(); closeErr != nil && err == nil {
|
||||||
|
err = closeErr
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
for {
|
||||||
|
logEntry, err := fileIterator.getNext()
|
||||||
|
if err == io.EOF {
|
||||||
|
return entries, nil
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
if isChunkNotFoundError(err) {
|
||||||
|
// Volume or chunk was deleted, skip the rest of this log file
|
||||||
|
glog.Warningf("skipping rest of %s: %v", fileIterator.filePath, err)
|
||||||
|
return entries, nil
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
entries = append(entries, logEntry)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -324,6 +360,7 @@ type LogFileIterator struct {
|
|||||||
sizeBuf []byte
|
sizeBuf []byte
|
||||||
startTsNs int64
|
startTsNs int64
|
||||||
stopTsNs int64
|
stopTsNs int64
|
||||||
|
filePath string
|
||||||
}
|
}
|
||||||
|
|
||||||
func newLogFileIterator(masterClient *wdclient.MasterClient, fileEntry *Entry, startTsNs, stopTsNs int64) *LogFileIterator {
|
func newLogFileIterator(masterClient *wdclient.MasterClient, fileEntry *Entry, startTsNs, stopTsNs int64) *LogFileIterator {
|
||||||
@@ -332,9 +369,17 @@ func newLogFileIterator(masterClient *wdclient.MasterClient, fileEntry *Entry, s
|
|||||||
sizeBuf: make([]byte, 4),
|
sizeBuf: make([]byte, 4),
|
||||||
startTsNs: startTsNs,
|
startTsNs: startTsNs,
|
||||||
stopTsNs: stopTsNs,
|
stopTsNs: stopTsNs,
|
||||||
|
filePath: string(fileEntry.FullPath),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (iter *LogFileIterator) Close() error {
|
||||||
|
if r, ok := iter.r.(io.Closer); ok {
|
||||||
|
return r.Close()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// getNext will return io.EOF when done
|
// getNext will return io.EOF when done
|
||||||
func (iter *LogFileIterator) getNext() (logEntry *filer_pb.LogEntry, err error) {
|
func (iter *LogFileIterator) getNext() (logEntry *filer_pb.LogEntry, err error) {
|
||||||
var n int
|
var n int
|
||||||
|
|||||||
@@ -245,6 +245,7 @@ type ChunkStreamReader struct {
|
|||||||
|
|
||||||
var _ = io.ReadSeeker(&ChunkStreamReader{})
|
var _ = io.ReadSeeker(&ChunkStreamReader{})
|
||||||
var _ = io.ReaderAt(&ChunkStreamReader{})
|
var _ = io.ReaderAt(&ChunkStreamReader{})
|
||||||
|
var _ = io.Closer(&ChunkStreamReader{})
|
||||||
|
|
||||||
func doNewChunkStreamReader(ctx context.Context, lookupFileIdFn wdclient.LookupFileIdFunctionType, chunks []*filer_pb.FileChunk) *ChunkStreamReader {
|
func doNewChunkStreamReader(ctx context.Context, lookupFileIdFn wdclient.LookupFileIdFunctionType, chunks []*filer_pb.FileChunk) *ChunkStreamReader {
|
||||||
|
|
||||||
@@ -403,8 +404,13 @@ func (c *ChunkStreamReader) fetchChunkToBuffer(chunkView *ChunkView) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ChunkStreamReader) Close() {
|
func (c *ChunkStreamReader) Close() error {
|
||||||
// TODO try to release and reuse buffer
|
c.bufferLock.Lock()
|
||||||
|
defer c.bufferLock.Unlock()
|
||||||
|
c.buffer = nil
|
||||||
|
c.head = nil
|
||||||
|
c.chunkView = nil
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func VolumeId(fileId string) string {
|
func VolumeId(fileId string) string {
|
||||||
|
|||||||
Reference in New Issue
Block a user