fix: keep metadata subscriptions progressing (#8730) (#8746)

* fix: keep metadata subscriptions progressing (#8730)

* test: cancel slow metadata writers with parent context

* filer: ignore missing persisted log chunks
This commit is contained in:
Chris Lu
2026-03-23 15:26:54 -07:00
committed by GitHub
parent d5ee35c8df
commit 6bf654c25c
10 changed files with 368 additions and 27 deletions

View File

@@ -606,6 +606,22 @@ func (logBuffer *LogBuffer) invalidateAllDiskCacheChunks() {
func (logBuffer *LogBuffer) GetEarliestTime() time.Time {
return logBuffer.startTime
}
func (logBuffer *LogBuffer) HasData() bool {
logBuffer.RLock()
defer logBuffer.RUnlock()
if logBuffer.pos > 0 {
return true
}
for _, buf := range logBuffer.prevBuffers.buffers {
if buf.size > 0 {
return true
}
}
return false
}
func (logBuffer *LogBuffer) GetEarliestPosition() MessagePosition {
return MessagePosition{
Time: logBuffer.startTime,
@@ -771,7 +787,9 @@ func (logBuffer *LogBuffer) ReadFromBuffer(lastReadPosition MessagePosition) (bu
glog.Errorf("ReadFromBuffer: buffer corruption in prevBuffer: %v", err)
return nil, -1, fmt.Errorf("%w: %v", ErrBufferCorrupted, err)
}
return copiedBytes(buf.buf[pos:buf.size]), buf.offset, nil
if pos < buf.size {
return copiedBytes(buf.buf[pos:buf.size]), buf.offset, nil
}
}
}
// If current buffer is not empty, return it

View File

@@ -1,6 +1,7 @@
package log_buffer
import (
"bytes"
"crypto/rand"
"fmt"
"io"
@@ -67,6 +68,48 @@ func TestNewLogBufferFirstBuffer(t *testing.T) {
}
}
func TestReadFromBufferTimestampBased_AfterFlushReturnsNewerData(t *testing.T) {
lb := NewLogBuffer("test", time.Hour, nil, nil, nil)
defer lb.ShutdownLogBuffer()
payload := bytes.Repeat([]byte("x"), 4096)
var sealed *MemBuffer
for i := 0; i < 5000; i++ {
if err := lb.AddDataToBuffer([]byte("k"), payload, int64(i+1)); err != nil {
t.Fatalf("AddDataToBuffer(%d): %v", i, err)
}
candidate := lb.prevBuffers.buffers[len(lb.prevBuffers.buffers)-1]
if candidate.size > 0 {
sealed = &MemBuffer{
size: candidate.size,
startTime: candidate.startTime,
stopTime: candidate.stopTime,
offset: candidate.offset,
}
break
}
}
if sealed == nil {
t.Fatal("expected first buffer flush to produce a sealed buffer")
}
for i := 5000; i < 5100; i++ {
if err := lb.AddDataToBuffer([]byte("k"), payload, int64(i+1)); err != nil {
t.Fatalf("AddDataToBuffer(%d): %v", i, err)
}
}
buf, _, err := lb.ReadFromBuffer(NewMessagePosition(sealed.stopTime.UnixNano(), sealed.offset))
if err != nil {
t.Fatalf("ReadFromBuffer returned error: %v", err)
}
if buf == nil || buf.Len() == 0 {
t.Fatalf("expected newer data after the first sealed buffer, got %v", buf)
}
}
// TestReadFromBuffer_OldOffsetReturnsResumeFromDiskError tests that requesting an old offset
// that has been flushed to disk properly returns ResumeFromDiskError instead of hanging forever.
// This reproduces the bug where Schema Registry couldn't read the _schemas topic.

View File

@@ -77,6 +77,7 @@ func (logBuffer *LogBuffer) LoopProcessLogData(readerName string, startPosition
if err == ResumeFromDiskError {
// Try to read from disk if readFromDiskFn is available
if logBuffer.ReadFromDiskFn != nil {
prevReadPosition := lastReadPosition
lastReadPosition, isDone, err = logBuffer.ReadFromDiskFn(lastReadPosition, stopTsNs, eachLogDataFn)
if err != nil {
return lastReadPosition, isDone, err
@@ -84,6 +85,11 @@ func (logBuffer *LogBuffer) LoopProcessLogData(readerName string, startPosition
if isDone {
return lastReadPosition, isDone, nil
}
if lastReadPosition != prevReadPosition {
continue
}
} else if logBuffer.HasData() {
return lastReadPosition, isDone, ResumeFromDiskError
}
// CRITICAL: Check if client is still connected
@@ -261,6 +267,7 @@ func (logBuffer *LogBuffer) LoopProcessLogDataWithOffset(readerName string, star
if err == ResumeFromDiskError {
// Try to read from disk if readFromDiskFn is available
if logBuffer.ReadFromDiskFn != nil {
prevReadPosition := lastReadPosition
// Wrap eachLogDataFn to match the expected signature
diskReadFn := func(logEntry *filer_pb.LogEntry) (bool, error) {
return eachLogDataFn(logEntry, logEntry.Offset)
@@ -272,7 +279,11 @@ func (logBuffer *LogBuffer) LoopProcessLogDataWithOffset(readerName string, star
if isDone {
return lastReadPosition, isDone, nil
}
// Continue to next iteration after disk read
if lastReadPosition != prevReadPosition {
continue
}
} else if logBuffer.HasData() {
return lastReadPosition, isDone, ResumeFromDiskError
}
// CRITICAL: Check if client is still connected after disk read

View File

@@ -307,6 +307,47 @@ func TestLoopProcessLogDataWithOffset_StopTime(t *testing.T) {
t.Logf("Loop correctly exited for past stopTsNs in %v (waitForDataFn called %d times)", elapsed, callCount)
}
func TestLoopProcessLogData_SlowConsumerFallsBehind(t *testing.T) {
flushFn := func(logBuffer *LogBuffer, startTime, stopTime time.Time, buf []byte, minOffset, maxOffset int64) {}
logBuffer := NewLogBuffer("test", 1*time.Minute, flushFn, nil, nil)
defer logBuffer.ShutdownLogBuffer()
baseTime := time.Now()
for i := 0; i < 1000; i++ {
ts := baseTime.Add(time.Duration(i) * time.Millisecond)
if err := logBuffer.AddDataToBuffer([]byte("key"), []byte("value"), ts.UnixNano()); err != nil {
t.Fatalf("AddDataToBuffer(%d): %v", i, err)
}
}
oldPosition := NewMessagePosition(baseTime.Add(-10*time.Second).UnixNano(), 1)
waitForDataFn := func() bool {
t.Errorf("waitForDataFn should not be called for a slow consumer that has fallen behind")
return false
}
eachLogEntryFn := func(logEntry *filer_pb.LogEntry) (bool, error) {
return false, nil
}
done := make(chan struct{})
var err error
go func() {
_, _, err = logBuffer.LoopProcessLogData("slow-consumer", oldPosition, 0, waitForDataFn, eachLogEntryFn)
close(done)
}()
select {
case <-done:
if err != ResumeFromDiskError {
t.Fatalf("expected ResumeFromDiskError, got %v", err)
}
case <-time.After(2 * time.Second):
t.Fatal("LoopProcessLogData blocked instead of returning ResumeFromDiskError")
}
}
// BenchmarkLoopProcessLogDataWithOffset_EmptyBuffer benchmarks the performance
// of the loop with an empty buffer to ensure no busy-waiting
func BenchmarkLoopProcessLogDataWithOffset_EmptyBuffer(b *testing.B) {

View File

@@ -32,7 +32,7 @@ func newSealedBuffers(size int) *SealedBuffers {
}
func (sbs *SealedBuffers) SealBuffer(startTime, stopTime time.Time, buf []byte, pos int, startOffset int64, endOffset int64) (newBuf []byte) {
oldMemBuffer := sbs.buffers[0]
oldBuf := sbs.buffers[0].buf
size := len(sbs.buffers)
for i := 0; i < size-1; i++ {
sbs.buffers[i].buf = sbs.buffers[i+1].buf
@@ -48,12 +48,12 @@ func (sbs *SealedBuffers) SealBuffer(startTime, stopTime time.Time, buf []byte,
sbs.buffers[size-1].stopTime = stopTime
sbs.buffers[size-1].startOffset = startOffset
sbs.buffers[size-1].offset = endOffset
return oldMemBuffer.buf
return oldBuf
}
func (mb *MemBuffer) locateByTs(lastReadTime time.Time) (pos int, err error) {
lastReadTs := lastReadTime.UnixNano()
for pos < len(mb.buf) {
for pos < mb.size {
size, t, readErr := readTs(mb.buf, pos)
if readErr != nil {
// Return error if buffer is corrupted
@@ -64,7 +64,7 @@ func (mb *MemBuffer) locateByTs(lastReadTime time.Time) (pos int, err error) {
}
pos += size + 4
}
return len(mb.buf), nil
return mb.size, nil
}
func (mb *MemBuffer) String() string {