fix: keep metadata subscriptions progressing (#8730) (#8746)

* fix: keep metadata subscriptions progressing (#8730)

* test: cancel slow metadata writers with parent context

* filer: ignore missing persisted log chunks
This commit is contained in:
Chris Lu
2026-03-23 15:26:54 -07:00
committed by GitHub
parent d5ee35c8df
commit 6bf654c25c
10 changed files with 368 additions and 27 deletions

View File

@@ -3,6 +3,7 @@ package metadata_subscribe
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"mime/multipart"
@@ -27,6 +28,8 @@ import (
"google.golang.org/grpc/credentials/insecure"
)
const slowConsumerMetadataPayloadSize = 4096
// TestMetadataSubscribeBasic tests basic metadata subscription functionality
func TestMetadataSubscribeBasic(t *testing.T) {
if testing.Short() {
@@ -680,6 +683,108 @@ func TestMetadataSubscribeMillionUpdates(t *testing.T) {
})
}
func TestMetadataSubscribeSlowConsumerKeepsProgressing(t *testing.T) {
if testing.Short() {
t.Skip("Skipping integration test in short mode")
}
testDir, err := os.MkdirTemp("", "seaweedfs_slow_consumer_test_")
require.NoError(t, err)
defer os.RemoveAll(testDir)
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
defer cancel()
cluster, err := startSeaweedFSCluster(ctx, testDir)
require.NoError(t, err)
defer cluster.Stop()
require.NoError(t, waitForHTTPServer("http://127.0.0.1:9333", 30*time.Second))
require.NoError(t, waitForHTTPServer("http://127.0.0.1:8080", 30*time.Second))
require.NoError(t, waitForHTTPServer("http://127.0.0.1:8888", 30*time.Second))
t.Logf("Cluster started for slow consumer regression test")
t.Run("single_filer_slow_consumer", func(t *testing.T) {
var receivedCount int64
phaseOneEntries := 6000
phaseTwoEntries := 14000
totalEntries := phaseOneEntries + phaseTwoEntries
minExpected := int64(12000)
errChan := make(chan error, 1)
subCtx, subCancel := context.WithCancel(ctx)
defer subCancel()
go func() {
err := followMetadataSlowly(
subCtx,
"127.0.0.1:8888",
"/slow-consumer/",
time.Now().Add(-5*time.Second).UnixNano(),
time.Millisecond,
func(resp *filer_pb.SubscribeMetadataResponse) {
if resp.GetEventNotification() == nil {
return
}
entry := resp.GetEventNotification().GetNewEntry()
if entry == nil || entry.IsDirectory {
return
}
atomic.AddInt64(&receivedCount, 1)
},
)
if err != nil && !errors.Is(err, context.Canceled) {
errChan <- err
}
}()
time.Sleep(2 * time.Second)
payload := bytes.Repeat([]byte("x"), slowConsumerMetadataPayloadSize)
startTime := time.Now()
require.NoError(t, createMetadataEntries(ctx, "127.0.0.1:8888", 0, phaseOneEntries, payload))
t.Logf("Created phase 1 with %d entries in %v", phaseOneEntries, time.Since(startTime))
time.Sleep(2 * time.Second)
require.NoError(t, createMetadataEntries(ctx, "127.0.0.1:8888", phaseOneEntries, phaseTwoEntries, payload))
t.Logf("Created phase 2 with %d entries", phaseTwoEntries)
checkTicker := time.NewTicker(2 * time.Second)
defer checkTicker.Stop()
deadline := time.NewTimer(45 * time.Second)
defer deadline.Stop()
lastReceived := int64(-1)
stableChecks := 0
for {
select {
case err := <-errChan:
t.Fatalf("slow consumer subscription error: %v", err)
case <-deadline.C:
t.Fatalf("timed out waiting for slow consumer progress, received %d/%d", atomic.LoadInt64(&receivedCount), totalEntries)
case <-checkTicker.C:
received := atomic.LoadInt64(&receivedCount)
t.Logf("Slow consumer progress: %d/%d", received, totalEntries)
if received >= minExpected {
return
}
if received == lastReceived {
stableChecks++
if stableChecks >= 4 {
t.Fatalf("slow consumer stalled at %d/%d after writes completed", received, totalEntries)
}
} else {
stableChecks = 0
}
lastReceived = received
}
}
})
}
// Helper types and functions
type TestCluster struct {
@@ -915,3 +1020,106 @@ func subscribeToMetadataWithOptions(ctx context.Context, filerGrpcAddress, pathP
}
})
}
func followMetadataSlowly(
ctx context.Context,
filerGrpcAddress, pathPrefix string,
sinceNs int64,
delay time.Duration,
onEvent func(resp *filer_pb.SubscribeMetadataResponse),
) error {
grpcDialOption := security.LoadClientTLS(util.GetViper(), "grpc.client")
if grpcDialOption == nil {
grpcDialOption = grpc.WithTransportCredentials(insecure.NewCredentials())
}
option := &pb.MetadataFollowOption{
ClientName: "slow_consumer_test",
ClientId: util.RandomInt32(),
ClientEpoch: int32(time.Now().Unix()),
PathPrefix: pathPrefix,
StartTsNs: sinceNs,
EventErrorType: pb.DontLogError,
}
return pb.FollowMetadata(pb.ServerAddress(filerGrpcAddress), grpcDialOption, option, func(resp *filer_pb.SubscribeMetadataResponse) error {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
onEvent(resp)
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(delay):
return nil
}
})
}
func createMetadataEntries(ctx context.Context, filerGrpcAddress string, startIndex, total int, payload []byte) error {
const workers = 10
grpcDialOption := grpc.WithTransportCredentials(insecure.NewCredentials())
errCh := make(chan error, workers)
var wg sync.WaitGroup
for workerID := 0; workerID < workers; workerID++ {
wg.Add(1)
go func(workerID int) {
defer wg.Done()
err := pb.WithFilerClient(false, 0, pb.ServerAddress(filerGrpcAddress), grpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
for idx := startIndex + workerID; idx < startIndex+total; idx += workers {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
dir := fmt.Sprintf("/slow-consumer/bucket-%02d", idx%6)
name := fmt.Sprintf("entry-%05d", idx)
_, err := client.CreateEntry(ctx, &filer_pb.CreateEntryRequest{
Directory: dir,
Entry: &filer_pb.Entry{
Name: name,
IsDirectory: false,
Attributes: &filer_pb.FuseAttributes{
FileSize: uint64(len(payload)),
Mtime: time.Now().Unix(),
FileMode: 0644,
Uid: 1000,
Gid: 1000,
},
Extended: map[string][]byte{
"payload": payload,
},
},
})
if err != nil {
return err
}
}
return nil
})
if err != nil {
errCh <- err
}
}(workerID)
}
wg.Wait()
close(errCh)
for err := range errCh {
if err != nil {
return err
}
}
return nil
}