fix(s3): start KeepConnectedToMaster for filer discovery with filerGroup (#7732)

Fixes #7721

When S3 server is configured with a filerGroup, it creates a MasterClient
to enable dynamic filer discovery. However, the KeepConnectedToMaster()
background goroutine was never started, causing GetMaster() to block
indefinitely in WaitUntilConnected().

This resulted in the log message:
  WaitUntilConnected still waiting for master connection (attempt N)...

being logged repeatedly every ~20 seconds.

The fix adds the missing 'go masterClient.KeepConnectedToMaster(ctx)'
call to properly establish the connection to master servers.

Also adds unit tests to verify WaitUntilConnected respects context
cancellation.
This commit is contained in:
Chris Lu
2025-12-13 12:10:15 -08:00
committed by GitHub
parent 36b8b2147b
commit d80d8be012
2 changed files with 106 additions and 0 deletions

View File

@@ -115,6 +115,8 @@ func NewS3ApiServerWithStore(router *mux.Router, option *S3ApiServerOption, expl
masterMap[fmt.Sprintf("master%d", i)] = addr
}
masterClient := wdclient.NewMasterClient(option.GrpcDialOption, option.FilerGroup, cluster.S3Type, "", "", "", *pb.NewServiceDiscoveryFromMap(masterMap))
// Start the master client connection loop - required for GetMaster() to work
go masterClient.KeepConnectedToMaster(context.Background())
filerClient = wdclient.NewFilerClient(option.Filers, option.GrpcDialOption, option.DataCenter, &wdclient.FilerClientOption{
MasterClient: masterClient,

View File

@@ -0,0 +1,104 @@
package wdclient
import (
"context"
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/pb"
"google.golang.org/grpc"
)
// TestWaitUntilConnectedWithoutKeepConnected verifies that WaitUntilConnected
// respects context cancellation when KeepConnectedToMaster is not running.
// This tests the fix for https://github.com/seaweedfs/seaweedfs/issues/7721
func TestWaitUntilConnectedWithoutKeepConnected(t *testing.T) {
mc := NewMasterClient(grpc.EmptyDialOption{}, "test-group", "test-client", "", "", "", pb.ServerDiscovery{})
// Without KeepConnectedToMaster running, WaitUntilConnected should
// respect context cancellation and not block forever
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
start := time.Now()
mc.WaitUntilConnected(ctx)
elapsed := time.Since(start)
// Should have returned due to context timeout, not blocked forever
if elapsed > 200*time.Millisecond {
t.Errorf("WaitUntilConnected blocked for %v, expected to return on context timeout", elapsed)
}
// GetMaster should return empty since we never connected
master := mc.getCurrentMaster()
if master != "" {
t.Errorf("Expected empty master, got %s", master)
}
}
// TestWaitUntilConnectedReturnsImmediatelyWhenConnected verifies that
// WaitUntilConnected returns immediately when a master is already set.
func TestWaitUntilConnectedReturnsImmediatelyWhenConnected(t *testing.T) {
mc := NewMasterClient(grpc.EmptyDialOption{}, "test-group", "test-client", "", "", "", pb.ServerDiscovery{})
// Simulate that KeepConnectedToMaster has already established a connection
mc.setCurrentMaster("localhost:9333")
ctx := context.Background()
start := time.Now()
mc.WaitUntilConnected(ctx)
elapsed := time.Since(start)
// Should return almost immediately (< 10ms)
if elapsed > 10*time.Millisecond {
t.Errorf("WaitUntilConnected took %v when master was already set, expected immediate return", elapsed)
}
// Verify master is returned
master := mc.getCurrentMaster()
if master != "localhost:9333" {
t.Errorf("Expected master localhost:9333, got %s", master)
}
}
// TestGetMasterRespectsContextCancellation verifies that GetMaster
// respects context cancellation and doesn't block forever.
func TestGetMasterRespectsContextCancellation(t *testing.T) {
mc := NewMasterClient(grpc.EmptyDialOption{}, "test-group", "test-client", "", "", "", pb.ServerDiscovery{})
// GetMaster calls WaitUntilConnected internally
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
start := time.Now()
master := mc.GetMaster(ctx)
elapsed := time.Since(start)
// Should return on context timeout
if elapsed > 200*time.Millisecond {
t.Errorf("GetMaster blocked for %v, expected to return on context timeout", elapsed)
}
// Master should be empty since we never connected
if master != "" {
t.Errorf("Expected empty master when not connected, got %s", master)
}
}
// TestMasterClientFilerGroupLogging verifies the FilerGroup is properly set
// and would be logged correctly (regression test for issue #7721 log message format)
func TestMasterClientFilerGroupLogging(t *testing.T) {
filerGroup := "filer_1"
clientType := "s3"
mc := NewMasterClient(grpc.EmptyDialOption{}, filerGroup, clientType, "", "", "", pb.ServerDiscovery{})
if mc.FilerGroup != filerGroup {
t.Errorf("Expected FilerGroup %s, got %s", filerGroup, mc.FilerGroup)
}
if mc.clientType != clientType {
t.Errorf("Expected clientType %s, got %s", clientType, mc.clientType)
}
}