* fix(kafka): resolve consumer group resumption timeout in e2e tests Three issues caused ConsumerGroupResumption to time out when the second consumer tried to resume from committed offsets: 1. ForceCompleteRebalance deadlock: performCleanup() held group.Mu.Lock then called ForceCompleteRebalance() which tried to acquire the same lock — a guaranteed deadlock on Go's non-reentrant sync.Mutex. Fixed by requiring callers to hold the lock (matching actual call sites). 2. Unbounded fallback fetch: when the multi-batch fetch timed out, the fallback GetStoredRecords call used the connection context (no deadline). A slow broker gRPC call could block the data-plane goroutine indefinitely, causing head-of-line blocking for all responses on that connection. Fixed with a 10-second timeout. 3. HWM lookup failure caused empty responses: after a consumer leaves and the partition is deactivated, GetLatestOffset can fail. The fetch handler treated this as "no data" and entered the long-poll loop (up to 10s × 4 retries = 40s timeout). Fixed by assuming data may exist when HWM lookup fails, so the actual fetch determines availability. * fix(kafka): address review feedback on HWM sentinel and fallback timeout - Don't expose synthetic HWM (requestedOffset+1) to clients; keep result.highWaterMark at 0 when the real HWM lookup fails. - Tie fallback timeout to client's MaxWaitTime instead of a fixed 10s, so one slow partition doesn't hold the reader beyond the request budget. * fix(kafka): use large HWM sentinel and clamp fallback timeout - Use requestedOffset+10000 as sentinel HWM instead of +1, so FetchMultipleBatches doesn't artificially limit to 1 record. - Add 2s floor to fallback timeout so disk reads via gRPC have a reasonable chance even when maxWaitMs is small or zero. * fix(kafka): use MaxInt64 sentinel and derive HWM from fetch result - Use math.MaxInt64 as HWM sentinel to avoid integer overflow risk (previously requestedOffset+10000 could wrap on large offsets). - After the fetch, derive a meaningful HWM from newOffset so the client never sees MaxInt64 or 0 in the response. * fix(kafka): use remaining time budget for fallback fetch The fallback was restarting the full maxWaitMs budget even though the multi-batch fetch already consumed part of it. Now compute remaining time from either the parent context deadline or maxWaitMs minus elapsed, skip the fallback if budget is exhausted, and clamp to [2s, 10s] bounds.
331 lines
9.5 KiB
Go
331 lines
9.5 KiB
Go
package consumer
|
|
|
|
import (
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
func TestRebalanceTimeoutManager_CheckRebalanceTimeouts(t *testing.T) {
|
|
coordinator := NewGroupCoordinator()
|
|
defer coordinator.Close()
|
|
|
|
rtm := coordinator.rebalanceTimeoutManager
|
|
|
|
// Create a group with a member that has a short rebalance timeout
|
|
group := coordinator.GetOrCreateGroup("test-group")
|
|
group.Mu.Lock()
|
|
group.State = GroupStatePreparingRebalance
|
|
|
|
member := &GroupMember{
|
|
ID: "member1",
|
|
ClientID: "client1",
|
|
SessionTimeout: 30000, // 30 seconds
|
|
RebalanceTimeout: 1000, // 1 second (very short for testing)
|
|
State: MemberStatePending,
|
|
LastHeartbeat: time.Now(),
|
|
JoinedAt: time.Now().Add(-2 * time.Second), // Joined 2 seconds ago
|
|
}
|
|
group.Members["member1"] = member
|
|
group.Mu.Unlock()
|
|
|
|
// Check timeouts - member should be evicted
|
|
rtm.CheckRebalanceTimeouts()
|
|
|
|
group.Mu.RLock()
|
|
if len(group.Members) != 0 {
|
|
t.Errorf("Expected member to be evicted due to rebalance timeout, but %d members remain", len(group.Members))
|
|
}
|
|
|
|
if group.State != GroupStateEmpty {
|
|
t.Errorf("Expected group state to be Empty after member eviction, got %s", group.State.String())
|
|
}
|
|
group.Mu.RUnlock()
|
|
}
|
|
|
|
func TestRebalanceTimeoutManager_SessionTimeoutFallback(t *testing.T) {
|
|
coordinator := NewGroupCoordinator()
|
|
defer coordinator.Close()
|
|
|
|
rtm := coordinator.rebalanceTimeoutManager
|
|
|
|
// Create a group with a member that has exceeded session timeout
|
|
group := coordinator.GetOrCreateGroup("test-group")
|
|
group.Mu.Lock()
|
|
group.State = GroupStatePreparingRebalance
|
|
|
|
member := &GroupMember{
|
|
ID: "member1",
|
|
ClientID: "client1",
|
|
SessionTimeout: 1000, // 1 second
|
|
RebalanceTimeout: 30000, // 30 seconds
|
|
State: MemberStatePending,
|
|
LastHeartbeat: time.Now().Add(-2 * time.Second), // Last heartbeat 2 seconds ago
|
|
JoinedAt: time.Now(),
|
|
}
|
|
group.Members["member1"] = member
|
|
group.Mu.Unlock()
|
|
|
|
// Check timeouts - member should be evicted due to session timeout
|
|
rtm.CheckRebalanceTimeouts()
|
|
|
|
group.Mu.RLock()
|
|
if len(group.Members) != 0 {
|
|
t.Errorf("Expected member to be evicted due to session timeout, but %d members remain", len(group.Members))
|
|
}
|
|
group.Mu.RUnlock()
|
|
}
|
|
|
|
func TestRebalanceTimeoutManager_LeaderEviction(t *testing.T) {
|
|
coordinator := NewGroupCoordinator()
|
|
defer coordinator.Close()
|
|
|
|
rtm := coordinator.rebalanceTimeoutManager
|
|
|
|
// Create a group with leader and another member
|
|
group := coordinator.GetOrCreateGroup("test-group")
|
|
group.Mu.Lock()
|
|
group.State = GroupStatePreparingRebalance
|
|
group.Leader = "member1"
|
|
|
|
// Leader with expired rebalance timeout
|
|
leader := &GroupMember{
|
|
ID: "member1",
|
|
ClientID: "client1",
|
|
SessionTimeout: 30000,
|
|
RebalanceTimeout: 1000,
|
|
State: MemberStatePending,
|
|
LastHeartbeat: time.Now(),
|
|
JoinedAt: time.Now().Add(-2 * time.Second),
|
|
}
|
|
group.Members["member1"] = leader
|
|
|
|
// Another member that's still valid
|
|
member2 := &GroupMember{
|
|
ID: "member2",
|
|
ClientID: "client2",
|
|
SessionTimeout: 30000,
|
|
RebalanceTimeout: 30000,
|
|
State: MemberStatePending,
|
|
LastHeartbeat: time.Now(),
|
|
JoinedAt: time.Now(),
|
|
}
|
|
group.Members["member2"] = member2
|
|
group.Mu.Unlock()
|
|
|
|
// Check timeouts - leader should be evicted, new leader selected
|
|
rtm.CheckRebalanceTimeouts()
|
|
|
|
group.Mu.RLock()
|
|
if len(group.Members) != 1 {
|
|
t.Errorf("Expected 1 member to remain after leader eviction, got %d", len(group.Members))
|
|
}
|
|
|
|
if group.Leader != "member2" {
|
|
t.Errorf("Expected member2 to become new leader, got %s", group.Leader)
|
|
}
|
|
|
|
if group.State != GroupStatePreparingRebalance {
|
|
t.Errorf("Expected group to restart rebalancing after leader eviction, got %s", group.State.String())
|
|
}
|
|
group.Mu.RUnlock()
|
|
}
|
|
|
|
func TestRebalanceTimeoutManager_IsRebalanceStuck(t *testing.T) {
|
|
coordinator := NewGroupCoordinator()
|
|
defer coordinator.Close()
|
|
|
|
rtm := coordinator.rebalanceTimeoutManager
|
|
|
|
// Create a group that's been rebalancing for a while
|
|
group := coordinator.GetOrCreateGroup("test-group")
|
|
group.Mu.Lock()
|
|
group.State = GroupStatePreparingRebalance
|
|
group.LastActivity = time.Now().Add(-15 * time.Minute) // 15 minutes ago
|
|
group.Mu.Unlock()
|
|
|
|
// Check if rebalance is stuck (max 10 minutes)
|
|
maxDuration := 10 * time.Minute
|
|
if !rtm.IsRebalanceStuck(group, maxDuration) {
|
|
t.Error("Expected rebalance to be detected as stuck")
|
|
}
|
|
|
|
// Test with a group that's not stuck
|
|
group.Mu.Lock()
|
|
group.LastActivity = time.Now().Add(-5 * time.Minute) // 5 minutes ago
|
|
group.Mu.Unlock()
|
|
|
|
if rtm.IsRebalanceStuck(group, maxDuration) {
|
|
t.Error("Expected rebalance to not be detected as stuck")
|
|
}
|
|
|
|
// Test with stable group (should not be stuck)
|
|
group.Mu.Lock()
|
|
group.State = GroupStateStable
|
|
group.LastActivity = time.Now().Add(-15 * time.Minute)
|
|
group.Mu.Unlock()
|
|
|
|
if rtm.IsRebalanceStuck(group, maxDuration) {
|
|
t.Error("Stable group should not be detected as stuck")
|
|
}
|
|
}
|
|
|
|
func TestRebalanceTimeoutManager_ForceCompleteRebalance(t *testing.T) {
|
|
coordinator := NewGroupCoordinator()
|
|
defer coordinator.Close()
|
|
|
|
rtm := coordinator.rebalanceTimeoutManager
|
|
|
|
// Test forcing completion from PreparingRebalance
|
|
group := coordinator.GetOrCreateGroup("test-group")
|
|
group.Mu.Lock()
|
|
group.State = GroupStatePreparingRebalance
|
|
|
|
member := &GroupMember{
|
|
ID: "member1",
|
|
State: MemberStatePending,
|
|
}
|
|
group.Members["member1"] = member
|
|
|
|
// ForceCompleteRebalance expects the caller to hold group.Mu.Lock()
|
|
rtm.ForceCompleteRebalance(group)
|
|
|
|
if group.State != GroupStateCompletingRebalance {
|
|
t.Errorf("Expected group state to be CompletingRebalance, got %s", group.State.String())
|
|
}
|
|
group.Mu.Unlock()
|
|
|
|
// Test forcing completion from CompletingRebalance
|
|
group.Mu.Lock()
|
|
rtm.ForceCompleteRebalance(group)
|
|
|
|
if group.State != GroupStateStable {
|
|
t.Errorf("Expected group state to be Stable, got %s", group.State.String())
|
|
}
|
|
|
|
if member.State != MemberStateStable {
|
|
t.Errorf("Expected member state to be Stable, got %s", member.State.String())
|
|
}
|
|
group.Mu.Unlock()
|
|
}
|
|
|
|
func TestRebalanceTimeoutManager_GetRebalanceStatus(t *testing.T) {
|
|
coordinator := NewGroupCoordinator()
|
|
defer coordinator.Close()
|
|
|
|
rtm := coordinator.rebalanceTimeoutManager
|
|
|
|
// Test with non-existent group
|
|
status := rtm.GetRebalanceStatus("non-existent")
|
|
if status != nil {
|
|
t.Error("Expected nil status for non-existent group")
|
|
}
|
|
|
|
// Create a group with members
|
|
group := coordinator.GetOrCreateGroup("test-group")
|
|
group.Mu.Lock()
|
|
group.State = GroupStatePreparingRebalance
|
|
group.Generation = 5
|
|
group.Leader = "member1"
|
|
group.LastActivity = time.Now().Add(-2 * time.Minute)
|
|
|
|
member1 := &GroupMember{
|
|
ID: "member1",
|
|
State: MemberStatePending,
|
|
LastHeartbeat: time.Now().Add(-30 * time.Second),
|
|
JoinedAt: time.Now().Add(-2 * time.Minute),
|
|
SessionTimeout: 30000, // 30 seconds
|
|
RebalanceTimeout: 300000, // 5 minutes
|
|
}
|
|
group.Members["member1"] = member1
|
|
|
|
member2 := &GroupMember{
|
|
ID: "member2",
|
|
State: MemberStatePending,
|
|
LastHeartbeat: time.Now().Add(-10 * time.Second),
|
|
JoinedAt: time.Now().Add(-1 * time.Minute),
|
|
SessionTimeout: 60000, // 1 minute
|
|
RebalanceTimeout: 180000, // 3 minutes
|
|
}
|
|
group.Members["member2"] = member2
|
|
group.Mu.Unlock()
|
|
|
|
// Get status
|
|
status = rtm.GetRebalanceStatus("test-group")
|
|
|
|
if status == nil {
|
|
t.Fatal("Expected non-nil status")
|
|
}
|
|
|
|
if status.GroupID != "test-group" {
|
|
t.Errorf("Expected group ID 'test-group', got %s", status.GroupID)
|
|
}
|
|
|
|
if status.State != GroupStatePreparingRebalance {
|
|
t.Errorf("Expected state PreparingRebalance, got %s", status.State.String())
|
|
}
|
|
|
|
if status.Generation != 5 {
|
|
t.Errorf("Expected generation 5, got %d", status.Generation)
|
|
}
|
|
|
|
if status.MemberCount != 2 {
|
|
t.Errorf("Expected 2 members, got %d", status.MemberCount)
|
|
}
|
|
|
|
if status.Leader != "member1" {
|
|
t.Errorf("Expected leader 'member1', got %s", status.Leader)
|
|
}
|
|
|
|
if !status.IsRebalancing {
|
|
t.Error("Expected IsRebalancing to be true")
|
|
}
|
|
|
|
if len(status.Members) != 2 {
|
|
t.Errorf("Expected 2 member statuses, got %d", len(status.Members))
|
|
}
|
|
|
|
// Check member timeout calculations
|
|
for _, memberStatus := range status.Members {
|
|
if memberStatus.SessionTimeRemaining < 0 {
|
|
t.Errorf("Session time remaining should not be negative for member %s", memberStatus.MemberID)
|
|
}
|
|
|
|
if memberStatus.RebalanceTimeRemaining < 0 {
|
|
t.Errorf("Rebalance time remaining should not be negative for member %s", memberStatus.MemberID)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestRebalanceTimeoutManager_DefaultRebalanceTimeout(t *testing.T) {
|
|
coordinator := NewGroupCoordinator()
|
|
defer coordinator.Close()
|
|
|
|
rtm := coordinator.rebalanceTimeoutManager
|
|
|
|
// Create a group with a member that has no rebalance timeout set (0)
|
|
group := coordinator.GetOrCreateGroup("test-group")
|
|
group.Mu.Lock()
|
|
group.State = GroupStatePreparingRebalance
|
|
|
|
member := &GroupMember{
|
|
ID: "member1",
|
|
ClientID: "client1",
|
|
SessionTimeout: 30000, // 30 seconds
|
|
RebalanceTimeout: 0, // Not set, should use default
|
|
State: MemberStatePending,
|
|
LastHeartbeat: time.Now(),
|
|
JoinedAt: time.Now().Add(-6 * time.Minute), // Joined 6 minutes ago
|
|
}
|
|
group.Members["member1"] = member
|
|
group.Mu.Unlock()
|
|
|
|
// Default rebalance timeout is 5 minutes (300000ms), so member should be evicted
|
|
rtm.CheckRebalanceTimeouts()
|
|
|
|
group.Mu.RLock()
|
|
if len(group.Members) != 0 {
|
|
t.Errorf("Expected member to be evicted using default rebalance timeout, but %d members remain", len(group.Members))
|
|
}
|
|
group.Mu.RUnlock()
|
|
}
|