* fix: clear raft vote state file on non-resume startup The seaweedfs/raft library v1.1.7 added a persistent `state` file for currentTerm and votedFor. When RaftResumeState=false (the default), the log, conf, and snapshot directories are cleared but this state file was not. On repeated restarts, different masters accumulate divergent terms, causing AppendEntries rejections and preventing leader election. Fixes #8690 * fix: recover TopologyId from snapshot before clearing raft state When RaftResumeState=false clears log/conf/snapshot, the TopologyId (used for license validation) was lost. Now extract it from the latest snapshot before cleanup and restore it on the topology. Both seaweedfs/raft and hashicorp/raft paths are handled, with a shared recoverTopologyIdFromState helper in raft_common.go. * fix: stagger multi-master bootstrap delay by peer index Previously all masters used a fixed 1500ms delay before the bootstrap check. Now the delay is proportional to the peer's sorted index with randomization (matching the hashicorp raft path), giving the designated bootstrap node (peer 0) a head start while later peers wait for gRPC servers to be ready. Also adds diagnostic logging showing why DoJoinCommand was or wasn't called, making leader election issues easier to diagnose from logs. * fix: skip unreachable masters during leader reconnection When a master leader goes down, non-leader masters still redirect clients to the stale leader address. The masterClient would follow these redirects, fail, and retry — wasting round-trips each cycle. Now tryAllMasters tracks which masters failed within a cycle and skips redirects pointing to them, reducing log spam and connection overhead during leader failover. * fix: take snapshot after TopologyId generation for recovery After generating a new TopologyId on the leader, immediately take a raft snapshot so the ID can be recovered from the snapshot on future restarts with RaftResumeState=false. Without this, short-lived clusters would lose the TopologyId on restart since no automatic snapshot had been taken yet. * test: add multi-master raft failover integration tests Integration test framework and 5 test scenarios for 3-node master clusters: - TestLeaderConsistencyAcrossNodes: all nodes agree on leader and TopologyId - TestLeaderDownAndRecoverQuickly: leader stops, new leader elected, old leader rejoins as follower - TestLeaderDownSlowRecover: leader gone for extended period, cluster continues with 2/3 quorum - TestTwoMastersDownAndRestart: quorum lost (2/3 down), recovered when both restart - TestAllMastersDownAndRestart: full cluster restart, leader elected, all nodes agree on TopologyId * fix: address PR review comments - peerIndex: return -1 (not 0) when self not found, add warning log - recoverTopologyIdFromSnapshot: defer dir.Close() - tests: check GetTopologyId errors instead of discarding them * fix: address review comments on failover tests - Assert no leader after quorum loss (was only logging) - Verify follower cs.Leader matches expected leader via ServerAddress.ToHttpAddress() comparison - Check GetTopologyId error in TestTwoMastersDownAndRestart
314 lines
9.4 KiB
Go
314 lines
9.4 KiB
Go
package multi_master
|
|
|
|
import (
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/pb"
|
|
)
|
|
|
|
const (
|
|
// Election timeout is 3s in our cluster config; allow generous margin.
|
|
leaderElectionTimeout = 20 * time.Second
|
|
)
|
|
|
|
// TestLeaderDownAndRecoverQuickly verifies that when the leader is stopped and
|
|
// restarted quickly, the cluster re-elects a leader and the restarted node
|
|
// rejoins as a follower. TopologyId must be consistent across all nodes.
|
|
func TestLeaderDownAndRecoverQuickly(t *testing.T) {
|
|
mc := StartMasterCluster(t)
|
|
|
|
// Record initial state.
|
|
leaderIdx, leaderAddr := mc.FindLeader()
|
|
if leaderIdx < 0 {
|
|
t.Fatal("no leader found after cluster start")
|
|
}
|
|
t.Logf("initial leader: node %d at %s", leaderIdx, leaderAddr)
|
|
|
|
topologyId, err := mc.GetTopologyId(leaderIdx)
|
|
if err != nil || topologyId == "" {
|
|
t.Fatalf("failed to get initial TopologyId: %v", err)
|
|
}
|
|
t.Logf("initial TopologyId: %s", topologyId)
|
|
|
|
// Stop the leader.
|
|
mc.StopNode(leaderIdx)
|
|
t.Logf("stopped leader node %d", leaderIdx)
|
|
|
|
// Wait for a new leader from the remaining 2 nodes.
|
|
newLeaderIdx, newLeaderAddr, err := mc.WaitForNewLeader(leaderAddr, leaderElectionTimeout)
|
|
if err != nil {
|
|
mc.DumpLogs()
|
|
t.Fatalf("new leader not elected after stopping old leader: %v", err)
|
|
}
|
|
t.Logf("new leader: node %d at %s", newLeaderIdx, newLeaderAddr)
|
|
|
|
// Restart the old leader quickly.
|
|
mc.StartNode(leaderIdx)
|
|
if err := mc.WaitForNodeReady(leaderIdx, waitTimeout); err != nil {
|
|
mc.DumpLogs()
|
|
t.Fatalf("restarted node %d not ready: %v", leaderIdx, err)
|
|
}
|
|
t.Logf("restarted node %d", leaderIdx)
|
|
|
|
// Give raft time to settle.
|
|
time.Sleep(3 * time.Second)
|
|
|
|
// Verify leader is stable.
|
|
finalLeaderIdx, _ := mc.FindLeader()
|
|
if finalLeaderIdx < 0 {
|
|
mc.DumpLogs()
|
|
t.Fatal("no leader after restarting old leader node")
|
|
}
|
|
|
|
// Verify TopologyId is consistent across all nodes.
|
|
assertTopologyIdConsistent(t, mc, topologyId)
|
|
}
|
|
|
|
// TestLeaderDownSlowRecover verifies that when the leader goes down and takes
|
|
// a long time to come back, the remaining 2 nodes elect a new leader and the
|
|
// cluster continues to function. When the slow node returns, it rejoins.
|
|
func TestLeaderDownSlowRecover(t *testing.T) {
|
|
mc := StartMasterCluster(t)
|
|
|
|
leaderIdx, leaderAddr := mc.FindLeader()
|
|
if leaderIdx < 0 {
|
|
t.Fatal("no leader found")
|
|
}
|
|
topologyId, err := mc.GetTopologyId(leaderIdx)
|
|
if err != nil || topologyId == "" {
|
|
t.Fatalf("failed to get initial TopologyId: %v", err)
|
|
}
|
|
t.Logf("initial leader: node %d, TopologyId: %s", leaderIdx, topologyId)
|
|
|
|
// Stop the leader.
|
|
mc.StopNode(leaderIdx)
|
|
|
|
// Wait for a new leader.
|
|
newLeaderIdx, _, err := mc.WaitForNewLeader(leaderAddr, leaderElectionTimeout)
|
|
if err != nil {
|
|
mc.DumpLogs()
|
|
t.Fatalf("new leader not elected: %v", err)
|
|
}
|
|
t.Logf("new leader: node %d", newLeaderIdx)
|
|
|
|
// Verify cluster functions with only 2 nodes (quorum is 2/3).
|
|
cs, err := mc.GetClusterStatus(newLeaderIdx)
|
|
if err != nil {
|
|
mc.DumpLogs()
|
|
t.Fatalf("cannot get cluster status from new leader: %v", err)
|
|
}
|
|
if !cs.IsLeader {
|
|
t.Fatalf("node %d claims not to be leader", newLeaderIdx)
|
|
}
|
|
|
|
// Simulate slow recovery: wait significantly longer than election timeout.
|
|
t.Log("simulating slow recovery (10 seconds)...")
|
|
time.Sleep(10 * time.Second)
|
|
|
|
// Verify leader is still stable during the outage.
|
|
stableLeaderIdx, _ := mc.FindLeader()
|
|
if stableLeaderIdx < 0 {
|
|
mc.DumpLogs()
|
|
t.Fatal("leader lost during extended outage of one node")
|
|
}
|
|
|
|
// Restart the downed node.
|
|
mc.StartNode(leaderIdx)
|
|
if err := mc.WaitForNodeReady(leaderIdx, waitTimeout); err != nil {
|
|
mc.DumpLogs()
|
|
t.Fatalf("slow-recovered node %d not ready: %v", leaderIdx, err)
|
|
}
|
|
|
|
time.Sleep(3 * time.Second)
|
|
assertTopologyIdConsistent(t, mc, topologyId)
|
|
}
|
|
|
|
// TestTwoMastersDownAndRestart verifies that when 2 of 3 masters go down
|
|
// (losing quorum), the cluster cannot elect a leader. When both restart,
|
|
// a leader is elected and TopologyId is preserved.
|
|
func TestTwoMastersDownAndRestart(t *testing.T) {
|
|
mc := StartMasterCluster(t)
|
|
|
|
leaderIdx, _ := mc.FindLeader()
|
|
if leaderIdx < 0 {
|
|
t.Fatal("no leader found")
|
|
}
|
|
topologyId, err := mc.GetTopologyId(leaderIdx)
|
|
if err != nil || topologyId == "" {
|
|
t.Fatalf("failed to get initial TopologyId: %v", err)
|
|
}
|
|
t.Logf("initial TopologyId: %s", topologyId)
|
|
|
|
// Determine which 2 nodes to stop (stop the leader + one follower).
|
|
down1 := leaderIdx
|
|
down2 := (leaderIdx + 1) % 3
|
|
survivor := (leaderIdx + 2) % 3
|
|
t.Logf("stopping nodes %d and %d, keeping node %d", down1, down2, survivor)
|
|
|
|
mc.StopNode(down1)
|
|
mc.StopNode(down2)
|
|
|
|
// The surviving node alone cannot form a quorum — no leader expected.
|
|
// Wait long enough for any stale leadership to expire (election timeout
|
|
// is 3s in our config, quorum check fires every election timeout).
|
|
time.Sleep(5 * time.Second)
|
|
soloLeaderIdx, _ := mc.FindLeader()
|
|
if soloLeaderIdx >= 0 {
|
|
// It's possible the survivor briefly thinks it's leader before stepping down.
|
|
// Give it time to realize it lost quorum.
|
|
time.Sleep(5 * time.Second)
|
|
soloLeaderIdx, _ = mc.FindLeader()
|
|
}
|
|
if soloLeaderIdx >= 0 {
|
|
mc.DumpLogs()
|
|
t.Fatalf("expected no leader with only 1 of 3 nodes, but node %d claims leadership", soloLeaderIdx)
|
|
}
|
|
|
|
// Restart both downed nodes.
|
|
mc.StartNode(down1)
|
|
mc.StartNode(down2)
|
|
for _, i := range []int{down1, down2} {
|
|
if err := mc.WaitForNodeReady(i, waitTimeout); err != nil {
|
|
mc.DumpLogs()
|
|
t.Fatalf("restarted node %d not ready: %v", i, err)
|
|
}
|
|
}
|
|
|
|
// Wait for leader election.
|
|
if err := mc.WaitForLeader(leaderElectionTimeout); err != nil {
|
|
mc.DumpLogs()
|
|
t.Fatalf("no leader after restarting 2 downed nodes: %v", err)
|
|
}
|
|
|
|
time.Sleep(3 * time.Second)
|
|
assertTopologyIdConsistent(t, mc, topologyId)
|
|
}
|
|
|
|
// TestAllMastersDownAndRestart verifies that when all 3 masters are stopped
|
|
// and restarted, the cluster elects a leader and all nodes agree on a
|
|
// TopologyId. With RaftResumeState=false (default), raft state is cleared on
|
|
// restart. The TopologyId is recovered from snapshots when available; on a
|
|
// short-lived cluster that hasn't taken snapshots on all nodes, a new
|
|
// TopologyId may be generated — but all nodes must still agree.
|
|
func TestAllMastersDownAndRestart(t *testing.T) {
|
|
mc := StartMasterCluster(t)
|
|
|
|
leaderIdx, _ := mc.FindLeader()
|
|
if leaderIdx < 0 {
|
|
t.Fatal("no leader found")
|
|
}
|
|
topologyId, _ := mc.GetTopologyId(leaderIdx)
|
|
if topologyId == "" {
|
|
t.Fatal("no TopologyId on initial leader")
|
|
}
|
|
t.Logf("initial TopologyId: %s", topologyId)
|
|
|
|
// Stop all nodes.
|
|
for i := range 3 {
|
|
mc.StopNode(i)
|
|
}
|
|
t.Log("all nodes stopped")
|
|
|
|
time.Sleep(2 * time.Second)
|
|
|
|
// Restart all nodes.
|
|
for i := range 3 {
|
|
mc.StartNode(i)
|
|
}
|
|
for i := range 3 {
|
|
if err := mc.WaitForNodeReady(i, waitTimeout); err != nil {
|
|
mc.DumpLogs()
|
|
t.Fatalf("node %d not ready after full restart: %v", i, err)
|
|
}
|
|
}
|
|
|
|
// Wait for leader.
|
|
if err := mc.WaitForLeader(leaderElectionTimeout); err != nil {
|
|
mc.DumpLogs()
|
|
t.Fatalf("no leader after full cluster restart: %v", err)
|
|
}
|
|
|
|
newLeaderIdx, _ := mc.FindLeader()
|
|
t.Logf("leader after full restart: node %d", newLeaderIdx)
|
|
|
|
time.Sleep(3 * time.Second)
|
|
|
|
// All nodes must agree on a TopologyId (may differ from original if
|
|
// snapshots were not yet taken on all nodes before shutdown).
|
|
newTopologyId, err := mc.GetTopologyId(newLeaderIdx)
|
|
if err != nil || newTopologyId == "" {
|
|
mc.DumpLogs()
|
|
t.Fatal("no TopologyId after full restart")
|
|
}
|
|
if newTopologyId == topologyId {
|
|
t.Logf("TopologyId preserved across full restart: %s", topologyId)
|
|
} else {
|
|
t.Logf("TopologyId changed (expected for short-lived cluster without snapshots): %s -> %s", topologyId, newTopologyId)
|
|
}
|
|
assertTopologyIdConsistent(t, mc, newTopologyId)
|
|
}
|
|
|
|
// TestLeaderConsistencyAcrossNodes verifies that all nodes agree on who the
|
|
// leader is and report the same TopologyId.
|
|
func TestLeaderConsistencyAcrossNodes(t *testing.T) {
|
|
mc := StartMasterCluster(t)
|
|
|
|
// Allow cluster to stabilize.
|
|
time.Sleep(3 * time.Second)
|
|
|
|
leaderIdx, leaderAddr := mc.FindLeader()
|
|
if leaderIdx < 0 {
|
|
t.Fatal("no leader found")
|
|
}
|
|
t.Logf("leader: node %d at %s", leaderIdx, leaderAddr)
|
|
|
|
// Every node should agree on the leader.
|
|
for i := range 3 {
|
|
cs, err := mc.GetClusterStatus(i)
|
|
if err != nil {
|
|
t.Fatalf("node %d cluster/status error: %v", i, err)
|
|
}
|
|
if i == leaderIdx {
|
|
if !cs.IsLeader {
|
|
t.Errorf("node %d should be leader but IsLeader=false", i)
|
|
}
|
|
} else {
|
|
if cs.IsLeader {
|
|
t.Errorf("node %d should not be leader but IsLeader=true", i)
|
|
}
|
|
// cs.Leader is a ServerAddress like "127.0.0.1:10000.20000";
|
|
// convert to HTTP address for comparison with leaderAddr.
|
|
leaderHttp := pb.ServerAddress(cs.Leader).ToHttpAddress()
|
|
if leaderHttp != leaderAddr {
|
|
t.Errorf("node %d reports leader %q (http: %s), expected %q", i, cs.Leader, leaderHttp, leaderAddr)
|
|
}
|
|
}
|
|
}
|
|
|
|
// All nodes should have the same TopologyId.
|
|
topologyId, _ := mc.GetTopologyId(leaderIdx)
|
|
if topologyId == "" {
|
|
t.Fatal("leader has no TopologyId")
|
|
}
|
|
assertTopologyIdConsistent(t, mc, topologyId)
|
|
}
|
|
|
|
// assertTopologyIdConsistent verifies that all running nodes report the expected TopologyId.
|
|
func assertTopologyIdConsistent(t *testing.T, mc *MasterCluster, expectedId string) {
|
|
t.Helper()
|
|
for i := range 3 {
|
|
if !mc.IsNodeRunning(i) {
|
|
continue
|
|
}
|
|
id, err := mc.GetTopologyId(i)
|
|
if err != nil {
|
|
t.Errorf("node %d: failed to get TopologyId: %v", i, err)
|
|
continue
|
|
}
|
|
if id != expectedId {
|
|
t.Errorf("node %d: TopologyId=%q, expected %q", i, id, expectedId)
|
|
}
|
|
}
|
|
}
|