Files
seaweedFS/test/multi_master/failover_test.go
Chris Lu 15f4a97029 fix: improve raft leader election reliability and failover speed (#8692)
* fix: clear raft vote state file on non-resume startup

The seaweedfs/raft library v1.1.7 added a persistent `state` file for
currentTerm and votedFor. When RaftResumeState=false (the default), the
log, conf, and snapshot directories are cleared but this state file was
not. On repeated restarts, different masters accumulate divergent terms,
causing AppendEntries rejections and preventing leader election.

Fixes #8690

* fix: recover TopologyId from snapshot before clearing raft state

When RaftResumeState=false clears log/conf/snapshot, the TopologyId
(used for license validation) was lost. Now extract it from the latest
snapshot before cleanup and restore it on the topology.

Both seaweedfs/raft and hashicorp/raft paths are handled, with a shared
recoverTopologyIdFromState helper in raft_common.go.

* fix: stagger multi-master bootstrap delay by peer index

Previously all masters used a fixed 1500ms delay before the bootstrap
check. Now the delay is proportional to the peer's sorted index with
randomization (matching the hashicorp raft path), giving the designated
bootstrap node (peer 0) a head start while later peers wait for gRPC
servers to be ready.

Also adds diagnostic logging showing why DoJoinCommand was or wasn't
called, making leader election issues easier to diagnose from logs.

* fix: skip unreachable masters during leader reconnection

When a master leader goes down, non-leader masters still redirect
clients to the stale leader address. The masterClient would follow
these redirects, fail, and retry — wasting round-trips each cycle.

Now tryAllMasters tracks which masters failed within a cycle and skips
redirects pointing to them, reducing log spam and connection overhead
during leader failover.

* fix: take snapshot after TopologyId generation for recovery

After generating a new TopologyId on the leader, immediately take a raft
snapshot so the ID can be recovered from the snapshot on future restarts
with RaftResumeState=false. Without this, short-lived clusters would
lose the TopologyId on restart since no automatic snapshot had been
taken yet.

* test: add multi-master raft failover integration tests

Integration test framework and 5 test scenarios for 3-node master
clusters:

- TestLeaderConsistencyAcrossNodes: all nodes agree on leader and
  TopologyId
- TestLeaderDownAndRecoverQuickly: leader stops, new leader elected,
  old leader rejoins as follower
- TestLeaderDownSlowRecover: leader gone for extended period, cluster
  continues with 2/3 quorum
- TestTwoMastersDownAndRestart: quorum lost (2/3 down), recovered
  when both restart
- TestAllMastersDownAndRestart: full cluster restart, leader elected,
  all nodes agree on TopologyId

* fix: address PR review comments

- peerIndex: return -1 (not 0) when self not found, add warning log
- recoverTopologyIdFromSnapshot: defer dir.Close()
- tests: check GetTopologyId errors instead of discarding them

* fix: address review comments on failover tests

- Assert no leader after quorum loss (was only logging)
- Verify follower cs.Leader matches expected leader via
  ServerAddress.ToHttpAddress() comparison
- Check GetTopologyId error in TestTwoMastersDownAndRestart
2026-03-18 23:28:07 -07:00

314 lines
9.4 KiB
Go

package multi_master
import (
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/pb"
)
const (
// Election timeout is 3s in our cluster config; allow generous margin.
leaderElectionTimeout = 20 * time.Second
)
// TestLeaderDownAndRecoverQuickly verifies that when the leader is stopped and
// restarted quickly, the cluster re-elects a leader and the restarted node
// rejoins as a follower. TopologyId must be consistent across all nodes.
func TestLeaderDownAndRecoverQuickly(t *testing.T) {
mc := StartMasterCluster(t)
// Record initial state.
leaderIdx, leaderAddr := mc.FindLeader()
if leaderIdx < 0 {
t.Fatal("no leader found after cluster start")
}
t.Logf("initial leader: node %d at %s", leaderIdx, leaderAddr)
topologyId, err := mc.GetTopologyId(leaderIdx)
if err != nil || topologyId == "" {
t.Fatalf("failed to get initial TopologyId: %v", err)
}
t.Logf("initial TopologyId: %s", topologyId)
// Stop the leader.
mc.StopNode(leaderIdx)
t.Logf("stopped leader node %d", leaderIdx)
// Wait for a new leader from the remaining 2 nodes.
newLeaderIdx, newLeaderAddr, err := mc.WaitForNewLeader(leaderAddr, leaderElectionTimeout)
if err != nil {
mc.DumpLogs()
t.Fatalf("new leader not elected after stopping old leader: %v", err)
}
t.Logf("new leader: node %d at %s", newLeaderIdx, newLeaderAddr)
// Restart the old leader quickly.
mc.StartNode(leaderIdx)
if err := mc.WaitForNodeReady(leaderIdx, waitTimeout); err != nil {
mc.DumpLogs()
t.Fatalf("restarted node %d not ready: %v", leaderIdx, err)
}
t.Logf("restarted node %d", leaderIdx)
// Give raft time to settle.
time.Sleep(3 * time.Second)
// Verify leader is stable.
finalLeaderIdx, _ := mc.FindLeader()
if finalLeaderIdx < 0 {
mc.DumpLogs()
t.Fatal("no leader after restarting old leader node")
}
// Verify TopologyId is consistent across all nodes.
assertTopologyIdConsistent(t, mc, topologyId)
}
// TestLeaderDownSlowRecover verifies that when the leader goes down and takes
// a long time to come back, the remaining 2 nodes elect a new leader and the
// cluster continues to function. When the slow node returns, it rejoins.
func TestLeaderDownSlowRecover(t *testing.T) {
mc := StartMasterCluster(t)
leaderIdx, leaderAddr := mc.FindLeader()
if leaderIdx < 0 {
t.Fatal("no leader found")
}
topologyId, err := mc.GetTopologyId(leaderIdx)
if err != nil || topologyId == "" {
t.Fatalf("failed to get initial TopologyId: %v", err)
}
t.Logf("initial leader: node %d, TopologyId: %s", leaderIdx, topologyId)
// Stop the leader.
mc.StopNode(leaderIdx)
// Wait for a new leader.
newLeaderIdx, _, err := mc.WaitForNewLeader(leaderAddr, leaderElectionTimeout)
if err != nil {
mc.DumpLogs()
t.Fatalf("new leader not elected: %v", err)
}
t.Logf("new leader: node %d", newLeaderIdx)
// Verify cluster functions with only 2 nodes (quorum is 2/3).
cs, err := mc.GetClusterStatus(newLeaderIdx)
if err != nil {
mc.DumpLogs()
t.Fatalf("cannot get cluster status from new leader: %v", err)
}
if !cs.IsLeader {
t.Fatalf("node %d claims not to be leader", newLeaderIdx)
}
// Simulate slow recovery: wait significantly longer than election timeout.
t.Log("simulating slow recovery (10 seconds)...")
time.Sleep(10 * time.Second)
// Verify leader is still stable during the outage.
stableLeaderIdx, _ := mc.FindLeader()
if stableLeaderIdx < 0 {
mc.DumpLogs()
t.Fatal("leader lost during extended outage of one node")
}
// Restart the downed node.
mc.StartNode(leaderIdx)
if err := mc.WaitForNodeReady(leaderIdx, waitTimeout); err != nil {
mc.DumpLogs()
t.Fatalf("slow-recovered node %d not ready: %v", leaderIdx, err)
}
time.Sleep(3 * time.Second)
assertTopologyIdConsistent(t, mc, topologyId)
}
// TestTwoMastersDownAndRestart verifies that when 2 of 3 masters go down
// (losing quorum), the cluster cannot elect a leader. When both restart,
// a leader is elected and TopologyId is preserved.
func TestTwoMastersDownAndRestart(t *testing.T) {
mc := StartMasterCluster(t)
leaderIdx, _ := mc.FindLeader()
if leaderIdx < 0 {
t.Fatal("no leader found")
}
topologyId, err := mc.GetTopologyId(leaderIdx)
if err != nil || topologyId == "" {
t.Fatalf("failed to get initial TopologyId: %v", err)
}
t.Logf("initial TopologyId: %s", topologyId)
// Determine which 2 nodes to stop (stop the leader + one follower).
down1 := leaderIdx
down2 := (leaderIdx + 1) % 3
survivor := (leaderIdx + 2) % 3
t.Logf("stopping nodes %d and %d, keeping node %d", down1, down2, survivor)
mc.StopNode(down1)
mc.StopNode(down2)
// The surviving node alone cannot form a quorum — no leader expected.
// Wait long enough for any stale leadership to expire (election timeout
// is 3s in our config, quorum check fires every election timeout).
time.Sleep(5 * time.Second)
soloLeaderIdx, _ := mc.FindLeader()
if soloLeaderIdx >= 0 {
// It's possible the survivor briefly thinks it's leader before stepping down.
// Give it time to realize it lost quorum.
time.Sleep(5 * time.Second)
soloLeaderIdx, _ = mc.FindLeader()
}
if soloLeaderIdx >= 0 {
mc.DumpLogs()
t.Fatalf("expected no leader with only 1 of 3 nodes, but node %d claims leadership", soloLeaderIdx)
}
// Restart both downed nodes.
mc.StartNode(down1)
mc.StartNode(down2)
for _, i := range []int{down1, down2} {
if err := mc.WaitForNodeReady(i, waitTimeout); err != nil {
mc.DumpLogs()
t.Fatalf("restarted node %d not ready: %v", i, err)
}
}
// Wait for leader election.
if err := mc.WaitForLeader(leaderElectionTimeout); err != nil {
mc.DumpLogs()
t.Fatalf("no leader after restarting 2 downed nodes: %v", err)
}
time.Sleep(3 * time.Second)
assertTopologyIdConsistent(t, mc, topologyId)
}
// TestAllMastersDownAndRestart verifies that when all 3 masters are stopped
// and restarted, the cluster elects a leader and all nodes agree on a
// TopologyId. With RaftResumeState=false (default), raft state is cleared on
// restart. The TopologyId is recovered from snapshots when available; on a
// short-lived cluster that hasn't taken snapshots on all nodes, a new
// TopologyId may be generated — but all nodes must still agree.
func TestAllMastersDownAndRestart(t *testing.T) {
mc := StartMasterCluster(t)
leaderIdx, _ := mc.FindLeader()
if leaderIdx < 0 {
t.Fatal("no leader found")
}
topologyId, _ := mc.GetTopologyId(leaderIdx)
if topologyId == "" {
t.Fatal("no TopologyId on initial leader")
}
t.Logf("initial TopologyId: %s", topologyId)
// Stop all nodes.
for i := range 3 {
mc.StopNode(i)
}
t.Log("all nodes stopped")
time.Sleep(2 * time.Second)
// Restart all nodes.
for i := range 3 {
mc.StartNode(i)
}
for i := range 3 {
if err := mc.WaitForNodeReady(i, waitTimeout); err != nil {
mc.DumpLogs()
t.Fatalf("node %d not ready after full restart: %v", i, err)
}
}
// Wait for leader.
if err := mc.WaitForLeader(leaderElectionTimeout); err != nil {
mc.DumpLogs()
t.Fatalf("no leader after full cluster restart: %v", err)
}
newLeaderIdx, _ := mc.FindLeader()
t.Logf("leader after full restart: node %d", newLeaderIdx)
time.Sleep(3 * time.Second)
// All nodes must agree on a TopologyId (may differ from original if
// snapshots were not yet taken on all nodes before shutdown).
newTopologyId, err := mc.GetTopologyId(newLeaderIdx)
if err != nil || newTopologyId == "" {
mc.DumpLogs()
t.Fatal("no TopologyId after full restart")
}
if newTopologyId == topologyId {
t.Logf("TopologyId preserved across full restart: %s", topologyId)
} else {
t.Logf("TopologyId changed (expected for short-lived cluster without snapshots): %s -> %s", topologyId, newTopologyId)
}
assertTopologyIdConsistent(t, mc, newTopologyId)
}
// TestLeaderConsistencyAcrossNodes verifies that all nodes agree on who the
// leader is and report the same TopologyId.
func TestLeaderConsistencyAcrossNodes(t *testing.T) {
mc := StartMasterCluster(t)
// Allow cluster to stabilize.
time.Sleep(3 * time.Second)
leaderIdx, leaderAddr := mc.FindLeader()
if leaderIdx < 0 {
t.Fatal("no leader found")
}
t.Logf("leader: node %d at %s", leaderIdx, leaderAddr)
// Every node should agree on the leader.
for i := range 3 {
cs, err := mc.GetClusterStatus(i)
if err != nil {
t.Fatalf("node %d cluster/status error: %v", i, err)
}
if i == leaderIdx {
if !cs.IsLeader {
t.Errorf("node %d should be leader but IsLeader=false", i)
}
} else {
if cs.IsLeader {
t.Errorf("node %d should not be leader but IsLeader=true", i)
}
// cs.Leader is a ServerAddress like "127.0.0.1:10000.20000";
// convert to HTTP address for comparison with leaderAddr.
leaderHttp := pb.ServerAddress(cs.Leader).ToHttpAddress()
if leaderHttp != leaderAddr {
t.Errorf("node %d reports leader %q (http: %s), expected %q", i, cs.Leader, leaderHttp, leaderAddr)
}
}
}
// All nodes should have the same TopologyId.
topologyId, _ := mc.GetTopologyId(leaderIdx)
if topologyId == "" {
t.Fatal("leader has no TopologyId")
}
assertTopologyIdConsistent(t, mc, topologyId)
}
// assertTopologyIdConsistent verifies that all running nodes report the expected TopologyId.
func assertTopologyIdConsistent(t *testing.T, mc *MasterCluster, expectedId string) {
t.Helper()
for i := range 3 {
if !mc.IsNodeRunning(i) {
continue
}
id, err := mc.GetTopologyId(i)
if err != nil {
t.Errorf("node %d: failed to get TopologyId: %v", i, err)
continue
}
if id != expectedId {
t.Errorf("node %d: TopologyId=%q, expected %q", i, id, expectedId)
}
}
}