fix: improve raft leader election reliability and failover speed (#8692)

* fix: clear raft vote state file on non-resume startup

The seaweedfs/raft library v1.1.7 added a persistent `state` file for
currentTerm and votedFor. When RaftResumeState=false (the default), the
log, conf, and snapshot directories are cleared but this state file was
not. On repeated restarts, different masters accumulate divergent terms,
causing AppendEntries rejections and preventing leader election.

Fixes #8690

* fix: recover TopologyId from snapshot before clearing raft state

When RaftResumeState=false clears log/conf/snapshot, the TopologyId
(used for license validation) was lost. Now extract it from the latest
snapshot before cleanup and restore it on the topology.

Both seaweedfs/raft and hashicorp/raft paths are handled, with a shared
recoverTopologyIdFromState helper in raft_common.go.

* fix: stagger multi-master bootstrap delay by peer index

Previously all masters used a fixed 1500ms delay before the bootstrap
check. Now the delay is proportional to the peer's sorted index with
randomization (matching the hashicorp raft path), giving the designated
bootstrap node (peer 0) a head start while later peers wait for gRPC
servers to be ready.

Also adds diagnostic logging showing why DoJoinCommand was or wasn't
called, making leader election issues easier to diagnose from logs.

* fix: skip unreachable masters during leader reconnection

When a master leader goes down, non-leader masters still redirect
clients to the stale leader address. The masterClient would follow
these redirects, fail, and retry — wasting round-trips each cycle.

Now tryAllMasters tracks which masters failed within a cycle and skips
redirects pointing to them, reducing log spam and connection overhead
during leader failover.

* fix: take snapshot after TopologyId generation for recovery

After generating a new TopologyId on the leader, immediately take a raft
snapshot so the ID can be recovered from the snapshot on future restarts
with RaftResumeState=false. Without this, short-lived clusters would
lose the TopologyId on restart since no automatic snapshot had been
taken yet.

* test: add multi-master raft failover integration tests

Integration test framework and 5 test scenarios for 3-node master
clusters:

- TestLeaderConsistencyAcrossNodes: all nodes agree on leader and
  TopologyId
- TestLeaderDownAndRecoverQuickly: leader stops, new leader elected,
  old leader rejoins as follower
- TestLeaderDownSlowRecover: leader gone for extended period, cluster
  continues with 2/3 quorum
- TestTwoMastersDownAndRestart: quorum lost (2/3 down), recovered
  when both restart
- TestAllMastersDownAndRestart: full cluster restart, leader elected,
  all nodes agree on TopologyId

* fix: address PR review comments

- peerIndex: return -1 (not 0) when self not found, add warning log
- recoverTopologyIdFromSnapshot: defer dir.Close()
- tests: check GetTopologyId errors instead of discarding them

* fix: address review comments on failover tests

- Assert no leader after quorum loss (was only logging)
- Verify follower cs.Leader matches expected leader via
  ServerAddress.ToHttpAddress() comparison
- Check GetTopologyId error in TestTwoMastersDownAndRestart
This commit is contained in:
Chris Lu
2026-03-18 23:28:07 -07:00
committed by GitHub
parent c197206897
commit 15f4a97029
9 changed files with 908 additions and 11 deletions

View File

@@ -297,11 +297,28 @@ func (ms *MasterServer) ensureTopologyId() {
currentId := ms.Topo.GetTopologyId()
glog.V(1).Infof("ensureTopologyId: current TopologyId after barrier: %s", currentId)
prevId := ms.Topo.GetTopologyId()
EnsureTopologyId(ms.Topo, func() bool {
return ms.Topo.IsLeader()
}, func(topologyId string) error {
return ms.syncRaftForTopologyId(topologyId)
})
// If a new TopologyId was generated, take a snapshot so it survives
// raft state cleanup on future non-resume restarts.
if prevId == "" && ms.Topo.GetTopologyId() != "" {
ms.Topo.RaftServerAccessLock.RLock()
if ms.Topo.RaftServer != nil {
if err := ms.Topo.RaftServer.TakeSnapshot(); err != nil {
glog.Warningf("snapshot after TopologyId generation: %v", err)
} else {
glog.V(0).Infof("snapshot taken to persist TopologyId %s", ms.Topo.GetTopologyId())
}
}
// Hashicorp raft snapshots are handled automatically.
ms.Topo.RaftServerAccessLock.RUnlock()
}
}
func (ms *MasterServer) proxyToLeader(f http.HandlerFunc) http.HandlerFunc {

View File

@@ -1,6 +1,7 @@
package weed_server
import (
"encoding/json"
"time"
"github.com/google/uuid"
@@ -8,6 +9,23 @@ import (
"github.com/seaweedfs/seaweedfs/weed/topology"
)
// recoverTopologyIdFromState restores the TopologyId from serialized FSM
// state bytes (JSON-encoded MaxVolumeIdCommand). Both raft implementations
// call this after reading their snapshot in their own format.
func recoverTopologyIdFromState(fsmState []byte, topo *topology.Topology) {
if topo.GetTopologyId() != "" {
return
}
var cmd topology.MaxVolumeIdCommand
if err := json.Unmarshal(fsmState, &cmd); err != nil {
return
}
if cmd.TopologyId != "" {
topo.SetTopologyId(cmd.TopologyId)
glog.V(0).Infof("Recovered TopologyId from snapshot: %s", cmd.TopologyId)
}
}
// EnsureTopologyId ensures that a TopologyId is generated and persisted if it's currently missing.
// It uses the provided checkLeaderFn to verify leadership and persistFn to save the new ID.
func EnsureTopologyId(topo *topology.Topology, checkLeaderFn func() bool, persistFn func(string) error) {

View File

@@ -6,6 +6,7 @@ package weed_server
import (
"encoding/json"
"fmt"
"io"
"math/rand/v2"
"os"
"path"
@@ -56,6 +57,28 @@ func raftServerID(server pb.ServerAddress) string {
return server.ToHttpAddress()
}
// recoverTopologyIdFromHashicorpSnapshot reads the TopologyId from the latest
// hashicorp raft snapshot before state cleanup.
func recoverTopologyIdFromHashicorpSnapshot(dataDir string, topo *topology.Topology) {
fss, err := raft.NewFileSnapshotStore(dataDir, 1, io.Discard)
if err != nil {
return
}
snapshots, err := fss.List()
if err != nil || len(snapshots) == 0 {
return
}
_, rc, err := fss.Open(snapshots[0].ID)
if err != nil {
return
}
defer rc.Close()
if b, err := io.ReadAll(rc); err == nil {
recoverTopologyIdFromState(b, topo)
}
}
func (s *RaftServer) AddPeersConfiguration() (cfg raft.Configuration) {
for _, peer := range s.peers {
cfg.Servers = append(cfg.Servers, raft.Server{
@@ -168,6 +191,8 @@ func NewHashicorpRaftServer(option *RaftServerOption) (*RaftServer, error) {
}
if option.RaftBootstrap {
recoverTopologyIdFromHashicorpSnapshot(s.dataDir, option.Topo)
os.RemoveAll(path.Join(s.dataDir, ldbFile))
os.RemoveAll(path.Join(s.dataDir, sdbFile))
os.RemoveAll(path.Join(s.dataDir, "snapshots"))

View File

@@ -2,10 +2,13 @@ package weed_server
import (
"encoding/json"
"fmt"
"hash/crc32"
"io"
"math/rand/v2"
"os"
"path"
"sort"
"sync"
"time"
@@ -142,11 +145,20 @@ func NewRaftServer(option *RaftServerOption) (*RaftServer, error) {
glog.V(0).Infof("Starting RaftServer with %v", option.ServerAddr)
if !option.RaftResumeState {
// Recover the TopologyId from the snapshot before clearing state.
// The TopologyId is a cluster identity used for license validation
// and must survive raft state cleanup.
recoverTopologyIdFromSnapshot(s.dataDir, option.Topo)
// clear previous log to ensure fresh start
os.RemoveAll(path.Join(s.dataDir, "log"))
// always clear previous metadata
os.RemoveAll(path.Join(s.dataDir, "conf"))
os.RemoveAll(path.Join(s.dataDir, "snapshot"))
// clear persisted vote state (currentTerm/votedFor) so that stale
// terms from previous runs cannot cause election conflicts when the
// log has been wiped.
os.Remove(path.Join(s.dataDir, "state"))
}
if err := os.MkdirAll(path.Join(s.dataDir, "snapshot"), os.ModePerm); err != nil {
return nil, err
@@ -208,6 +220,47 @@ func (s *RaftServer) Peers() (members []string) {
return
}
// recoverTopologyIdFromSnapshot reads the TopologyId from the latest
// seaweedfs/raft snapshot before state cleanup.
func recoverTopologyIdFromSnapshot(dataDir string, topo *topology.Topology) {
snapshotDir := path.Join(dataDir, "snapshot")
dir, err := os.Open(snapshotDir)
if err != nil {
return
}
defer dir.Close()
filenames, err := dir.Readdirnames(-1)
if err != nil || len(filenames) == 0 {
return
}
sort.Strings(filenames)
file, err := os.Open(path.Join(snapshotDir, filenames[len(filenames)-1]))
if err != nil {
return
}
defer file.Close()
// Snapshot format: 8-hex-digit CRC32 checksum, newline, JSON body.
var checksum uint32
if _, err := fmt.Fscanf(file, "%08x\n", &checksum); err != nil {
return
}
b, err := io.ReadAll(file)
if err != nil || crc32.ChecksumIEEE(b) != checksum {
return
}
// The snapshot JSON wraps the FSM state in a "state" field.
var snap struct {
State json.RawMessage `json:"state"`
}
if err := json.Unmarshal(b, &snap); err != nil || len(snap.State) == 0 {
return
}
recoverTopologyIdFromState(snap.State, topo)
}
func (s *RaftServer) DoJoinCommand() {
glog.V(0).Infoln("Initializing new cluster")