From 4efe0acaf5c88db8702b20c6ca39b76c4934b935 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Sat, 4 Apr 2026 14:15:56 -0700 Subject: [PATCH] fix(master): fast resume state and default resumeState to true (#8925) * fix(master): fast resume state and default resumeState to true When resumeState is enabled in single-master mode, the raft server had existing log entries so the self-join path couldn't promote to leader. The server waited the full election timeout (10-20s) before self-electing. Fix by temporarily setting election timeout to 1ms before Start() when in single-master + resumeState mode with existing log, then restoring the original timeout after leader election. This makes resume near-instant. Also change the default for resumeState from false to true across all CLI commands (master, mini, server) so state is preserved by default. * fix(master): prevent fastResume goroutine from hanging forever Use defer to guarantee election timeout is always restored, and bound the polling loop with a timeout so it cannot spin indefinitely if leader election never succeeds. * fix(master): use ticker instead of time.After in fastResume polling loop --- weed/command/master.go | 3 ++- weed/command/mini.go | 2 +- weed/command/server.go | 2 +- weed/server/raft_server.go | 30 ++++++++++++++++++++++++++++++ 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/weed/command/master.go b/weed/command/master.go index ad81fe7a0..3ea8c96fc 100644 --- a/weed/command/master.go +++ b/weed/command/master.go @@ -96,7 +96,7 @@ func init() { m.metricsIntervalSec = cmdMaster.Flag.Int("metrics.intervalSeconds", 15, "Prometheus push interval in seconds") m.metricsHttpPort = cmdMaster.Flag.Int("metricsPort", 0, "Prometheus metrics listen port") m.metricsHttpIp = cmdMaster.Flag.String("metricsIp", "", "metrics listen ip. If empty, default to same as -ip.bind option.") - m.raftResumeState = cmdMaster.Flag.Bool("resumeState", false, "resume previous state on start master server") + m.raftResumeState = cmdMaster.Flag.Bool("resumeState", true, "resume previous state on start master server") m.heartbeatInterval = cmdMaster.Flag.Duration("heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)") m.electionTimeout = cmdMaster.Flag.Duration("electionTimeout", 10*time.Second, "election timeout of master servers") m.raftHashicorp = cmdMaster.Flag.Bool("raftHashicorp", false, "use hashicorp raft") @@ -208,6 +208,7 @@ func startMaster(masterOption MasterOptions, masterWhiteList []string) { DataDir: util.ResolvePath(metaDir), Topo: ms.Topo, RaftResumeState: *masterOption.raftResumeState, + SingleMaster: isSingleMaster, HeartbeatInterval: *masterOption.heartbeatInterval, ElectionTimeout: *masterOption.electionTimeout, RaftBootstrap: *masterOption.raftBootstrap, diff --git a/weed/command/mini.go b/weed/command/mini.go index f23797b5e..35c7adeda 100644 --- a/weed/command/mini.go +++ b/weed/command/mini.go @@ -162,7 +162,7 @@ func initMiniMasterFlags() { miniMasterOptions.garbageThreshold = cmdMini.Flag.Float64("master.garbageThreshold", 0.3, "threshold to vacuum and reclaim spaces") miniMasterOptions.metricsAddress = cmdMini.Flag.String("master.metrics.address", "", "Prometheus gateway address") miniMasterOptions.metricsIntervalSec = cmdMini.Flag.Int("master.metrics.intervalSeconds", 15, "Prometheus push interval in seconds") - miniMasterOptions.raftResumeState = cmdMini.Flag.Bool("master.resumeState", false, "resume previous state on start master server") + miniMasterOptions.raftResumeState = cmdMini.Flag.Bool("master.resumeState", true, "resume previous state on start master server") miniMasterOptions.heartbeatInterval = cmdMini.Flag.Duration("master.heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)") miniMasterOptions.electionTimeout = cmdMini.Flag.Duration("master.electionTimeout", 10*time.Second, "election timeout of master servers") miniMasterOptions.raftHashicorp = cmdMini.Flag.Bool("master.raftHashicorp", false, "use hashicorp raft") diff --git a/weed/command/server.go b/weed/command/server.go index 9533c967e..caa456478 100644 --- a/weed/command/server.go +++ b/weed/command/server.go @@ -102,7 +102,7 @@ func init() { masterOptions.garbageThreshold = cmdServer.Flag.Float64("master.garbageThreshold", 0.3, "threshold to vacuum and reclaim spaces") masterOptions.metricsAddress = cmdServer.Flag.String("master.metrics.address", "", "Prometheus gateway address") masterOptions.metricsIntervalSec = cmdServer.Flag.Int("master.metrics.intervalSeconds", 15, "Prometheus push interval in seconds") - masterOptions.raftResumeState = cmdServer.Flag.Bool("master.resumeState", false, "resume previous state on start master server") + masterOptions.raftResumeState = cmdServer.Flag.Bool("master.resumeState", true, "resume previous state on start master server") masterOptions.raftHashicorp = cmdServer.Flag.Bool("master.raftHashicorp", false, "use hashicorp raft") masterOptions.raftBootstrap = cmdServer.Flag.Bool("master.raftBootstrap", false, "Whether to bootstrap the Raft cluster") masterOptions.heartbeatInterval = cmdServer.Flag.Duration("master.heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)") diff --git a/weed/server/raft_server.go b/weed/server/raft_server.go index 938219325..b42ac0671 100644 --- a/weed/server/raft_server.go +++ b/weed/server/raft_server.go @@ -32,6 +32,7 @@ type RaftServerOption struct { DataDir string Topo *topology.Topology RaftResumeState bool + SingleMaster bool HeartbeatInterval time.Duration ElectionTimeout time.Duration RaftBootstrap bool @@ -176,10 +177,38 @@ func NewRaftServer(option *RaftServerOption) (*RaftServer, error) { if err := s.raftServer.LoadSnapshot(); err != nil { return nil, err } + + // In single-master mode resuming state, the log is not empty so the + // normal self-join path won't promote to leader. The server will + // self-elect after the election timeout, so use a tiny timeout to + // make this near-instant, then restore the original after election. + fastResume := option.SingleMaster && option.RaftResumeState && !s.raftServer.IsLogEmpty() + if fastResume { + s.raftServer.SetElectionTimeout(time.Millisecond) + } + if err := s.raftServer.Start(); err != nil { return nil, err } + if fastResume { + go func() { + defer s.raftServer.SetElectionTimeout(option.ElectionTimeout) + ticker := time.NewTicker(100 * time.Millisecond) + defer ticker.Stop() + timeout := time.After(option.ElectionTimeout) + for s.raftServer.Leader() == "" { + select { + case <-timeout: + glog.Warningf("Fast resume timed out waiting for leader election, restoring election timeout to %v", option.ElectionTimeout) + return + case <-ticker.C: + } + } + glog.V(0).Infof("Resumed as leader with election timeout restored to %v", option.ElectionTimeout) + }() + } + for name, peer := range s.peers { if err := s.raftServer.AddPeer(name, peer.ToGrpcAddress()); err != nil { return nil, err @@ -273,3 +302,4 @@ func (s *RaftServer) DoJoinCommand() { } } +