fix(master): fast resume state and default resumeState to true (#8925)

* fix(master): fast resume state and default resumeState to true

When resumeState is enabled in single-master mode, the raft server had
existing log entries so the self-join path couldn't promote to leader.
The server waited the full election timeout (10-20s) before self-electing.

Fix by temporarily setting election timeout to 1ms before Start() when
in single-master + resumeState mode with existing log, then restoring
the original timeout after leader election. This makes resume near-instant.

Also change the default for resumeState from false to true across all
CLI commands (master, mini, server) so state is preserved by default.

* fix(master): prevent fastResume goroutine from hanging forever

Use defer to guarantee election timeout is always restored, and bound
the polling loop with a timeout so it cannot spin indefinitely if
leader election never succeeds.

* fix(master): use ticker instead of time.After in fastResume polling loop
This commit is contained in:
Chris Lu
2026-04-04 14:15:56 -07:00
committed by GitHub
parent 0da1794856
commit 4efe0acaf5
4 changed files with 34 additions and 3 deletions

View File

@@ -96,7 +96,7 @@ func init() {
m.metricsIntervalSec = cmdMaster.Flag.Int("metrics.intervalSeconds", 15, "Prometheus push interval in seconds")
m.metricsHttpPort = cmdMaster.Flag.Int("metricsPort", 0, "Prometheus metrics listen port")
m.metricsHttpIp = cmdMaster.Flag.String("metricsIp", "", "metrics listen ip. If empty, default to same as -ip.bind option.")
m.raftResumeState = cmdMaster.Flag.Bool("resumeState", false, "resume previous state on start master server")
m.raftResumeState = cmdMaster.Flag.Bool("resumeState", true, "resume previous state on start master server")
m.heartbeatInterval = cmdMaster.Flag.Duration("heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)")
m.electionTimeout = cmdMaster.Flag.Duration("electionTimeout", 10*time.Second, "election timeout of master servers")
m.raftHashicorp = cmdMaster.Flag.Bool("raftHashicorp", false, "use hashicorp raft")
@@ -208,6 +208,7 @@ func startMaster(masterOption MasterOptions, masterWhiteList []string) {
DataDir: util.ResolvePath(metaDir),
Topo: ms.Topo,
RaftResumeState: *masterOption.raftResumeState,
SingleMaster: isSingleMaster,
HeartbeatInterval: *masterOption.heartbeatInterval,
ElectionTimeout: *masterOption.electionTimeout,
RaftBootstrap: *masterOption.raftBootstrap,

View File

@@ -162,7 +162,7 @@ func initMiniMasterFlags() {
miniMasterOptions.garbageThreshold = cmdMini.Flag.Float64("master.garbageThreshold", 0.3, "threshold to vacuum and reclaim spaces")
miniMasterOptions.metricsAddress = cmdMini.Flag.String("master.metrics.address", "", "Prometheus gateway address")
miniMasterOptions.metricsIntervalSec = cmdMini.Flag.Int("master.metrics.intervalSeconds", 15, "Prometheus push interval in seconds")
miniMasterOptions.raftResumeState = cmdMini.Flag.Bool("master.resumeState", false, "resume previous state on start master server")
miniMasterOptions.raftResumeState = cmdMini.Flag.Bool("master.resumeState", true, "resume previous state on start master server")
miniMasterOptions.heartbeatInterval = cmdMini.Flag.Duration("master.heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)")
miniMasterOptions.electionTimeout = cmdMini.Flag.Duration("master.electionTimeout", 10*time.Second, "election timeout of master servers")
miniMasterOptions.raftHashicorp = cmdMini.Flag.Bool("master.raftHashicorp", false, "use hashicorp raft")

View File

@@ -102,7 +102,7 @@ func init() {
masterOptions.garbageThreshold = cmdServer.Flag.Float64("master.garbageThreshold", 0.3, "threshold to vacuum and reclaim spaces")
masterOptions.metricsAddress = cmdServer.Flag.String("master.metrics.address", "", "Prometheus gateway address")
masterOptions.metricsIntervalSec = cmdServer.Flag.Int("master.metrics.intervalSeconds", 15, "Prometheus push interval in seconds")
masterOptions.raftResumeState = cmdServer.Flag.Bool("master.resumeState", false, "resume previous state on start master server")
masterOptions.raftResumeState = cmdServer.Flag.Bool("master.resumeState", true, "resume previous state on start master server")
masterOptions.raftHashicorp = cmdServer.Flag.Bool("master.raftHashicorp", false, "use hashicorp raft")
masterOptions.raftBootstrap = cmdServer.Flag.Bool("master.raftBootstrap", false, "Whether to bootstrap the Raft cluster")
masterOptions.heartbeatInterval = cmdServer.Flag.Duration("master.heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)")