fix(master): fast resume state and default resumeState to true (#8925)
* fix(master): fast resume state and default resumeState to true When resumeState is enabled in single-master mode, the raft server had existing log entries so the self-join path couldn't promote to leader. The server waited the full election timeout (10-20s) before self-electing. Fix by temporarily setting election timeout to 1ms before Start() when in single-master + resumeState mode with existing log, then restoring the original timeout after leader election. This makes resume near-instant. Also change the default for resumeState from false to true across all CLI commands (master, mini, server) so state is preserved by default. * fix(master): prevent fastResume goroutine from hanging forever Use defer to guarantee election timeout is always restored, and bound the polling loop with a timeout so it cannot spin indefinitely if leader election never succeeds. * fix(master): use ticker instead of time.After in fastResume polling loop
This commit is contained in:
@@ -96,7 +96,7 @@ func init() {
|
|||||||
m.metricsIntervalSec = cmdMaster.Flag.Int("metrics.intervalSeconds", 15, "Prometheus push interval in seconds")
|
m.metricsIntervalSec = cmdMaster.Flag.Int("metrics.intervalSeconds", 15, "Prometheus push interval in seconds")
|
||||||
m.metricsHttpPort = cmdMaster.Flag.Int("metricsPort", 0, "Prometheus metrics listen port")
|
m.metricsHttpPort = cmdMaster.Flag.Int("metricsPort", 0, "Prometheus metrics listen port")
|
||||||
m.metricsHttpIp = cmdMaster.Flag.String("metricsIp", "", "metrics listen ip. If empty, default to same as -ip.bind option.")
|
m.metricsHttpIp = cmdMaster.Flag.String("metricsIp", "", "metrics listen ip. If empty, default to same as -ip.bind option.")
|
||||||
m.raftResumeState = cmdMaster.Flag.Bool("resumeState", false, "resume previous state on start master server")
|
m.raftResumeState = cmdMaster.Flag.Bool("resumeState", true, "resume previous state on start master server")
|
||||||
m.heartbeatInterval = cmdMaster.Flag.Duration("heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)")
|
m.heartbeatInterval = cmdMaster.Flag.Duration("heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)")
|
||||||
m.electionTimeout = cmdMaster.Flag.Duration("electionTimeout", 10*time.Second, "election timeout of master servers")
|
m.electionTimeout = cmdMaster.Flag.Duration("electionTimeout", 10*time.Second, "election timeout of master servers")
|
||||||
m.raftHashicorp = cmdMaster.Flag.Bool("raftHashicorp", false, "use hashicorp raft")
|
m.raftHashicorp = cmdMaster.Flag.Bool("raftHashicorp", false, "use hashicorp raft")
|
||||||
@@ -208,6 +208,7 @@ func startMaster(masterOption MasterOptions, masterWhiteList []string) {
|
|||||||
DataDir: util.ResolvePath(metaDir),
|
DataDir: util.ResolvePath(metaDir),
|
||||||
Topo: ms.Topo,
|
Topo: ms.Topo,
|
||||||
RaftResumeState: *masterOption.raftResumeState,
|
RaftResumeState: *masterOption.raftResumeState,
|
||||||
|
SingleMaster: isSingleMaster,
|
||||||
HeartbeatInterval: *masterOption.heartbeatInterval,
|
HeartbeatInterval: *masterOption.heartbeatInterval,
|
||||||
ElectionTimeout: *masterOption.electionTimeout,
|
ElectionTimeout: *masterOption.electionTimeout,
|
||||||
RaftBootstrap: *masterOption.raftBootstrap,
|
RaftBootstrap: *masterOption.raftBootstrap,
|
||||||
|
|||||||
@@ -162,7 +162,7 @@ func initMiniMasterFlags() {
|
|||||||
miniMasterOptions.garbageThreshold = cmdMini.Flag.Float64("master.garbageThreshold", 0.3, "threshold to vacuum and reclaim spaces")
|
miniMasterOptions.garbageThreshold = cmdMini.Flag.Float64("master.garbageThreshold", 0.3, "threshold to vacuum and reclaim spaces")
|
||||||
miniMasterOptions.metricsAddress = cmdMini.Flag.String("master.metrics.address", "", "Prometheus gateway address")
|
miniMasterOptions.metricsAddress = cmdMini.Flag.String("master.metrics.address", "", "Prometheus gateway address")
|
||||||
miniMasterOptions.metricsIntervalSec = cmdMini.Flag.Int("master.metrics.intervalSeconds", 15, "Prometheus push interval in seconds")
|
miniMasterOptions.metricsIntervalSec = cmdMini.Flag.Int("master.metrics.intervalSeconds", 15, "Prometheus push interval in seconds")
|
||||||
miniMasterOptions.raftResumeState = cmdMini.Flag.Bool("master.resumeState", false, "resume previous state on start master server")
|
miniMasterOptions.raftResumeState = cmdMini.Flag.Bool("master.resumeState", true, "resume previous state on start master server")
|
||||||
miniMasterOptions.heartbeatInterval = cmdMini.Flag.Duration("master.heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)")
|
miniMasterOptions.heartbeatInterval = cmdMini.Flag.Duration("master.heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)")
|
||||||
miniMasterOptions.electionTimeout = cmdMini.Flag.Duration("master.electionTimeout", 10*time.Second, "election timeout of master servers")
|
miniMasterOptions.electionTimeout = cmdMini.Flag.Duration("master.electionTimeout", 10*time.Second, "election timeout of master servers")
|
||||||
miniMasterOptions.raftHashicorp = cmdMini.Flag.Bool("master.raftHashicorp", false, "use hashicorp raft")
|
miniMasterOptions.raftHashicorp = cmdMini.Flag.Bool("master.raftHashicorp", false, "use hashicorp raft")
|
||||||
|
|||||||
@@ -102,7 +102,7 @@ func init() {
|
|||||||
masterOptions.garbageThreshold = cmdServer.Flag.Float64("master.garbageThreshold", 0.3, "threshold to vacuum and reclaim spaces")
|
masterOptions.garbageThreshold = cmdServer.Flag.Float64("master.garbageThreshold", 0.3, "threshold to vacuum and reclaim spaces")
|
||||||
masterOptions.metricsAddress = cmdServer.Flag.String("master.metrics.address", "", "Prometheus gateway address")
|
masterOptions.metricsAddress = cmdServer.Flag.String("master.metrics.address", "", "Prometheus gateway address")
|
||||||
masterOptions.metricsIntervalSec = cmdServer.Flag.Int("master.metrics.intervalSeconds", 15, "Prometheus push interval in seconds")
|
masterOptions.metricsIntervalSec = cmdServer.Flag.Int("master.metrics.intervalSeconds", 15, "Prometheus push interval in seconds")
|
||||||
masterOptions.raftResumeState = cmdServer.Flag.Bool("master.resumeState", false, "resume previous state on start master server")
|
masterOptions.raftResumeState = cmdServer.Flag.Bool("master.resumeState", true, "resume previous state on start master server")
|
||||||
masterOptions.raftHashicorp = cmdServer.Flag.Bool("master.raftHashicorp", false, "use hashicorp raft")
|
masterOptions.raftHashicorp = cmdServer.Flag.Bool("master.raftHashicorp", false, "use hashicorp raft")
|
||||||
masterOptions.raftBootstrap = cmdServer.Flag.Bool("master.raftBootstrap", false, "Whether to bootstrap the Raft cluster")
|
masterOptions.raftBootstrap = cmdServer.Flag.Bool("master.raftBootstrap", false, "Whether to bootstrap the Raft cluster")
|
||||||
masterOptions.heartbeatInterval = cmdServer.Flag.Duration("master.heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)")
|
masterOptions.heartbeatInterval = cmdServer.Flag.Duration("master.heartbeatInterval", 300*time.Millisecond, "heartbeat interval of master servers, and will be randomly multiplied by [1, 1.25)")
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ type RaftServerOption struct {
|
|||||||
DataDir string
|
DataDir string
|
||||||
Topo *topology.Topology
|
Topo *topology.Topology
|
||||||
RaftResumeState bool
|
RaftResumeState bool
|
||||||
|
SingleMaster bool
|
||||||
HeartbeatInterval time.Duration
|
HeartbeatInterval time.Duration
|
||||||
ElectionTimeout time.Duration
|
ElectionTimeout time.Duration
|
||||||
RaftBootstrap bool
|
RaftBootstrap bool
|
||||||
@@ -176,10 +177,38 @@ func NewRaftServer(option *RaftServerOption) (*RaftServer, error) {
|
|||||||
if err := s.raftServer.LoadSnapshot(); err != nil {
|
if err := s.raftServer.LoadSnapshot(); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// In single-master mode resuming state, the log is not empty so the
|
||||||
|
// normal self-join path won't promote to leader. The server will
|
||||||
|
// self-elect after the election timeout, so use a tiny timeout to
|
||||||
|
// make this near-instant, then restore the original after election.
|
||||||
|
fastResume := option.SingleMaster && option.RaftResumeState && !s.raftServer.IsLogEmpty()
|
||||||
|
if fastResume {
|
||||||
|
s.raftServer.SetElectionTimeout(time.Millisecond)
|
||||||
|
}
|
||||||
|
|
||||||
if err := s.raftServer.Start(); err != nil {
|
if err := s.raftServer.Start(); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if fastResume {
|
||||||
|
go func() {
|
||||||
|
defer s.raftServer.SetElectionTimeout(option.ElectionTimeout)
|
||||||
|
ticker := time.NewTicker(100 * time.Millisecond)
|
||||||
|
defer ticker.Stop()
|
||||||
|
timeout := time.After(option.ElectionTimeout)
|
||||||
|
for s.raftServer.Leader() == "" {
|
||||||
|
select {
|
||||||
|
case <-timeout:
|
||||||
|
glog.Warningf("Fast resume timed out waiting for leader election, restoring election timeout to %v", option.ElectionTimeout)
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
glog.V(0).Infof("Resumed as leader with election timeout restored to %v", option.ElectionTimeout)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
for name, peer := range s.peers {
|
for name, peer := range s.peers {
|
||||||
if err := s.raftServer.AddPeer(name, peer.ToGrpcAddress()); err != nil {
|
if err := s.raftServer.AddPeer(name, peer.ToGrpcAddress()); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -273,3 +302,4 @@ func (s *RaftServer) DoJoinCommand() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user