fix(master): fast resume state and default resumeState to true (#8925)
* fix(master): fast resume state and default resumeState to true When resumeState is enabled in single-master mode, the raft server had existing log entries so the self-join path couldn't promote to leader. The server waited the full election timeout (10-20s) before self-electing. Fix by temporarily setting election timeout to 1ms before Start() when in single-master + resumeState mode with existing log, then restoring the original timeout after leader election. This makes resume near-instant. Also change the default for resumeState from false to true across all CLI commands (master, mini, server) so state is preserved by default. * fix(master): prevent fastResume goroutine from hanging forever Use defer to guarantee election timeout is always restored, and bound the polling loop with a timeout so it cannot spin indefinitely if leader election never succeeds. * fix(master): use ticker instead of time.After in fastResume polling loop
This commit is contained in:
@@ -32,6 +32,7 @@ type RaftServerOption struct {
|
||||
DataDir string
|
||||
Topo *topology.Topology
|
||||
RaftResumeState bool
|
||||
SingleMaster bool
|
||||
HeartbeatInterval time.Duration
|
||||
ElectionTimeout time.Duration
|
||||
RaftBootstrap bool
|
||||
@@ -176,10 +177,38 @@ func NewRaftServer(option *RaftServerOption) (*RaftServer, error) {
|
||||
if err := s.raftServer.LoadSnapshot(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// In single-master mode resuming state, the log is not empty so the
|
||||
// normal self-join path won't promote to leader. The server will
|
||||
// self-elect after the election timeout, so use a tiny timeout to
|
||||
// make this near-instant, then restore the original after election.
|
||||
fastResume := option.SingleMaster && option.RaftResumeState && !s.raftServer.IsLogEmpty()
|
||||
if fastResume {
|
||||
s.raftServer.SetElectionTimeout(time.Millisecond)
|
||||
}
|
||||
|
||||
if err := s.raftServer.Start(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if fastResume {
|
||||
go func() {
|
||||
defer s.raftServer.SetElectionTimeout(option.ElectionTimeout)
|
||||
ticker := time.NewTicker(100 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
timeout := time.After(option.ElectionTimeout)
|
||||
for s.raftServer.Leader() == "" {
|
||||
select {
|
||||
case <-timeout:
|
||||
glog.Warningf("Fast resume timed out waiting for leader election, restoring election timeout to %v", option.ElectionTimeout)
|
||||
return
|
||||
case <-ticker.C:
|
||||
}
|
||||
}
|
||||
glog.V(0).Infof("Resumed as leader with election timeout restored to %v", option.ElectionTimeout)
|
||||
}()
|
||||
}
|
||||
|
||||
for name, peer := range s.peers {
|
||||
if err := s.raftServer.AddPeer(name, peer.ToGrpcAddress()); err != nil {
|
||||
return nil, err
|
||||
@@ -273,3 +302,4 @@ func (s *RaftServer) DoJoinCommand() {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user