* proto: add RaftLeadershipTransfer RPC for forced leader change Add new gRPC RPC and messages for leadership transfer: - RaftLeadershipTransferRequest: optional target_id and target_address - RaftLeadershipTransferResponse: previous_leader and new_leader This enables graceful leadership transfer before master maintenance, reducing errors in filers during planned maintenance windows. Ref: https://github.com/seaweedfs/seaweedfs/issues/7527 * proto: regenerate Go files for RaftLeadershipTransfer Generated from master.proto changes. * master: implement RaftLeadershipTransfer gRPC handler Add gRPC handler for leadership transfer with support for: - Transfer to any eligible follower (when target_id is empty) - Transfer to a specific server (when target_id and target_address are provided) Uses hashicorp/raft LeadershipTransfer() and LeadershipTransferToServer() APIs. Returns the previous and new leader in the response. * shell: add cluster.raft.leader.transfer command Add weed shell command for graceful leadership transfer: - Displays current cluster status before transfer - Supports auto-selection of target (any eligible follower) - Supports targeted transfer with -id and -address flags - Provides clear feedback on success/failure with troubleshooting tips Usage: cluster.raft.leader.transfer cluster.raft.leader.transfer -id <server_id> -address <grpc_address> * master: add unit tests for raft gRPC handlers Add tests covering: - RaftLeadershipTransfer with no raft initialized - RaftLeadershipTransfer with target_id but no address - RaftListClusterServers with no raft initialized - RaftAddServer with no raft initialized - RaftRemoveServer with no raft initialized These tests verify error handling when raft is not configured. * shell: add tests for cluster.raft.leader.transfer command Add tests covering: - Command name and help text validation - HasTag returns false for ResourceHeavy - Validation of -id without -address - Argument parsing with unknown flags * master: clarify that leadership transfer requires -raftHashicorp The default raft implementation (seaweedfs/raft, a goraft fork) does not support graceful leadership transfer. This feature is only available when using hashicorp raft (-raftHashicorp=true). Update error messages and help text to make this requirement clear: - gRPC handler returns specific error for goraft users - Shell command help text notes the requirement - Added test for goraft case * test: use strings.Contains instead of custom helper Replace custom contains/containsHelper functions with the standard library strings.Contains for better maintainability. * shell: return flag parsing errors instead of swallowing them - Return the error from flag.Parse() instead of returning nil - Update test to explicitly assert error for unknown flags * test: document integration test scenarios for Raft leadership transfer Add comments explaining: - Why these unit tests only cover 'Raft not initialized' scenarios - What integration tests should cover (with multi-master cluster) - hashicorp/raft uses concrete types that cannot be easily mocked * fix: address reviewer feedback on tests and leader routing - Remove misleading tests that couldn't properly validate their documented behavior without a real Raft cluster: - TestRaftLeadershipTransfer_GoraftNotSupported - TestRaftLeadershipTransfer_ValidationTargetIdWithoutAddress - Change WithClient(false) to WithClient(true) for RaftLeadershipTransfer RPC to ensure the request is routed to the current leader * Improve cluster.raft.transferLeader command - Rename command from cluster.raft.leader.transfer to cluster.raft.transferLeader - Add symmetric validation: -id and -address must be specified together - Handle case where same leader is re-elected after transfer - Add test for -address without -id validation - Add docker compose file for 5-master raft cluster testing
140 lines
4.4 KiB
Go
140 lines
4.4 KiB
Go
package weed_server
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
|
|
"github.com/hashicorp/raft"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/cluster"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
|
)
|
|
|
|
func (ms *MasterServer) RaftListClusterServers(ctx context.Context, req *master_pb.RaftListClusterServersRequest) (*master_pb.RaftListClusterServersResponse, error) {
|
|
resp := &master_pb.RaftListClusterServersResponse{}
|
|
|
|
ms.Topo.RaftServerAccessLock.RLock()
|
|
if ms.Topo.HashicorpRaft == nil {
|
|
ms.Topo.RaftServerAccessLock.RUnlock()
|
|
return resp, nil
|
|
}
|
|
|
|
servers := ms.Topo.HashicorpRaft.GetConfiguration().Configuration().Servers
|
|
_, leaderId := ms.Topo.HashicorpRaft.LeaderWithID()
|
|
ms.Topo.RaftServerAccessLock.RUnlock()
|
|
|
|
for _, server := range servers {
|
|
resp.ClusterServers = append(resp.ClusterServers, &master_pb.RaftListClusterServersResponse_ClusterServers{
|
|
Id: string(server.ID),
|
|
Address: string(server.Address),
|
|
Suffrage: server.Suffrage.String(),
|
|
IsLeader: server.ID == leaderId,
|
|
})
|
|
}
|
|
return resp, nil
|
|
}
|
|
|
|
func (ms *MasterServer) RaftAddServer(ctx context.Context, req *master_pb.RaftAddServerRequest) (*master_pb.RaftAddServerResponse, error) {
|
|
resp := &master_pb.RaftAddServerResponse{}
|
|
|
|
ms.Topo.RaftServerAccessLock.RLock()
|
|
defer ms.Topo.RaftServerAccessLock.RUnlock()
|
|
|
|
if ms.Topo.HashicorpRaft == nil {
|
|
return resp, nil
|
|
}
|
|
|
|
if ms.Topo.HashicorpRaft.State() != raft.Leader {
|
|
return nil, fmt.Errorf("raft add server %s failed: %s is no current leader", req.Id, ms.Topo.HashicorpRaft.String())
|
|
}
|
|
|
|
var idxFuture raft.IndexFuture
|
|
if req.Voter {
|
|
idxFuture = ms.Topo.HashicorpRaft.AddVoter(raft.ServerID(req.Id), raft.ServerAddress(req.Address), 0, 0)
|
|
} else {
|
|
idxFuture = ms.Topo.HashicorpRaft.AddNonvoter(raft.ServerID(req.Id), raft.ServerAddress(req.Address), 0, 0)
|
|
}
|
|
|
|
if err := idxFuture.Error(); err != nil {
|
|
return nil, err
|
|
}
|
|
return resp, nil
|
|
}
|
|
|
|
func (ms *MasterServer) RaftRemoveServer(ctx context.Context, req *master_pb.RaftRemoveServerRequest) (*master_pb.RaftRemoveServerResponse, error) {
|
|
resp := &master_pb.RaftRemoveServerResponse{}
|
|
|
|
ms.Topo.RaftServerAccessLock.RLock()
|
|
defer ms.Topo.RaftServerAccessLock.RUnlock()
|
|
|
|
if ms.Topo.HashicorpRaft == nil {
|
|
return resp, nil
|
|
}
|
|
|
|
if ms.Topo.HashicorpRaft.State() != raft.Leader {
|
|
return nil, fmt.Errorf("raft remove server %s failed: %s is no current leader", req.Id, ms.Topo.HashicorpRaft.String())
|
|
}
|
|
|
|
if !req.Force {
|
|
ms.clientChansLock.RLock()
|
|
_, ok := ms.clientChans[fmt.Sprintf("%s@%s", cluster.MasterType, req.Id)]
|
|
ms.clientChansLock.RUnlock()
|
|
if ok {
|
|
return resp, fmt.Errorf("raft remove server %s failed: client connection to master exists", req.Id)
|
|
}
|
|
}
|
|
|
|
idxFuture := ms.Topo.HashicorpRaft.RemoveServer(raft.ServerID(req.Id), 0, 0)
|
|
if err := idxFuture.Error(); err != nil {
|
|
return nil, err
|
|
}
|
|
return resp, nil
|
|
}
|
|
|
|
func (ms *MasterServer) RaftLeadershipTransfer(ctx context.Context, req *master_pb.RaftLeadershipTransferRequest) (*master_pb.RaftLeadershipTransferResponse, error) {
|
|
resp := &master_pb.RaftLeadershipTransferResponse{}
|
|
|
|
ms.Topo.RaftServerAccessLock.RLock()
|
|
defer ms.Topo.RaftServerAccessLock.RUnlock()
|
|
|
|
// Leadership transfer is only supported with hashicorp raft (-raftHashicorp=true)
|
|
// The default seaweedfs/raft (goraft) implementation does not support this feature
|
|
if ms.Topo.HashicorpRaft == nil {
|
|
if ms.Topo.RaftServer != nil {
|
|
return nil, fmt.Errorf("leadership transfer requires -raftHashicorp=true; the default raft implementation does not support this feature")
|
|
}
|
|
return nil, fmt.Errorf("raft not initialized (single master mode)")
|
|
}
|
|
|
|
if ms.Topo.HashicorpRaft.State() != raft.Leader {
|
|
leaderAddr, _ := ms.Topo.HashicorpRaft.LeaderWithID()
|
|
return nil, fmt.Errorf("this server is not the leader; current leader is %s", leaderAddr)
|
|
}
|
|
|
|
// Record previous leader
|
|
_, previousLeaderId := ms.Topo.HashicorpRaft.LeaderWithID()
|
|
resp.PreviousLeader = string(previousLeaderId)
|
|
|
|
var future raft.Future
|
|
if req.TargetId != "" && req.TargetAddress != "" {
|
|
// Transfer to specific server
|
|
future = ms.Topo.HashicorpRaft.LeadershipTransferToServer(
|
|
raft.ServerID(req.TargetId),
|
|
raft.ServerAddress(req.TargetAddress),
|
|
)
|
|
} else {
|
|
// Transfer to any eligible follower
|
|
future = ms.Topo.HashicorpRaft.LeadershipTransfer()
|
|
}
|
|
|
|
if err := future.Error(); err != nil {
|
|
return nil, fmt.Errorf("leadership transfer failed: %v", err)
|
|
}
|
|
|
|
// Get new leader info
|
|
_, newLeaderId := ms.Topo.HashicorpRaft.LeaderWithID()
|
|
resp.NewLeader = string(newLeaderId)
|
|
|
|
return resp, nil
|
|
}
|