Add cluster.raft.leader.transfer command for graceful leader change (#7819)

* proto: add RaftLeadershipTransfer RPC for forced leader change

Add new gRPC RPC and messages for leadership transfer:
- RaftLeadershipTransferRequest: optional target_id and target_address
- RaftLeadershipTransferResponse: previous_leader and new_leader

This enables graceful leadership transfer before master maintenance,
reducing errors in filers during planned maintenance windows.

Ref: https://github.com/seaweedfs/seaweedfs/issues/7527

* proto: regenerate Go files for RaftLeadershipTransfer

Generated from master.proto changes.

* master: implement RaftLeadershipTransfer gRPC handler

Add gRPC handler for leadership transfer with support for:
- Transfer to any eligible follower (when target_id is empty)
- Transfer to a specific server (when target_id and target_address are provided)

Uses hashicorp/raft LeadershipTransfer() and LeadershipTransferToServer() APIs.

Returns the previous and new leader in the response.

* shell: add cluster.raft.leader.transfer command

Add weed shell command for graceful leadership transfer:
- Displays current cluster status before transfer
- Supports auto-selection of target (any eligible follower)
- Supports targeted transfer with -id and -address flags
- Provides clear feedback on success/failure with troubleshooting tips

Usage:
  cluster.raft.leader.transfer
  cluster.raft.leader.transfer -id <server_id> -address <grpc_address>

* master: add unit tests for raft gRPC handlers

Add tests covering:
- RaftLeadershipTransfer with no raft initialized
- RaftLeadershipTransfer with target_id but no address
- RaftListClusterServers with no raft initialized
- RaftAddServer with no raft initialized
- RaftRemoveServer with no raft initialized

These tests verify error handling when raft is not configured.

* shell: add tests for cluster.raft.leader.transfer command

Add tests covering:
- Command name and help text validation
- HasTag returns false for ResourceHeavy
- Validation of -id without -address
- Argument parsing with unknown flags

* master: clarify that leadership transfer requires -raftHashicorp

The default raft implementation (seaweedfs/raft, a goraft fork) does not
support graceful leadership transfer. This feature is only available when
using hashicorp raft (-raftHashicorp=true).

Update error messages and help text to make this requirement clear:
- gRPC handler returns specific error for goraft users
- Shell command help text notes the requirement
- Added test for goraft case

* test: use strings.Contains instead of custom helper

Replace custom contains/containsHelper functions with the standard
library strings.Contains for better maintainability.

* shell: return flag parsing errors instead of swallowing them

- Return the error from flag.Parse() instead of returning nil
- Update test to explicitly assert error for unknown flags

* test: document integration test scenarios for Raft leadership transfer

Add comments explaining:
- Why these unit tests only cover 'Raft not initialized' scenarios
- What integration tests should cover (with multi-master cluster)
- hashicorp/raft uses concrete types that cannot be easily mocked

* fix: address reviewer feedback on tests and leader routing

- Remove misleading tests that couldn't properly validate their
  documented behavior without a real Raft cluster:
  - TestRaftLeadershipTransfer_GoraftNotSupported
  - TestRaftLeadershipTransfer_ValidationTargetIdWithoutAddress

- Change WithClient(false) to WithClient(true) for RaftLeadershipTransfer
  RPC to ensure the request is routed to the current leader

* Improve cluster.raft.transferLeader command

- Rename command from cluster.raft.leader.transfer to cluster.raft.transferLeader
- Add symmetric validation: -id and -address must be specified together
- Handle case where same leader is re-elected after transfer
- Add test for -address without -id validation
- Add docker compose file for 5-master raft cluster testing
This commit is contained in:
Chris Lu
2025-12-19 00:15:39 -08:00
committed by GitHub
parent 134fd6a1ae
commit f4cdfcc5fd
7 changed files with 621 additions and 67 deletions

View File

@@ -41,6 +41,7 @@ const (
Seaweed_RaftListClusterServers_FullMethodName = "/master_pb.Seaweed/RaftListClusterServers"
Seaweed_RaftAddServer_FullMethodName = "/master_pb.Seaweed/RaftAddServer"
Seaweed_RaftRemoveServer_FullMethodName = "/master_pb.Seaweed/RaftRemoveServer"
Seaweed_RaftLeadershipTransfer_FullMethodName = "/master_pb.Seaweed/RaftLeadershipTransfer"
Seaweed_VolumeGrow_FullMethodName = "/master_pb.Seaweed/VolumeGrow"
)
@@ -70,6 +71,7 @@ type SeaweedClient interface {
RaftListClusterServers(ctx context.Context, in *RaftListClusterServersRequest, opts ...grpc.CallOption) (*RaftListClusterServersResponse, error)
RaftAddServer(ctx context.Context, in *RaftAddServerRequest, opts ...grpc.CallOption) (*RaftAddServerResponse, error)
RaftRemoveServer(ctx context.Context, in *RaftRemoveServerRequest, opts ...grpc.CallOption) (*RaftRemoveServerResponse, error)
RaftLeadershipTransfer(ctx context.Context, in *RaftLeadershipTransferRequest, opts ...grpc.CallOption) (*RaftLeadershipTransferResponse, error)
VolumeGrow(ctx context.Context, in *VolumeGrowRequest, opts ...grpc.CallOption) (*VolumeGrowResponse, error)
}
@@ -310,6 +312,16 @@ func (c *seaweedClient) RaftRemoveServer(ctx context.Context, in *RaftRemoveServ
return out, nil
}
func (c *seaweedClient) RaftLeadershipTransfer(ctx context.Context, in *RaftLeadershipTransferRequest, opts ...grpc.CallOption) (*RaftLeadershipTransferResponse, error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
out := new(RaftLeadershipTransferResponse)
err := c.cc.Invoke(ctx, Seaweed_RaftLeadershipTransfer_FullMethodName, in, out, cOpts...)
if err != nil {
return nil, err
}
return out, nil
}
func (c *seaweedClient) VolumeGrow(ctx context.Context, in *VolumeGrowRequest, opts ...grpc.CallOption) (*VolumeGrowResponse, error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
out := new(VolumeGrowResponse)
@@ -346,6 +358,7 @@ type SeaweedServer interface {
RaftListClusterServers(context.Context, *RaftListClusterServersRequest) (*RaftListClusterServersResponse, error)
RaftAddServer(context.Context, *RaftAddServerRequest) (*RaftAddServerResponse, error)
RaftRemoveServer(context.Context, *RaftRemoveServerRequest) (*RaftRemoveServerResponse, error)
RaftLeadershipTransfer(context.Context, *RaftLeadershipTransferRequest) (*RaftLeadershipTransferResponse, error)
VolumeGrow(context.Context, *VolumeGrowRequest) (*VolumeGrowResponse, error)
mustEmbedUnimplementedSeaweedServer()
}
@@ -423,6 +436,9 @@ func (UnimplementedSeaweedServer) RaftAddServer(context.Context, *RaftAddServerR
func (UnimplementedSeaweedServer) RaftRemoveServer(context.Context, *RaftRemoveServerRequest) (*RaftRemoveServerResponse, error) {
return nil, status.Errorf(codes.Unimplemented, "method RaftRemoveServer not implemented")
}
func (UnimplementedSeaweedServer) RaftLeadershipTransfer(context.Context, *RaftLeadershipTransferRequest) (*RaftLeadershipTransferResponse, error) {
return nil, status.Errorf(codes.Unimplemented, "method RaftLeadershipTransfer not implemented")
}
func (UnimplementedSeaweedServer) VolumeGrow(context.Context, *VolumeGrowRequest) (*VolumeGrowResponse, error) {
return nil, status.Errorf(codes.Unimplemented, "method VolumeGrow not implemented")
}
@@ -810,6 +826,24 @@ func _Seaweed_RaftRemoveServer_Handler(srv interface{}, ctx context.Context, dec
return interceptor(ctx, in, info, handler)
}
func _Seaweed_RaftLeadershipTransfer_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
in := new(RaftLeadershipTransferRequest)
if err := dec(in); err != nil {
return nil, err
}
if interceptor == nil {
return srv.(SeaweedServer).RaftLeadershipTransfer(ctx, in)
}
info := &grpc.UnaryServerInfo{
Server: srv,
FullMethod: Seaweed_RaftLeadershipTransfer_FullMethodName,
}
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
return srv.(SeaweedServer).RaftLeadershipTransfer(ctx, req.(*RaftLeadershipTransferRequest))
}
return interceptor(ctx, in, info, handler)
}
func _Seaweed_VolumeGrow_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
in := new(VolumeGrowRequest)
if err := dec(in); err != nil {
@@ -911,6 +945,10 @@ var Seaweed_ServiceDesc = grpc.ServiceDesc{
MethodName: "RaftRemoveServer",
Handler: _Seaweed_RaftRemoveServer_Handler,
},
{
MethodName: "RaftLeadershipTransfer",
Handler: _Seaweed_RaftLeadershipTransfer_Handler,
},
{
MethodName: "VolumeGrow",
Handler: _Seaweed_VolumeGrow_Handler,