iceberg: validate filer failover targets (#8637)
* iceberg: validate filer failover targets * iceberg: tighten filer liveness checks * iceberg: relax filer test readiness deadline
This commit is contained in:
@@ -153,8 +153,23 @@ func (f *fakeFilerServer) DeleteEntry(_ context.Context, req *filer_pb.DeleteEnt
|
|||||||
return &filer_pb.DeleteEntryResponse{}, nil
|
return &filer_pb.DeleteEntryResponse{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f *fakeFilerServer) Ping(_ context.Context, _ *filer_pb.PingRequest) (*filer_pb.PingResponse, error) {
|
||||||
|
now := time.Now().UnixNano()
|
||||||
|
return &filer_pb.PingResponse{
|
||||||
|
StartTimeNs: now,
|
||||||
|
RemoteTimeNs: now,
|
||||||
|
StopTimeNs: now,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
// startFakeFiler starts a gRPC server and returns a connected client.
|
// startFakeFiler starts a gRPC server and returns a connected client.
|
||||||
func startFakeFiler(t *testing.T) (*fakeFilerServer, filer_pb.SeaweedFilerClient) {
|
func startFakeFiler(t *testing.T) (*fakeFilerServer, filer_pb.SeaweedFilerClient) {
|
||||||
|
t.Helper()
|
||||||
|
fakeServer, client, _ := startFakeFilerWithAddress(t)
|
||||||
|
return fakeServer, client
|
||||||
|
}
|
||||||
|
|
||||||
|
func startFakeFilerWithAddress(t *testing.T) (*fakeFilerServer, filer_pb.SeaweedFilerClient, string) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
fakeServer := newFakeFilerServer()
|
fakeServer := newFakeFilerServer()
|
||||||
|
|
||||||
@@ -175,7 +190,26 @@ func startFakeFiler(t *testing.T) (*fakeFilerServer, filer_pb.SeaweedFilerClient
|
|||||||
}
|
}
|
||||||
t.Cleanup(func() { conn.Close() })
|
t.Cleanup(func() { conn.Close() })
|
||||||
|
|
||||||
return fakeServer, filer_pb.NewSeaweedFilerClient(conn)
|
client := filer_pb.NewSeaweedFilerClient(conn)
|
||||||
|
deadline := time.Now().Add(5 * time.Second)
|
||||||
|
for {
|
||||||
|
pingCtx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
|
||||||
|
_, err := client.Ping(pingCtx, &filer_pb.PingRequest{})
|
||||||
|
cancel()
|
||||||
|
if err == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if time.Now().After(deadline) {
|
||||||
|
t.Fatalf("filer not ready: %v", err)
|
||||||
|
}
|
||||||
|
code := status.Code(err)
|
||||||
|
if code != codes.Unavailable && code != codes.DeadlineExceeded && code != codes.Canceled {
|
||||||
|
t.Fatalf("unexpected filer readiness error: %v", err)
|
||||||
|
}
|
||||||
|
time.Sleep(10 * time.Millisecond)
|
||||||
|
}
|
||||||
|
|
||||||
|
return fakeServer, client, listener.Addr().String()
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@@ -860,6 +894,44 @@ func TestDetectWithFilters(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestConnectToFilerSkipsUnreachableAddresses(t *testing.T) {
|
||||||
|
handler := NewHandler(grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||||
|
_, _, liveAddr := startFakeFilerWithAddress(t)
|
||||||
|
|
||||||
|
deadListener, err := net.Listen("tcp", "127.0.0.1:0")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("listen for dead address: %v", err)
|
||||||
|
}
|
||||||
|
deadAddr := deadListener.Addr().String()
|
||||||
|
_ = deadListener.Close()
|
||||||
|
|
||||||
|
addr, conn, err := handler.connectToFiler(context.Background(), []string{deadAddr, liveAddr})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("connectToFiler failed: %v", err)
|
||||||
|
}
|
||||||
|
defer conn.Close()
|
||||||
|
|
||||||
|
if addr != liveAddr {
|
||||||
|
t.Fatalf("expected live address %q, got %q", liveAddr, addr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConnectToFilerFailsWhenAllAddressesAreUnreachable(t *testing.T) {
|
||||||
|
handler := NewHandler(grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||||
|
|
||||||
|
deadListener, err := net.Listen("tcp", "127.0.0.1:0")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("listen for dead address: %v", err)
|
||||||
|
}
|
||||||
|
deadAddr := deadListener.Addr().String()
|
||||||
|
_ = deadListener.Close()
|
||||||
|
|
||||||
|
_, _, err = handler.connectToFiler(context.Background(), []string{deadAddr})
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected connectToFiler to fail")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestStalePlanGuard(t *testing.T) {
|
func TestStalePlanGuard(t *testing.T) {
|
||||||
fs, client := startFakeFiler(t)
|
fs, client := startFakeFiler(t)
|
||||||
|
|
||||||
|
|||||||
@@ -5,8 +5,10 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"path"
|
"path"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||||
|
"github.com/seaweedfs/seaweedfs/weed/pb"
|
||||||
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
|
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
|
||||||
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb"
|
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb"
|
||||||
pluginworker "github.com/seaweedfs/seaweedfs/weed/plugin/worker"
|
pluginworker "github.com/seaweedfs/seaweedfs/weed/plugin/worker"
|
||||||
@@ -31,6 +33,8 @@ type Handler struct {
|
|||||||
grpcDialOption grpc.DialOption
|
grpcDialOption grpc.DialOption
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const filerConnectTimeout = 5 * time.Second
|
||||||
|
|
||||||
// NewHandler creates a new handler for iceberg table maintenance.
|
// NewHandler creates a new handler for iceberg table maintenance.
|
||||||
func NewHandler(grpcDialOption grpc.DialOption) *Handler {
|
func NewHandler(grpcDialOption grpc.DialOption) *Handler {
|
||||||
return &Handler{grpcDialOption: grpcDialOption}
|
return &Handler{grpcDialOption: grpcDialOption}
|
||||||
@@ -205,7 +209,7 @@ func (h *Handler) Descriptor() *plugin_pb.JobTypeDescriptor {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
DefaultValues: map[string]*plugin_pb.ConfigValue{
|
DefaultValues: map[string]*plugin_pb.ConfigValue{
|
||||||
"target_file_size_mb": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultTargetFileSizeMB}},
|
"target_file_size_mb": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultTargetFileSizeMB}},
|
||||||
"min_input_files": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultMinInputFiles}},
|
"min_input_files": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultMinInputFiles}},
|
||||||
"min_manifests_to_rewrite": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultMinManifestsToRewrite}},
|
"min_manifests_to_rewrite": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultMinManifestsToRewrite}},
|
||||||
"snapshot_retention_hours": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultSnapshotRetentionHours}},
|
"snapshot_retention_hours": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultSnapshotRetentionHours}},
|
||||||
@@ -227,7 +231,7 @@ func (h *Handler) Descriptor() *plugin_pb.JobTypeDescriptor {
|
|||||||
JobTypeMaxRuntimeSeconds: 3600, // 1 hour max
|
JobTypeMaxRuntimeSeconds: 3600, // 1 hour max
|
||||||
},
|
},
|
||||||
WorkerDefaultValues: map[string]*plugin_pb.ConfigValue{
|
WorkerDefaultValues: map[string]*plugin_pb.ConfigValue{
|
||||||
"target_file_size_mb": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultTargetFileSizeMB}},
|
"target_file_size_mb": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultTargetFileSizeMB}},
|
||||||
"min_input_files": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultMinInputFiles}},
|
"min_input_files": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultMinInputFiles}},
|
||||||
"snapshot_retention_hours": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultSnapshotRetentionHours}},
|
"snapshot_retention_hours": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultSnapshotRetentionHours}},
|
||||||
"max_snapshots_to_keep": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultMaxSnapshotsToKeep}},
|
"max_snapshots_to_keep": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultMaxSnapshotsToKeep}},
|
||||||
@@ -272,7 +276,7 @@ func (h *Handler) Detect(ctx context.Context, request *plugin_pb.RunDetectionReq
|
|||||||
tableFilter := strings.TrimSpace(readStringConfig(request.GetAdminConfigValues(), "table_filter", ""))
|
tableFilter := strings.TrimSpace(readStringConfig(request.GetAdminConfigValues(), "table_filter", ""))
|
||||||
|
|
||||||
// Connect to filer — try each address until one succeeds.
|
// Connect to filer — try each address until one succeeds.
|
||||||
filerAddress, conn, err := h.connectToFiler(filerAddresses)
|
filerAddress, conn, err := h.connectToFiler(ctx, filerAddresses)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("connect to filer: %w", err)
|
return fmt.Errorf("connect to filer: %w", err)
|
||||||
}
|
}
|
||||||
@@ -382,7 +386,7 @@ func (h *Handler) Execute(ctx context.Context, request *plugin_pb.ExecuteJobRequ
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Connect to filer
|
// Connect to filer
|
||||||
conn, err := grpc.NewClient(filerAddress, h.grpcDialOption)
|
conn, err := h.dialFiler(ctx, filerAddress)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("connect to filer %s: %w", filerAddress, err)
|
return fmt.Errorf("connect to filer %s: %w", filerAddress, err)
|
||||||
}
|
}
|
||||||
@@ -488,13 +492,30 @@ func (h *Handler) sendEmptyDetection(sender pluginworker.DetectionSender) error
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *Handler) dialFiler(ctx context.Context, address string) (*grpc.ClientConn, error) {
|
||||||
|
opCtx, opCancel := context.WithTimeout(ctx, filerConnectTimeout)
|
||||||
|
defer opCancel()
|
||||||
|
|
||||||
|
conn, err := pb.GrpcDial(opCtx, address, false, h.grpcDialOption)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
client := filer_pb.NewSeaweedFilerClient(conn)
|
||||||
|
if _, err := client.Ping(opCtx, &filer_pb.PingRequest{}); err != nil {
|
||||||
|
_ = conn.Close()
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return conn, nil
|
||||||
|
}
|
||||||
|
|
||||||
// connectToFiler tries each filer address in order and returns the first
|
// connectToFiler tries each filer address in order and returns the first
|
||||||
// successful gRPC connection. If all addresses fail, it returns a
|
// address whose gRPC connection and Ping request succeed.
|
||||||
// consolidated error.
|
func (h *Handler) connectToFiler(ctx context.Context, addresses []string) (string, *grpc.ClientConn, error) {
|
||||||
func (h *Handler) connectToFiler(addresses []string) (string, *grpc.ClientConn, error) {
|
|
||||||
var lastErr error
|
var lastErr error
|
||||||
for _, addr := range addresses {
|
for _, addr := range addresses {
|
||||||
conn, err := grpc.NewClient(addr, h.grpcDialOption)
|
conn, err := h.dialFiler(ctx, addr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
lastErr = fmt.Errorf("filer %s: %w", addr, err)
|
lastErr = fmt.Errorf("filer %s: %w", addr, err)
|
||||||
continue
|
continue
|
||||||
|
|||||||
Reference in New Issue
Block a user