Add plugin worker integration tests for erasure coding (#8450)
* test: add plugin worker integration harness * test: add erasure coding detection integration tests * test: add erasure coding execution integration tests * ci: add plugin worker integration workflow * test: extend fake volume server for vacuum and balance * test: expand erasure coding detection topologies * test: add large erasure coding detection topology * test: add vacuum plugin worker integration tests * test: add volume balance plugin worker integration tests * ci: run plugin worker tests per worker * fixes * erasure coding: stop after placement failures * erasure coding: record hasMore when early stopping * erasure coding: relax large topology expectations
This commit is contained in:
285
test/plugin_workers/erasure_coding/detection_test.go
Normal file
285
test/plugin_workers/erasure_coding/detection_test.go
Normal file
@@ -0,0 +1,285 @@
|
||||
package erasure_coding_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
pluginworkers "github.com/seaweedfs/seaweedfs/test/plugin_workers"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
|
||||
pluginworker "github.com/seaweedfs/seaweedfs/weed/plugin/worker"
|
||||
ecstorage "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
||||
"github.com/stretchr/testify/require"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
"google.golang.org/protobuf/proto"
|
||||
)
|
||||
|
||||
type topologySpec struct {
|
||||
name string
|
||||
dataCenters int
|
||||
racksPerDC int
|
||||
nodesPerRack int
|
||||
diskTypes []string
|
||||
replicas int
|
||||
collection string
|
||||
}
|
||||
|
||||
type detectionCase struct {
|
||||
name string
|
||||
topology topologySpec
|
||||
adminCollectionFilter string
|
||||
expectProposals bool
|
||||
}
|
||||
|
||||
func TestErasureCodingDetectionAcrossTopologies(t *testing.T) {
|
||||
cases := []detectionCase{
|
||||
{
|
||||
name: "single-dc-multi-rack",
|
||||
topology: topologySpec{
|
||||
name: "single-dc-multi-rack",
|
||||
dataCenters: 1,
|
||||
racksPerDC: 2,
|
||||
nodesPerRack: 7,
|
||||
diskTypes: []string{"hdd"},
|
||||
replicas: 1,
|
||||
collection: "ec-test",
|
||||
},
|
||||
expectProposals: true,
|
||||
},
|
||||
{
|
||||
name: "multi-dc",
|
||||
topology: topologySpec{
|
||||
name: "multi-dc",
|
||||
dataCenters: 2,
|
||||
racksPerDC: 1,
|
||||
nodesPerRack: 7,
|
||||
diskTypes: []string{"hdd"},
|
||||
replicas: 1,
|
||||
collection: "ec-test",
|
||||
},
|
||||
expectProposals: true,
|
||||
},
|
||||
{
|
||||
name: "multi-dc-multi-rack",
|
||||
topology: topologySpec{
|
||||
name: "multi-dc-multi-rack",
|
||||
dataCenters: 2,
|
||||
racksPerDC: 2,
|
||||
nodesPerRack: 4,
|
||||
diskTypes: []string{"hdd"},
|
||||
replicas: 1,
|
||||
collection: "ec-test",
|
||||
},
|
||||
expectProposals: true,
|
||||
},
|
||||
{
|
||||
name: "mixed-disk-types",
|
||||
topology: topologySpec{
|
||||
name: "mixed-disk-types",
|
||||
dataCenters: 1,
|
||||
racksPerDC: 2,
|
||||
nodesPerRack: 7,
|
||||
diskTypes: []string{"hdd", "ssd"},
|
||||
replicas: 1,
|
||||
collection: "ec-test",
|
||||
},
|
||||
expectProposals: true,
|
||||
},
|
||||
{
|
||||
name: "multi-replica-volume",
|
||||
topology: topologySpec{
|
||||
name: "multi-replica-volume",
|
||||
dataCenters: 1,
|
||||
racksPerDC: 2,
|
||||
nodesPerRack: 7,
|
||||
diskTypes: []string{"hdd"},
|
||||
replicas: 3,
|
||||
collection: "ec-test",
|
||||
},
|
||||
expectProposals: true,
|
||||
},
|
||||
{
|
||||
name: "collection-filter-match",
|
||||
topology: topologySpec{
|
||||
name: "collection-filter-match",
|
||||
dataCenters: 1,
|
||||
racksPerDC: 2,
|
||||
nodesPerRack: 7,
|
||||
diskTypes: []string{"hdd"},
|
||||
replicas: 1,
|
||||
collection: "filtered",
|
||||
},
|
||||
adminCollectionFilter: "filtered",
|
||||
expectProposals: true,
|
||||
},
|
||||
{
|
||||
name: "collection-filter-mismatch",
|
||||
topology: topologySpec{
|
||||
name: "collection-filter-mismatch",
|
||||
dataCenters: 1,
|
||||
racksPerDC: 2,
|
||||
nodesPerRack: 7,
|
||||
diskTypes: []string{"hdd"},
|
||||
replicas: 1,
|
||||
collection: "filtered",
|
||||
},
|
||||
adminCollectionFilter: "other",
|
||||
expectProposals: false,
|
||||
},
|
||||
{
|
||||
name: "insufficient-disks",
|
||||
topology: topologySpec{
|
||||
name: "insufficient-disks",
|
||||
dataCenters: 1,
|
||||
racksPerDC: 1,
|
||||
nodesPerRack: 2,
|
||||
diskTypes: []string{"hdd"},
|
||||
replicas: 1,
|
||||
collection: "ec-test",
|
||||
},
|
||||
expectProposals: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
volumeID := uint32(7)
|
||||
response := buildVolumeListResponse(t, tc.topology, volumeID)
|
||||
master := pluginworkers.NewMasterServer(t, response)
|
||||
|
||||
dialOption := grpc.WithTransportCredentials(insecure.NewCredentials())
|
||||
handler := pluginworker.NewErasureCodingHandler(dialOption, t.TempDir())
|
||||
harness := pluginworkers.NewHarness(t, pluginworkers.HarnessConfig{
|
||||
WorkerOptions: pluginworker.WorkerOptions{
|
||||
GrpcDialOption: dialOption,
|
||||
},
|
||||
Handlers: []pluginworker.JobHandler{handler},
|
||||
})
|
||||
harness.WaitForJobType("erasure_coding")
|
||||
|
||||
if tc.adminCollectionFilter != "" {
|
||||
err := harness.Plugin().SaveJobTypeConfig(&plugin_pb.PersistedJobTypeConfig{
|
||||
JobType: "erasure_coding",
|
||||
AdminConfigValues: map[string]*plugin_pb.ConfigValue{
|
||||
"collection_filter": {
|
||||
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: tc.adminCollectionFilter},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
proposals, err := harness.Plugin().RunDetection(ctx, "erasure_coding", &plugin_pb.ClusterContext{
|
||||
MasterGrpcAddresses: []string{master.Address()},
|
||||
}, 10)
|
||||
require.NoError(t, err)
|
||||
|
||||
if !tc.expectProposals {
|
||||
require.Empty(t, proposals)
|
||||
return
|
||||
}
|
||||
|
||||
require.NotEmpty(t, proposals)
|
||||
|
||||
proposal := proposals[0]
|
||||
require.Equal(t, "erasure_coding", proposal.JobType)
|
||||
paramsValue := proposal.Parameters["task_params_pb"]
|
||||
require.NotNil(t, paramsValue)
|
||||
|
||||
params := &worker_pb.TaskParams{}
|
||||
require.NoError(t, proto.Unmarshal(paramsValue.GetBytesValue(), params))
|
||||
require.NotEmpty(t, params.Sources)
|
||||
require.Len(t, params.Targets, ecstorage.TotalShardsCount)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func buildVolumeListResponse(t *testing.T, spec topologySpec, volumeID uint32) *master_pb.VolumeListResponse {
|
||||
t.Helper()
|
||||
|
||||
volumeSizeLimitMB := uint64(100)
|
||||
volumeSize := uint64(90) * 1024 * 1024
|
||||
volumeModifiedAt := time.Now().Add(-10 * time.Minute).Unix()
|
||||
|
||||
diskTypes := spec.diskTypes
|
||||
if len(diskTypes) == 0 {
|
||||
diskTypes = []string{"hdd"}
|
||||
}
|
||||
replicas := spec.replicas
|
||||
if replicas <= 0 {
|
||||
replicas = 1
|
||||
}
|
||||
collection := spec.collection
|
||||
if collection == "" {
|
||||
collection = "ec-test"
|
||||
}
|
||||
|
||||
var dataCenters []*master_pb.DataCenterInfo
|
||||
nodeIndex := 0
|
||||
replicasPlaced := 0
|
||||
|
||||
for dc := 0; dc < spec.dataCenters; dc++ {
|
||||
var racks []*master_pb.RackInfo
|
||||
for rack := 0; rack < spec.racksPerDC; rack++ {
|
||||
var nodes []*master_pb.DataNodeInfo
|
||||
for n := 0; n < spec.nodesPerRack; n++ {
|
||||
nodeIndex++
|
||||
address := fmt.Sprintf("127.0.0.1:%d", 20000+nodeIndex)
|
||||
diskType := diskTypes[(nodeIndex-1)%len(diskTypes)]
|
||||
|
||||
diskInfo := &master_pb.DiskInfo{
|
||||
DiskId: 0,
|
||||
MaxVolumeCount: 100,
|
||||
VolumeCount: 0,
|
||||
VolumeInfos: []*master_pb.VolumeInformationMessage{},
|
||||
}
|
||||
|
||||
if replicasPlaced < replicas {
|
||||
diskInfo.VolumeCount = 1
|
||||
diskInfo.VolumeInfos = append(diskInfo.VolumeInfos, &master_pb.VolumeInformationMessage{
|
||||
Id: volumeID,
|
||||
Collection: collection,
|
||||
DiskId: 0,
|
||||
Size: volumeSize,
|
||||
DeletedByteCount: 0,
|
||||
ModifiedAtSecond: volumeModifiedAt,
|
||||
ReplicaPlacement: 1,
|
||||
ReadOnly: false,
|
||||
})
|
||||
replicasPlaced++
|
||||
}
|
||||
|
||||
nodes = append(nodes, &master_pb.DataNodeInfo{
|
||||
Id: address,
|
||||
Address: address,
|
||||
DiskInfos: map[string]*master_pb.DiskInfo{diskType: diskInfo},
|
||||
})
|
||||
}
|
||||
|
||||
racks = append(racks, &master_pb.RackInfo{
|
||||
Id: fmt.Sprintf("rack-%d", rack+1),
|
||||
DataNodeInfos: nodes,
|
||||
})
|
||||
}
|
||||
|
||||
dataCenters = append(dataCenters, &master_pb.DataCenterInfo{
|
||||
Id: fmt.Sprintf("dc-%d", dc+1),
|
||||
RackInfos: racks,
|
||||
})
|
||||
}
|
||||
|
||||
return &master_pb.VolumeListResponse{
|
||||
VolumeSizeLimitMb: volumeSizeLimitMB,
|
||||
TopologyInfo: &master_pb.TopologyInfo{
|
||||
DataCenterInfos: dataCenters,
|
||||
},
|
||||
}
|
||||
}
|
||||
83
test/plugin_workers/erasure_coding/execution_test.go
Normal file
83
test/plugin_workers/erasure_coding/execution_test.go
Normal file
@@ -0,0 +1,83 @@
|
||||
package erasure_coding_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
pluginworkers "github.com/seaweedfs/seaweedfs/test/plugin_workers"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb"
|
||||
pluginworker "github.com/seaweedfs/seaweedfs/weed/plugin/worker"
|
||||
ecstorage "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
||||
"github.com/stretchr/testify/require"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
)
|
||||
|
||||
func TestErasureCodingExecutionEncodesShards(t *testing.T) {
|
||||
volumeID := uint32(123)
|
||||
datSize := 1 * 1024 * 1024
|
||||
|
||||
dialOption := grpc.WithTransportCredentials(insecure.NewCredentials())
|
||||
handler := pluginworker.NewErasureCodingHandler(dialOption, t.TempDir())
|
||||
harness := pluginworkers.NewHarness(t, pluginworkers.HarnessConfig{
|
||||
WorkerOptions: pluginworker.WorkerOptions{
|
||||
GrpcDialOption: dialOption,
|
||||
},
|
||||
Handlers: []pluginworker.JobHandler{handler},
|
||||
})
|
||||
harness.WaitForJobType("erasure_coding")
|
||||
|
||||
sourceServer := pluginworkers.NewVolumeServer(t, "")
|
||||
pluginworkers.WriteTestVolumeFiles(t, sourceServer.BaseDir(), volumeID, datSize)
|
||||
|
||||
targetServers := make([]*pluginworkers.VolumeServer, 0, ecstorage.TotalShardsCount)
|
||||
targetAddresses := make([]string, 0, ecstorage.TotalShardsCount)
|
||||
for i := 0; i < ecstorage.TotalShardsCount; i++ {
|
||||
target := pluginworkers.NewVolumeServer(t, "")
|
||||
targetServers = append(targetServers, target)
|
||||
targetAddresses = append(targetAddresses, target.Address())
|
||||
}
|
||||
|
||||
job := &plugin_pb.JobSpec{
|
||||
JobId: fmt.Sprintf("ec-job-%d", volumeID),
|
||||
JobType: "erasure_coding",
|
||||
Parameters: map[string]*plugin_pb.ConfigValue{
|
||||
"volume_id": {
|
||||
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: int64(volumeID)},
|
||||
},
|
||||
"collection": {
|
||||
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: "ec-test"},
|
||||
},
|
||||
"source_server": {
|
||||
Kind: &plugin_pb.ConfigValue_StringValue{StringValue: sourceServer.Address()},
|
||||
},
|
||||
"target_servers": {
|
||||
Kind: &plugin_pb.ConfigValue_StringList{StringList: &plugin_pb.StringList{Values: targetAddresses}},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
result, err := harness.Plugin().ExecuteJob(ctx, job, nil, 1)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, result)
|
||||
require.True(t, result.Success)
|
||||
|
||||
require.GreaterOrEqual(t, sourceServer.MarkReadonlyCount(), 1)
|
||||
require.GreaterOrEqual(t, len(sourceServer.DeleteRequests()), 1)
|
||||
|
||||
for shardID := 0; shardID < ecstorage.TotalShardsCount; shardID++ {
|
||||
targetIndex := shardID % len(targetServers)
|
||||
target := targetServers[targetIndex]
|
||||
expected := filepath.Join(target.BaseDir(), fmt.Sprintf("%d.ec%02d", volumeID, shardID))
|
||||
info, err := os.Stat(expected)
|
||||
require.NoErrorf(t, err, "missing shard file %s", expected)
|
||||
require.Greater(t, info.Size(), int64(0))
|
||||
}
|
||||
}
|
||||
123
test/plugin_workers/erasure_coding/large_topology_test.go
Normal file
123
test/plugin_workers/erasure_coding/large_topology_test.go
Normal file
@@ -0,0 +1,123 @@
|
||||
package erasure_coding_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
pluginworkers "github.com/seaweedfs/seaweedfs/test/plugin_workers"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb"
|
||||
pluginworker "github.com/seaweedfs/seaweedfs/weed/plugin/worker"
|
||||
"github.com/stretchr/testify/require"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
)
|
||||
|
||||
func TestErasureCodingDetectionLargeTopology(t *testing.T) {
|
||||
const (
|
||||
rackCount = 100
|
||||
serverCount = 1000
|
||||
volumesPerNode = 300
|
||||
volumeSizeLimit = uint64(100)
|
||||
)
|
||||
|
||||
if serverCount%rackCount != 0 {
|
||||
t.Fatalf("serverCount (%d) must be divisible by rackCount (%d)", serverCount, rackCount)
|
||||
}
|
||||
|
||||
nodesPerRack := serverCount / rackCount
|
||||
eligibleSize := uint64(90) * 1024 * 1024
|
||||
ineligibleSize := uint64(10) * 1024 * 1024
|
||||
modifiedAt := time.Now().Add(-10 * time.Minute).Unix()
|
||||
|
||||
volumeID := uint32(1)
|
||||
dataCenters := make([]*master_pb.DataCenterInfo, 0, 1)
|
||||
|
||||
racks := make([]*master_pb.RackInfo, 0, rackCount)
|
||||
for rack := 0; rack < rackCount; rack++ {
|
||||
nodes := make([]*master_pb.DataNodeInfo, 0, nodesPerRack)
|
||||
for node := 0; node < nodesPerRack; node++ {
|
||||
address := fmt.Sprintf("10.0.%d.%d:8080", rack, node+1)
|
||||
volumes := make([]*master_pb.VolumeInformationMessage, 0, volumesPerNode)
|
||||
for v := 0; v < volumesPerNode; v++ {
|
||||
size := ineligibleSize
|
||||
if volumeID%2 == 0 {
|
||||
size = eligibleSize
|
||||
}
|
||||
volumes = append(volumes, &master_pb.VolumeInformationMessage{
|
||||
Id: volumeID,
|
||||
Collection: "ec-bulk",
|
||||
DiskId: 0,
|
||||
Size: size,
|
||||
DeletedByteCount: 0,
|
||||
ModifiedAtSecond: modifiedAt,
|
||||
ReplicaPlacement: 1,
|
||||
ReadOnly: false,
|
||||
})
|
||||
volumeID++
|
||||
}
|
||||
|
||||
diskInfo := &master_pb.DiskInfo{
|
||||
DiskId: 0,
|
||||
MaxVolumeCount: int64(volumesPerNode + 10),
|
||||
VolumeCount: int64(volumesPerNode),
|
||||
VolumeInfos: volumes,
|
||||
}
|
||||
|
||||
nodes = append(nodes, &master_pb.DataNodeInfo{
|
||||
Id: address,
|
||||
Address: address,
|
||||
DiskInfos: map[string]*master_pb.DiskInfo{"hdd": diskInfo},
|
||||
})
|
||||
}
|
||||
|
||||
racks = append(racks, &master_pb.RackInfo{
|
||||
Id: fmt.Sprintf("rack-%d", rack+1),
|
||||
DataNodeInfos: nodes,
|
||||
})
|
||||
}
|
||||
|
||||
dataCenters = append(dataCenters, &master_pb.DataCenterInfo{
|
||||
Id: "dc-1",
|
||||
RackInfos: racks,
|
||||
})
|
||||
|
||||
response := &master_pb.VolumeListResponse{
|
||||
VolumeSizeLimitMb: volumeSizeLimit,
|
||||
TopologyInfo: &master_pb.TopologyInfo{
|
||||
DataCenterInfos: dataCenters,
|
||||
},
|
||||
}
|
||||
|
||||
master := pluginworkers.NewMasterServer(t, response)
|
||||
|
||||
dialOption := grpc.WithTransportCredentials(insecure.NewCredentials())
|
||||
handler := pluginworker.NewErasureCodingHandler(dialOption, t.TempDir())
|
||||
harness := pluginworkers.NewHarness(t, pluginworkers.HarnessConfig{
|
||||
WorkerOptions: pluginworker.WorkerOptions{
|
||||
GrpcDialOption: dialOption,
|
||||
},
|
||||
Handlers: []pluginworker.JobHandler{handler},
|
||||
})
|
||||
harness.WaitForJobType("erasure_coding")
|
||||
|
||||
totalVolumes := serverCount * volumesPerNode
|
||||
expectedEligible := totalVolumes / 2
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
start := time.Now()
|
||||
proposals, err := harness.Plugin().RunDetection(ctx, "erasure_coding", &plugin_pb.ClusterContext{
|
||||
MasterGrpcAddresses: []string{master.Address()},
|
||||
}, 0)
|
||||
duration := time.Since(start)
|
||||
require.NoError(t, err)
|
||||
require.GreaterOrEqual(t, len(proposals), 10, "should detect at least some proposals")
|
||||
t.Logf("large topology detection completed in %s (proposals=%d, eligible=%d)", duration, len(proposals), expectedEligible)
|
||||
if len(proposals) < expectedEligible {
|
||||
t.Logf("large topology detection stopped early: %d proposals vs %d eligible", len(proposals), expectedEligible)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user