Make EC detection context aware (#8449)

* Make EC detection context aware

* Update register.go

* Speed up EC detection planning

* Add tests for EC detection planner

* optimizations

detection.go: extracted ParseCollectionFilter (exported) and feed it into the detection loop so both detection and tracing share the same parsing/whitelisting logic; the detection loop now iterates on a sorted list of volume IDs, checks the context at every iteration, and only sets hasMore when there are still unprocessed groups after hitting maxResults, keeping runtime bounded while still scheduling planned tasks before returning the results.
erasure_coding_handler.go: dropped the duplicated inline filter parsing in emitErasureCodingDetectionDecisionTrace and now reuse erasurecodingtask.ParseCollectionFilter, and the summary suffix logic now only accounts for the hasMore case that can actually happen.
detection_test.go: updated the helper topology builder to use master_pb.VolumeInformationMessage (matching the current protobuf types) and tightened the cancellation/max-results tests so they reliably exercise the detection logic (cancel before calling Detection, and provide enough disks so one result is produced before the limit).

* use working directory

* fix compilation

* fix compilation

* rename

* go vet

* fix getenv

* address comments, fix error
This commit is contained in:
Chris Lu
2026-02-25 18:02:35 -08:00
committed by GitHub
parent 7f6e58b791
commit d2b92938ee
12 changed files with 559 additions and 129 deletions

View File

@@ -3,7 +3,6 @@ package pluginworker
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"time"
@@ -27,10 +26,11 @@ type erasureCodingWorkerConfig struct {
// ErasureCodingHandler is the plugin job handler for erasure coding.
type ErasureCodingHandler struct {
grpcDialOption grpc.DialOption
workingDir string
}
func NewErasureCodingHandler(grpcDialOption grpc.DialOption) *ErasureCodingHandler {
return &ErasureCodingHandler{grpcDialOption: grpcDialOption}
func NewErasureCodingHandler(grpcDialOption grpc.DialOption, workingDir string) *ErasureCodingHandler {
return &ErasureCodingHandler{grpcDialOption: grpcDialOption, workingDir: strings.TrimSpace(workingDir)}
}
func (h *ErasureCodingHandler) Capability() *plugin_pb.JobTypeCapability {
@@ -228,24 +228,21 @@ func (h *ErasureCodingHandler) Detect(
}
clusterInfo := &workertypes.ClusterInfo{ActiveTopology: activeTopology}
results, err := erasurecodingtask.Detection(metrics, clusterInfo, workerConfig.TaskConfig)
maxResults := int(request.MaxResults)
if maxResults < 0 {
maxResults = 0
}
results, hasMore, err := erasurecodingtask.Detection(ctx, metrics, clusterInfo, workerConfig.TaskConfig, maxResults)
if err != nil {
return err
}
if traceErr := emitErasureCodingDetectionDecisionTrace(sender, metrics, workerConfig.TaskConfig, results); traceErr != nil {
if traceErr := emitErasureCodingDetectionDecisionTrace(sender, metrics, workerConfig.TaskConfig, results, maxResults, hasMore); traceErr != nil {
glog.Warningf("Plugin worker failed to emit erasure_coding detection trace: %v", traceErr)
}
maxResults := int(request.MaxResults)
hasMore := false
if maxResults > 0 && len(results) > maxResults {
hasMore = true
results = results[:maxResults]
}
proposals := make([]*plugin_pb.JobProposal, 0, len(results))
for _, result := range results {
proposal, proposalErr := buildErasureCodingProposal(result)
proposal, proposalErr := buildErasureCodingProposal(result, h.workingDir)
if proposalErr != nil {
glog.Warningf("Plugin worker skip invalid erasure_coding proposal: %v", proposalErr)
continue
@@ -273,6 +270,8 @@ func emitErasureCodingDetectionDecisionTrace(
metrics []*workertypes.VolumeHealthMetrics,
taskConfig *erasurecodingtask.Config,
results []*workertypes.TaskDetectionResult,
maxResults int,
hasMore bool,
) error {
if sender == nil || taskConfig == nil {
return nil
@@ -280,15 +279,7 @@ func emitErasureCodingDetectionDecisionTrace(
quietThreshold := time.Duration(taskConfig.QuietForSeconds) * time.Second
minSizeBytes := uint64(taskConfig.MinSizeMB) * 1024 * 1024
allowedCollections := make(map[string]bool)
if strings.TrimSpace(taskConfig.CollectionFilter) != "" {
for _, collection := range strings.Split(taskConfig.CollectionFilter, ",") {
trimmed := strings.TrimSpace(collection)
if trimmed != "" {
allowedCollections[trimmed] = true
}
}
}
allowedCollections := erasurecodingtask.ParseCollectionFilter(taskConfig.CollectionFilter)
volumeGroups := make(map[uint32][]*workertypes.VolumeHealthMetrics)
for _, metric := range metrics {
@@ -341,11 +332,16 @@ func emitErasureCodingDetectionDecisionTrace(
}
totalVolumes := len(metrics)
summarySuffix := ""
if hasMore {
summarySuffix = fmt.Sprintf(" (max_results=%d reached; remaining volumes not evaluated)", maxResults)
}
summaryMessage := ""
if len(results) == 0 {
summaryMessage = fmt.Sprintf(
"EC detection: No tasks created for %d volumes (skipped: %d already EC, %d too small, %d filtered, %d not quiet, %d not full)",
"EC detection: No tasks created for %d volumes%s (skipped: %d already EC, %d too small, %d filtered, %d not quiet, %d not full)",
totalVolumes,
summarySuffix,
skippedAlreadyEC,
skippedTooSmall,
skippedCollectionFilter,
@@ -354,8 +350,9 @@ func emitErasureCodingDetectionDecisionTrace(
)
} else {
summaryMessage = fmt.Sprintf(
"EC detection: Created %d task(s) from %d volumes (skipped: %d already EC, %d too small, %d filtered, %d not quiet, %d not full)",
"EC detection: Created %d task(s)%s from %d volumes (skipped: %d already EC, %d too small, %d filtered, %d not quiet, %d not full)",
len(results),
summarySuffix,
totalVolumes,
skippedAlreadyEC,
skippedTooSmall,
@@ -372,6 +369,12 @@ func emitErasureCodingDetectionDecisionTrace(
"selected_tasks": {
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: int64(len(results))},
},
"max_results": {
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: int64(maxResults)},
},
"has_more": {
Kind: &plugin_pb.ConfigValue_BoolValue{BoolValue: hasMore},
},
"skipped_already_ec": {
Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: int64(skippedAlreadyEC)},
},
@@ -470,7 +473,7 @@ func (h *ErasureCodingHandler) Execute(
return err
}
applyErasureCodingExecutionDefaults(params, request.GetClusterContext())
applyErasureCodingExecutionDefaults(params, request.GetClusterContext(), h.workingDir)
if len(params.Sources) == 0 || strings.TrimSpace(params.Sources[0].Node) == "" {
return fmt.Errorf("erasure coding source node is required")
@@ -607,6 +610,7 @@ func deriveErasureCodingWorkerConfig(values map[string]*plugin_pb.ConfigValue) *
func buildErasureCodingProposal(
result *workertypes.TaskDetectionResult,
baseWorkingDir string,
) (*plugin_pb.JobProposal, error) {
if result == nil {
return nil, fmt.Errorf("task detection result is nil")
@@ -615,7 +619,7 @@ func buildErasureCodingProposal(
return nil, fmt.Errorf("missing typed params for volume %d", result.VolumeID)
}
params := proto.Clone(result.TypedParams).(*worker_pb.TaskParams)
applyErasureCodingExecutionDefaults(params, nil)
applyErasureCodingExecutionDefaults(params, nil, baseWorkingDir)
paramsPayload, err := proto.Marshal(params)
if err != nil {
@@ -766,6 +770,7 @@ func decodeErasureCodingTaskParams(job *plugin_pb.JobSpec) (*worker_pb.TaskParam
func applyErasureCodingExecutionDefaults(
params *worker_pb.TaskParams,
clusterContext *plugin_pb.ClusterContext,
baseWorkingDir string,
) {
if params == nil {
return
@@ -786,7 +791,7 @@ func applyErasureCodingExecutionDefaults(
if ecParams.ParityShards <= 0 {
ecParams.ParityShards = ecstorage.ParityShardsCount
}
ecParams.WorkingDir = defaultErasureCodingWorkingDir()
ecParams.WorkingDir = defaultErasureCodingWorkingDir(baseWorkingDir)
ecParams.CleanupSource = true
if strings.TrimSpace(ecParams.MasterClient) == "" && clusterContext != nil && len(clusterContext.MasterGrpcAddresses) > 0 {
ecParams.MasterClient = clusterContext.MasterGrpcAddresses[0]
@@ -897,6 +902,10 @@ func assignECShardIDs(totalShards int, targetCount int) [][]uint32 {
return assignments
}
func defaultErasureCodingWorkingDir() string {
return filepath.Join(os.TempDir(), "seaweedfs-ec")
func defaultErasureCodingWorkingDir(baseWorkingDir string) string {
dir := strings.TrimSpace(baseWorkingDir)
if dir == "" {
return filepath.Join(".", "seaweedfs-ec")
}
return filepath.Join(dir, "seaweedfs-ec")
}

View File

@@ -179,7 +179,7 @@ func TestBuildErasureCodingProposal(t *testing.T) {
TypedParams: params,
}
proposal, err := buildErasureCodingProposal(result)
proposal, err := buildErasureCodingProposal(result, "")
if err != nil {
t.Fatalf("buildErasureCodingProposal() err = %v", err)
}
@@ -195,7 +195,7 @@ func TestBuildErasureCodingProposal(t *testing.T) {
}
func TestErasureCodingHandlerRejectsUnsupportedJobType(t *testing.T) {
handler := NewErasureCodingHandler(nil)
handler := NewErasureCodingHandler(nil, "")
err := handler.Detect(context.Background(), &plugin_pb.RunDetectionRequest{
JobType: "vacuum",
}, noopDetectionSender{})
@@ -212,7 +212,7 @@ func TestErasureCodingHandlerRejectsUnsupportedJobType(t *testing.T) {
}
func TestErasureCodingHandlerDetectSkipsByMinInterval(t *testing.T) {
handler := NewErasureCodingHandler(nil)
handler := NewErasureCodingHandler(nil, "")
sender := &recordingDetectionSender{}
err := handler.Detect(context.Background(), &plugin_pb.RunDetectionRequest{
JobType: "erasure_coding",
@@ -269,7 +269,7 @@ func TestEmitErasureCodingDetectionDecisionTraceNoTasks(t *testing.T) {
},
}
if err := emitErasureCodingDetectionDecisionTrace(sender, metrics, config, nil); err != nil {
if err := emitErasureCodingDetectionDecisionTrace(sender, metrics, config, nil, 0, false); err != nil {
t.Fatalf("emitErasureCodingDetectionDecisionTrace error: %v", err)
}
if len(sender.events) < 4 {
@@ -288,7 +288,7 @@ func TestEmitErasureCodingDetectionDecisionTraceNoTasks(t *testing.T) {
}
func TestErasureCodingDescriptorOmitsLocalExecutionFields(t *testing.T) {
descriptor := NewErasureCodingHandler(nil).Descriptor()
descriptor := NewErasureCodingHandler(nil, "").Descriptor()
if descriptor == nil || descriptor.WorkerConfigForm == nil {
t.Fatalf("expected worker config form in descriptor")
}
@@ -301,6 +301,7 @@ func TestErasureCodingDescriptorOmitsLocalExecutionFields(t *testing.T) {
}
func TestApplyErasureCodingExecutionDefaultsForcesLocalFields(t *testing.T) {
baseWorkingDir := "/var/lib/seaweedfs-worker"
params := &worker_pb.TaskParams{
TaskId: "ec-test",
VolumeId: 100,
@@ -314,14 +315,14 @@ func TestApplyErasureCodingExecutionDefaultsForcesLocalFields(t *testing.T) {
},
}
applyErasureCodingExecutionDefaults(params, nil)
applyErasureCodingExecutionDefaults(params, nil, baseWorkingDir)
ecParams := params.GetErasureCodingParams()
if ecParams == nil {
t.Fatalf("expected erasure coding params")
}
if ecParams.WorkingDir != defaultErasureCodingWorkingDir() {
t.Fatalf("expected local working_dir %q, got %q", defaultErasureCodingWorkingDir(), ecParams.WorkingDir)
if ecParams.WorkingDir != defaultErasureCodingWorkingDir(baseWorkingDir) {
t.Fatalf("expected local working_dir %q, got %q", defaultErasureCodingWorkingDir(baseWorkingDir), ecParams.WorkingDir)
}
if !ecParams.CleanupSource {
t.Fatalf("expected cleanup_source true")