plugin scheduler: run iceberg and lifecycle lanes concurrently (#8821)
* plugin scheduler: run iceberg and lifecycle lanes concurrently The default lane serialises job types under a single admin lock because volume-management operations share global state. Iceberg and lifecycle lanes have no such constraint, so run each of their job types independently in separate goroutines. * Fix concurrent lane scheduler status * plugin scheduler: address review feedback - Extract collectDueJobTypes helper to deduplicate policy loading between locked and concurrent iteration paths. - Use atomic.Bool instead of sync.Mutex for hadJobs in the concurrent path. - Set lane loop state to "busy" before launching concurrent goroutines so the lane is not reported as idle while work runs. - Convert TestLaneRequiresLock to table-driven style. - Add TestRunLaneSchedulerIterationLockBehavior to verify the scheduler acquires the admin lock only for lanes that require it. - Fix flaky TestGetLaneSchedulerStatusShowsActiveConcurrentLaneWork by not starting background scheduler goroutines that race with the direct runJobTypeIteration call.
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
package plugin
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/plugin_pb"
|
||||
)
|
||||
@@ -62,3 +64,84 @@ func TestGetSchedulerStatusIncludesLastDetectionCount(t *testing.T) {
|
||||
t.Fatalf("expected job type status for %s", jobType)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetLaneSchedulerStatusShowsActiveConcurrentLaneWork(t *testing.T) {
|
||||
clusterContextStarted := make(chan struct{})
|
||||
releaseClusterContext := make(chan struct{})
|
||||
|
||||
// Create the Plugin without a ClusterContextProvider so no background
|
||||
// scheduler goroutines are started; they would race with the direct
|
||||
// runJobTypeIteration call below.
|
||||
pluginSvc, err := New(Options{})
|
||||
if err != nil {
|
||||
t.Fatalf("New: %v", err)
|
||||
}
|
||||
defer pluginSvc.Shutdown()
|
||||
|
||||
// Set the provider after construction so runJobTypeIteration can use it.
|
||||
pluginSvc.clusterContextProvider = func(context.Context) (*plugin_pb.ClusterContext, error) {
|
||||
close(clusterContextStarted)
|
||||
<-releaseClusterContext
|
||||
return nil, context.Canceled
|
||||
}
|
||||
|
||||
const jobType = "s3_lifecycle"
|
||||
err = pluginSvc.SaveJobTypeConfig(&plugin_pb.PersistedJobTypeConfig{
|
||||
JobType: jobType,
|
||||
AdminRuntime: &plugin_pb.AdminRuntimeConfig{
|
||||
Enabled: true,
|
||||
DetectionIntervalSeconds: 30,
|
||||
DetectionTimeoutSeconds: 15,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("SaveJobTypeConfig: %v", err)
|
||||
}
|
||||
|
||||
policy, enabled, err := pluginSvc.loadSchedulerPolicy(jobType)
|
||||
if err != nil {
|
||||
t.Fatalf("loadSchedulerPolicy: %v", err)
|
||||
}
|
||||
if !enabled {
|
||||
t.Fatalf("expected enabled policy")
|
||||
}
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
defer close(done)
|
||||
pluginSvc.runJobTypeIteration(jobType, policy)
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-clusterContextStarted:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("timed out waiting for job type iteration to start")
|
||||
}
|
||||
|
||||
var laneStatus SchedulerStatus
|
||||
var aggregateStatus SchedulerStatus
|
||||
deadline := time.Now().Add(time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
laneStatus = pluginSvc.GetLaneSchedulerStatus(LaneLifecycle)
|
||||
aggregateStatus = pluginSvc.GetSchedulerStatus()
|
||||
if laneStatus.CurrentJobType == jobType && laneStatus.CurrentPhase == "detecting" &&
|
||||
aggregateStatus.CurrentJobType == jobType && aggregateStatus.CurrentPhase == "detecting" {
|
||||
break
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
|
||||
if laneStatus.CurrentJobType != jobType || laneStatus.CurrentPhase != "detecting" {
|
||||
t.Fatalf("unexpected lane status while work is active: job=%q phase=%q", laneStatus.CurrentJobType, laneStatus.CurrentPhase)
|
||||
}
|
||||
if aggregateStatus.CurrentJobType != jobType || aggregateStatus.CurrentPhase != "detecting" {
|
||||
t.Fatalf("unexpected aggregate status while work is active: job=%q phase=%q", aggregateStatus.CurrentJobType, aggregateStatus.CurrentPhase)
|
||||
}
|
||||
|
||||
close(releaseClusterContext)
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("timed out waiting for job type iteration to finish")
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user