plugin scheduler: run iceberg and lifecycle lanes concurrently (#8821)
* plugin scheduler: run iceberg and lifecycle lanes concurrently The default lane serialises job types under a single admin lock because volume-management operations share global state. Iceberg and lifecycle lanes have no such constraint, so run each of their job types independently in separate goroutines. * Fix concurrent lane scheduler status * plugin scheduler: address review feedback - Extract collectDueJobTypes helper to deduplicate policy loading between locked and concurrent iteration paths. - Use atomic.Bool instead of sync.Mutex for hadJobs in the concurrent path. - Set lane loop state to "busy" before launching concurrent goroutines so the lane is not reported as idle while work runs. - Convert TestLaneRequiresLock to table-driven style. - Add TestRunLaneSchedulerIterationLockBehavior to verify the scheduler acquires the admin lock only for lanes that require it. - Fix flaky TestGetLaneSchedulerStatusShowsActiveConcurrentLaneWork by not starting background scheduler goroutines that race with the direct runJobTypeIteration call.
This commit is contained in:
@@ -3,6 +3,7 @@ package plugin
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -594,3 +595,81 @@ func TestPickDetectorReassignsWhenLeaseIsStale(t *testing.T) {
|
||||
t.Fatalf("expected detector lease to be updated to worker-a, got=%s", lease)
|
||||
}
|
||||
}
|
||||
|
||||
// trackingLockManager records whether Acquire was called and how many times.
|
||||
type trackingLockManager struct {
|
||||
mu sync.Mutex
|
||||
acquired int
|
||||
}
|
||||
|
||||
func (m *trackingLockManager) Acquire(reason string) (func(), error) {
|
||||
m.mu.Lock()
|
||||
m.acquired++
|
||||
m.mu.Unlock()
|
||||
return func() {}, nil
|
||||
}
|
||||
|
||||
func (m *trackingLockManager) count() int {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
return m.acquired
|
||||
}
|
||||
|
||||
func TestRunLaneSchedulerIterationLockBehavior(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
lane SchedulerLane
|
||||
jobType string
|
||||
wantLock bool
|
||||
}{
|
||||
{"Default", LaneDefault, "vacuum", true},
|
||||
{"Iceberg", LaneIceberg, "iceberg_maintenance", false},
|
||||
{"Lifecycle", LaneLifecycle, "s3_lifecycle", false},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
lm := &trackingLockManager{}
|
||||
pluginSvc, err := New(Options{
|
||||
LockManager: lm,
|
||||
ClusterContextProvider: func(context.Context) (*plugin_pb.ClusterContext, error) {
|
||||
return &plugin_pb.ClusterContext{}, nil
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("New: %v", err)
|
||||
}
|
||||
defer pluginSvc.Shutdown()
|
||||
|
||||
// Register a detectable worker for the job type.
|
||||
pluginSvc.registry.UpsertFromHello(&plugin_pb.WorkerHello{
|
||||
WorkerId: "worker-a",
|
||||
Capabilities: []*plugin_pb.JobTypeCapability{
|
||||
{JobType: tt.jobType, CanDetect: true},
|
||||
},
|
||||
})
|
||||
|
||||
// Enable the job type so the scheduler picks it up.
|
||||
err = pluginSvc.SaveJobTypeConfig(&plugin_pb.PersistedJobTypeConfig{
|
||||
JobType: tt.jobType,
|
||||
AdminRuntime: &plugin_pb.AdminRuntimeConfig{
|
||||
Enabled: true,
|
||||
DetectionIntervalSeconds: 1,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("SaveJobTypeConfig: %v", err)
|
||||
}
|
||||
|
||||
ls := pluginSvc.lanes[tt.lane]
|
||||
pluginSvc.runLaneSchedulerIteration(ls)
|
||||
|
||||
if got := lm.count(); (got > 0) != tt.wantLock {
|
||||
t.Errorf("lock acquired %d times, wantLock=%v", got, tt.wantLock)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user