plugin scheduler: run iceberg and lifecycle lanes concurrently (#8821)

* plugin scheduler: run iceberg and lifecycle lanes concurrently

The default lane serialises job types under a single admin lock
because volume-management operations share global state. Iceberg
and lifecycle lanes have no such constraint, so run each of their
job types independently in separate goroutines.

* Fix concurrent lane scheduler status

* plugin scheduler: address review feedback

- Extract collectDueJobTypes helper to deduplicate policy loading
  between locked and concurrent iteration paths.
- Use atomic.Bool instead of sync.Mutex for hadJobs in the concurrent
  path.
- Set lane loop state to "busy" before launching concurrent goroutines
  so the lane is not reported as idle while work runs.
- Convert TestLaneRequiresLock to table-driven style.
- Add TestRunLaneSchedulerIterationLockBehavior to verify the scheduler
  acquires the admin lock only for lanes that require it.
- Fix flaky TestGetLaneSchedulerStatusShowsActiveConcurrentLaneWork by
  not starting background scheduler goroutines that race with the
  direct runJobTypeIteration call.
This commit is contained in:
Chris Lu
2026-03-29 00:06:20 -07:00
committed by GitHub
parent e8a6fcaafb
commit a95b8396e4
6 changed files with 292 additions and 17 deletions

View File

@@ -3,6 +3,7 @@ package plugin
import (
"context"
"fmt"
"sync"
"testing"
"time"
@@ -594,3 +595,81 @@ func TestPickDetectorReassignsWhenLeaseIsStale(t *testing.T) {
t.Fatalf("expected detector lease to be updated to worker-a, got=%s", lease)
}
}
// trackingLockManager records whether Acquire was called and how many times.
type trackingLockManager struct {
mu sync.Mutex
acquired int
}
func (m *trackingLockManager) Acquire(reason string) (func(), error) {
m.mu.Lock()
m.acquired++
m.mu.Unlock()
return func() {}, nil
}
func (m *trackingLockManager) count() int {
m.mu.Lock()
defer m.mu.Unlock()
return m.acquired
}
func TestRunLaneSchedulerIterationLockBehavior(t *testing.T) {
t.Parallel()
tests := []struct {
name string
lane SchedulerLane
jobType string
wantLock bool
}{
{"Default", LaneDefault, "vacuum", true},
{"Iceberg", LaneIceberg, "iceberg_maintenance", false},
{"Lifecycle", LaneLifecycle, "s3_lifecycle", false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
lm := &trackingLockManager{}
pluginSvc, err := New(Options{
LockManager: lm,
ClusterContextProvider: func(context.Context) (*plugin_pb.ClusterContext, error) {
return &plugin_pb.ClusterContext{}, nil
},
})
if err != nil {
t.Fatalf("New: %v", err)
}
defer pluginSvc.Shutdown()
// Register a detectable worker for the job type.
pluginSvc.registry.UpsertFromHello(&plugin_pb.WorkerHello{
WorkerId: "worker-a",
Capabilities: []*plugin_pb.JobTypeCapability{
{JobType: tt.jobType, CanDetect: true},
},
})
// Enable the job type so the scheduler picks it up.
err = pluginSvc.SaveJobTypeConfig(&plugin_pb.PersistedJobTypeConfig{
JobType: tt.jobType,
AdminRuntime: &plugin_pb.AdminRuntimeConfig{
Enabled: true,
DetectionIntervalSeconds: 1,
},
})
if err != nil {
t.Fatalf("SaveJobTypeConfig: %v", err)
}
ls := pluginSvc.lanes[tt.lane]
pluginSvc.runLaneSchedulerIteration(ls)
if got := lm.count(); (got > 0) != tt.wantLock {
t.Errorf("lock acquired %d times, wantLock=%v", got, tt.wantLock)
}
})
}
}