feat(plugin): enhanced collection filtering for volume balance (#8620)

* feat(plugin): enhanced collection filtering for volume balance

Replace wildcard matching with three collection filter modes:
- ALL_COLLECTIONS (default): treat all volumes as one pool
- EACH_COLLECTION: run detection separately per collection
- Regex pattern: filter volumes by matching collection names

The EACH_COLLECTION mode extracts distinct collections from metrics
and calls Detection() per collection, sharing the maxResults budget
and clusterInfo (with ActiveTopology) across all calls.

* address PR review: fix wildcard→regexp replacement, optimize EACH_COLLECTION

* address nitpick: fail fast on config errors (invalid regex)

Add configError type so invalid collection_filter regex returns
immediately instead of retrying across all masters with the same
bad config. Transient errors still retry.

* address review: constants, unbounded maxResults, wildcard compat

- Define collectionFilterAll/collectionFilterEach constants to
  eliminate magic strings across handler and metrics code
- Fix EACH_COLLECTION budget loop to treat maxResults <= 0 as
  unbounded, matching Detection's existing semantics
- Treat "*" as ALL_COLLECTIONS for backward compat with wildcard

* address review: nil guard in EACH_COLLECTION grouping loop

* remove useless descriptor string test
This commit is contained in:
Chris Lu
2026-03-13 17:02:59 -07:00
committed by GitHub
parent 577a8459c9
commit 00ce1c6eba
3 changed files with 194 additions and 8 deletions

View File

@@ -23,6 +23,10 @@ import (
const (
defaultBalanceTimeoutSeconds = int32(10 * 60)
maxProposalStringLength = 200
// Collection filter mode constants.
collectionFilterAll = "ALL_COLLECTIONS"
collectionFilterEach = "EACH_COLLECTION"
)
func init() {
@@ -85,8 +89,8 @@ func (h *VolumeBalanceHandler) Descriptor() *plugin_pb.JobTypeDescriptor {
{
Name: "collection_filter",
Label: "Collection Filter",
Description: "Only detect balance opportunities in this collection when set.",
Placeholder: "all collections",
Description: "Filter collections for balance detection. Use ALL_COLLECTIONS (default) to treat all volumes as one pool, EACH_COLLECTION to run detection separately per collection, or a regex pattern to match specific collections.",
Placeholder: "ALL_COLLECTIONS",
FieldType: plugin_pb.ConfigFieldType_CONFIG_FIELD_TYPE_STRING,
Widget: plugin_pb.ConfigWidget_CONFIG_WIDGET_TEXT,
},
@@ -286,9 +290,54 @@ func (h *VolumeBalanceHandler) Detect(
clusterInfo := &workertypes.ClusterInfo{ActiveTopology: activeTopology}
maxResults := int(request.MaxResults)
results, hasMore, err := balancetask.Detection(metrics, clusterInfo, workerConfig.TaskConfig, maxResults)
if err != nil {
return err
var results []*workertypes.TaskDetectionResult
var hasMore bool
if collectionFilter == collectionFilterEach {
// Group metrics by collection in a single pass (O(N) instead of O(C*N))
metricsByCollection := make(map[string][]*workertypes.VolumeHealthMetrics)
for _, m := range metrics {
if m == nil {
continue
}
metricsByCollection[m.Collection] = append(metricsByCollection[m.Collection], m)
}
collections := make([]string, 0, len(metricsByCollection))
for c := range metricsByCollection {
collections = append(collections, c)
}
sort.Strings(collections)
budget := maxResults
unlimitedBudget := budget <= 0
for _, collection := range collections {
if !unlimitedBudget && budget <= 0 {
hasMore = true
break
}
perCollectionLimit := budget
if unlimitedBudget {
perCollectionLimit = 0 // Detection treats <= 0 as unbounded
}
perResults, perHasMore, perErr := balancetask.Detection(metricsByCollection[collection], clusterInfo, workerConfig.TaskConfig, perCollectionLimit)
if perErr != nil {
return perErr
}
results = append(results, perResults...)
if !unlimitedBudget {
budget -= len(perResults)
}
if perHasMore {
hasMore = true
}
}
} else {
var err error
results, hasMore, err = balancetask.Detection(metrics, clusterInfo, workerConfig.TaskConfig, maxResults)
if err != nil {
return err
}
}
if traceErr := emitVolumeBalanceDetectionDecisionTrace(sender, metrics, activeTopology, workerConfig.TaskConfig, results); traceErr != nil {