Persist S3 bucket counter metrics across idle periods (#8595)

* Stop deleting counter metrics during bucket TTL cleanup

Counter metrics (traffic bytes, request counts, object counts) are
monotonically increasing by design. Deleting them after 10 minutes of
bucket inactivity causes them to vanish from /metrics output and reset
to zero when traffic resumes, breaking Prometheus rate()/increase()
queries and making historical traffic reporting impossible.

Only delete gauges and histograms in the TTL cleanup loop, as these
represent current state and are safely re-populated on next activity.

Fixes https://github.com/seaweedfs/seaweedfs/issues/8521

* Clean up all bucket metrics on bucket deletion

Add DeleteBucketMetrics() to delete all metrics (including counters)
for a bucket when it is explicitly deleted. This prevents unbounded
label cardinality from accumulating for buckets that no longer exist.

Called from DeleteBucketHandler after successful bucket deletion.

* Reduce mutex scope in bucket metrics TTL sweep

Collect expired bucket names under the lock, then release before
calling DeletePartialMatch on Prometheus metrics. This prevents
RecordBucketActiveTime from blocking during the expensive cleanup.
This commit is contained in:
Chris Lu
2026-03-10 19:00:40 -07:00
committed by GitHub
parent 0a2dac1e56
commit 0a5c5ed4ce
2 changed files with 27 additions and 7 deletions

View File

@@ -20,6 +20,7 @@ import (
"github.com/seaweedfs/seaweedfs/weed/filer"
"github.com/seaweedfs/seaweedfs/weed/s3api/s3_constants"
stats_collect "github.com/seaweedfs/seaweedfs/weed/stats"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
@@ -396,8 +397,9 @@ func (s3a *S3ApiServer) DeleteBucketHandler(w http.ResponseWriter, r *http.Reque
return
}
// Clean up bucket-related caches and locks after successful deletion
// Clean up bucket-related caches, locks, and metrics after successful deletion
s3a.invalidateBucketConfigCache(bucket)
stats_collect.DeleteBucketMetrics(bucket)
s3err.WriteEmptyResponse(w, r, http.StatusNoContent)
}

View File

@@ -573,6 +573,26 @@ func RecordBucketActiveTime(bucket string) {
bucketLastActiveLock.Unlock()
}
func DeleteBucketMetrics(bucket string) {
bucketLastActiveLock.Lock()
delete(bucketLastActiveTsNs, bucket)
bucketLastActiveLock.Unlock()
labels := prometheus.Labels{"bucket": bucket}
c := S3RequestCounter.DeletePartialMatch(labels)
c += S3RequestHistogram.DeletePartialMatch(labels)
c += S3TimeToFirstByteHistogram.DeletePartialMatch(labels)
c += S3BucketTrafficReceivedBytesCounter.DeletePartialMatch(labels)
c += S3BucketTrafficSentBytesCounter.DeletePartialMatch(labels)
c += S3DeletedObjectsCounter.DeletePartialMatch(labels)
c += S3UploadedObjectsCounter.DeletePartialMatch(labels)
c += S3BucketSizeBytesGauge.DeletePartialMatch(labels)
c += S3BucketPhysicalSizeBytesGauge.DeletePartialMatch(labels)
c += S3BucketObjectCountGauge.DeletePartialMatch(labels)
glog.V(0).Infof("delete bucket metrics, %s: %d", bucket, c)
}
func DeleteCollectionMetrics(collection string) {
labels := prometheus.Labels{"collection": collection}
c := MasterReplicaPlacementMismatch.DeletePartialMatch(labels)
@@ -605,13 +625,11 @@ func bucketMetricTTLControl() {
for _, bucket := range expiredBuckets {
labels := prometheus.Labels{"bucket": bucket}
c := S3RequestCounter.DeletePartialMatch(labels)
c += S3RequestHistogram.DeletePartialMatch(labels)
// Only delete gauges and histograms, which represent current state.
// Counters (traffic, requests, objects) must persist for the process
// lifetime so that Prometheus rate()/increase() queries work correctly.
c := S3RequestHistogram.DeletePartialMatch(labels)
c += S3TimeToFirstByteHistogram.DeletePartialMatch(labels)
c += S3BucketTrafficReceivedBytesCounter.DeletePartialMatch(labels)
c += S3BucketTrafficSentBytesCounter.DeletePartialMatch(labels)
c += S3DeletedObjectsCounter.DeletePartialMatch(labels)
c += S3UploadedObjectsCounter.DeletePartialMatch(labels)
c += S3BucketSizeBytesGauge.DeletePartialMatch(labels)
c += S3BucketPhysicalSizeBytesGauge.DeletePartialMatch(labels)
c += S3BucketObjectCountGauge.DeletePartialMatch(labels)