Persist S3 bucket counter metrics across idle periods (#8595)
* Stop deleting counter metrics during bucket TTL cleanup Counter metrics (traffic bytes, request counts, object counts) are monotonically increasing by design. Deleting them after 10 minutes of bucket inactivity causes them to vanish from /metrics output and reset to zero when traffic resumes, breaking Prometheus rate()/increase() queries and making historical traffic reporting impossible. Only delete gauges and histograms in the TTL cleanup loop, as these represent current state and are safely re-populated on next activity. Fixes https://github.com/seaweedfs/seaweedfs/issues/8521 * Clean up all bucket metrics on bucket deletion Add DeleteBucketMetrics() to delete all metrics (including counters) for a bucket when it is explicitly deleted. This prevents unbounded label cardinality from accumulating for buckets that no longer exist. Called from DeleteBucketHandler after successful bucket deletion. * Reduce mutex scope in bucket metrics TTL sweep Collect expired bucket names under the lock, then release before calling DeletePartialMatch on Prometheus metrics. This prevents RecordBucketActiveTime from blocking during the expensive cleanup.
This commit is contained in:
@@ -573,6 +573,26 @@ func RecordBucketActiveTime(bucket string) {
|
||||
bucketLastActiveLock.Unlock()
|
||||
}
|
||||
|
||||
func DeleteBucketMetrics(bucket string) {
|
||||
bucketLastActiveLock.Lock()
|
||||
delete(bucketLastActiveTsNs, bucket)
|
||||
bucketLastActiveLock.Unlock()
|
||||
|
||||
labels := prometheus.Labels{"bucket": bucket}
|
||||
c := S3RequestCounter.DeletePartialMatch(labels)
|
||||
c += S3RequestHistogram.DeletePartialMatch(labels)
|
||||
c += S3TimeToFirstByteHistogram.DeletePartialMatch(labels)
|
||||
c += S3BucketTrafficReceivedBytesCounter.DeletePartialMatch(labels)
|
||||
c += S3BucketTrafficSentBytesCounter.DeletePartialMatch(labels)
|
||||
c += S3DeletedObjectsCounter.DeletePartialMatch(labels)
|
||||
c += S3UploadedObjectsCounter.DeletePartialMatch(labels)
|
||||
c += S3BucketSizeBytesGauge.DeletePartialMatch(labels)
|
||||
c += S3BucketPhysicalSizeBytesGauge.DeletePartialMatch(labels)
|
||||
c += S3BucketObjectCountGauge.DeletePartialMatch(labels)
|
||||
|
||||
glog.V(0).Infof("delete bucket metrics, %s: %d", bucket, c)
|
||||
}
|
||||
|
||||
func DeleteCollectionMetrics(collection string) {
|
||||
labels := prometheus.Labels{"collection": collection}
|
||||
c := MasterReplicaPlacementMismatch.DeletePartialMatch(labels)
|
||||
@@ -605,13 +625,11 @@ func bucketMetricTTLControl() {
|
||||
|
||||
for _, bucket := range expiredBuckets {
|
||||
labels := prometheus.Labels{"bucket": bucket}
|
||||
c := S3RequestCounter.DeletePartialMatch(labels)
|
||||
c += S3RequestHistogram.DeletePartialMatch(labels)
|
||||
// Only delete gauges and histograms, which represent current state.
|
||||
// Counters (traffic, requests, objects) must persist for the process
|
||||
// lifetime so that Prometheus rate()/increase() queries work correctly.
|
||||
c := S3RequestHistogram.DeletePartialMatch(labels)
|
||||
c += S3TimeToFirstByteHistogram.DeletePartialMatch(labels)
|
||||
c += S3BucketTrafficReceivedBytesCounter.DeletePartialMatch(labels)
|
||||
c += S3BucketTrafficSentBytesCounter.DeletePartialMatch(labels)
|
||||
c += S3DeletedObjectsCounter.DeletePartialMatch(labels)
|
||||
c += S3UploadedObjectsCounter.DeletePartialMatch(labels)
|
||||
c += S3BucketSizeBytesGauge.DeletePartialMatch(labels)
|
||||
c += S3BucketPhysicalSizeBytesGauge.DeletePartialMatch(labels)
|
||||
c += S3BucketObjectCountGauge.DeletePartialMatch(labels)
|
||||
|
||||
Reference in New Issue
Block a user