feat: add S3 bucket size and object count metrics (#7776)

* feat: add S3 bucket size and object count metrics Adds periodic collection of bucket size metrics: - SeaweedFS_s3_bucket_size_bytes: logical size (deduplicated across replicas) - SeaweedFS_s3_bucket_physical_size_bytes: physical size (including replicas) - SeaweedFS_s3_bucket_object_count: object count (deduplicated) Collection runs every 1 minute via background goroutine that queries filer Statistics RPC for each bucket's collection. Also adds Grafana dashboard panels for: - S3 Bucket Size (logical vs physical) - S3 Bucket Object Count * address PR comments: fix bucket size metrics collection 1. Fix collectCollectionInfoFromMaster to use master VolumeList API - Now properly queries master for topology info - Uses WithMasterClient to get volume list from master - Correctly calculates logical vs physical size based on replication 2. Return error when filerClient is nil to trigger fallback - Changed from 'return nil, nil' to 'return nil, error' - Ensures fallback to filer stats is properly triggered 3. Implement pagination in listBucketNames - Added listBucketPageSize constant (1000) - Uses StartFromFileName for pagination - Continues fetching until fewer entries than limit returned 4. Handle NewReplicaPlacementFromByte error and prevent division by zero - Check error return from NewReplicaPlacementFromByte - Default to 1 copy if error occurs - Add explicit check for copyCount == 0 * simplify bucket size metrics: remove filer fallback, align with quota enforcement - Remove fallback to filer Statistics RPC - Use only master topology for collection info (same as s3.bucket.quota.enforce) - Updated comments to clarify this runs the same collection logic as quota enforcement - Simplified code by removing collectBucketSizeFromFilerStats * use s3a.option.Masters directly instead of querying filer * address PR comments: fix dashboard overlaps and improve metrics collection Grafana dashboard fixes: - Fix overlapping panels 55 and 59 in grafana_seaweedfs.json (moved 59 to y=30) - Fix grid collision in k8s dashboard (moved panel 72 to y=48) - Aggregate bucket metrics with max() by (bucket) for multi-instance S3 gateways Go code improvements: - Add graceful shutdown support via context cancellation - Use ticker instead of time.Sleep for better shutdown responsiveness - Distinguish EOF from actual errors in stream handling * improve bucket size metrics: multi-master failover and proper error handling - Initial delay now respects context cancellation using select with time.After - Use WithOneOfGrpcMasterClients for multi-master failover instead of hardcoding Masters[0] - Properly propagate stream errors instead of just logging them (EOF vs real errors) * improve bucket size metrics: distributed lock and volume ID deduplication - Add distributed lock (LiveLock) so only one S3 instance collects metrics at a time - Add IsLocked() method to LiveLock for checking lock status - Fix deduplication: use volume ID tracking instead of dividing by copyCount - Previous approach gave wrong results if replicas were missing - Now tracks seen volume IDs and counts each volume only once - Physical size still includes all replicas for accurate disk usage reporting * rename lock to s3.leader * simplify: remove StartBucketSizeMetricsCollection wrapper function * fix data race: use atomic operations for LiveLock.isLocked field - Change isLocked from bool to int32 - Use atomic.LoadInt32/StoreInt32 for all reads/writes - Sync shared isLocked field in StartLongLivedLock goroutine * add nil check for topology info to prevent panic * fix bucket metrics: use Ticker for consistent intervals, fix pagination logic - Use time.Ticker instead of time.After for consistent interval execution - Fix pagination: count all entries (not just directories) for proper termination - Update lastFileName for all entries to prevent pagination issues * address PR comments: remove redundant atomic store, propagate context - Remove redundant atomic.StoreInt32 in StartLongLivedLock (AttemptToLock already sets it) - Propagate context through metrics collection for proper cancellation on shutdown - collectAndUpdateBucketSizeMetrics now accepts ctx - collectCollectionInfoFromMaster uses ctx for VolumeList RPC - listBucketNames uses ctx for ListEntries RPC
2025-12-15 19:23:25 -08:00
parent 4dcd33bbc8
commit f5c666052e
9 changed files with 1100 additions and 9 deletions
--- a/weed/cluster/lock_client.go
+++ b/weed/cluster/lock_client.go
@@ -3,6 +3,7 @@ package cluster
 import (
 	"context"
 	"fmt"
+	"sync/atomic"
 	"time"

 	"github.com/seaweedfs/seaweedfs/weed/cluster/lock_manager"
@@ -36,7 +37,7 @@ type LiveLock struct {
 	hostFiler      pb.ServerAddress
 	cancelCh       chan struct{}
 	grpcDialOption grpc.DialOption
-	isLocked       bool
+	isLocked       int32 // 0 = unlocked, 1 = locked; use atomic operations
 	self           string
 	lc             *LockClient
 	owner          string
@@ -84,10 +85,12 @@ func (lc *LockClient) StartLongLivedLock(key string, owner string, onLockOwnerCh
 				if err := lock.AttemptToLock(lock_manager.LiveLockTTL); err != nil {
 					glog.V(0).Infof("Lost lock %s: %v", key, err)
 					isLocked = false
+					atomic.StoreInt32(&lock.isLocked, 0)
 				}
 			} else {
 				if err := lock.AttemptToLock(lock_manager.LiveLockTTL); err == nil {
 					isLocked = true
+					// Note: AttemptToLock already sets lock.isLocked atomically on success
 				}
 			}
 			if lockOwner != lock.LockOwner() && lock.LockOwner() != "" {
@@ -130,20 +133,20 @@ func (lock *LiveLock) AttemptToLock(lockDuration time.Duration) error {
 		time.Sleep(time.Second)
 		return fmt.Errorf("%v", errorMessage)
 	}
-	if !lock.isLocked {
+	if atomic.LoadInt32(&lock.isLocked) == 0 {
 		// Only log when transitioning from unlocked to locked
 		glog.V(1).Infof("LOCK: Successfully acquired key=%s owner=%s", lock.key, lock.self)
 	}
-	lock.isLocked = true
+	atomic.StoreInt32(&lock.isLocked, 1)
 	return nil
 }

 func (lock *LiveLock) StopShortLivedLock() error {
-	if !lock.isLocked {
+	if atomic.LoadInt32(&lock.isLocked) == 0 {
 		return nil
 	}
 	defer func() {
-		lock.isLocked = false
+		atomic.StoreInt32(&lock.isLocked, 0)
 	}()
 	return pb.WithFilerClient(false, 0, lock.hostFiler, lock.grpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
 		_, err := client.DistributedUnlock(context.Background(), &filer_pb.UnlockRequest{
@@ -228,3 +231,8 @@ func (lock *LiveLock) doLock(lockDuration time.Duration) (errorMessage string, e
 func (lock *LiveLock) LockOwner() string {
 	return lock.owner
 }
+
+// IsLocked returns true if this instance currently holds the lock
+func (lock *LiveLock) IsLocked() bool {
+	return atomic.LoadInt32(&lock.isLocked) == 1
+}