seaweedFS/weed/topology/node.go

package topology

import (
	"errors"
	"fmt"
	"math/rand/v2"
	"strings"
	"sync"
	"sync/atomic"
	"time"

	"github.com/seaweedfs/seaweedfs/weed/glog"
	"github.com/seaweedfs/seaweedfs/weed/stats"
	"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
	"github.com/seaweedfs/seaweedfs/weed/storage/needle"
	"github.com/seaweedfs/seaweedfs/weed/storage/types"
)

type NodeId string

// CapacityReservation represents a temporary reservation of capacity
type CapacityReservation struct {
	reservationId string
	diskType      types.DiskType
	count         int64
	createdAt     time.Time
}

// CapacityReservations manages capacity reservations for a node
type CapacityReservations struct {
	sync.RWMutex
	reservations   map[string]*CapacityReservation
	reservedCounts map[types.DiskType]int64
}

func newCapacityReservations() *CapacityReservations {
	return &CapacityReservations{
		reservations:   make(map[string]*CapacityReservation),
		reservedCounts: make(map[types.DiskType]int64),
	}
}

func (cr *CapacityReservations) removeReservation(reservationId string) bool {
	cr.Lock()
	defer cr.Unlock()

	if reservation, exists := cr.reservations[reservationId]; exists {
		delete(cr.reservations, reservationId)
		cr.decrementCount(reservation.diskType, reservation.count)
		return true
	}
	return false
}

func (cr *CapacityReservations) getReservedCount(diskType types.DiskType) int64 {
	cr.RLock()
	defer cr.RUnlock()

	return cr.reservedCounts[diskType]
}

// decrementCount is a helper to decrement reserved count and clean up zero entries
func (cr *CapacityReservations) decrementCount(diskType types.DiskType, count int64) {
	cr.reservedCounts[diskType] -= count
	// Clean up zero counts to prevent map growth
	if cr.reservedCounts[diskType] <= 0 {
		delete(cr.reservedCounts, diskType)
	}
}

// doAddReservation is a helper to add a reservation, assuming the lock is already held
func (cr *CapacityReservations) doAddReservation(diskType types.DiskType, count int64) string {
	now := time.Now()
	reservationId := fmt.Sprintf("%s-%d-%d-%d", diskType, count, now.UnixNano(), rand.Int64())
	cr.reservations[reservationId] = &CapacityReservation{
		reservationId: reservationId,
		diskType:      diskType,
		count:         count,
		createdAt:     now,
	}
	cr.reservedCounts[diskType] += count
	return reservationId
}

// tryReserveAtomic atomically checks available space and reserves if possible
func (cr *CapacityReservations) tryReserveAtomic(diskType types.DiskType, count int64, availableSpaceFunc func() int64) (reservationId string, success bool) {
	cr.Lock()
	defer cr.Unlock()

	// Check available space under lock
	currentReserved := cr.reservedCounts[diskType]
	availableSpace := availableSpaceFunc() - currentReserved

	if availableSpace >= count {
		// Create and add reservation atomically
		return cr.doAddReservation(diskType, count), true
	}

	return "", false
}

func (cr *CapacityReservations) cleanExpiredReservations(expirationDuration time.Duration) {
	cr.Lock()
	defer cr.Unlock()

	now := time.Now()
	for id, reservation := range cr.reservations {
		if now.Sub(reservation.createdAt) > expirationDuration {
			delete(cr.reservations, id)
			cr.decrementCount(reservation.diskType, reservation.count)
			glog.V(1).Infof("Cleaned up expired capacity reservation: %s", id)
		}
	}
}

type Node interface {
	Id() NodeId
	String() string
	AvailableSpaceFor(option *VolumeGrowOption) int64
	ReserveOneVolume(r int64, option *VolumeGrowOption) (*DataNode, error)
	ReserveOneVolumeForReservation(r int64, option *VolumeGrowOption) (*DataNode, error)
	UpAdjustDiskUsageDelta(diskType types.DiskType, diskUsage *DiskUsageCounts)
	UpAdjustMaxVolumeId(vid needle.VolumeId)
	GetDiskUsages() *DiskUsages

	// Capacity reservation methods for avoiding race conditions
	TryReserveCapacity(diskType types.DiskType, count int64) (reservationId string, success bool)
	ReleaseReservedCapacity(reservationId string)
	AvailableSpaceForReservation(option *VolumeGrowOption) int64

	GetMaxVolumeId() needle.VolumeId
	SetParent(Node)
	LinkChildNode(node Node)
	UnlinkChildNode(nodeId NodeId)
	CollectDeadNodeAndFullVolumes(freshThreshHold int64, volumeSizeLimit uint64, growThreshold float64)

	IsDataNode() bool
	IsRack() bool
	IsDataCenter() bool
	IsLocked() bool
	Children() []Node
	Parent() Node

	GetValue() interface{} //get reference to the topology,dc,rack,datanode
}

type NodeImpl struct {
	diskUsages   *DiskUsages
	id           NodeId
	parent       Node
	sync.RWMutex // lock children
	children     map[NodeId]Node
	maxVolumeId  needle.VolumeId

	//for rack, data center, topology
	nodeType string
	value    interface{}

	// capacity reservations to prevent race conditions during volume creation
	capacityReservations *CapacityReservations
}

func (n *NodeImpl) GetDiskUsages() *DiskUsages {
	return n.diskUsages
}

// the first node must satisfy filterFirstNodeFn(), the rest nodes must have one free slot
func (n *NodeImpl) PickNodesByWeight(numberOfNodes int, option *VolumeGrowOption, filterFirstNodeFn func(dn Node) error) (firstNode Node, restNodes []Node, err error) {
	var totalWeights int64
	var errs []string
	n.RLock()
	candidates := make([]Node, 0, len(n.children))
	candidatesWeights := make([]int64, 0, len(n.children))
	//pick nodes which has enough free volumes as candidates, and use free volumes number as node weight.
	for _, node := range n.children {
		if node.AvailableSpaceFor(option) <= 0 {
			continue
		}
		totalWeights += node.AvailableSpaceFor(option)
		candidates = append(candidates, node)
		candidatesWeights = append(candidatesWeights, node.AvailableSpaceFor(option))
	}
	n.RUnlock()
	if len(candidates) < numberOfNodes {
		glog.V(0).Infoln(n.Id(), "failed to pick", numberOfNodes, "from ", len(candidates), "node candidates")
		return nil, nil, errors.New("Not enough data nodes found!")
	}

	//pick nodes randomly by weights, the node picked earlier has higher final weights
	sortedCandidates := make([]Node, 0, len(candidates))
	for i := 0; i < len(candidates); i++ {
		// Break if no more weights available to prevent panic in rand.Int64N
		if totalWeights <= 0 {
			break
		}
		weightsInterval := rand.Int64N(totalWeights)
		lastWeights := int64(0)
		for k, weights := range candidatesWeights {
			if (weightsInterval >= lastWeights) && (weightsInterval < lastWeights+weights) {
				sortedCandidates = append(sortedCandidates, candidates[k])
				candidatesWeights[k] = 0
				totalWeights -= weights
				break
			}
			lastWeights += weights
		}
	}

	restNodes = make([]Node, 0, numberOfNodes-1)
	ret := false
	n.RLock()
	for k, node := range sortedCandidates {
		if err := filterFirstNodeFn(node); err == nil {
			firstNode = node
			if k >= numberOfNodes-1 {
				restNodes = sortedCandidates[:numberOfNodes-1]
			} else {
				restNodes = append(restNodes, sortedCandidates[:k]...)
				restNodes = append(restNodes, sortedCandidates[k+1:numberOfNodes]...)
			}
			ret = true
			break
		} else {
			errs = append(errs, string(node.Id())+":"+err.Error())
		}
	}
	n.RUnlock()
	if !ret {
		return nil, nil, errors.New("No matching data node found! \n" + strings.Join(errs, "\n"))
	}
	return
}

func (n *NodeImpl) IsDataNode() bool {
	return n.nodeType == "DataNode"
}

func (n *NodeImpl) IsRack() bool {
	return n.nodeType == "Rack"
}

func (n *NodeImpl) IsDataCenter() bool {
	return n.nodeType == "DataCenter"
}

func (n *NodeImpl) IsLocked() (isTryLock bool) {
	if isTryLock = n.TryRLock(); isTryLock {
		n.RUnlock()
	}
	return !isTryLock
}

func (n *NodeImpl) String() string {
	if n.parent != nil {
		return n.parent.String() + ":" + string(n.id)
	}
	return string(n.id)
}

func (n *NodeImpl) Id() NodeId {
	return n.id
}

func (n *NodeImpl) getOrCreateDisk(diskType types.DiskType) *DiskUsageCounts {
	return n.diskUsages.getOrCreateDisk(diskType)
}

func (n *NodeImpl) AvailableSpaceFor(option *VolumeGrowOption) int64 {
	t := n.getOrCreateDisk(option.DiskType)
	freeVolumeSlotCount := atomic.LoadInt64(&t.maxVolumeCount) + atomic.LoadInt64(&t.remoteVolumeCount) - atomic.LoadInt64(&t.volumeCount)
	ecShardCount := atomic.LoadInt64(&t.ecShardCount)
	if ecShardCount > 0 {
		freeVolumeSlotCount = freeVolumeSlotCount - ecShardCount/erasure_coding.DataShardsCount - 1
	}
	return freeVolumeSlotCount
}

// AvailableSpaceForReservation returns available space considering existing reservations
func (n *NodeImpl) AvailableSpaceForReservation(option *VolumeGrowOption) int64 {
	baseAvailable := n.AvailableSpaceFor(option)
	reservedCount := n.capacityReservations.getReservedCount(option.DiskType)
	return baseAvailable - reservedCount
}

// TryReserveCapacity attempts to atomically reserve capacity for volume creation
func (n *NodeImpl) TryReserveCapacity(diskType types.DiskType, count int64) (reservationId string, success bool) {
	const reservationTimeout = 5 * time.Minute // TODO: make this configurable

	// Clean up any expired reservations first
	n.capacityReservations.cleanExpiredReservations(reservationTimeout)

	// Atomically check and reserve space
	option := &VolumeGrowOption{DiskType: diskType}
	reservationId, success = n.capacityReservations.tryReserveAtomic(diskType, count, func() int64 {
		return n.AvailableSpaceFor(option)
	})

	if success {
		glog.V(1).Infof("Reserved %d capacity for diskType %s on node %s: %s", count, diskType, n.Id(), reservationId)
	}

	return reservationId, success
}

// ReleaseReservedCapacity releases a previously reserved capacity
func (n *NodeImpl) ReleaseReservedCapacity(reservationId string) {
	if n.capacityReservations.removeReservation(reservationId) {
		glog.V(1).Infof("Released capacity reservation on node %s: %s", n.Id(), reservationId)
	} else {
		glog.V(1).Infof("Attempted to release non-existent reservation on node %s: %s", n.Id(), reservationId)
	}
}
func (n *NodeImpl) SetParent(node Node) {
	n.parent = node
}

func (n *NodeImpl) Children() (ret []Node) {
	n.RLock()
	defer n.RUnlock()
	for _, c := range n.children {
		ret = append(ret, c)
	}
	return ret
}

func (n *NodeImpl) Parent() Node {
	return n.parent
}

func (n *NodeImpl) GetValue() interface{} {
	return n.value
}

func (n *NodeImpl) ReserveOneVolume(r int64, option *VolumeGrowOption) (assignedNode *DataNode, err error) {
	return n.reserveOneVolumeInternal(r, option, false)
}

// ReserveOneVolumeForReservation selects a node using reservation-aware capacity checks
func (n *NodeImpl) ReserveOneVolumeForReservation(r int64, option *VolumeGrowOption) (assignedNode *DataNode, err error) {
	return n.reserveOneVolumeInternal(r, option, true)
}

func (n *NodeImpl) reserveOneVolumeInternal(r int64, option *VolumeGrowOption, useReservations bool) (assignedNode *DataNode, err error) {
	n.RLock()
	defer n.RUnlock()
	for _, node := range n.children {
		var freeSpace int64
		if useReservations {
			freeSpace = node.AvailableSpaceForReservation(option)
		} else {
			freeSpace = node.AvailableSpaceFor(option)
		}
		// fmt.Println("r =", r, ", node =", node, ", freeSpace =", freeSpace)
		if freeSpace <= 0 {
			continue
		}
		if r >= freeSpace {
			r -= freeSpace
		} else {
			var hasSpace bool
			if useReservations {
				hasSpace = node.IsDataNode() && node.AvailableSpaceForReservation(option) > 0
			} else {
				hasSpace = node.IsDataNode() && node.AvailableSpaceFor(option) > 0
			}
			if hasSpace {
				// fmt.Println("vid =", vid, " assigned to node =", node, ", freeSpace =", node.FreeSpace())
				dn := node.(*DataNode)
				if dn.IsTerminating {
					continue
				}
				return dn, nil
			}
			if useReservations {
				assignedNode, err = node.ReserveOneVolumeForReservation(r, option)
			} else {
				assignedNode, err = node.ReserveOneVolume(r, option)
			}
			if err == nil {
				return
			}
		}
	}
	return nil, errors.New("No free volume slot found!")
}

func (n *NodeImpl) UpAdjustDiskUsageDelta(diskType types.DiskType, diskUsage *DiskUsageCounts) { //can be negative
	existingDisk := n.getOrCreateDisk(diskType)
	existingDisk.addDiskUsageCounts(diskUsage)
	if n.parent != nil {
		n.parent.UpAdjustDiskUsageDelta(diskType, diskUsage)
	}
}
func (n *NodeImpl) UpAdjustMaxVolumeId(vid needle.VolumeId) { //can be negative
	if n.maxVolumeId < vid {
		n.maxVolumeId = vid
		if n.parent != nil {
			n.parent.UpAdjustMaxVolumeId(vid)
		}
	}
}
func (n *NodeImpl) GetMaxVolumeId() needle.VolumeId {
	return n.maxVolumeId
}

func (n *NodeImpl) LinkChildNode(node Node) {
	n.Lock()
	defer n.Unlock()
	n.doLinkChildNode(node)
}

func (n *NodeImpl) doLinkChildNode(node Node) {
	if n.children[node.Id()] == nil {
		n.children[node.Id()] = node
		for dt, du := range node.GetDiskUsages().usages {
			n.UpAdjustDiskUsageDelta(dt, du)
		}
		n.UpAdjustMaxVolumeId(node.GetMaxVolumeId())
		node.SetParent(n)
		glog.V(0).Infoln(n, "adds child", node.Id())
	}
}

func (n *NodeImpl) UnlinkChildNode(nodeId NodeId) {
	n.Lock()
	defer n.Unlock()
	node := n.children[nodeId]
	if node != nil {
		node.SetParent(nil)
		delete(n.children, node.Id())
		for dt, du := range node.GetDiskUsages().negative().usages {
			n.UpAdjustDiskUsageDelta(dt, du)
		}
		glog.V(0).Infoln(n, "removes", node.Id())
	}
}

func (n *NodeImpl) CollectDeadNodeAndFullVolumes(freshThreshHoldUnixTime int64, volumeSizeLimit uint64, growThreshold float64) {
	if n.IsRack() {
		for _, c := range n.Children() {
			dn := c.(*DataNode) //can not cast n to DataNode
			for _, v := range dn.GetVolumes() {
				topo := n.GetTopology()
				diskType := types.ToDiskType(v.DiskType)
				vl := topo.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)

				if v.Size >= volumeSizeLimit {
					vl.accessLock.RLock()
					vacuumTime, ok := vl.vacuumedVolumes[v.Id]
					vl.accessLock.RUnlock()

					// If a volume has been vacuumed in the past 20 seconds, we do not check whether it has reached full capacity.
					// After 20s(grpc timeout), theoretically all the heartbeats of the volume server have reached the master,
					// the volume size should be correct, not the size before the vacuum.
					if !ok || time.Now().Add(-20*time.Second).After(vacuumTime) {
						//fmt.Println("volume",v.Id,"size",v.Size,">",volumeSizeLimit)
						topo.chanFullVolumes <- v
					}
				} else if float64(v.Size) > float64(volumeSizeLimit)*growThreshold {
					topo.chanCrowdedVolumes <- v
				}
				copyCount := v.ReplicaPlacement.GetCopyCount()
				if copyCount > 1 {
					if copyCount > len(topo.Lookup(v.Collection, v.Id)) {
						stats.MasterReplicaPlacementMismatch.WithLabelValues(v.Collection, v.Id.String()).Set(1)
					} else {
						stats.MasterReplicaPlacementMismatch.WithLabelValues(v.Collection, v.Id.String()).Set(0)
					}
				}
			}
		}
	} else {
		for _, c := range n.Children() {
			c.CollectDeadNodeAndFullVolumes(freshThreshHoldUnixTime, volumeSizeLimit, growThreshold)
		}
	}
}

func (n *NodeImpl) GetTopology() *Topology {
	var p Node
	p = n
	for p.Parent() != nil {
		p = p.Parent()
	}
	return p.GetValue().(*Topology)
}