Files
seaweedFS/weed/topology/node.go
Chris Lu 995dfc4d5d chore: remove ~50k lines of unreachable dead code (#8913)
* chore: remove unreachable dead code across the codebase

Remove ~50,000 lines of unreachable code identified by static analysis.

Major removals:
- weed/filer/redis_lua: entire unused Redis Lua filer store implementation
- weed/wdclient/net2, resource_pool: unused connection/resource pool packages
- weed/plugin/worker/lifecycle: unused lifecycle plugin worker
- weed/s3api: unused S3 policy templates, presigned URL IAM, streaming copy,
  multipart IAM, key rotation, and various SSE helper functions
- weed/mq/kafka: unused partition mapping, compression, schema, and protocol functions
- weed/mq/offset: unused SQL storage and migration code
- weed/worker: unused registry, task, and monitoring functions
- weed/query: unused SQL engine, parquet scanner, and type functions
- weed/shell: unused EC proportional rebalance functions
- weed/storage/erasure_coding/distribution: unused distribution analysis functions
- Individual unreachable functions removed from 150+ files across admin,
  credential, filer, iam, kms, mount, mq, operation, pb, s3api, server,
  shell, storage, topology, and util packages

* fix(s3): reset shared memory store in IAM test to prevent flaky failure

TestLoadIAMManagerFromConfig_EmptyConfigWithFallbackKey was flaky because
the MemoryStore credential backend is a singleton registered via init().
Earlier tests that create anonymous identities pollute the shared store,
causing LookupAnonymous() to unexpectedly return true.

Fix by calling Reset() on the memory store before the test runs.

* style: run gofmt on changed files

* fix: restore KMS functions used by integration tests

* fix(plugin): prevent panic on send to closed worker session channel

The Plugin.sendToWorker method could panic with "send on closed channel"
when a worker disconnected while a message was being sent. The race was
between streamSession.close() closing the outgoing channel and sendToWorker
writing to it concurrently.

Add a done channel to streamSession that is closed before the outgoing
channel, and check it in sendToWorker's select to safely detect closed
sessions without panicking.
2026-04-03 16:04:27 -07:00

487 lines
14 KiB
Go

package topology
import (
"errors"
"fmt"
"math/rand/v2"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/stats"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
)
type NodeId string
// CapacityReservation represents a temporary reservation of capacity
type CapacityReservation struct {
reservationId string
diskType types.DiskType
count int64
createdAt time.Time
}
// CapacityReservations manages capacity reservations for a node
type CapacityReservations struct {
sync.RWMutex
reservations map[string]*CapacityReservation
reservedCounts map[types.DiskType]int64
}
func newCapacityReservations() *CapacityReservations {
return &CapacityReservations{
reservations: make(map[string]*CapacityReservation),
reservedCounts: make(map[types.DiskType]int64),
}
}
func (cr *CapacityReservations) removeReservation(reservationId string) bool {
cr.Lock()
defer cr.Unlock()
if reservation, exists := cr.reservations[reservationId]; exists {
delete(cr.reservations, reservationId)
cr.decrementCount(reservation.diskType, reservation.count)
return true
}
return false
}
func (cr *CapacityReservations) getReservedCount(diskType types.DiskType) int64 {
cr.RLock()
defer cr.RUnlock()
return cr.reservedCounts[diskType]
}
// decrementCount is a helper to decrement reserved count and clean up zero entries
func (cr *CapacityReservations) decrementCount(diskType types.DiskType, count int64) {
cr.reservedCounts[diskType] -= count
// Clean up zero counts to prevent map growth
if cr.reservedCounts[diskType] <= 0 {
delete(cr.reservedCounts, diskType)
}
}
// doAddReservation is a helper to add a reservation, assuming the lock is already held
func (cr *CapacityReservations) doAddReservation(diskType types.DiskType, count int64) string {
now := time.Now()
reservationId := fmt.Sprintf("%s-%d-%d-%d", diskType, count, now.UnixNano(), rand.Int64())
cr.reservations[reservationId] = &CapacityReservation{
reservationId: reservationId,
diskType: diskType,
count: count,
createdAt: now,
}
cr.reservedCounts[diskType] += count
return reservationId
}
// tryReserveAtomic atomically checks available space and reserves if possible
func (cr *CapacityReservations) tryReserveAtomic(diskType types.DiskType, count int64, availableSpaceFunc func() int64) (reservationId string, success bool) {
cr.Lock()
defer cr.Unlock()
// Check available space under lock
currentReserved := cr.reservedCounts[diskType]
availableSpace := availableSpaceFunc() - currentReserved
if availableSpace >= count {
// Create and add reservation atomically
return cr.doAddReservation(diskType, count), true
}
return "", false
}
func (cr *CapacityReservations) cleanExpiredReservations(expirationDuration time.Duration) {
cr.Lock()
defer cr.Unlock()
now := time.Now()
for id, reservation := range cr.reservations {
if now.Sub(reservation.createdAt) > expirationDuration {
delete(cr.reservations, id)
cr.decrementCount(reservation.diskType, reservation.count)
glog.V(1).Infof("Cleaned up expired capacity reservation: %s", id)
}
}
}
type Node interface {
Id() NodeId
String() string
AvailableSpaceFor(option *VolumeGrowOption) int64
ReserveOneVolume(r int64, option *VolumeGrowOption) (*DataNode, error)
ReserveOneVolumeForReservation(r int64, option *VolumeGrowOption) (*DataNode, error)
UpAdjustDiskUsageDelta(diskType types.DiskType, diskUsage *DiskUsageCounts)
UpAdjustMaxVolumeId(vid needle.VolumeId)
GetDiskUsages() *DiskUsages
// Capacity reservation methods for avoiding race conditions
TryReserveCapacity(diskType types.DiskType, count int64) (reservationId string, success bool)
ReleaseReservedCapacity(reservationId string)
AvailableSpaceForReservation(option *VolumeGrowOption) int64
GetMaxVolumeId() needle.VolumeId
SetParent(Node)
LinkChildNode(node Node)
UnlinkChildNode(nodeId NodeId)
CollectDeadNodeAndFullVolumes(freshThreshHold int64, volumeSizeLimit uint64, growThreshold float64)
IsDataNode() bool
IsRack() bool
IsDataCenter() bool
IsLocked() bool
Children() []Node
Parent() Node
GetValue() interface{} //get reference to the topology,dc,rack,datanode
}
type NodeImpl struct {
diskUsages *DiskUsages
id NodeId
parent Node
sync.RWMutex // lock children
children map[NodeId]Node
maxVolumeId needle.VolumeId
//for rack, data center, topology
nodeType string
value interface{}
// capacity reservations to prevent race conditions during volume creation
capacityReservations *CapacityReservations
}
func (n *NodeImpl) GetDiskUsages() *DiskUsages {
return n.diskUsages
}
// the first node must satisfy filterFirstNodeFn(), the rest nodes must have one free slot
func (n *NodeImpl) PickNodesByWeight(numberOfNodes int, option *VolumeGrowOption, filterFirstNodeFn func(dn Node) error) (firstNode Node, restNodes []Node, err error) {
var totalWeights int64
var errs []string
n.RLock()
candidates := make([]Node, 0, len(n.children))
candidatesWeights := make([]int64, 0, len(n.children))
//pick nodes which has enough free volumes as candidates, and use free volumes number as node weight.
for _, node := range n.children {
if node.AvailableSpaceFor(option) <= 0 {
continue
}
totalWeights += node.AvailableSpaceFor(option)
candidates = append(candidates, node)
candidatesWeights = append(candidatesWeights, node.AvailableSpaceFor(option))
}
n.RUnlock()
if len(candidates) < numberOfNodes {
glog.V(0).Infoln(n.Id(), "failed to pick", numberOfNodes, "from ", len(candidates), "node candidates")
return nil, nil, errors.New("Not enough data nodes found!")
}
//pick nodes randomly by weights, the node picked earlier has higher final weights
sortedCandidates := make([]Node, 0, len(candidates))
for i := 0; i < len(candidates); i++ {
// Break if no more weights available to prevent panic in rand.Int64N
if totalWeights <= 0 {
break
}
weightsInterval := rand.Int64N(totalWeights)
lastWeights := int64(0)
for k, weights := range candidatesWeights {
if (weightsInterval >= lastWeights) && (weightsInterval < lastWeights+weights) {
sortedCandidates = append(sortedCandidates, candidates[k])
candidatesWeights[k] = 0
totalWeights -= weights
break
}
lastWeights += weights
}
}
restNodes = make([]Node, 0, numberOfNodes-1)
ret := false
n.RLock()
for k, node := range sortedCandidates {
if err := filterFirstNodeFn(node); err == nil {
firstNode = node
if k >= numberOfNodes-1 {
restNodes = sortedCandidates[:numberOfNodes-1]
} else {
restNodes = append(restNodes, sortedCandidates[:k]...)
restNodes = append(restNodes, sortedCandidates[k+1:numberOfNodes]...)
}
ret = true
break
} else {
errs = append(errs, string(node.Id())+":"+err.Error())
}
}
n.RUnlock()
if !ret {
return nil, nil, errors.New("No matching data node found! \n" + strings.Join(errs, "\n"))
}
return
}
func (n *NodeImpl) IsDataNode() bool {
return n.nodeType == "DataNode"
}
func (n *NodeImpl) IsRack() bool {
return n.nodeType == "Rack"
}
func (n *NodeImpl) IsDataCenter() bool {
return n.nodeType == "DataCenter"
}
func (n *NodeImpl) IsLocked() (isTryLock bool) {
if isTryLock = n.TryRLock(); isTryLock {
n.RUnlock()
}
return !isTryLock
}
func (n *NodeImpl) String() string {
if n.parent != nil {
return n.parent.String() + ":" + string(n.id)
}
return string(n.id)
}
func (n *NodeImpl) Id() NodeId {
return n.id
}
func (n *NodeImpl) getOrCreateDisk(diskType types.DiskType) *DiskUsageCounts {
return n.diskUsages.getOrCreateDisk(diskType)
}
func (n *NodeImpl) AvailableSpaceFor(option *VolumeGrowOption) int64 {
t := n.getOrCreateDisk(option.DiskType)
freeVolumeSlotCount := atomic.LoadInt64(&t.maxVolumeCount) + atomic.LoadInt64(&t.remoteVolumeCount) - atomic.LoadInt64(&t.volumeCount)
ecShardCount := atomic.LoadInt64(&t.ecShardCount)
if ecShardCount > 0 {
freeVolumeSlotCount = freeVolumeSlotCount - ecShardCount/erasure_coding.DataShardsCount - 1
}
return freeVolumeSlotCount
}
// AvailableSpaceForReservation returns available space considering existing reservations
func (n *NodeImpl) AvailableSpaceForReservation(option *VolumeGrowOption) int64 {
baseAvailable := n.AvailableSpaceFor(option)
reservedCount := n.capacityReservations.getReservedCount(option.DiskType)
return baseAvailable - reservedCount
}
// TryReserveCapacity attempts to atomically reserve capacity for volume creation
func (n *NodeImpl) TryReserveCapacity(diskType types.DiskType, count int64) (reservationId string, success bool) {
const reservationTimeout = 5 * time.Minute // TODO: make this configurable
// Clean up any expired reservations first
n.capacityReservations.cleanExpiredReservations(reservationTimeout)
// Atomically check and reserve space
option := &VolumeGrowOption{DiskType: diskType}
reservationId, success = n.capacityReservations.tryReserveAtomic(diskType, count, func() int64 {
return n.AvailableSpaceFor(option)
})
if success {
glog.V(1).Infof("Reserved %d capacity for diskType %s on node %s: %s", count, diskType, n.Id(), reservationId)
}
return reservationId, success
}
// ReleaseReservedCapacity releases a previously reserved capacity
func (n *NodeImpl) ReleaseReservedCapacity(reservationId string) {
if n.capacityReservations.removeReservation(reservationId) {
glog.V(1).Infof("Released capacity reservation on node %s: %s", n.Id(), reservationId)
} else {
glog.V(1).Infof("Attempted to release non-existent reservation on node %s: %s", n.Id(), reservationId)
}
}
func (n *NodeImpl) SetParent(node Node) {
n.parent = node
}
func (n *NodeImpl) Children() (ret []Node) {
n.RLock()
defer n.RUnlock()
for _, c := range n.children {
ret = append(ret, c)
}
return ret
}
func (n *NodeImpl) Parent() Node {
return n.parent
}
func (n *NodeImpl) GetValue() interface{} {
return n.value
}
func (n *NodeImpl) ReserveOneVolume(r int64, option *VolumeGrowOption) (assignedNode *DataNode, err error) {
return n.reserveOneVolumeInternal(r, option, false)
}
// ReserveOneVolumeForReservation selects a node using reservation-aware capacity checks
func (n *NodeImpl) ReserveOneVolumeForReservation(r int64, option *VolumeGrowOption) (assignedNode *DataNode, err error) {
return n.reserveOneVolumeInternal(r, option, true)
}
func (n *NodeImpl) reserveOneVolumeInternal(r int64, option *VolumeGrowOption, useReservations bool) (assignedNode *DataNode, err error) {
n.RLock()
defer n.RUnlock()
for _, node := range n.children {
var freeSpace int64
if useReservations {
freeSpace = node.AvailableSpaceForReservation(option)
} else {
freeSpace = node.AvailableSpaceFor(option)
}
// fmt.Println("r =", r, ", node =", node, ", freeSpace =", freeSpace)
if freeSpace <= 0 {
continue
}
if r >= freeSpace {
r -= freeSpace
} else {
var hasSpace bool
if useReservations {
hasSpace = node.IsDataNode() && node.AvailableSpaceForReservation(option) > 0
} else {
hasSpace = node.IsDataNode() && node.AvailableSpaceFor(option) > 0
}
if hasSpace {
// fmt.Println("vid =", vid, " assigned to node =", node, ", freeSpace =", node.FreeSpace())
dn := node.(*DataNode)
if dn.IsTerminating {
continue
}
return dn, nil
}
if useReservations {
assignedNode, err = node.ReserveOneVolumeForReservation(r, option)
} else {
assignedNode, err = node.ReserveOneVolume(r, option)
}
if err == nil {
return
}
}
}
return nil, errors.New("No free volume slot found!")
}
func (n *NodeImpl) UpAdjustDiskUsageDelta(diskType types.DiskType, diskUsage *DiskUsageCounts) { //can be negative
existingDisk := n.getOrCreateDisk(diskType)
existingDisk.addDiskUsageCounts(diskUsage)
if n.parent != nil {
n.parent.UpAdjustDiskUsageDelta(diskType, diskUsage)
}
}
func (n *NodeImpl) UpAdjustMaxVolumeId(vid needle.VolumeId) { //can be negative
if n.maxVolumeId < vid {
n.maxVolumeId = vid
if n.parent != nil {
n.parent.UpAdjustMaxVolumeId(vid)
}
}
}
func (n *NodeImpl) GetMaxVolumeId() needle.VolumeId {
return n.maxVolumeId
}
func (n *NodeImpl) LinkChildNode(node Node) {
n.Lock()
defer n.Unlock()
n.doLinkChildNode(node)
}
func (n *NodeImpl) doLinkChildNode(node Node) {
if n.children[node.Id()] == nil {
n.children[node.Id()] = node
for dt, du := range node.GetDiskUsages().usages {
n.UpAdjustDiskUsageDelta(dt, du)
}
n.UpAdjustMaxVolumeId(node.GetMaxVolumeId())
node.SetParent(n)
glog.V(0).Infoln(n, "adds child", node.Id())
}
}
func (n *NodeImpl) UnlinkChildNode(nodeId NodeId) {
n.Lock()
defer n.Unlock()
node := n.children[nodeId]
if node != nil {
node.SetParent(nil)
delete(n.children, node.Id())
for dt, du := range node.GetDiskUsages().negative().usages {
n.UpAdjustDiskUsageDelta(dt, du)
}
glog.V(0).Infoln(n, "removes", node.Id())
}
}
func (n *NodeImpl) CollectDeadNodeAndFullVolumes(freshThreshHoldUnixTime int64, volumeSizeLimit uint64, growThreshold float64) {
if n.IsRack() {
for _, c := range n.Children() {
dn := c.(*DataNode) //can not cast n to DataNode
for _, v := range dn.GetVolumes() {
topo := n.GetTopology()
diskType := types.ToDiskType(v.DiskType)
vl := topo.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
if v.Size >= volumeSizeLimit {
vl.accessLock.RLock()
vacuumTime, ok := vl.vacuumedVolumes[v.Id]
vl.accessLock.RUnlock()
// If a volume has been vacuumed in the past 20 seconds, we do not check whether it has reached full capacity.
// After 20s(grpc timeout), theoretically all the heartbeats of the volume server have reached the master,
// the volume size should be correct, not the size before the vacuum.
if !ok || time.Now().Add(-20*time.Second).After(vacuumTime) {
//fmt.Println("volume",v.Id,"size",v.Size,">",volumeSizeLimit)
topo.chanFullVolumes <- v
}
} else if float64(v.Size) > float64(volumeSizeLimit)*growThreshold {
topo.chanCrowdedVolumes <- v
}
copyCount := v.ReplicaPlacement.GetCopyCount()
if copyCount > 1 {
if copyCount > len(topo.Lookup(v.Collection, v.Id)) {
stats.MasterReplicaPlacementMismatch.WithLabelValues(v.Collection, v.Id.String()).Set(1)
} else {
stats.MasterReplicaPlacementMismatch.WithLabelValues(v.Collection, v.Id.String()).Set(0)
}
}
}
}
} else {
for _, c := range n.Children() {
c.CollectDeadNodeAndFullVolumes(freshThreshHoldUnixTime, volumeSizeLimit, growThreshold)
}
}
}
func (n *NodeImpl) GetTopology() *Topology {
var p Node
p = n
for p.Parent() != nil {
p = p.Parent()
}
return p.GetValue().(*Topology)
}