* chore: remove unreachable dead code across the codebase Remove ~50,000 lines of unreachable code identified by static analysis. Major removals: - weed/filer/redis_lua: entire unused Redis Lua filer store implementation - weed/wdclient/net2, resource_pool: unused connection/resource pool packages - weed/plugin/worker/lifecycle: unused lifecycle plugin worker - weed/s3api: unused S3 policy templates, presigned URL IAM, streaming copy, multipart IAM, key rotation, and various SSE helper functions - weed/mq/kafka: unused partition mapping, compression, schema, and protocol functions - weed/mq/offset: unused SQL storage and migration code - weed/worker: unused registry, task, and monitoring functions - weed/query: unused SQL engine, parquet scanner, and type functions - weed/shell: unused EC proportional rebalance functions - weed/storage/erasure_coding/distribution: unused distribution analysis functions - Individual unreachable functions removed from 150+ files across admin, credential, filer, iam, kms, mount, mq, operation, pb, s3api, server, shell, storage, topology, and util packages * fix(s3): reset shared memory store in IAM test to prevent flaky failure TestLoadIAMManagerFromConfig_EmptyConfigWithFallbackKey was flaky because the MemoryStore credential backend is a singleton registered via init(). Earlier tests that create anonymous identities pollute the shared store, causing LookupAnonymous() to unexpectedly return true. Fix by calling Reset() on the memory store before the test runs. * style: run gofmt on changed files * fix: restore KMS functions used by integration tests * fix(plugin): prevent panic on send to closed worker session channel The Plugin.sendToWorker method could panic with "send on closed channel" when a worker disconnected while a message was being sent. The race was between streamSession.close() closing the outgoing channel and sendToWorker writing to it concurrently. Add a done channel to streamSession that is closed before the outgoing channel, and check it in sendToWorker's select to safely detect closed sessions without panicking.
487 lines
14 KiB
Go
487 lines
14 KiB
Go
package topology
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"math/rand/v2"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/stats"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/types"
|
|
)
|
|
|
|
type NodeId string
|
|
|
|
// CapacityReservation represents a temporary reservation of capacity
|
|
type CapacityReservation struct {
|
|
reservationId string
|
|
diskType types.DiskType
|
|
count int64
|
|
createdAt time.Time
|
|
}
|
|
|
|
// CapacityReservations manages capacity reservations for a node
|
|
type CapacityReservations struct {
|
|
sync.RWMutex
|
|
reservations map[string]*CapacityReservation
|
|
reservedCounts map[types.DiskType]int64
|
|
}
|
|
|
|
func newCapacityReservations() *CapacityReservations {
|
|
return &CapacityReservations{
|
|
reservations: make(map[string]*CapacityReservation),
|
|
reservedCounts: make(map[types.DiskType]int64),
|
|
}
|
|
}
|
|
|
|
func (cr *CapacityReservations) removeReservation(reservationId string) bool {
|
|
cr.Lock()
|
|
defer cr.Unlock()
|
|
|
|
if reservation, exists := cr.reservations[reservationId]; exists {
|
|
delete(cr.reservations, reservationId)
|
|
cr.decrementCount(reservation.diskType, reservation.count)
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (cr *CapacityReservations) getReservedCount(diskType types.DiskType) int64 {
|
|
cr.RLock()
|
|
defer cr.RUnlock()
|
|
|
|
return cr.reservedCounts[diskType]
|
|
}
|
|
|
|
// decrementCount is a helper to decrement reserved count and clean up zero entries
|
|
func (cr *CapacityReservations) decrementCount(diskType types.DiskType, count int64) {
|
|
cr.reservedCounts[diskType] -= count
|
|
// Clean up zero counts to prevent map growth
|
|
if cr.reservedCounts[diskType] <= 0 {
|
|
delete(cr.reservedCounts, diskType)
|
|
}
|
|
}
|
|
|
|
// doAddReservation is a helper to add a reservation, assuming the lock is already held
|
|
func (cr *CapacityReservations) doAddReservation(diskType types.DiskType, count int64) string {
|
|
now := time.Now()
|
|
reservationId := fmt.Sprintf("%s-%d-%d-%d", diskType, count, now.UnixNano(), rand.Int64())
|
|
cr.reservations[reservationId] = &CapacityReservation{
|
|
reservationId: reservationId,
|
|
diskType: diskType,
|
|
count: count,
|
|
createdAt: now,
|
|
}
|
|
cr.reservedCounts[diskType] += count
|
|
return reservationId
|
|
}
|
|
|
|
// tryReserveAtomic atomically checks available space and reserves if possible
|
|
func (cr *CapacityReservations) tryReserveAtomic(diskType types.DiskType, count int64, availableSpaceFunc func() int64) (reservationId string, success bool) {
|
|
cr.Lock()
|
|
defer cr.Unlock()
|
|
|
|
// Check available space under lock
|
|
currentReserved := cr.reservedCounts[diskType]
|
|
availableSpace := availableSpaceFunc() - currentReserved
|
|
|
|
if availableSpace >= count {
|
|
// Create and add reservation atomically
|
|
return cr.doAddReservation(diskType, count), true
|
|
}
|
|
|
|
return "", false
|
|
}
|
|
|
|
func (cr *CapacityReservations) cleanExpiredReservations(expirationDuration time.Duration) {
|
|
cr.Lock()
|
|
defer cr.Unlock()
|
|
|
|
now := time.Now()
|
|
for id, reservation := range cr.reservations {
|
|
if now.Sub(reservation.createdAt) > expirationDuration {
|
|
delete(cr.reservations, id)
|
|
cr.decrementCount(reservation.diskType, reservation.count)
|
|
glog.V(1).Infof("Cleaned up expired capacity reservation: %s", id)
|
|
}
|
|
}
|
|
}
|
|
|
|
type Node interface {
|
|
Id() NodeId
|
|
String() string
|
|
AvailableSpaceFor(option *VolumeGrowOption) int64
|
|
ReserveOneVolume(r int64, option *VolumeGrowOption) (*DataNode, error)
|
|
ReserveOneVolumeForReservation(r int64, option *VolumeGrowOption) (*DataNode, error)
|
|
UpAdjustDiskUsageDelta(diskType types.DiskType, diskUsage *DiskUsageCounts)
|
|
UpAdjustMaxVolumeId(vid needle.VolumeId)
|
|
GetDiskUsages() *DiskUsages
|
|
|
|
// Capacity reservation methods for avoiding race conditions
|
|
TryReserveCapacity(diskType types.DiskType, count int64) (reservationId string, success bool)
|
|
ReleaseReservedCapacity(reservationId string)
|
|
AvailableSpaceForReservation(option *VolumeGrowOption) int64
|
|
|
|
GetMaxVolumeId() needle.VolumeId
|
|
SetParent(Node)
|
|
LinkChildNode(node Node)
|
|
UnlinkChildNode(nodeId NodeId)
|
|
CollectDeadNodeAndFullVolumes(freshThreshHold int64, volumeSizeLimit uint64, growThreshold float64)
|
|
|
|
IsDataNode() bool
|
|
IsRack() bool
|
|
IsDataCenter() bool
|
|
IsLocked() bool
|
|
Children() []Node
|
|
Parent() Node
|
|
|
|
GetValue() interface{} //get reference to the topology,dc,rack,datanode
|
|
}
|
|
|
|
type NodeImpl struct {
|
|
diskUsages *DiskUsages
|
|
id NodeId
|
|
parent Node
|
|
sync.RWMutex // lock children
|
|
children map[NodeId]Node
|
|
maxVolumeId needle.VolumeId
|
|
|
|
//for rack, data center, topology
|
|
nodeType string
|
|
value interface{}
|
|
|
|
// capacity reservations to prevent race conditions during volume creation
|
|
capacityReservations *CapacityReservations
|
|
}
|
|
|
|
func (n *NodeImpl) GetDiskUsages() *DiskUsages {
|
|
return n.diskUsages
|
|
}
|
|
|
|
// the first node must satisfy filterFirstNodeFn(), the rest nodes must have one free slot
|
|
func (n *NodeImpl) PickNodesByWeight(numberOfNodes int, option *VolumeGrowOption, filterFirstNodeFn func(dn Node) error) (firstNode Node, restNodes []Node, err error) {
|
|
var totalWeights int64
|
|
var errs []string
|
|
n.RLock()
|
|
candidates := make([]Node, 0, len(n.children))
|
|
candidatesWeights := make([]int64, 0, len(n.children))
|
|
//pick nodes which has enough free volumes as candidates, and use free volumes number as node weight.
|
|
for _, node := range n.children {
|
|
if node.AvailableSpaceFor(option) <= 0 {
|
|
continue
|
|
}
|
|
totalWeights += node.AvailableSpaceFor(option)
|
|
candidates = append(candidates, node)
|
|
candidatesWeights = append(candidatesWeights, node.AvailableSpaceFor(option))
|
|
}
|
|
n.RUnlock()
|
|
if len(candidates) < numberOfNodes {
|
|
glog.V(0).Infoln(n.Id(), "failed to pick", numberOfNodes, "from ", len(candidates), "node candidates")
|
|
return nil, nil, errors.New("Not enough data nodes found!")
|
|
}
|
|
|
|
//pick nodes randomly by weights, the node picked earlier has higher final weights
|
|
sortedCandidates := make([]Node, 0, len(candidates))
|
|
for i := 0; i < len(candidates); i++ {
|
|
// Break if no more weights available to prevent panic in rand.Int64N
|
|
if totalWeights <= 0 {
|
|
break
|
|
}
|
|
weightsInterval := rand.Int64N(totalWeights)
|
|
lastWeights := int64(0)
|
|
for k, weights := range candidatesWeights {
|
|
if (weightsInterval >= lastWeights) && (weightsInterval < lastWeights+weights) {
|
|
sortedCandidates = append(sortedCandidates, candidates[k])
|
|
candidatesWeights[k] = 0
|
|
totalWeights -= weights
|
|
break
|
|
}
|
|
lastWeights += weights
|
|
}
|
|
}
|
|
|
|
restNodes = make([]Node, 0, numberOfNodes-1)
|
|
ret := false
|
|
n.RLock()
|
|
for k, node := range sortedCandidates {
|
|
if err := filterFirstNodeFn(node); err == nil {
|
|
firstNode = node
|
|
if k >= numberOfNodes-1 {
|
|
restNodes = sortedCandidates[:numberOfNodes-1]
|
|
} else {
|
|
restNodes = append(restNodes, sortedCandidates[:k]...)
|
|
restNodes = append(restNodes, sortedCandidates[k+1:numberOfNodes]...)
|
|
}
|
|
ret = true
|
|
break
|
|
} else {
|
|
errs = append(errs, string(node.Id())+":"+err.Error())
|
|
}
|
|
}
|
|
n.RUnlock()
|
|
if !ret {
|
|
return nil, nil, errors.New("No matching data node found! \n" + strings.Join(errs, "\n"))
|
|
}
|
|
return
|
|
}
|
|
|
|
func (n *NodeImpl) IsDataNode() bool {
|
|
return n.nodeType == "DataNode"
|
|
}
|
|
|
|
func (n *NodeImpl) IsRack() bool {
|
|
return n.nodeType == "Rack"
|
|
}
|
|
|
|
func (n *NodeImpl) IsDataCenter() bool {
|
|
return n.nodeType == "DataCenter"
|
|
}
|
|
|
|
func (n *NodeImpl) IsLocked() (isTryLock bool) {
|
|
if isTryLock = n.TryRLock(); isTryLock {
|
|
n.RUnlock()
|
|
}
|
|
return !isTryLock
|
|
}
|
|
|
|
func (n *NodeImpl) String() string {
|
|
if n.parent != nil {
|
|
return n.parent.String() + ":" + string(n.id)
|
|
}
|
|
return string(n.id)
|
|
}
|
|
|
|
func (n *NodeImpl) Id() NodeId {
|
|
return n.id
|
|
}
|
|
|
|
func (n *NodeImpl) getOrCreateDisk(diskType types.DiskType) *DiskUsageCounts {
|
|
return n.diskUsages.getOrCreateDisk(diskType)
|
|
}
|
|
|
|
func (n *NodeImpl) AvailableSpaceFor(option *VolumeGrowOption) int64 {
|
|
t := n.getOrCreateDisk(option.DiskType)
|
|
freeVolumeSlotCount := atomic.LoadInt64(&t.maxVolumeCount) + atomic.LoadInt64(&t.remoteVolumeCount) - atomic.LoadInt64(&t.volumeCount)
|
|
ecShardCount := atomic.LoadInt64(&t.ecShardCount)
|
|
if ecShardCount > 0 {
|
|
freeVolumeSlotCount = freeVolumeSlotCount - ecShardCount/erasure_coding.DataShardsCount - 1
|
|
}
|
|
return freeVolumeSlotCount
|
|
}
|
|
|
|
// AvailableSpaceForReservation returns available space considering existing reservations
|
|
func (n *NodeImpl) AvailableSpaceForReservation(option *VolumeGrowOption) int64 {
|
|
baseAvailable := n.AvailableSpaceFor(option)
|
|
reservedCount := n.capacityReservations.getReservedCount(option.DiskType)
|
|
return baseAvailable - reservedCount
|
|
}
|
|
|
|
// TryReserveCapacity attempts to atomically reserve capacity for volume creation
|
|
func (n *NodeImpl) TryReserveCapacity(diskType types.DiskType, count int64) (reservationId string, success bool) {
|
|
const reservationTimeout = 5 * time.Minute // TODO: make this configurable
|
|
|
|
// Clean up any expired reservations first
|
|
n.capacityReservations.cleanExpiredReservations(reservationTimeout)
|
|
|
|
// Atomically check and reserve space
|
|
option := &VolumeGrowOption{DiskType: diskType}
|
|
reservationId, success = n.capacityReservations.tryReserveAtomic(diskType, count, func() int64 {
|
|
return n.AvailableSpaceFor(option)
|
|
})
|
|
|
|
if success {
|
|
glog.V(1).Infof("Reserved %d capacity for diskType %s on node %s: %s", count, diskType, n.Id(), reservationId)
|
|
}
|
|
|
|
return reservationId, success
|
|
}
|
|
|
|
// ReleaseReservedCapacity releases a previously reserved capacity
|
|
func (n *NodeImpl) ReleaseReservedCapacity(reservationId string) {
|
|
if n.capacityReservations.removeReservation(reservationId) {
|
|
glog.V(1).Infof("Released capacity reservation on node %s: %s", n.Id(), reservationId)
|
|
} else {
|
|
glog.V(1).Infof("Attempted to release non-existent reservation on node %s: %s", n.Id(), reservationId)
|
|
}
|
|
}
|
|
func (n *NodeImpl) SetParent(node Node) {
|
|
n.parent = node
|
|
}
|
|
|
|
func (n *NodeImpl) Children() (ret []Node) {
|
|
n.RLock()
|
|
defer n.RUnlock()
|
|
for _, c := range n.children {
|
|
ret = append(ret, c)
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func (n *NodeImpl) Parent() Node {
|
|
return n.parent
|
|
}
|
|
|
|
func (n *NodeImpl) GetValue() interface{} {
|
|
return n.value
|
|
}
|
|
|
|
func (n *NodeImpl) ReserveOneVolume(r int64, option *VolumeGrowOption) (assignedNode *DataNode, err error) {
|
|
return n.reserveOneVolumeInternal(r, option, false)
|
|
}
|
|
|
|
// ReserveOneVolumeForReservation selects a node using reservation-aware capacity checks
|
|
func (n *NodeImpl) ReserveOneVolumeForReservation(r int64, option *VolumeGrowOption) (assignedNode *DataNode, err error) {
|
|
return n.reserveOneVolumeInternal(r, option, true)
|
|
}
|
|
|
|
func (n *NodeImpl) reserveOneVolumeInternal(r int64, option *VolumeGrowOption, useReservations bool) (assignedNode *DataNode, err error) {
|
|
n.RLock()
|
|
defer n.RUnlock()
|
|
for _, node := range n.children {
|
|
var freeSpace int64
|
|
if useReservations {
|
|
freeSpace = node.AvailableSpaceForReservation(option)
|
|
} else {
|
|
freeSpace = node.AvailableSpaceFor(option)
|
|
}
|
|
// fmt.Println("r =", r, ", node =", node, ", freeSpace =", freeSpace)
|
|
if freeSpace <= 0 {
|
|
continue
|
|
}
|
|
if r >= freeSpace {
|
|
r -= freeSpace
|
|
} else {
|
|
var hasSpace bool
|
|
if useReservations {
|
|
hasSpace = node.IsDataNode() && node.AvailableSpaceForReservation(option) > 0
|
|
} else {
|
|
hasSpace = node.IsDataNode() && node.AvailableSpaceFor(option) > 0
|
|
}
|
|
if hasSpace {
|
|
// fmt.Println("vid =", vid, " assigned to node =", node, ", freeSpace =", node.FreeSpace())
|
|
dn := node.(*DataNode)
|
|
if dn.IsTerminating {
|
|
continue
|
|
}
|
|
return dn, nil
|
|
}
|
|
if useReservations {
|
|
assignedNode, err = node.ReserveOneVolumeForReservation(r, option)
|
|
} else {
|
|
assignedNode, err = node.ReserveOneVolume(r, option)
|
|
}
|
|
if err == nil {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
return nil, errors.New("No free volume slot found!")
|
|
}
|
|
|
|
func (n *NodeImpl) UpAdjustDiskUsageDelta(diskType types.DiskType, diskUsage *DiskUsageCounts) { //can be negative
|
|
existingDisk := n.getOrCreateDisk(diskType)
|
|
existingDisk.addDiskUsageCounts(diskUsage)
|
|
if n.parent != nil {
|
|
n.parent.UpAdjustDiskUsageDelta(diskType, diskUsage)
|
|
}
|
|
}
|
|
func (n *NodeImpl) UpAdjustMaxVolumeId(vid needle.VolumeId) { //can be negative
|
|
if n.maxVolumeId < vid {
|
|
n.maxVolumeId = vid
|
|
if n.parent != nil {
|
|
n.parent.UpAdjustMaxVolumeId(vid)
|
|
}
|
|
}
|
|
}
|
|
func (n *NodeImpl) GetMaxVolumeId() needle.VolumeId {
|
|
return n.maxVolumeId
|
|
}
|
|
|
|
func (n *NodeImpl) LinkChildNode(node Node) {
|
|
n.Lock()
|
|
defer n.Unlock()
|
|
n.doLinkChildNode(node)
|
|
}
|
|
|
|
func (n *NodeImpl) doLinkChildNode(node Node) {
|
|
if n.children[node.Id()] == nil {
|
|
n.children[node.Id()] = node
|
|
for dt, du := range node.GetDiskUsages().usages {
|
|
n.UpAdjustDiskUsageDelta(dt, du)
|
|
}
|
|
n.UpAdjustMaxVolumeId(node.GetMaxVolumeId())
|
|
node.SetParent(n)
|
|
glog.V(0).Infoln(n, "adds child", node.Id())
|
|
}
|
|
}
|
|
|
|
func (n *NodeImpl) UnlinkChildNode(nodeId NodeId) {
|
|
n.Lock()
|
|
defer n.Unlock()
|
|
node := n.children[nodeId]
|
|
if node != nil {
|
|
node.SetParent(nil)
|
|
delete(n.children, node.Id())
|
|
for dt, du := range node.GetDiskUsages().negative().usages {
|
|
n.UpAdjustDiskUsageDelta(dt, du)
|
|
}
|
|
glog.V(0).Infoln(n, "removes", node.Id())
|
|
}
|
|
}
|
|
|
|
func (n *NodeImpl) CollectDeadNodeAndFullVolumes(freshThreshHoldUnixTime int64, volumeSizeLimit uint64, growThreshold float64) {
|
|
if n.IsRack() {
|
|
for _, c := range n.Children() {
|
|
dn := c.(*DataNode) //can not cast n to DataNode
|
|
for _, v := range dn.GetVolumes() {
|
|
topo := n.GetTopology()
|
|
diskType := types.ToDiskType(v.DiskType)
|
|
vl := topo.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
|
|
|
|
if v.Size >= volumeSizeLimit {
|
|
vl.accessLock.RLock()
|
|
vacuumTime, ok := vl.vacuumedVolumes[v.Id]
|
|
vl.accessLock.RUnlock()
|
|
|
|
// If a volume has been vacuumed in the past 20 seconds, we do not check whether it has reached full capacity.
|
|
// After 20s(grpc timeout), theoretically all the heartbeats of the volume server have reached the master,
|
|
// the volume size should be correct, not the size before the vacuum.
|
|
if !ok || time.Now().Add(-20*time.Second).After(vacuumTime) {
|
|
//fmt.Println("volume",v.Id,"size",v.Size,">",volumeSizeLimit)
|
|
topo.chanFullVolumes <- v
|
|
}
|
|
} else if float64(v.Size) > float64(volumeSizeLimit)*growThreshold {
|
|
topo.chanCrowdedVolumes <- v
|
|
}
|
|
copyCount := v.ReplicaPlacement.GetCopyCount()
|
|
if copyCount > 1 {
|
|
if copyCount > len(topo.Lookup(v.Collection, v.Id)) {
|
|
stats.MasterReplicaPlacementMismatch.WithLabelValues(v.Collection, v.Id.String()).Set(1)
|
|
} else {
|
|
stats.MasterReplicaPlacementMismatch.WithLabelValues(v.Collection, v.Id.String()).Set(0)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
for _, c := range n.Children() {
|
|
c.CollectDeadNodeAndFullVolumes(freshThreshHoldUnixTime, volumeSizeLimit, growThreshold)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (n *NodeImpl) GetTopology() *Topology {
|
|
var p Node
|
|
p = n
|
|
for p.Parent() != nil {
|
|
p = p.Parent()
|
|
}
|
|
return p.GetValue().(*Topology)
|
|
}
|