Files
seaweedFS/weed/server/volume_grpc_client_to_master.go
Chris Lu 995dfc4d5d chore: remove ~50k lines of unreachable dead code (#8913)
* chore: remove unreachable dead code across the codebase

Remove ~50,000 lines of unreachable code identified by static analysis.

Major removals:
- weed/filer/redis_lua: entire unused Redis Lua filer store implementation
- weed/wdclient/net2, resource_pool: unused connection/resource pool packages
- weed/plugin/worker/lifecycle: unused lifecycle plugin worker
- weed/s3api: unused S3 policy templates, presigned URL IAM, streaming copy,
  multipart IAM, key rotation, and various SSE helper functions
- weed/mq/kafka: unused partition mapping, compression, schema, and protocol functions
- weed/mq/offset: unused SQL storage and migration code
- weed/worker: unused registry, task, and monitoring functions
- weed/query: unused SQL engine, parquet scanner, and type functions
- weed/shell: unused EC proportional rebalance functions
- weed/storage/erasure_coding/distribution: unused distribution analysis functions
- Individual unreachable functions removed from 150+ files across admin,
  credential, filer, iam, kms, mount, mq, operation, pb, s3api, server,
  shell, storage, topology, and util packages

* fix(s3): reset shared memory store in IAM test to prevent flaky failure

TestLoadIAMManagerFromConfig_EmptyConfigWithFallbackKey was flaky because
the MemoryStore credential backend is a singleton registered via init().
Earlier tests that create anonymous identities pollute the shared store,
causing LookupAnonymous() to unexpectedly return true.

Fix by calling Reset() on the memory store before the test runs.

* style: run gofmt on changed files

* fix: restore KMS functions used by integration tests

* fix(plugin): prevent panic on send to closed worker session channel

The Plugin.sendToWorker method could panic with "send on closed channel"
when a worker disconnected while a message was being sent. The race was
between streamSession.close() closing the outgoing channel and sendToWorker
writing to it concurrently.

Add a done channel to streamSession that is closed before the outgoing
channel, and check it in sendToWorker's select to safely detect closed
sessions without panicking.
2026-04-03 16:04:27 -07:00

322 lines
11 KiB
Go

package weed_server
import (
"fmt"
"os"
"strings"
"time"
"github.com/seaweedfs/seaweedfs/weed/operation"
"google.golang.org/grpc"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/security"
"github.com/seaweedfs/seaweedfs/weed/storage/backend"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"context"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/util"
)
func (vs *VolumeServer) GetMaster(ctx context.Context) pb.ServerAddress {
return vs.currentMaster
}
func (vs *VolumeServer) checkWithMaster() (err error) {
for {
for _, master := range vs.SeedMasterNodes {
err = operation.WithMasterServerClient(false, master, vs.grpcDialOption, func(masterClient master_pb.SeaweedClient) error {
resp, err := masterClient.GetMasterConfiguration(context.Background(), &master_pb.GetMasterConfigurationRequest{})
if err != nil {
return fmt.Errorf("get master %s configuration: %v", master, err)
}
vs.metricsAddress, vs.metricsIntervalSec = resp.MetricsAddress, int(resp.MetricsIntervalSeconds)
backend.LoadFromPbStorageBackends(resp.StorageBackends)
return nil
})
if err == nil {
return
} else {
glog.V(0).Infof("checkWithMaster %s: %v", master, err)
}
}
time.Sleep(1790 * time.Millisecond)
}
}
func (vs *VolumeServer) heartbeat() {
glog.V(0).Infof("Volume server start with seed master nodes: %v", vs.SeedMasterNodes)
vs.store.SetDataCenter(vs.dataCenter)
vs.store.SetRack(vs.rack)
grpcDialOption := security.LoadClientTLS(util.GetViper(), "grpc.volume")
var err error
var newLeader pb.ServerAddress
duplicateRetryCount := 0
for vs.isHeartbeating {
for _, master := range vs.SeedMasterNodes {
if newLeader != "" {
// the new leader may actually is the same master
// need to wait a bit before adding itself
time.Sleep(3 * time.Second)
master = newLeader
}
vs.store.MasterAddress = master
newLeader, err = vs.doHeartbeatWithRetry(master, grpcDialOption, vs.pulsePeriod, duplicateRetryCount)
if err != nil {
glog.V(0).Infof("heartbeat to %s error: %v", master, err)
// Check if this is a duplicate UUID retry error
if strings.Contains(err.Error(), "duplicate UUIDs detected, retrying connection") {
duplicateRetryCount++
retryDelay := time.Duration(1<<(duplicateRetryCount-1)) * 2 * time.Second // exponential backoff: 2s, 4s, 8s
glog.V(0).Infof("Waiting %v before retrying due to duplicate UUID detection...", retryDelay)
time.Sleep(retryDelay)
} else {
// Regular error, reset duplicate retry count
duplicateRetryCount = 0
time.Sleep(vs.pulsePeriod)
}
newLeader = ""
vs.store.MasterAddress = ""
} else {
// Successful connection, reset retry count
duplicateRetryCount = 0
}
if !vs.isHeartbeating {
break
}
}
}
}
func (vs *VolumeServer) StopHeartbeat() (isAlreadyStopping bool) {
if !vs.isHeartbeating {
return true
}
vs.isHeartbeating = false
close(vs.stopChan)
return false
}
func (vs *VolumeServer) doHeartbeatWithRetry(masterAddress pb.ServerAddress, grpcDialOption grpc.DialOption, sleepInterval time.Duration, duplicateRetryCount int) (newLeader pb.ServerAddress, err error) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
grpcConnection, err := pb.GrpcDial(ctx, masterAddress.ToGrpcAddress(), false, grpcDialOption)
if err != nil {
return "", fmt.Errorf("fail to dial %s : %v", masterAddress, err)
}
defer grpcConnection.Close()
client := master_pb.NewSeaweedClient(grpcConnection)
stream, err := client.SendHeartbeat(ctx)
if err != nil {
glog.V(0).Infof("SendHeartbeat to %s: %v", masterAddress, err)
return "", err
}
glog.V(0).Infof("Heartbeat to: %v", masterAddress)
vs.currentMaster = masterAddress
doneChan := make(chan error, 1)
go func() {
for {
in, err := stream.Recv()
if err != nil {
doneChan <- err
return
}
if len(in.DuplicatedUuids) > 0 {
var duplicateDir []string
for _, loc := range vs.store.Locations {
for _, uuid := range in.DuplicatedUuids {
if uuid == loc.DirectoryUuid {
duplicateDir = append(duplicateDir, loc.Directory)
}
}
}
// Implement retry logic for potential race conditions
const maxRetries = 3
if duplicateRetryCount < maxRetries {
retryDelay := time.Duration(1<<duplicateRetryCount) * 2 * time.Second // exponential backoff: 2s, 4s, 8s
glog.Errorf("Master reported duplicate volume directories: %v (retry %d/%d)", duplicateDir, duplicateRetryCount+1, maxRetries)
glog.Errorf("This might be due to a race condition during reconnection. Waiting %v before retrying...", retryDelay)
// Return error to trigger retry with increased count
doneChan <- fmt.Errorf("duplicate UUIDs detected, retrying connection (attempt %d/%d)", duplicateRetryCount+1, maxRetries)
return
} else {
// After max retries, this is likely a real duplicate
glog.Errorf("Shut down Volume Server due to persistent duplicate volume directories after %d retries: %v", maxRetries, duplicateDir)
glog.Errorf("Please check if another volume server is using the same directory")
os.Exit(1)
}
}
volumeOptsChanged := false
if vs.store.GetPreallocate() != in.GetPreallocate() {
vs.store.SetPreallocate(in.GetPreallocate())
volumeOptsChanged = true
}
if in.GetVolumeSizeLimit() != 0 && vs.store.GetVolumeSizeLimit() != in.GetVolumeSizeLimit() {
vs.store.SetVolumeSizeLimit(in.GetVolumeSizeLimit())
volumeOptsChanged = true
}
if volumeOptsChanged {
if vs.store.MaybeAdjustVolumeMax() {
if err = stream.Send(vs.store.CollectHeartbeat()); err != nil {
glog.V(0).Infof("Volume Server Failed to talk with master %s: %v", vs.currentMaster, err)
return
}
}
}
if in.GetLeader() != "" && string(vs.currentMaster) != in.GetLeader() {
glog.V(0).Infof("Volume Server found a new master newLeader: %v instead of %v", in.GetLeader(), vs.currentMaster)
newLeader = pb.ServerAddress(in.GetLeader())
doneChan <- nil
return
}
}
}()
if err = stream.Send(vs.store.CollectHeartbeat()); err != nil {
glog.V(0).Infof("Volume Server Failed to talk with master %s: %v", masterAddress, err)
return "", err
}
if err = stream.Send(vs.store.CollectErasureCodingHeartbeat()); err != nil {
glog.V(0).Infof("Volume Server Failed to talk with master %s: %v", masterAddress, err)
return "", err
}
volumeTickChan := time.NewTicker(sleepInterval)
defer volumeTickChan.Stop()
ecShardTickChan := time.NewTicker(17 * sleepInterval)
defer ecShardTickChan.Stop()
dataCenter := vs.store.GetDataCenter()
rack := vs.store.GetRack()
ip := vs.store.Ip
port := uint32(vs.store.Port)
for {
select {
case stateMessage := <-vs.store.StateUpdateChan:
stateBeat := &master_pb.Heartbeat{
Ip: ip,
Port: port,
DataCenter: dataCenter,
Rack: rack,
State: stateMessage,
}
glog.V(0).Infof("volume server %s:%d updates state to %v", vs.store.Ip, vs.store.Port, stateMessage)
if err = stream.Send(stateBeat); err != nil {
glog.V(0).Infof("Volume Server Failed to update state to master %s: %v", masterAddress, err)
return "", err
}
case volumeMessage := <-vs.store.NewVolumesChan:
deltaBeat := &master_pb.Heartbeat{
Ip: ip,
Port: port,
DataCenter: dataCenter,
Rack: rack,
NewVolumes: []*master_pb.VolumeShortInformationMessage{
&volumeMessage, // volumeMessage is already a copy from the channel receive
},
}
glog.V(0).Infof("volume server %s:%d adds volume %d", vs.store.Ip, vs.store.Port, volumeMessage.Id)
if err = stream.Send(deltaBeat); err != nil {
glog.V(0).Infof("Volume Server Failed to update to master %s: %v", masterAddress, err)
return "", err
}
case ecShardMessage := <-vs.store.NewEcShardsChan:
deltaBeat := &master_pb.Heartbeat{
Ip: ip,
Port: port,
DataCenter: dataCenter,
Rack: rack,
NewEcShards: []*master_pb.VolumeEcShardInformationMessage{
&ecShardMessage, // ecShardMessage is already a copy from the channel receive
},
}
si := erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(&ecShardMessage)
glog.V(0).Infof("volume server %s:%d adds ec shards to %d [%s]", vs.store.Ip, vs.store.Port, ecShardMessage.Id, si.String())
if err = stream.Send(deltaBeat); err != nil {
glog.V(0).Infof("Volume Server Failed to update to master %s: %v", masterAddress, err)
return "", err
}
case volumeMessage := <-vs.store.DeletedVolumesChan:
deltaBeat := &master_pb.Heartbeat{
Ip: ip,
Port: port,
DataCenter: dataCenter,
Rack: rack,
DeletedVolumes: []*master_pb.VolumeShortInformationMessage{
&volumeMessage, // volumeMessage is already a copy from the channel receive
},
}
glog.V(0).Infof("volume server %s:%d deletes volume %d", vs.store.Ip, vs.store.Port, volumeMessage.Id)
if err = stream.Send(deltaBeat); err != nil {
glog.V(0).Infof("Volume Server Failed to update to master %s: %v", masterAddress, err)
return "", err
}
case ecShardMessage := <-vs.store.DeletedEcShardsChan:
deltaBeat := &master_pb.Heartbeat{
Ip: ip,
Port: port,
DataCenter: dataCenter,
Rack: rack,
DeletedEcShards: []*master_pb.VolumeEcShardInformationMessage{
&ecShardMessage, // ecShardMessage is already a copy from the channel receive
},
}
si := erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(&ecShardMessage)
glog.V(0).Infof("volume server %s:%d deletes ec shards from %d [%s]", vs.store.Ip, vs.store.Port, ecShardMessage.Id, si.String())
if err = stream.Send(deltaBeat); err != nil {
glog.V(0).Infof("Volume Server Failed to update to master %s: %v", masterAddress, err)
return "", err
}
case <-volumeTickChan.C:
glog.V(4).Infof("volume server %s:%d heartbeat", vs.store.Ip, vs.store.Port)
vs.store.MaybeAdjustVolumeMax()
if err = stream.Send(vs.store.CollectHeartbeat()); err != nil {
glog.V(0).Infof("Volume Server Failed to talk with master %s: %v", masterAddress, err)
return "", err
}
case <-ecShardTickChan.C:
glog.V(4).Infof("volume server %s:%d ec heartbeat", vs.store.Ip, vs.store.Port)
if err = stream.Send(vs.store.CollectErasureCodingHeartbeat()); err != nil {
glog.V(0).Infof("Volume Server Failed to talk with master %s: %v", masterAddress, err)
return "", err
}
case err = <-doneChan:
return
case <-vs.stopChan:
var volumeMessages []*master_pb.VolumeInformationMessage
emptyBeat := &master_pb.Heartbeat{
Ip: ip,
Port: port,
PublicUrl: vs.store.PublicUrl,
MaxFileKey: uint64(0),
DataCenter: dataCenter,
Rack: rack,
Volumes: volumeMessages,
HasNoVolumes: len(volumeMessages) == 0,
}
glog.V(1).Infof("volume server %s:%d stops and deletes all volumes", vs.store.Ip, vs.store.Port)
if err = stream.Send(emptyBeat); err != nil {
glog.V(0).Infof("Volume Server Failed to update to master %s: %v", masterAddress, err)
return "", err
}
return
}
}
}