Files
seaweedFS/weed/shell/command_cluster_status.go
Lisandro Pin 221bd237c4 Fix file stat collection metric bug for the cluster.status command. (#8302)
When the `--files` flag is present, `cluster.status` will scrape file metrics
from volume servers to provide detailed stats on those. The progress indicator
was not being updated properly though, so the command would complete before
it read 100%.
2026-02-11 13:34:20 -08:00

523 lines
14 KiB
Go

package shell
import (
"context"
"flag"
"fmt"
"math"
"strings"
"sync"
"github.com/dustin/go-humanize"
"github.com/dustin/go-humanize/english"
"github.com/seaweedfs/seaweedfs/weed/operation"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
"io"
)
func init() {
Commands = append(Commands, &commandClusterStatus{})
}
// Map of volume_id -> [volume replicas] with stat details.
type VolumeReplicaStats struct {
Id string
VolumeId uint32
Files uint64
FilesDeleted uint64
TotalSize uint64
}
type RegularVolumesStats map[uint32][]*VolumeReplicaStats
// Map of ec_volume_id -> stat details.
type EcVolumeStats struct {
VolumeId uint32
Files uint64
FilesDeleted uint64
TotalSize uint64
}
type EcVolumesStats map[uint32]*EcVolumeStats
type commandClusterStatus struct{}
type ClusterStatusPrinter struct {
writer io.Writer
writerMu sync.Mutex
humanize bool
maxParallelization int
locked bool
collections []string
topology *master_pb.TopologyInfo
volumeSizeLimitMb uint64
regularVolumesStats RegularVolumesStats
ecVolumesStats EcVolumesStats
}
func (c *commandClusterStatus) Name() string {
return "cluster.status"
}
func (c *commandClusterStatus) Help() string {
return `outputs a quick overview of the cluster status`
}
func (c *commandClusterStatus) HasTag(CommandTag) bool {
return false
}
func (c *commandClusterStatus) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
flags := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
humanize := flags.Bool("humanize", true, "human-readable output")
includeFiles := flags.Bool("files", false, "include detailed file metrics, from all volume servers")
maxParallelization := flags.Int("maxParallelization", DefaultMaxParallelization, "run up to X tasks in parallel, whenever possible")
if err = flags.Parse(args); err != nil {
return err
}
collections, err := ListCollectionNames(commandEnv, true, true)
if err != nil {
return err
}
topology, volumeSizeLimitMb, err := collectTopologyInfo(commandEnv, 0)
if err != nil {
return err
}
sp := &ClusterStatusPrinter{
writer: writer,
humanize: *humanize,
maxParallelization: *maxParallelization,
locked: commandEnv.isLocked(),
collections: collections,
topology: topology,
volumeSizeLimitMb: volumeSizeLimitMb,
}
if *includeFiles {
if err := sp.loadFileStats(commandEnv); err != nil {
return err
}
}
sp.Print()
return nil
}
func (sp *ClusterStatusPrinter) uint64(n uint64) string {
if !sp.humanize {
return fmt.Sprintf("%d", n)
}
return humanize.Comma(int64(n))
}
func (sp *ClusterStatusPrinter) int(n int) string {
return sp.uint64(uint64(n))
}
func (sp *ClusterStatusPrinter) uint64Plural(n uint64, str string) string {
if !sp.humanize {
return fmt.Sprintf("%s(s)", str)
}
uin := math.MaxInt
if n < math.MaxInt {
uin = int(n)
}
return english.PluralWord(int(uin), str, "")
}
func (sp *ClusterStatusPrinter) plural(n int, str string) string {
return sp.uint64Plural(uint64(n), str)
}
func (sp *ClusterStatusPrinter) bytes(b uint64) string {
if !sp.humanize {
return fmt.Sprintf("%d %s", b, sp.plural(int(b), "byte"))
}
return fmt.Sprintf("%s", humanize.Bytes(b))
}
func (sp *ClusterStatusPrinter) uint64Ratio(a, b uint64) string {
var p float64
if b != 0 {
if a%b == 0 {
// Avoid float precision issues on integer ratios.
p = float64(a / b)
} else {
p = float64(a) / float64(b)
}
}
if !sp.humanize {
return fmt.Sprintf("%.02f", p)
}
return fmt.Sprintf("%s", humanize.FtoaWithDigits(p, 2))
}
func (sp *ClusterStatusPrinter) intRatio(a, b int) string {
return sp.uint64Ratio(uint64(a), uint64(b))
}
func (sp *ClusterStatusPrinter) uint64Pct(a, b uint64) string {
var p float64
if b != 0 {
if a%b == 0 {
// avoid float rounding errors on exact ratios
p = float64(a / b * 100)
} else {
p = 100 * float64(a) / float64(b)
}
}
if !sp.humanize {
return fmt.Sprintf("%.02f%%", p)
}
return fmt.Sprintf("%s%%", humanize.FtoaWithDigits(p, 2))
}
func (sp *ClusterStatusPrinter) intPct(a, b int) string {
return sp.uint64Pct(uint64(a), uint64(b))
}
func (sp *ClusterStatusPrinter) write(format string, a ...any) {
sp.writerMu.Lock()
defer sp.writerMu.Unlock()
format = strings.TrimRight(format, " ")
if len(format) == 0 {
format = "\n"
}
fmt.Fprintf(sp.writer, format, a...)
last := format[len(format)-1:]
if last != "\n" && last != "\r" {
fmt.Fprint(sp.writer, "\n")
}
}
func (sp *ClusterStatusPrinter) Print() {
sp.write("")
sp.printClusterInfo()
sp.printVolumeInfo()
sp.printStorageInfo()
sp.printFilesInfo()
}
func (sp *ClusterStatusPrinter) loadFileStats(commandEnv *CommandEnv) error {
sp.regularVolumesStats = RegularVolumesStats{}
sp.ecVolumesStats = EcVolumesStats{}
var statsMu, writerMu sync.Mutex
var progressTotal, progressDone uint64
ewg := NewErrorWaitGroup(sp.maxParallelization)
updateProgress := func() {
writerMu.Lock()
defer writerMu.Unlock()
progressDone++
sp.write("collecting file stats: %s \r", sp.uint64Pct(progressDone, progressTotal))
}
for _, dci := range sp.topology.DataCenterInfos {
for _, ri := range dci.RackInfos {
for _, dni := range ri.DataNodeInfos {
for _, d := range dni.DiskInfos {
statsMu.Lock()
progressTotal += uint64(len(d.VolumeInfos))
progressTotal += uint64(len(d.EcShardInfos))
statsMu.Unlock()
for _, v := range d.VolumeInfos {
ewg.Add(func() error {
defer updateProgress()
// Collect regular volume stats
err := operation.WithVolumeServerClient(false, pb.NewServerAddressWithGrpcPort(dni.Id, int(dni.GrpcPort)), commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
resp, reqErr := volumeServerClient.VolumeStatus(context.Background(), &volume_server_pb.VolumeStatusRequest{
VolumeId: uint32(v.Id),
})
if reqErr != nil {
return reqErr
}
statsMu.Lock()
defer statsMu.Unlock()
if resp != nil {
if _, ok := sp.regularVolumesStats[v.Id]; !ok {
sp.regularVolumesStats[v.Id] = []*VolumeReplicaStats{}
}
sp.regularVolumesStats[v.Id] = append(sp.regularVolumesStats[v.Id], &VolumeReplicaStats{
Id: dni.Id,
VolumeId: v.Id,
Files: resp.FileCount,
FilesDeleted: resp.FileDeletedCount,
TotalSize: resp.VolumeSize,
})
}
return nil
})
return err
})
}
for _, eci := range d.EcShardInfos {
ewg.Add(func() error {
defer updateProgress()
// Collect EC shard stats
statsMu.Lock()
_, ok := sp.ecVolumesStats[eci.Id]
statsMu.Unlock()
if ok {
// this EC volume has been already processed, likely on a different node
return nil
}
err := operation.WithVolumeServerClient(false, pb.NewServerAddressWithGrpcPort(dni.Id, int(dni.GrpcPort)), commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
resp, reqErr := volumeServerClient.VolumeEcShardsInfo(context.Background(), &volume_server_pb.VolumeEcShardsInfoRequest{
VolumeId: uint32(eci.Id),
})
if reqErr != nil {
return reqErr
}
statsMu.Lock()
defer statsMu.Unlock()
if resp != nil {
sp.ecVolumesStats[eci.Id] = &EcVolumeStats{
VolumeId: eci.Id,
Files: resp.FileCount,
FilesDeleted: resp.FileDeletedCount,
TotalSize: resp.VolumeSize,
}
}
return nil
})
return err
})
}
}
}
}
}
err := ewg.Wait()
sp.write("")
return err
}
func (sp *ClusterStatusPrinter) printClusterInfo() {
dcs := len(sp.topology.DataCenterInfos)
racks := 0
nodes := 0
disks := 0
for _, dci := range sp.topology.DataCenterInfos {
racks += len(dci.RackInfos)
for _, ri := range dci.RackInfos {
for _, dni := range ri.DataNodeInfos {
nodes++
disks += len(dni.DiskInfos)
}
}
}
status := "unlocked"
if sp.locked {
status = "LOCKED"
}
sp.write("cluster:")
sp.write("\tid: %s", sp.topology.Id)
sp.write("\tstatus: %s", status)
sp.write("\tnodes: %s", sp.int(nodes))
sp.write("\ttopology: %s %s, %s %s on %s %s",
sp.int(dcs), sp.plural(dcs, "DC"),
sp.int(disks), sp.plural(disks, "disk"),
sp.int(racks), sp.plural(racks, "rack"))
sp.write("")
}
func (sp *ClusterStatusPrinter) printVolumeInfo() {
collections := len(sp.collections)
var maxVolumes uint64
volumeIds := map[needle.VolumeId]bool{}
ecVolumeIds := map[needle.VolumeId]bool{}
var replicas, roReplicas, rwReplicas, ecShards int
for _, dci := range sp.topology.DataCenterInfos {
for _, ri := range dci.RackInfos {
for _, dni := range ri.DataNodeInfos {
for _, di := range dni.DiskInfos {
maxVolumes += uint64(di.MaxVolumeCount)
for _, vi := range di.VolumeInfos {
vid := needle.VolumeId(vi.Id)
volumeIds[vid] = true
replicas++
if vi.ReadOnly {
roReplicas++
} else {
rwReplicas++
}
}
for _, eci := range di.EcShardInfos {
vid := needle.VolumeId(eci.Id)
ecVolumeIds[vid] = true
ecShards += erasure_coding.GetShardCount(eci)
}
}
}
}
}
volumes := len(volumeIds)
ecVolumes := len(ecVolumeIds)
totalVolumes := volumes + ecVolumes
sp.write("volumes:")
sp.write("\ttotal: %s %s, %s %s",
sp.int(totalVolumes), sp.plural(totalVolumes, "volume"),
sp.int(collections), sp.plural(collections, "collection"))
sp.write("\tmax size: %s", sp.bytes(sp.volumeSizeLimitMb*1024*1024))
sp.write("\tregular: %s/%s %s on %s %s, %s writable (%s), %s read-only (%s)",
sp.int(volumes), sp.uint64(maxVolumes), sp.plural(volumes, "volume"),
sp.int(replicas), sp.plural(replicas, "replica"),
sp.int(rwReplicas), sp.intPct(rwReplicas, replicas),
sp.int(roReplicas), sp.intPct(roReplicas, replicas))
sp.write("\tEC: %s EC %s on %s %s (%s shards/volume)",
sp.int(ecVolumes), sp.plural(ecVolumes, "volume"),
sp.int(ecShards), sp.plural(ecShards, "shard"),
sp.intRatio(ecShards, ecVolumes))
sp.write("")
}
func (sp *ClusterStatusPrinter) printStorageInfo() {
perVolumeSize := map[needle.VolumeId]uint64{}
perEcVolumeSize := map[needle.VolumeId]uint64{}
var rawVolumeSize, rawEcVolumeSize uint64
for _, dci := range sp.topology.DataCenterInfos {
for _, ri := range dci.RackInfos {
for _, dni := range ri.DataNodeInfos {
for _, di := range dni.DiskInfos {
for _, vi := range di.VolumeInfos {
vid := needle.VolumeId(vi.Id)
perVolumeSize[vid] = vi.Size
rawVolumeSize += vi.Size
}
for _, eci := range di.EcShardInfos {
vid := needle.VolumeId(eci.Id)
var size uint64
for _, ss := range eci.ShardSizes {
size += uint64(ss)
}
perEcVolumeSize[vid] += size
rawEcVolumeSize += size
}
}
}
}
}
// normalize EC logical volume sizes given shard settings
for vid := range perEcVolumeSize {
perEcVolumeSize[vid] = perEcVolumeSize[vid] * erasure_coding.DataShardsCount / erasure_coding.TotalShardsCount
}
var volumeSize, ecVolumeSize uint64
for _, s := range perVolumeSize {
volumeSize += s
}
for _, s := range perEcVolumeSize {
ecVolumeSize += s
}
totalSize := volumeSize + ecVolumeSize
totalRawSize := rawVolumeSize + rawEcVolumeSize
sp.write("storage:")
sp.write("\ttotal: %s (%s raw, %s)", sp.bytes(totalSize), sp.bytes(totalRawSize), sp.uint64Pct(totalRawSize, totalSize))
sp.write("\tregular volumes: %s (%s raw, %s)", sp.bytes(volumeSize), sp.bytes(rawVolumeSize), sp.uint64Pct(rawVolumeSize, volumeSize))
sp.write("\tEC volumes: %s (%s raw, %s)", sp.bytes(ecVolumeSize), sp.bytes(rawEcVolumeSize), sp.uint64Pct(rawEcVolumeSize, ecVolumeSize))
sp.write("")
}
func (sp *ClusterStatusPrinter) printFilesInfo() {
if len(sp.regularVolumesStats) == 0 && len(sp.ecVolumesStats) == 0 {
return
}
var regularFilesTotal, regularFilesDeleted, regularFilesSize uint64
for _, replicaStats := range sp.regularVolumesStats {
rc := uint64(len(replicaStats))
var volumeFilesTotal, volumeFilesSize, volumeFilesDeleted uint64
for _, rs := range replicaStats {
volumeFilesTotal += rs.Files
volumeFilesSize += rs.TotalSize
volumeFilesDeleted += rs.FilesDeleted
}
regularFilesTotal += (volumeFilesTotal / rc)
regularFilesSize += (volumeFilesSize / rc)
regularFilesDeleted += (volumeFilesDeleted / rc)
}
regularFiles := regularFilesTotal - regularFilesDeleted
var avgRegularFileSize uint64
if regularFilesTotal != 0 {
avgRegularFileSize = regularFilesSize / regularFilesTotal
}
var ecFilesTotal, ecFilesDeleted, ecFilesSize uint64
for _, ecStats := range sp.ecVolumesStats {
ecFilesTotal += ecStats.Files
ecFilesSize += ecStats.TotalSize
ecFilesDeleted += ecStats.FilesDeleted
}
ecFiles := ecFilesTotal - ecFilesDeleted
var avgEcFileSize uint64
if ecFilesTotal != 0 {
avgEcFileSize = ecFilesSize / ecFilesTotal
}
files := regularFiles + ecFiles
filesDeleted := regularFilesDeleted + ecFilesDeleted
filesTotal := regularFilesTotal + ecFilesTotal
filesSize := regularFilesSize + ecFilesSize
var avgFileSize uint64
if filesTotal != 0 {
avgFileSize = filesSize / filesTotal
}
sp.write("files:")
sp.write("\ttotal: %s %s, %s readable (%s), %s deleted (%s), avg %s per file",
sp.uint64(filesTotal), sp.uint64Plural(filesTotal, "file"),
sp.uint64(files), sp.uint64Pct(files, filesTotal),
sp.uint64(filesDeleted), sp.uint64Pct(filesDeleted, filesTotal),
sp.bytes(avgFileSize))
sp.write("\tregular: %s %s, %s readable (%s), %s deleted (%s), avg %s per file",
sp.uint64(regularFilesTotal), sp.uint64Plural(regularFilesTotal, "file"),
sp.uint64(regularFiles), sp.uint64Pct(regularFiles, regularFilesTotal),
sp.uint64(regularFilesDeleted), sp.uint64Pct(regularFilesDeleted, regularFilesTotal),
sp.bytes(avgRegularFileSize))
sp.write("\tEC: %s %s, %s readable (%s), %s deleted (%s), avg %s per file",
sp.uint64(ecFilesTotal), sp.uint64Plural(ecFilesTotal, "file"),
sp.uint64(ecFiles), sp.uint64Pct(ecFiles, ecFilesTotal),
sp.uint64(ecFilesDeleted), sp.uint64Pct(ecFilesDeleted, ecFilesTotal),
sp.bytes(avgEcFileSize))
sp.write("")
}