Give cluster.status detailed file metrics for regular volumes (#7791)
* Implement a `weed shell` command to return a status overview of the cluster. Detailed file information will be implemented in a follow-up MR. Note also that masters are currently not reporting back EC shard sizes correctly, via `master_pb.VolumeEcShardInformationMessage.shard_sizes`. F.ex: ``` > status cluster: id: topo status: LOCKED nodes: 10 topology: 1 DC(s)s, 1 disk(s) on 1 rack(s) volumes: total: 3 volumes on 1 collections max size: 31457280000 bytes regular: 2/80 volumes on 6 replicas, 6 writable (100.00%), 0 read-only (0.00%) EC: 1 EC volumes on 14 shards (14.00 shards/volume) storage: total: 186024424 bytes regular volumes: 186024424 bytes EC volumes: 0 bytes raw: 558073152 bytes on volume replicas, 0 bytes on EC shard files ``` * Humanize output for `weed.server` by default. Makes things more readable :) ``` > cluster.status cluster: id: topo status: LOCKED nodes: 10 topology: 1 DC, 10 disks on 1 rack volumes: total: 3 volumes, 1 collection max size: 32 GB regular: 2/80 volumes on 6 replicas, 6 writable (100%), 0 read-only (0%) EC: 1 EC volume on 14 shards (14 shards/volume) storage: total: 172 MB regular volumes: 172 MB EC volumes: 0 B raw: 516 MB on volume replicas, 0 B on EC shards ``` ``` > cluster.status --humanize=false cluster: id: topo status: LOCKED nodes: 10 topology: 1 DC(s), 10 disk(s) on 1 rack(s) volumes: total: 3 volume(s), 1 collection(s) max size: 31457280000 byte(s) regular: 2/80 volume(s) on 6 replica(s), 5 writable (83.33%), 1 read-only (16.67%) EC: 1 EC volume(s) on 14 shard(s) (14.00 shards/volume) storage: total: 172128072 byte(s) regular volumes: 172128072 byte(s) EC volumes: 0 byte(s) raw: 516384216 byte(s) on volume replicas, 0 byte(s) on EC shards ``` Also adds unit tests, and reshuffles test files handling for clarity. * `cluster.status`: Add detailed file metrics for regular volumes.
This commit is contained in:
@@ -1,13 +1,19 @@
|
||||
package shell
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/dustin/go-humanize"
|
||||
"github.com/dustin/go-humanize/english"
|
||||
"github.com/seaweedfs/seaweedfs/weed/operation"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
||||
|
||||
@@ -18,15 +24,29 @@ func init() {
|
||||
Commands = append(Commands, &commandClusterStatus{})
|
||||
}
|
||||
|
||||
// Map of volume_id -> [volume replicas] with stat details.
|
||||
type VolumeReplicaStats struct {
|
||||
Id string
|
||||
VolumeId uint32
|
||||
|
||||
Files uint64
|
||||
FilesDeleted uint64
|
||||
TotalSize uint64
|
||||
}
|
||||
type RegularVolumeStats map[uint32][]*VolumeReplicaStats
|
||||
|
||||
type commandClusterStatus struct{}
|
||||
type ClusterStatusPrinter struct {
|
||||
writer io.Writer
|
||||
humanize bool
|
||||
writer io.Writer
|
||||
writerMu sync.Mutex
|
||||
humanize bool
|
||||
maxParallelization int
|
||||
|
||||
locked bool
|
||||
collections []string
|
||||
topology *master_pb.TopologyInfo
|
||||
volumeSizeLimitMb uint64
|
||||
locked bool
|
||||
collections []string
|
||||
topology *master_pb.TopologyInfo
|
||||
volumeSizeLimitMb uint64
|
||||
regularVolumeStats RegularVolumeStats
|
||||
}
|
||||
|
||||
func (c *commandClusterStatus) Name() string {
|
||||
@@ -44,6 +64,8 @@ func (c *commandClusterStatus) HasTag(CommandTag) bool {
|
||||
func (c *commandClusterStatus) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
|
||||
flags := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
|
||||
humanize := flags.Bool("humanize", true, "human-readable output")
|
||||
includeFiles := flags.Bool("files", false, "include detailed file metrics, from all volume servers")
|
||||
maxParallelization := flags.Int("maxParallelization", DefaultMaxParallelization, "run up to X tasks in parallel, whenever possible")
|
||||
|
||||
if err = flags.Parse(args); err != nil {
|
||||
return err
|
||||
@@ -59,14 +81,21 @@ func (c *commandClusterStatus) Do(args []string, commandEnv *CommandEnv, writer
|
||||
}
|
||||
|
||||
sp := &ClusterStatusPrinter{
|
||||
writer: writer,
|
||||
humanize: *humanize,
|
||||
writer: writer,
|
||||
humanize: *humanize,
|
||||
maxParallelization: *maxParallelization,
|
||||
|
||||
locked: commandEnv.isLocked(),
|
||||
collections: collections,
|
||||
topology: topology,
|
||||
volumeSizeLimitMb: volumeSizeLimitMb,
|
||||
}
|
||||
if *includeFiles {
|
||||
if err := sp.loadFileStats(commandEnv); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
sp.Print()
|
||||
|
||||
return nil
|
||||
@@ -83,11 +112,19 @@ func (sp *ClusterStatusPrinter) int(n int) string {
|
||||
return sp.uint64(uint64(n))
|
||||
}
|
||||
|
||||
func (sp *ClusterStatusPrinter) plural(n int, str string) string {
|
||||
func (sp *ClusterStatusPrinter) uint64Plural(n uint64, str string) string {
|
||||
if !sp.humanize {
|
||||
return fmt.Sprintf("%s(s)", str)
|
||||
}
|
||||
return english.PluralWord(n, str, "")
|
||||
uin := math.MaxInt
|
||||
if n < math.MaxInt {
|
||||
uin = int(n)
|
||||
}
|
||||
return english.PluralWord(int(uin), str, "")
|
||||
}
|
||||
|
||||
func (sp *ClusterStatusPrinter) plural(n int, str string) string {
|
||||
return sp.uint64Plural(uint64(n), str)
|
||||
}
|
||||
|
||||
func (sp *ClusterStatusPrinter) bytes(b uint64) string {
|
||||
@@ -128,16 +165,90 @@ func (sp *ClusterStatusPrinter) intPct(a, b int) string {
|
||||
}
|
||||
|
||||
func (sp *ClusterStatusPrinter) write(format string, a ...any) {
|
||||
fmt.Fprintf(sp.writer, strings.TrimRight(format, "\r\n "), a...)
|
||||
fmt.Fprint(sp.writer, "\n")
|
||||
sp.writerMu.Lock()
|
||||
defer sp.writerMu.Unlock()
|
||||
|
||||
format = strings.TrimRight(format, " ")
|
||||
if len(format) == 0 {
|
||||
format = "\n"
|
||||
}
|
||||
fmt.Fprintf(sp.writer, format, a...)
|
||||
|
||||
last := format[len(format)-1:]
|
||||
if last != "\n" && last != "\r" {
|
||||
fmt.Fprint(sp.writer, "\n")
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: add option to collect detailed file stats
|
||||
func (sp *ClusterStatusPrinter) Print() {
|
||||
sp.write("")
|
||||
sp.printClusterInfo()
|
||||
sp.printVolumeInfo()
|
||||
sp.printStorageInfo()
|
||||
sp.printFilesInfo()
|
||||
}
|
||||
|
||||
// TODO: collect stats for EC volumes as well
|
||||
func (sp *ClusterStatusPrinter) loadFileStats(commandEnv *CommandEnv) error {
|
||||
sp.regularVolumeStats = RegularVolumeStats{}
|
||||
|
||||
var mu sync.Mutex
|
||||
var progressTotal, progressDone uint64
|
||||
ewg := NewErrorWaitGroup(sp.maxParallelization)
|
||||
|
||||
for _, dci := range sp.topology.DataCenterInfos {
|
||||
for _, ri := range dci.RackInfos {
|
||||
for _, dni := range ri.DataNodeInfos {
|
||||
for _, d := range dni.DiskInfos {
|
||||
mu.Lock()
|
||||
progressTotal += uint64(len(d.VolumeInfos))
|
||||
mu.Unlock()
|
||||
for _, v := range d.VolumeInfos {
|
||||
ewg.Add(func() error {
|
||||
// Collect regular volume stats
|
||||
err := operation.WithVolumeServerClient(false, pb.NewServerAddressWithGrpcPort(dni.Id, int(dni.GrpcPort)), commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
|
||||
resp, reqErr := volumeServerClient.VolumeStatus(context.Background(), &volume_server_pb.VolumeStatusRequest{
|
||||
VolumeId: uint32(v.Id),
|
||||
})
|
||||
if reqErr != nil {
|
||||
return reqErr
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
if resp != nil {
|
||||
if _, ok := sp.regularVolumeStats[v.Id]; !ok {
|
||||
sp.regularVolumeStats[v.Id] = []*VolumeReplicaStats{}
|
||||
}
|
||||
sp.regularVolumeStats[v.Id] = append(sp.regularVolumeStats[v.Id], &VolumeReplicaStats{
|
||||
Id: dni.Id,
|
||||
VolumeId: v.Id,
|
||||
Files: resp.FileCount,
|
||||
FilesDeleted: resp.FileDeletedCount,
|
||||
TotalSize: resp.VolumeSize,
|
||||
})
|
||||
}
|
||||
progressDone++
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
sp.write("collecting file stats: %s \r", sp.uint64Pct(progressDone, progressTotal))
|
||||
mu.Unlock()
|
||||
return nil
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
err := ewg.Wait()
|
||||
sp.write("")
|
||||
return err
|
||||
}
|
||||
|
||||
func (sp *ClusterStatusPrinter) printClusterInfo() {
|
||||
@@ -277,3 +388,52 @@ func (sp *ClusterStatusPrinter) printStorageInfo() {
|
||||
sp.write("\traw: %s on volume replicas, %s on EC shards", sp.bytes(rawVolumeSize), sp.bytes(rawEcVolumeSize))
|
||||
sp.write("")
|
||||
}
|
||||
|
||||
func (sp *ClusterStatusPrinter) printFilesInfo() {
|
||||
if len(sp.regularVolumeStats) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
var regularFilesTotal, regularFilesDeleted, regularFilesSize uint64
|
||||
var regularFilesTotalRaw, regularFilesDeletedRaw, regularFilesSizeRaw uint64
|
||||
|
||||
for _, replicaStats := range sp.regularVolumeStats {
|
||||
rc := uint64(len(replicaStats))
|
||||
|
||||
var volumeFilesTotal, volumeFilesSize, volumeFilesDeleted uint64
|
||||
for _, rs := range replicaStats {
|
||||
regularFilesTotalRaw += rs.Files
|
||||
regularFilesSizeRaw += rs.TotalSize
|
||||
regularFilesDeletedRaw += rs.FilesDeleted
|
||||
|
||||
volumeFilesTotal += rs.Files
|
||||
volumeFilesSize += rs.TotalSize
|
||||
volumeFilesDeleted += rs.FilesDeleted
|
||||
}
|
||||
regularFilesTotal += (volumeFilesTotal / rc)
|
||||
regularFilesSize += (volumeFilesSize / rc)
|
||||
regularFilesDeleted += (volumeFilesDeleted / rc)
|
||||
}
|
||||
|
||||
regularFiles := regularFilesTotal - regularFilesDeleted
|
||||
regularFilesRaw := regularFilesTotalRaw - regularFilesDeletedRaw
|
||||
var avgFileSize uint64
|
||||
if regularFilesTotal != 0 {
|
||||
avgFileSize = regularFilesSize / regularFilesTotal
|
||||
}
|
||||
|
||||
sp.write("files:")
|
||||
sp.write("\tregular: %s %s, %s readable (%s), %s deleted (%s), avg %s per file",
|
||||
sp.uint64(regularFilesTotal), sp.uint64Plural(regularFilesTotal, "file"),
|
||||
sp.uint64(regularFiles), sp.uint64Pct(regularFiles, regularFilesTotal),
|
||||
sp.uint64(regularFilesDeleted), sp.uint64Pct(regularFilesDeleted, regularFilesTotal),
|
||||
sp.bytes(avgFileSize))
|
||||
sp.write("\tregular raw: %s %s, %s readable (%s), %s deleted (%s), %s total",
|
||||
sp.uint64(regularFilesTotalRaw), sp.uint64Plural(regularFilesTotalRaw, "file"),
|
||||
sp.uint64(regularFilesRaw), sp.uint64Pct(regularFilesRaw, regularFilesTotalRaw),
|
||||
sp.uint64(regularFilesDeletedRaw), sp.uint64Pct(regularFilesDeletedRaw, regularFilesTotalRaw),
|
||||
sp.bytes(regularFilesSizeRaw))
|
||||
sp.write("\tEC: [no data]")
|
||||
sp.write("\tEC raw: [no data]")
|
||||
sp.write("")
|
||||
}
|
||||
|
||||
@@ -138,3 +138,78 @@ func TestPrintStorageInfo(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrintFilesInfo(t *testing.T) {
|
||||
testCases := []struct {
|
||||
regularVolumeStats RegularVolumeStats
|
||||
humanize bool
|
||||
want string
|
||||
}{
|
||||
{
|
||||
regularVolumeStats: RegularVolumeStats{
|
||||
1: []*VolumeReplicaStats{
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9001", VolumeId: 1, Files: 159, FilesDeleted: 8, TotalSize: 89762704},
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9002", VolumeId: 1, Files: 159, FilesDeleted: 8, TotalSize: 89762704},
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9008", VolumeId: 1, Files: 159, FilesDeleted: 8, TotalSize: 89762704},
|
||||
},
|
||||
2: []*VolumeReplicaStats{
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9003", VolumeId: 2, Files: 192, FilesDeleted: 21, TotalSize: 93788632},
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9004", VolumeId: 2, Files: 192, FilesDeleted: 21, TotalSize: 93788632},
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9005", VolumeId: 2, Files: 192, FilesDeleted: 21, TotalSize: 93788632},
|
||||
},
|
||||
3: []*VolumeReplicaStats{
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9001", VolumeId: 3, Files: 149, FilesDeleted: 0, TotalSize: 81643872},
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9006", VolumeId: 3, Files: 149, FilesDeleted: 0, TotalSize: 81643872},
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9009", VolumeId: 3, Files: 149, FilesDeleted: 0, TotalSize: 81643872},
|
||||
},
|
||||
},
|
||||
humanize: false,
|
||||
want: `files:
|
||||
regular: 500 file(s), 471 readable (94.20%), 29 deleted (5.80%), avg 530390 byte(s) per file
|
||||
regular raw: 1500 file(s), 1413 readable (94.20%), 87 deleted (5.80%), 795585624 byte(s) total
|
||||
EC: [no data]
|
||||
EC raw: [no data]
|
||||
|
||||
`,
|
||||
},
|
||||
{
|
||||
regularVolumeStats: RegularVolumeStats{
|
||||
1: []*VolumeReplicaStats{
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9001", VolumeId: 1, Files: 184, FilesDeleted: 33, TotalSize: 79187475},
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9008", VolumeId: 1, Files: 184, FilesDeleted: 33, TotalSize: 79187475},
|
||||
},
|
||||
2: []*VolumeReplicaStats{
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9004", VolumeId: 2, Files: 245, FilesDeleted: 4, TotalSize: 89501070},
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9005", VolumeId: 2, Files: 245, FilesDeleted: 4, TotalSize: 89501070},
|
||||
},
|
||||
3: []*VolumeReplicaStats{
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9006", VolumeId: 3, Files: 171, FilesDeleted: 12, TotalSize: 124049530},
|
||||
&VolumeReplicaStats{Id: "10.200.17.13:9009", VolumeId: 3, Files: 171, FilesDeleted: 12, TotalSize: 124049530},
|
||||
},
|
||||
},
|
||||
humanize: true,
|
||||
want: `files:
|
||||
regular: 600 files, 551 readable (91.83%), 49 deleted (8.16%), avg 488 kB per file
|
||||
regular raw: 1,200 files, 1,102 readable (91.83%), 98 deleted (8.16%), 586 MB total
|
||||
EC: [no data]
|
||||
EC raw: [no data]
|
||||
|
||||
`,
|
||||
},
|
||||
}
|
||||
|
||||
for i, tc := range testCases {
|
||||
var buf bytes.Buffer
|
||||
sp := &ClusterStatusPrinter{
|
||||
writer: &buf,
|
||||
humanize: tc.humanize,
|
||||
regularVolumeStats: tc.regularVolumeStats,
|
||||
}
|
||||
sp.printFilesInfo()
|
||||
got := buf.String()
|
||||
|
||||
if got != tc.want {
|
||||
t.Errorf("#%d: got %v, want %v", i, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user