Admin: misc improvements on admin server and workers. EC now works. (#7055)

* initial design

* added simulation as tests

* reorganized the codebase to move the simulation framework and tests into their own dedicated package

* integration test. ec worker task

* remove "enhanced" reference

* start master, volume servers, filer

Current Status
 Master: Healthy and running (port 9333)
 Filer: Healthy and running (port 8888)
 Volume Servers: All 6 servers running (ports 8080-8085)
🔄 Admin/Workers: Will start when dependencies are ready

* generate write load

* tasks are assigned

* admin start wtih grpc port. worker has its own working directory

* Update .gitignore

* working worker and admin. Task detection is not working yet.

* compiles, detection uses volumeSizeLimitMB from master

* compiles

* worker retries connecting to admin

* build and restart

* rendering pending tasks

* skip task ID column

* sticky worker id

* test canScheduleTaskNow

* worker reconnect to admin

* clean up logs

* worker register itself first

* worker can run ec work and report status

but:
1. one volume should not be repeatedly worked on.
2. ec shards needs to be distributed and source data should be deleted.

* move ec task logic

* listing ec shards

* local copy, ec. Need to distribute.

* ec is mostly working now

* distribution of ec shards needs improvement
* need configuration to enable ec

* show ec volumes

* interval field UI component

* rename

* integration test with vauuming

* garbage percentage threshold

* fix warning

* display ec shard sizes

* fix ec volumes list

* Update ui.go

* show default values

* ensure correct default value

* MaintenanceConfig use ConfigField

* use schema defined defaults

* config

* reduce duplication

* refactor to use BaseUIProvider

* each task register its schema

* checkECEncodingCandidate use ecDetector

* use vacuumDetector

* use volumeSizeLimitMB

* remove

remove

* remove unused

* refactor

* use new framework

* remove v2 reference

* refactor

* left menu can scroll now

* The maintenance manager was not being initialized when no data directory was configured for persistent storage.

* saving config

* Update task_config_schema_templ.go

* enable/disable tasks

* protobuf encoded task configurations

* fix system settings

* use ui component

* remove logs

* interface{} Reduction

* reduce interface{}

* reduce interface{}

* avoid from/to map

* reduce interface{}

* refactor

* keep it DRY

* added logging

* debug messages

* debug level

* debug

* show the log caller line

* use configured task policy

* log level

* handle admin heartbeat response

* Update worker.go

* fix EC rack and dc count

* Report task status to admin server

* fix task logging, simplify interface checking, use erasure_coding constants

* factor in empty volume server during task planning

* volume.list adds disk id

* track disk id also

* fix locking scheduled and manual scanning

* add active topology

* simplify task detector

* ec task completed, but shards are not showing up

* implement ec in ec_typed.go

* adjust log level

* dedup

* implementing ec copying shards and only ecx files

* use disk id when distributing ec shards

🎯 Planning: ActiveTopology creates DestinationPlan with specific TargetDisk
📦 Task Creation: maintenance_integration.go creates ECDestination with DiskId
🚀 Task Execution: EC task passes DiskId in VolumeEcShardsCopyRequest
💾 Volume Server: Receives disk_id and stores shards on specific disk (vs.store.Locations[req.DiskId])
📂 File System: EC shards and metadata land in the exact disk directory planned

* Delete original volume from all locations

* clean up existing shard locations

* local encoding and distributing

* Update docker/admin_integration/EC-TESTING-README.md

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

* check volume id range

* simplify

* fix tests

* fix types

* clean up logs and tests

---------

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Chris Lu
2025-07-30 12:38:03 -07:00
committed by GitHub
parent 64198dad83
commit 891a2fb6eb
130 changed files with 27737 additions and 4429 deletions

View File

@@ -109,6 +109,7 @@ message VolumeInformationMessage {
string remote_storage_name = 13;
string remote_storage_key = 14;
string disk_type = 15;
uint32 disk_id = 16;
}
message VolumeShortInformationMessage {
@@ -118,6 +119,7 @@ message VolumeShortInformationMessage {
uint32 version = 9;
uint32 ttl = 10;
string disk_type = 15;
uint32 disk_id = 16;
}
message VolumeEcShardInformationMessage {
@@ -126,6 +128,7 @@ message VolumeEcShardInformationMessage {
uint32 ec_index_bits = 3;
string disk_type = 4;
uint64 expire_at_sec = 5; // used to record the destruction time of ec volume
uint32 disk_id = 6;
}
message StorageBackend {
@@ -279,6 +282,7 @@ message DiskInfo {
repeated VolumeInformationMessage volume_infos = 6;
repeated VolumeEcShardInformationMessage ec_shard_infos = 7;
int64 remote_volume_count = 8;
uint32 disk_id = 9;
}
message DataNodeInfo {
string id = 1;

View File

@@ -313,6 +313,7 @@ type VolumeInformationMessage struct {
RemoteStorageName string `protobuf:"bytes,13,opt,name=remote_storage_name,json=remoteStorageName,proto3" json:"remote_storage_name,omitempty"`
RemoteStorageKey string `protobuf:"bytes,14,opt,name=remote_storage_key,json=remoteStorageKey,proto3" json:"remote_storage_key,omitempty"`
DiskType string `protobuf:"bytes,15,opt,name=disk_type,json=diskType,proto3" json:"disk_type,omitempty"`
DiskId uint32 `protobuf:"varint,16,opt,name=disk_id,json=diskId,proto3" json:"disk_id,omitempty"`
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
@@ -452,6 +453,13 @@ func (x *VolumeInformationMessage) GetDiskType() string {
return ""
}
func (x *VolumeInformationMessage) GetDiskId() uint32 {
if x != nil {
return x.DiskId
}
return 0
}
type VolumeShortInformationMessage struct {
state protoimpl.MessageState `protogen:"open.v1"`
Id uint32 `protobuf:"varint,1,opt,name=id,proto3" json:"id,omitempty"`
@@ -460,6 +468,7 @@ type VolumeShortInformationMessage struct {
Version uint32 `protobuf:"varint,9,opt,name=version,proto3" json:"version,omitempty"`
Ttl uint32 `protobuf:"varint,10,opt,name=ttl,proto3" json:"ttl,omitempty"`
DiskType string `protobuf:"bytes,15,opt,name=disk_type,json=diskType,proto3" json:"disk_type,omitempty"`
DiskId uint32 `protobuf:"varint,16,opt,name=disk_id,json=diskId,proto3" json:"disk_id,omitempty"`
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
@@ -536,6 +545,13 @@ func (x *VolumeShortInformationMessage) GetDiskType() string {
return ""
}
func (x *VolumeShortInformationMessage) GetDiskId() uint32 {
if x != nil {
return x.DiskId
}
return 0
}
type VolumeEcShardInformationMessage struct {
state protoimpl.MessageState `protogen:"open.v1"`
Id uint32 `protobuf:"varint,1,opt,name=id,proto3" json:"id,omitempty"`
@@ -543,6 +559,7 @@ type VolumeEcShardInformationMessage struct {
EcIndexBits uint32 `protobuf:"varint,3,opt,name=ec_index_bits,json=ecIndexBits,proto3" json:"ec_index_bits,omitempty"`
DiskType string `protobuf:"bytes,4,opt,name=disk_type,json=diskType,proto3" json:"disk_type,omitempty"`
ExpireAtSec uint64 `protobuf:"varint,5,opt,name=expire_at_sec,json=expireAtSec,proto3" json:"expire_at_sec,omitempty"` // used to record the destruction time of ec volume
DiskId uint32 `protobuf:"varint,6,opt,name=disk_id,json=diskId,proto3" json:"disk_id,omitempty"`
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
@@ -612,6 +629,13 @@ func (x *VolumeEcShardInformationMessage) GetExpireAtSec() uint64 {
return 0
}
func (x *VolumeEcShardInformationMessage) GetDiskId() uint32 {
if x != nil {
return x.DiskId
}
return 0
}
type StorageBackend struct {
state protoimpl.MessageState `protogen:"open.v1"`
Type string `protobuf:"bytes,1,opt,name=type,proto3" json:"type,omitempty"`
@@ -1904,6 +1928,7 @@ type DiskInfo struct {
VolumeInfos []*VolumeInformationMessage `protobuf:"bytes,6,rep,name=volume_infos,json=volumeInfos,proto3" json:"volume_infos,omitempty"`
EcShardInfos []*VolumeEcShardInformationMessage `protobuf:"bytes,7,rep,name=ec_shard_infos,json=ecShardInfos,proto3" json:"ec_shard_infos,omitempty"`
RemoteVolumeCount int64 `protobuf:"varint,8,opt,name=remote_volume_count,json=remoteVolumeCount,proto3" json:"remote_volume_count,omitempty"`
DiskId uint32 `protobuf:"varint,9,opt,name=disk_id,json=diskId,proto3" json:"disk_id,omitempty"`
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
@@ -1994,6 +2019,13 @@ func (x *DiskInfo) GetRemoteVolumeCount() int64 {
return 0
}
func (x *DiskInfo) GetDiskId() uint32 {
if x != nil {
return x.DiskId
}
return 0
}
type DataNodeInfo struct {
state protoimpl.MessageState `protogen:"open.v1"`
Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"`
@@ -4034,7 +4066,7 @@ const file_master_proto_rawDesc = "" +
"\x18metrics_interval_seconds\x18\x04 \x01(\rR\x16metricsIntervalSeconds\x12D\n" +
"\x10storage_backends\x18\x05 \x03(\v2\x19.master_pb.StorageBackendR\x0fstorageBackends\x12)\n" +
"\x10duplicated_uuids\x18\x06 \x03(\tR\x0fduplicatedUuids\x12 \n" +
"\vpreallocate\x18\a \x01(\bR\vpreallocate\"\x98\x04\n" +
"\vpreallocate\x18\a \x01(\bR\vpreallocate\"\xb1\x04\n" +
"\x18VolumeInformationMessage\x12\x0e\n" +
"\x02id\x18\x01 \x01(\rR\x02id\x12\x12\n" +
"\x04size\x18\x02 \x01(\x04R\x04size\x12\x1e\n" +
@@ -4054,7 +4086,8 @@ const file_master_proto_rawDesc = "" +
"\x12modified_at_second\x18\f \x01(\x03R\x10modifiedAtSecond\x12.\n" +
"\x13remote_storage_name\x18\r \x01(\tR\x11remoteStorageName\x12,\n" +
"\x12remote_storage_key\x18\x0e \x01(\tR\x10remoteStorageKey\x12\x1b\n" +
"\tdisk_type\x18\x0f \x01(\tR\bdiskType\"\xc5\x01\n" +
"\tdisk_type\x18\x0f \x01(\tR\bdiskType\x12\x17\n" +
"\adisk_id\x18\x10 \x01(\rR\x06diskId\"\xde\x01\n" +
"\x1dVolumeShortInformationMessage\x12\x0e\n" +
"\x02id\x18\x01 \x01(\rR\x02id\x12\x1e\n" +
"\n" +
@@ -4064,7 +4097,8 @@ const file_master_proto_rawDesc = "" +
"\aversion\x18\t \x01(\rR\aversion\x12\x10\n" +
"\x03ttl\x18\n" +
" \x01(\rR\x03ttl\x12\x1b\n" +
"\tdisk_type\x18\x0f \x01(\tR\bdiskType\"\xb6\x01\n" +
"\tdisk_type\x18\x0f \x01(\tR\bdiskType\x12\x17\n" +
"\adisk_id\x18\x10 \x01(\rR\x06diskId\"\xcf\x01\n" +
"\x1fVolumeEcShardInformationMessage\x12\x0e\n" +
"\x02id\x18\x01 \x01(\rR\x02id\x12\x1e\n" +
"\n" +
@@ -4072,7 +4106,8 @@ const file_master_proto_rawDesc = "" +
"collection\x12\"\n" +
"\rec_index_bits\x18\x03 \x01(\rR\vecIndexBits\x12\x1b\n" +
"\tdisk_type\x18\x04 \x01(\tR\bdiskType\x12\"\n" +
"\rexpire_at_sec\x18\x05 \x01(\x04R\vexpireAtSec\"\xbe\x01\n" +
"\rexpire_at_sec\x18\x05 \x01(\x04R\vexpireAtSec\x12\x17\n" +
"\adisk_id\x18\x06 \x01(\rR\x06diskId\"\xbe\x01\n" +
"\x0eStorageBackend\x12\x12\n" +
"\x04type\x18\x01 \x01(\tR\x04type\x12\x0e\n" +
"\x02id\x18\x02 \x01(\tR\x02id\x12I\n" +
@@ -4199,7 +4234,7 @@ const file_master_proto_rawDesc = "" +
"\vcollections\x18\x01 \x03(\v2\x15.master_pb.CollectionR\vcollections\"-\n" +
"\x17CollectionDeleteRequest\x12\x12\n" +
"\x04name\x18\x01 \x01(\tR\x04name\"\x1a\n" +
"\x18CollectionDeleteResponse\"\x91\x03\n" +
"\x18CollectionDeleteResponse\"\xaa\x03\n" +
"\bDiskInfo\x12\x12\n" +
"\x04type\x18\x01 \x01(\tR\x04type\x12!\n" +
"\fvolume_count\x18\x02 \x01(\x03R\vvolumeCount\x12(\n" +
@@ -4208,7 +4243,8 @@ const file_master_proto_rawDesc = "" +
"\x13active_volume_count\x18\x05 \x01(\x03R\x11activeVolumeCount\x12F\n" +
"\fvolume_infos\x18\x06 \x03(\v2#.master_pb.VolumeInformationMessageR\vvolumeInfos\x12P\n" +
"\x0eec_shard_infos\x18\a \x03(\v2*.master_pb.VolumeEcShardInformationMessageR\fecShardInfos\x12.\n" +
"\x13remote_volume_count\x18\b \x01(\x03R\x11remoteVolumeCount\"\xd4\x01\n" +
"\x13remote_volume_count\x18\b \x01(\x03R\x11remoteVolumeCount\x12\x17\n" +
"\adisk_id\x18\t \x01(\rR\x06diskId\"\xd4\x01\n" +
"\fDataNodeInfo\x12\x0e\n" +
"\x02id\x18\x01 \x01(\tR\x02id\x12D\n" +
"\tdiskInfos\x18\x02 \x03(\v2&.master_pb.DataNodeInfo.DiskInfosEntryR\tdiskInfos\x12\x1b\n" +

View File

@@ -53,6 +53,8 @@ service VolumeServer {
}
rpc CopyFile (CopyFileRequest) returns (stream CopyFileResponse) {
}
rpc ReceiveFile (stream ReceiveFileRequest) returns (ReceiveFileResponse) {
}
rpc ReadNeedleBlob (ReadNeedleBlobRequest) returns (ReadNeedleBlobResponse) {
}
@@ -87,6 +89,8 @@ service VolumeServer {
}
rpc VolumeEcShardsToVolume (VolumeEcShardsToVolumeRequest) returns (VolumeEcShardsToVolumeResponse) {
}
rpc VolumeEcShardsInfo (VolumeEcShardsInfoRequest) returns (VolumeEcShardsInfoResponse) {
}
// tiered storage
rpc VolumeTierMoveDatToRemote (VolumeTierMoveDatToRemoteRequest) returns (stream VolumeTierMoveDatToRemoteResponse) {
@@ -285,6 +289,27 @@ message CopyFileResponse {
int64 modified_ts_ns = 2;
}
message ReceiveFileRequest {
oneof data {
ReceiveFileInfo info = 1;
bytes file_content = 2;
}
}
message ReceiveFileInfo {
uint32 volume_id = 1;
string ext = 2;
string collection = 3;
bool is_ec_volume = 4;
uint32 shard_id = 5;
uint64 file_size = 6;
}
message ReceiveFileResponse {
uint64 bytes_written = 1;
string error = 2;
}
message ReadNeedleBlobRequest {
uint32 volume_id = 1;
int64 offset = 3; // actual offset
@@ -376,6 +401,7 @@ message VolumeEcShardsCopyRequest {
string source_data_node = 5;
bool copy_ecj_file = 6;
bool copy_vif_file = 7;
uint32 disk_id = 8; // Target disk ID for storing EC shards
}
message VolumeEcShardsCopyResponse {
}
@@ -431,6 +457,19 @@ message VolumeEcShardsToVolumeRequest {
message VolumeEcShardsToVolumeResponse {
}
message VolumeEcShardsInfoRequest {
uint32 volume_id = 1;
}
message VolumeEcShardsInfoResponse {
repeated EcShardInfo ec_shard_infos = 1;
}
message EcShardInfo {
uint32 shard_id = 1;
int64 size = 2;
string collection = 3;
}
message ReadVolumeFileStatusRequest {
uint32 volume_id = 1;
}

File diff suppressed because it is too large Load Diff

View File

@@ -38,6 +38,7 @@ const (
VolumeServer_VolumeCopy_FullMethodName = "/volume_server_pb.VolumeServer/VolumeCopy"
VolumeServer_ReadVolumeFileStatus_FullMethodName = "/volume_server_pb.VolumeServer/ReadVolumeFileStatus"
VolumeServer_CopyFile_FullMethodName = "/volume_server_pb.VolumeServer/CopyFile"
VolumeServer_ReceiveFile_FullMethodName = "/volume_server_pb.VolumeServer/ReceiveFile"
VolumeServer_ReadNeedleBlob_FullMethodName = "/volume_server_pb.VolumeServer/ReadNeedleBlob"
VolumeServer_ReadNeedleMeta_FullMethodName = "/volume_server_pb.VolumeServer/ReadNeedleMeta"
VolumeServer_WriteNeedleBlob_FullMethodName = "/volume_server_pb.VolumeServer/WriteNeedleBlob"
@@ -53,6 +54,7 @@ const (
VolumeServer_VolumeEcShardRead_FullMethodName = "/volume_server_pb.VolumeServer/VolumeEcShardRead"
VolumeServer_VolumeEcBlobDelete_FullMethodName = "/volume_server_pb.VolumeServer/VolumeEcBlobDelete"
VolumeServer_VolumeEcShardsToVolume_FullMethodName = "/volume_server_pb.VolumeServer/VolumeEcShardsToVolume"
VolumeServer_VolumeEcShardsInfo_FullMethodName = "/volume_server_pb.VolumeServer/VolumeEcShardsInfo"
VolumeServer_VolumeTierMoveDatToRemote_FullMethodName = "/volume_server_pb.VolumeServer/VolumeTierMoveDatToRemote"
VolumeServer_VolumeTierMoveDatFromRemote_FullMethodName = "/volume_server_pb.VolumeServer/VolumeTierMoveDatFromRemote"
VolumeServer_VolumeServerStatus_FullMethodName = "/volume_server_pb.VolumeServer/VolumeServerStatus"
@@ -88,6 +90,7 @@ type VolumeServerClient interface {
VolumeCopy(ctx context.Context, in *VolumeCopyRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[VolumeCopyResponse], error)
ReadVolumeFileStatus(ctx context.Context, in *ReadVolumeFileStatusRequest, opts ...grpc.CallOption) (*ReadVolumeFileStatusResponse, error)
CopyFile(ctx context.Context, in *CopyFileRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[CopyFileResponse], error)
ReceiveFile(ctx context.Context, opts ...grpc.CallOption) (grpc.ClientStreamingClient[ReceiveFileRequest, ReceiveFileResponse], error)
ReadNeedleBlob(ctx context.Context, in *ReadNeedleBlobRequest, opts ...grpc.CallOption) (*ReadNeedleBlobResponse, error)
ReadNeedleMeta(ctx context.Context, in *ReadNeedleMetaRequest, opts ...grpc.CallOption) (*ReadNeedleMetaResponse, error)
WriteNeedleBlob(ctx context.Context, in *WriteNeedleBlobRequest, opts ...grpc.CallOption) (*WriteNeedleBlobResponse, error)
@@ -104,6 +107,7 @@ type VolumeServerClient interface {
VolumeEcShardRead(ctx context.Context, in *VolumeEcShardReadRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[VolumeEcShardReadResponse], error)
VolumeEcBlobDelete(ctx context.Context, in *VolumeEcBlobDeleteRequest, opts ...grpc.CallOption) (*VolumeEcBlobDeleteResponse, error)
VolumeEcShardsToVolume(ctx context.Context, in *VolumeEcShardsToVolumeRequest, opts ...grpc.CallOption) (*VolumeEcShardsToVolumeResponse, error)
VolumeEcShardsInfo(ctx context.Context, in *VolumeEcShardsInfoRequest, opts ...grpc.CallOption) (*VolumeEcShardsInfoResponse, error)
// tiered storage
VolumeTierMoveDatToRemote(ctx context.Context, in *VolumeTierMoveDatToRemoteRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[VolumeTierMoveDatToRemoteResponse], error)
VolumeTierMoveDatFromRemote(ctx context.Context, in *VolumeTierMoveDatFromRemoteRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[VolumeTierMoveDatFromRemoteResponse], error)
@@ -351,6 +355,19 @@ func (c *volumeServerClient) CopyFile(ctx context.Context, in *CopyFileRequest,
// This type alias is provided for backwards compatibility with existing code that references the prior non-generic stream type by name.
type VolumeServer_CopyFileClient = grpc.ServerStreamingClient[CopyFileResponse]
func (c *volumeServerClient) ReceiveFile(ctx context.Context, opts ...grpc.CallOption) (grpc.ClientStreamingClient[ReceiveFileRequest, ReceiveFileResponse], error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
stream, err := c.cc.NewStream(ctx, &VolumeServer_ServiceDesc.Streams[4], VolumeServer_ReceiveFile_FullMethodName, cOpts...)
if err != nil {
return nil, err
}
x := &grpc.GenericClientStream[ReceiveFileRequest, ReceiveFileResponse]{ClientStream: stream}
return x, nil
}
// This type alias is provided for backwards compatibility with existing code that references the prior non-generic stream type by name.
type VolumeServer_ReceiveFileClient = grpc.ClientStreamingClient[ReceiveFileRequest, ReceiveFileResponse]
func (c *volumeServerClient) ReadNeedleBlob(ctx context.Context, in *ReadNeedleBlobRequest, opts ...grpc.CallOption) (*ReadNeedleBlobResponse, error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
out := new(ReadNeedleBlobResponse)
@@ -383,7 +400,7 @@ func (c *volumeServerClient) WriteNeedleBlob(ctx context.Context, in *WriteNeedl
func (c *volumeServerClient) ReadAllNeedles(ctx context.Context, in *ReadAllNeedlesRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[ReadAllNeedlesResponse], error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
stream, err := c.cc.NewStream(ctx, &VolumeServer_ServiceDesc.Streams[4], VolumeServer_ReadAllNeedles_FullMethodName, cOpts...)
stream, err := c.cc.NewStream(ctx, &VolumeServer_ServiceDesc.Streams[5], VolumeServer_ReadAllNeedles_FullMethodName, cOpts...)
if err != nil {
return nil, err
}
@@ -402,7 +419,7 @@ type VolumeServer_ReadAllNeedlesClient = grpc.ServerStreamingClient[ReadAllNeedl
func (c *volumeServerClient) VolumeTailSender(ctx context.Context, in *VolumeTailSenderRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[VolumeTailSenderResponse], error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
stream, err := c.cc.NewStream(ctx, &VolumeServer_ServiceDesc.Streams[5], VolumeServer_VolumeTailSender_FullMethodName, cOpts...)
stream, err := c.cc.NewStream(ctx, &VolumeServer_ServiceDesc.Streams[6], VolumeServer_VolumeTailSender_FullMethodName, cOpts...)
if err != nil {
return nil, err
}
@@ -491,7 +508,7 @@ func (c *volumeServerClient) VolumeEcShardsUnmount(ctx context.Context, in *Volu
func (c *volumeServerClient) VolumeEcShardRead(ctx context.Context, in *VolumeEcShardReadRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[VolumeEcShardReadResponse], error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
stream, err := c.cc.NewStream(ctx, &VolumeServer_ServiceDesc.Streams[6], VolumeServer_VolumeEcShardRead_FullMethodName, cOpts...)
stream, err := c.cc.NewStream(ctx, &VolumeServer_ServiceDesc.Streams[7], VolumeServer_VolumeEcShardRead_FullMethodName, cOpts...)
if err != nil {
return nil, err
}
@@ -528,9 +545,19 @@ func (c *volumeServerClient) VolumeEcShardsToVolume(ctx context.Context, in *Vol
return out, nil
}
func (c *volumeServerClient) VolumeEcShardsInfo(ctx context.Context, in *VolumeEcShardsInfoRequest, opts ...grpc.CallOption) (*VolumeEcShardsInfoResponse, error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
out := new(VolumeEcShardsInfoResponse)
err := c.cc.Invoke(ctx, VolumeServer_VolumeEcShardsInfo_FullMethodName, in, out, cOpts...)
if err != nil {
return nil, err
}
return out, nil
}
func (c *volumeServerClient) VolumeTierMoveDatToRemote(ctx context.Context, in *VolumeTierMoveDatToRemoteRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[VolumeTierMoveDatToRemoteResponse], error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
stream, err := c.cc.NewStream(ctx, &VolumeServer_ServiceDesc.Streams[7], VolumeServer_VolumeTierMoveDatToRemote_FullMethodName, cOpts...)
stream, err := c.cc.NewStream(ctx, &VolumeServer_ServiceDesc.Streams[8], VolumeServer_VolumeTierMoveDatToRemote_FullMethodName, cOpts...)
if err != nil {
return nil, err
}
@@ -549,7 +576,7 @@ type VolumeServer_VolumeTierMoveDatToRemoteClient = grpc.ServerStreamingClient[V
func (c *volumeServerClient) VolumeTierMoveDatFromRemote(ctx context.Context, in *VolumeTierMoveDatFromRemoteRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[VolumeTierMoveDatFromRemoteResponse], error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
stream, err := c.cc.NewStream(ctx, &VolumeServer_ServiceDesc.Streams[8], VolumeServer_VolumeTierMoveDatFromRemote_FullMethodName, cOpts...)
stream, err := c.cc.NewStream(ctx, &VolumeServer_ServiceDesc.Streams[9], VolumeServer_VolumeTierMoveDatFromRemote_FullMethodName, cOpts...)
if err != nil {
return nil, err
}
@@ -598,7 +625,7 @@ func (c *volumeServerClient) FetchAndWriteNeedle(ctx context.Context, in *FetchA
func (c *volumeServerClient) Query(ctx context.Context, in *QueryRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[QueriedStripe], error) {
cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
stream, err := c.cc.NewStream(ctx, &VolumeServer_ServiceDesc.Streams[9], VolumeServer_Query_FullMethodName, cOpts...)
stream, err := c.cc.NewStream(ctx, &VolumeServer_ServiceDesc.Streams[10], VolumeServer_Query_FullMethodName, cOpts...)
if err != nil {
return nil, err
}
@@ -660,6 +687,7 @@ type VolumeServerServer interface {
VolumeCopy(*VolumeCopyRequest, grpc.ServerStreamingServer[VolumeCopyResponse]) error
ReadVolumeFileStatus(context.Context, *ReadVolumeFileStatusRequest) (*ReadVolumeFileStatusResponse, error)
CopyFile(*CopyFileRequest, grpc.ServerStreamingServer[CopyFileResponse]) error
ReceiveFile(grpc.ClientStreamingServer[ReceiveFileRequest, ReceiveFileResponse]) error
ReadNeedleBlob(context.Context, *ReadNeedleBlobRequest) (*ReadNeedleBlobResponse, error)
ReadNeedleMeta(context.Context, *ReadNeedleMetaRequest) (*ReadNeedleMetaResponse, error)
WriteNeedleBlob(context.Context, *WriteNeedleBlobRequest) (*WriteNeedleBlobResponse, error)
@@ -676,6 +704,7 @@ type VolumeServerServer interface {
VolumeEcShardRead(*VolumeEcShardReadRequest, grpc.ServerStreamingServer[VolumeEcShardReadResponse]) error
VolumeEcBlobDelete(context.Context, *VolumeEcBlobDeleteRequest) (*VolumeEcBlobDeleteResponse, error)
VolumeEcShardsToVolume(context.Context, *VolumeEcShardsToVolumeRequest) (*VolumeEcShardsToVolumeResponse, error)
VolumeEcShardsInfo(context.Context, *VolumeEcShardsInfoRequest) (*VolumeEcShardsInfoResponse, error)
// tiered storage
VolumeTierMoveDatToRemote(*VolumeTierMoveDatToRemoteRequest, grpc.ServerStreamingServer[VolumeTierMoveDatToRemoteResponse]) error
VolumeTierMoveDatFromRemote(*VolumeTierMoveDatFromRemoteRequest, grpc.ServerStreamingServer[VolumeTierMoveDatFromRemoteResponse]) error
@@ -754,6 +783,9 @@ func (UnimplementedVolumeServerServer) ReadVolumeFileStatus(context.Context, *Re
func (UnimplementedVolumeServerServer) CopyFile(*CopyFileRequest, grpc.ServerStreamingServer[CopyFileResponse]) error {
return status.Errorf(codes.Unimplemented, "method CopyFile not implemented")
}
func (UnimplementedVolumeServerServer) ReceiveFile(grpc.ClientStreamingServer[ReceiveFileRequest, ReceiveFileResponse]) error {
return status.Errorf(codes.Unimplemented, "method ReceiveFile not implemented")
}
func (UnimplementedVolumeServerServer) ReadNeedleBlob(context.Context, *ReadNeedleBlobRequest) (*ReadNeedleBlobResponse, error) {
return nil, status.Errorf(codes.Unimplemented, "method ReadNeedleBlob not implemented")
}
@@ -799,6 +831,9 @@ func (UnimplementedVolumeServerServer) VolumeEcBlobDelete(context.Context, *Volu
func (UnimplementedVolumeServerServer) VolumeEcShardsToVolume(context.Context, *VolumeEcShardsToVolumeRequest) (*VolumeEcShardsToVolumeResponse, error) {
return nil, status.Errorf(codes.Unimplemented, "method VolumeEcShardsToVolume not implemented")
}
func (UnimplementedVolumeServerServer) VolumeEcShardsInfo(context.Context, *VolumeEcShardsInfoRequest) (*VolumeEcShardsInfoResponse, error) {
return nil, status.Errorf(codes.Unimplemented, "method VolumeEcShardsInfo not implemented")
}
func (UnimplementedVolumeServerServer) VolumeTierMoveDatToRemote(*VolumeTierMoveDatToRemoteRequest, grpc.ServerStreamingServer[VolumeTierMoveDatToRemoteResponse]) error {
return status.Errorf(codes.Unimplemented, "method VolumeTierMoveDatToRemote not implemented")
}
@@ -1158,6 +1193,13 @@ func _VolumeServer_CopyFile_Handler(srv interface{}, stream grpc.ServerStream) e
// This type alias is provided for backwards compatibility with existing code that references the prior non-generic stream type by name.
type VolumeServer_CopyFileServer = grpc.ServerStreamingServer[CopyFileResponse]
func _VolumeServer_ReceiveFile_Handler(srv interface{}, stream grpc.ServerStream) error {
return srv.(VolumeServerServer).ReceiveFile(&grpc.GenericServerStream[ReceiveFileRequest, ReceiveFileResponse]{ServerStream: stream})
}
// This type alias is provided for backwards compatibility with existing code that references the prior non-generic stream type by name.
type VolumeServer_ReceiveFileServer = grpc.ClientStreamingServer[ReceiveFileRequest, ReceiveFileResponse]
func _VolumeServer_ReadNeedleBlob_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
in := new(ReadNeedleBlobRequest)
if err := dec(in); err != nil {
@@ -1407,6 +1449,24 @@ func _VolumeServer_VolumeEcShardsToVolume_Handler(srv interface{}, ctx context.C
return interceptor(ctx, in, info, handler)
}
func _VolumeServer_VolumeEcShardsInfo_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
in := new(VolumeEcShardsInfoRequest)
if err := dec(in); err != nil {
return nil, err
}
if interceptor == nil {
return srv.(VolumeServerServer).VolumeEcShardsInfo(ctx, in)
}
info := &grpc.UnaryServerInfo{
Server: srv,
FullMethod: VolumeServer_VolumeEcShardsInfo_FullMethodName,
}
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
return srv.(VolumeServerServer).VolumeEcShardsInfo(ctx, req.(*VolumeEcShardsInfoRequest))
}
return interceptor(ctx, in, info, handler)
}
func _VolumeServer_VolumeTierMoveDatToRemote_Handler(srv interface{}, stream grpc.ServerStream) error {
m := new(VolumeTierMoveDatToRemoteRequest)
if err := stream.RecvMsg(m); err != nil {
@@ -1645,6 +1705,10 @@ var VolumeServer_ServiceDesc = grpc.ServiceDesc{
MethodName: "VolumeEcShardsToVolume",
Handler: _VolumeServer_VolumeEcShardsToVolume_Handler,
},
{
MethodName: "VolumeEcShardsInfo",
Handler: _VolumeServer_VolumeEcShardsInfo_Handler,
},
{
MethodName: "VolumeServerStatus",
Handler: _VolumeServer_VolumeServerStatus_Handler,
@@ -1687,6 +1751,11 @@ var VolumeServer_ServiceDesc = grpc.ServiceDesc{
Handler: _VolumeServer_CopyFile_Handler,
ServerStreams: true,
},
{
StreamName: "ReceiveFile",
Handler: _VolumeServer_ReceiveFile_Handler,
ClientStreams: true,
},
{
StreamName: "ReadAllNeedles",
Handler: _VolumeServer_ReadAllNeedles_Handler,

View File

@@ -22,6 +22,7 @@ message WorkerMessage {
TaskUpdate task_update = 6;
TaskComplete task_complete = 7;
WorkerShutdown shutdown = 8;
TaskLogResponse task_log_response = 9;
}
}
@@ -36,6 +37,7 @@ message AdminMessage {
TaskAssignment task_assignment = 5;
TaskCancellation task_cancellation = 6;
AdminShutdown admin_shutdown = 7;
TaskLogRequest task_log_request = 8;
}
}
@@ -90,7 +92,7 @@ message TaskAssignment {
map<string, string> metadata = 6;
}
// TaskParams contains task-specific parameters
// TaskParams contains task-specific parameters with typed variants
message TaskParams {
uint32 volume_id = 1;
string server = 2;
@@ -98,7 +100,75 @@ message TaskParams {
string data_center = 4;
string rack = 5;
repeated string replicas = 6;
map<string, string> parameters = 7;
// Typed task parameters
oneof task_params {
VacuumTaskParams vacuum_params = 7;
ErasureCodingTaskParams erasure_coding_params = 8;
BalanceTaskParams balance_params = 9;
ReplicationTaskParams replication_params = 10;
}
}
// VacuumTaskParams for vacuum operations
message VacuumTaskParams {
double garbage_threshold = 1; // Minimum garbage ratio to trigger vacuum
bool force_vacuum = 2; // Force vacuum even if below threshold
int32 batch_size = 3; // Number of files to process per batch
string working_dir = 4; // Working directory for temporary files
bool verify_checksum = 5; // Verify file checksums during vacuum
}
// ErasureCodingTaskParams for EC encoding operations
message ErasureCodingTaskParams {
uint64 estimated_shard_size = 3; // Estimated size per shard
int32 data_shards = 4; // Number of data shards (default: 10)
int32 parity_shards = 5; // Number of parity shards (default: 4)
string working_dir = 6; // Working directory for EC processing
string master_client = 7; // Master server address
bool cleanup_source = 8; // Whether to cleanup source volume after EC
repeated string placement_conflicts = 9; // Any placement rule conflicts
repeated ECDestination destinations = 10; // Planned destinations with disk information
repeated ExistingECShardLocation existing_shard_locations = 11; // Existing EC shards to cleanup
}
// ECDestination represents a planned destination for EC shards with disk information
message ECDestination {
string node = 1; // Target server address
uint32 disk_id = 2; // Target disk ID
string rack = 3; // Target rack for placement tracking
string data_center = 4; // Target data center for placement tracking
double placement_score = 5; // Quality score of the placement
}
// ExistingECShardLocation represents existing EC shards that need cleanup
message ExistingECShardLocation {
string node = 1; // Server address with existing shards
repeated uint32 shard_ids = 2; // List of shard IDs on this server
}
// BalanceTaskParams for volume balancing operations
message BalanceTaskParams {
string dest_node = 1; // Planned destination node
uint64 estimated_size = 2; // Estimated volume size
string dest_rack = 3; // Destination rack for placement rules
string dest_dc = 4; // Destination data center
double placement_score = 5; // Quality score of the planned placement
repeated string placement_conflicts = 6; // Any placement rule conflicts
bool force_move = 7; // Force move even with conflicts
int32 timeout_seconds = 8; // Operation timeout
}
// ReplicationTaskParams for adding replicas
message ReplicationTaskParams {
string dest_node = 1; // Planned destination node for new replica
uint64 estimated_size = 2; // Estimated replica size
string dest_rack = 3; // Destination rack for placement rules
string dest_dc = 4; // Destination data center
double placement_score = 5; // Quality score of the planned placement
repeated string placement_conflicts = 6; // Any placement rule conflicts
int32 replica_count = 7; // Target replica count
bool verify_consistency = 8; // Verify replica consistency after creation
}
// TaskUpdate reports task progress
@@ -139,4 +209,122 @@ message WorkerShutdown {
message AdminShutdown {
string reason = 1;
int32 graceful_shutdown_seconds = 2;
}
// ========== Task Log Messages ==========
// TaskLogRequest requests logs for a specific task
message TaskLogRequest {
string task_id = 1;
string worker_id = 2;
bool include_metadata = 3; // Include task metadata
int32 max_entries = 4; // Maximum number of log entries (0 = all)
string log_level = 5; // Filter by log level (INFO, WARNING, ERROR, DEBUG)
int64 start_time = 6; // Unix timestamp for start time filter
int64 end_time = 7; // Unix timestamp for end time filter
}
// TaskLogResponse returns task logs and metadata
message TaskLogResponse {
string task_id = 1;
string worker_id = 2;
bool success = 3;
string error_message = 4;
TaskLogMetadata metadata = 5;
repeated TaskLogEntry log_entries = 6;
}
// TaskLogMetadata contains metadata about task execution
message TaskLogMetadata {
string task_id = 1;
string task_type = 2;
string worker_id = 3;
int64 start_time = 4;
int64 end_time = 5;
int64 duration_ms = 6;
string status = 7;
float progress = 8;
uint32 volume_id = 9;
string server = 10;
string collection = 11;
string log_file_path = 12;
int64 created_at = 13;
map<string, string> custom_data = 14;
}
// TaskLogEntry represents a single log entry
message TaskLogEntry {
int64 timestamp = 1;
string level = 2;
string message = 3;
map<string, string> fields = 4;
float progress = 5;
string status = 6;
}
// ========== Maintenance Configuration Messages ==========
// MaintenanceConfig holds configuration for the maintenance system
message MaintenanceConfig {
bool enabled = 1;
int32 scan_interval_seconds = 2; // How often to scan for maintenance needs
int32 worker_timeout_seconds = 3; // Worker heartbeat timeout
int32 task_timeout_seconds = 4; // Individual task timeout
int32 retry_delay_seconds = 5; // Delay between retries
int32 max_retries = 6; // Default max retries for tasks
int32 cleanup_interval_seconds = 7; // How often to clean up old tasks
int32 task_retention_seconds = 8; // How long to keep completed/failed tasks
MaintenancePolicy policy = 9;
}
// MaintenancePolicy defines policies for maintenance operations
message MaintenancePolicy {
map<string, TaskPolicy> task_policies = 1; // Task type -> policy mapping
int32 global_max_concurrent = 2; // Overall limit across all task types
int32 default_repeat_interval_seconds = 3; // Default seconds if task doesn't specify
int32 default_check_interval_seconds = 4; // Default seconds for periodic checks
}
// TaskPolicy represents configuration for a specific task type
message TaskPolicy {
bool enabled = 1;
int32 max_concurrent = 2;
int32 repeat_interval_seconds = 3; // Seconds to wait before repeating
int32 check_interval_seconds = 4; // Seconds between checks
// Typed task-specific configuration (replaces generic map)
oneof task_config {
VacuumTaskConfig vacuum_config = 5;
ErasureCodingTaskConfig erasure_coding_config = 6;
BalanceTaskConfig balance_config = 7;
ReplicationTaskConfig replication_config = 8;
}
}
// Task-specific configuration messages
// VacuumTaskConfig contains vacuum-specific configuration
message VacuumTaskConfig {
double garbage_threshold = 1; // Minimum garbage ratio to trigger vacuum (0.0-1.0)
int32 min_volume_age_hours = 2; // Minimum age before vacuum is considered
int32 min_interval_seconds = 3; // Minimum time between vacuum operations on the same volume
}
// ErasureCodingTaskConfig contains EC-specific configuration
message ErasureCodingTaskConfig {
double fullness_ratio = 1; // Minimum fullness ratio to trigger EC (0.0-1.0)
int32 quiet_for_seconds = 2; // Minimum quiet time before EC
int32 min_volume_size_mb = 3; // Minimum volume size for EC
string collection_filter = 4; // Only process volumes from specific collections
}
// BalanceTaskConfig contains balance-specific configuration
message BalanceTaskConfig {
double imbalance_threshold = 1; // Threshold for triggering rebalancing (0.0-1.0)
int32 min_server_count = 2; // Minimum number of servers required for balancing
}
// ReplicationTaskConfig contains replication-specific configuration
message ReplicationTaskConfig {
int32 target_replica_count = 1; // Target number of replicas
}

File diff suppressed because it is too large Load Diff