Admin: misc improvements on admin server and workers. EC now works. (#7055)
* initial design * added simulation as tests * reorganized the codebase to move the simulation framework and tests into their own dedicated package * integration test. ec worker task * remove "enhanced" reference * start master, volume servers, filer Current Status ✅ Master: Healthy and running (port 9333) ✅ Filer: Healthy and running (port 8888) ✅ Volume Servers: All 6 servers running (ports 8080-8085) 🔄 Admin/Workers: Will start when dependencies are ready * generate write load * tasks are assigned * admin start wtih grpc port. worker has its own working directory * Update .gitignore * working worker and admin. Task detection is not working yet. * compiles, detection uses volumeSizeLimitMB from master * compiles * worker retries connecting to admin * build and restart * rendering pending tasks * skip task ID column * sticky worker id * test canScheduleTaskNow * worker reconnect to admin * clean up logs * worker register itself first * worker can run ec work and report status but: 1. one volume should not be repeatedly worked on. 2. ec shards needs to be distributed and source data should be deleted. * move ec task logic * listing ec shards * local copy, ec. Need to distribute. * ec is mostly working now * distribution of ec shards needs improvement * need configuration to enable ec * show ec volumes * interval field UI component * rename * integration test with vauuming * garbage percentage threshold * fix warning * display ec shard sizes * fix ec volumes list * Update ui.go * show default values * ensure correct default value * MaintenanceConfig use ConfigField * use schema defined defaults * config * reduce duplication * refactor to use BaseUIProvider * each task register its schema * checkECEncodingCandidate use ecDetector * use vacuumDetector * use volumeSizeLimitMB * remove remove * remove unused * refactor * use new framework * remove v2 reference * refactor * left menu can scroll now * The maintenance manager was not being initialized when no data directory was configured for persistent storage. * saving config * Update task_config_schema_templ.go * enable/disable tasks * protobuf encoded task configurations * fix system settings * use ui component * remove logs * interface{} Reduction * reduce interface{} * reduce interface{} * avoid from/to map * reduce interface{} * refactor * keep it DRY * added logging * debug messages * debug level * debug * show the log caller line * use configured task policy * log level * handle admin heartbeat response * Update worker.go * fix EC rack and dc count * Report task status to admin server * fix task logging, simplify interface checking, use erasure_coding constants * factor in empty volume server during task planning * volume.list adds disk id * track disk id also * fix locking scheduled and manual scanning * add active topology * simplify task detector * ec task completed, but shards are not showing up * implement ec in ec_typed.go * adjust log level * dedup * implementing ec copying shards and only ecx files * use disk id when distributing ec shards 🎯 Planning: ActiveTopology creates DestinationPlan with specific TargetDisk 📦 Task Creation: maintenance_integration.go creates ECDestination with DiskId 🚀 Task Execution: EC task passes DiskId in VolumeEcShardsCopyRequest 💾 Volume Server: Receives disk_id and stores shards on specific disk (vs.store.Locations[req.DiskId]) 📂 File System: EC shards and metadata land in the exact disk directory planned * Delete original volume from all locations * clean up existing shard locations * local encoding and distributing * Update docker/admin_integration/EC-TESTING-README.md Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * check volume id range * simplify * fix tests * fix types * clean up logs and tests --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -22,6 +22,7 @@ message WorkerMessage {
|
||||
TaskUpdate task_update = 6;
|
||||
TaskComplete task_complete = 7;
|
||||
WorkerShutdown shutdown = 8;
|
||||
TaskLogResponse task_log_response = 9;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,6 +37,7 @@ message AdminMessage {
|
||||
TaskAssignment task_assignment = 5;
|
||||
TaskCancellation task_cancellation = 6;
|
||||
AdminShutdown admin_shutdown = 7;
|
||||
TaskLogRequest task_log_request = 8;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,7 +92,7 @@ message TaskAssignment {
|
||||
map<string, string> metadata = 6;
|
||||
}
|
||||
|
||||
// TaskParams contains task-specific parameters
|
||||
// TaskParams contains task-specific parameters with typed variants
|
||||
message TaskParams {
|
||||
uint32 volume_id = 1;
|
||||
string server = 2;
|
||||
@@ -98,7 +100,75 @@ message TaskParams {
|
||||
string data_center = 4;
|
||||
string rack = 5;
|
||||
repeated string replicas = 6;
|
||||
map<string, string> parameters = 7;
|
||||
|
||||
// Typed task parameters
|
||||
oneof task_params {
|
||||
VacuumTaskParams vacuum_params = 7;
|
||||
ErasureCodingTaskParams erasure_coding_params = 8;
|
||||
BalanceTaskParams balance_params = 9;
|
||||
ReplicationTaskParams replication_params = 10;
|
||||
}
|
||||
}
|
||||
|
||||
// VacuumTaskParams for vacuum operations
|
||||
message VacuumTaskParams {
|
||||
double garbage_threshold = 1; // Minimum garbage ratio to trigger vacuum
|
||||
bool force_vacuum = 2; // Force vacuum even if below threshold
|
||||
int32 batch_size = 3; // Number of files to process per batch
|
||||
string working_dir = 4; // Working directory for temporary files
|
||||
bool verify_checksum = 5; // Verify file checksums during vacuum
|
||||
}
|
||||
|
||||
// ErasureCodingTaskParams for EC encoding operations
|
||||
message ErasureCodingTaskParams {
|
||||
uint64 estimated_shard_size = 3; // Estimated size per shard
|
||||
int32 data_shards = 4; // Number of data shards (default: 10)
|
||||
int32 parity_shards = 5; // Number of parity shards (default: 4)
|
||||
string working_dir = 6; // Working directory for EC processing
|
||||
string master_client = 7; // Master server address
|
||||
bool cleanup_source = 8; // Whether to cleanup source volume after EC
|
||||
repeated string placement_conflicts = 9; // Any placement rule conflicts
|
||||
repeated ECDestination destinations = 10; // Planned destinations with disk information
|
||||
repeated ExistingECShardLocation existing_shard_locations = 11; // Existing EC shards to cleanup
|
||||
}
|
||||
|
||||
// ECDestination represents a planned destination for EC shards with disk information
|
||||
message ECDestination {
|
||||
string node = 1; // Target server address
|
||||
uint32 disk_id = 2; // Target disk ID
|
||||
string rack = 3; // Target rack for placement tracking
|
||||
string data_center = 4; // Target data center for placement tracking
|
||||
double placement_score = 5; // Quality score of the placement
|
||||
}
|
||||
|
||||
// ExistingECShardLocation represents existing EC shards that need cleanup
|
||||
message ExistingECShardLocation {
|
||||
string node = 1; // Server address with existing shards
|
||||
repeated uint32 shard_ids = 2; // List of shard IDs on this server
|
||||
}
|
||||
|
||||
// BalanceTaskParams for volume balancing operations
|
||||
message BalanceTaskParams {
|
||||
string dest_node = 1; // Planned destination node
|
||||
uint64 estimated_size = 2; // Estimated volume size
|
||||
string dest_rack = 3; // Destination rack for placement rules
|
||||
string dest_dc = 4; // Destination data center
|
||||
double placement_score = 5; // Quality score of the planned placement
|
||||
repeated string placement_conflicts = 6; // Any placement rule conflicts
|
||||
bool force_move = 7; // Force move even with conflicts
|
||||
int32 timeout_seconds = 8; // Operation timeout
|
||||
}
|
||||
|
||||
// ReplicationTaskParams for adding replicas
|
||||
message ReplicationTaskParams {
|
||||
string dest_node = 1; // Planned destination node for new replica
|
||||
uint64 estimated_size = 2; // Estimated replica size
|
||||
string dest_rack = 3; // Destination rack for placement rules
|
||||
string dest_dc = 4; // Destination data center
|
||||
double placement_score = 5; // Quality score of the planned placement
|
||||
repeated string placement_conflicts = 6; // Any placement rule conflicts
|
||||
int32 replica_count = 7; // Target replica count
|
||||
bool verify_consistency = 8; // Verify replica consistency after creation
|
||||
}
|
||||
|
||||
// TaskUpdate reports task progress
|
||||
@@ -139,4 +209,122 @@ message WorkerShutdown {
|
||||
message AdminShutdown {
|
||||
string reason = 1;
|
||||
int32 graceful_shutdown_seconds = 2;
|
||||
}
|
||||
|
||||
// ========== Task Log Messages ==========
|
||||
|
||||
// TaskLogRequest requests logs for a specific task
|
||||
message TaskLogRequest {
|
||||
string task_id = 1;
|
||||
string worker_id = 2;
|
||||
bool include_metadata = 3; // Include task metadata
|
||||
int32 max_entries = 4; // Maximum number of log entries (0 = all)
|
||||
string log_level = 5; // Filter by log level (INFO, WARNING, ERROR, DEBUG)
|
||||
int64 start_time = 6; // Unix timestamp for start time filter
|
||||
int64 end_time = 7; // Unix timestamp for end time filter
|
||||
}
|
||||
|
||||
// TaskLogResponse returns task logs and metadata
|
||||
message TaskLogResponse {
|
||||
string task_id = 1;
|
||||
string worker_id = 2;
|
||||
bool success = 3;
|
||||
string error_message = 4;
|
||||
TaskLogMetadata metadata = 5;
|
||||
repeated TaskLogEntry log_entries = 6;
|
||||
}
|
||||
|
||||
// TaskLogMetadata contains metadata about task execution
|
||||
message TaskLogMetadata {
|
||||
string task_id = 1;
|
||||
string task_type = 2;
|
||||
string worker_id = 3;
|
||||
int64 start_time = 4;
|
||||
int64 end_time = 5;
|
||||
int64 duration_ms = 6;
|
||||
string status = 7;
|
||||
float progress = 8;
|
||||
uint32 volume_id = 9;
|
||||
string server = 10;
|
||||
string collection = 11;
|
||||
string log_file_path = 12;
|
||||
int64 created_at = 13;
|
||||
map<string, string> custom_data = 14;
|
||||
}
|
||||
|
||||
// TaskLogEntry represents a single log entry
|
||||
message TaskLogEntry {
|
||||
int64 timestamp = 1;
|
||||
string level = 2;
|
||||
string message = 3;
|
||||
map<string, string> fields = 4;
|
||||
float progress = 5;
|
||||
string status = 6;
|
||||
}
|
||||
|
||||
// ========== Maintenance Configuration Messages ==========
|
||||
|
||||
// MaintenanceConfig holds configuration for the maintenance system
|
||||
message MaintenanceConfig {
|
||||
bool enabled = 1;
|
||||
int32 scan_interval_seconds = 2; // How often to scan for maintenance needs
|
||||
int32 worker_timeout_seconds = 3; // Worker heartbeat timeout
|
||||
int32 task_timeout_seconds = 4; // Individual task timeout
|
||||
int32 retry_delay_seconds = 5; // Delay between retries
|
||||
int32 max_retries = 6; // Default max retries for tasks
|
||||
int32 cleanup_interval_seconds = 7; // How often to clean up old tasks
|
||||
int32 task_retention_seconds = 8; // How long to keep completed/failed tasks
|
||||
MaintenancePolicy policy = 9;
|
||||
}
|
||||
|
||||
// MaintenancePolicy defines policies for maintenance operations
|
||||
message MaintenancePolicy {
|
||||
map<string, TaskPolicy> task_policies = 1; // Task type -> policy mapping
|
||||
int32 global_max_concurrent = 2; // Overall limit across all task types
|
||||
int32 default_repeat_interval_seconds = 3; // Default seconds if task doesn't specify
|
||||
int32 default_check_interval_seconds = 4; // Default seconds for periodic checks
|
||||
}
|
||||
|
||||
// TaskPolicy represents configuration for a specific task type
|
||||
message TaskPolicy {
|
||||
bool enabled = 1;
|
||||
int32 max_concurrent = 2;
|
||||
int32 repeat_interval_seconds = 3; // Seconds to wait before repeating
|
||||
int32 check_interval_seconds = 4; // Seconds between checks
|
||||
|
||||
// Typed task-specific configuration (replaces generic map)
|
||||
oneof task_config {
|
||||
VacuumTaskConfig vacuum_config = 5;
|
||||
ErasureCodingTaskConfig erasure_coding_config = 6;
|
||||
BalanceTaskConfig balance_config = 7;
|
||||
ReplicationTaskConfig replication_config = 8;
|
||||
}
|
||||
}
|
||||
|
||||
// Task-specific configuration messages
|
||||
|
||||
// VacuumTaskConfig contains vacuum-specific configuration
|
||||
message VacuumTaskConfig {
|
||||
double garbage_threshold = 1; // Minimum garbage ratio to trigger vacuum (0.0-1.0)
|
||||
int32 min_volume_age_hours = 2; // Minimum age before vacuum is considered
|
||||
int32 min_interval_seconds = 3; // Minimum time between vacuum operations on the same volume
|
||||
}
|
||||
|
||||
// ErasureCodingTaskConfig contains EC-specific configuration
|
||||
message ErasureCodingTaskConfig {
|
||||
double fullness_ratio = 1; // Minimum fullness ratio to trigger EC (0.0-1.0)
|
||||
int32 quiet_for_seconds = 2; // Minimum quiet time before EC
|
||||
int32 min_volume_size_mb = 3; // Minimum volume size for EC
|
||||
string collection_filter = 4; // Only process volumes from specific collections
|
||||
}
|
||||
|
||||
// BalanceTaskConfig contains balance-specific configuration
|
||||
message BalanceTaskConfig {
|
||||
double imbalance_threshold = 1; // Threshold for triggering rebalancing (0.0-1.0)
|
||||
int32 min_server_count = 2; // Minimum number of servers required for balancing
|
||||
}
|
||||
|
||||
// ReplicationTaskConfig contains replication-specific configuration
|
||||
message ReplicationTaskConfig {
|
||||
int32 target_replica_count = 1; // Target number of replicas
|
||||
}
|
||||
Reference in New Issue
Block a user