Files
seaweedFS/weed/admin/dash/config_persistence.go
Chris Lu 13dcf445a4 Fix maintenance worker panic and add EC integration tests (#8068)
* Fix nil pointer panic in maintenance worker when receiving empty task assignment

When a worker requests a task and none are available, the admin server
sends an empty TaskAssignment message. The worker was attempting to log
the task details without checking if the TaskId was empty, causing a
nil pointer dereference when accessing taskAssign.Params.VolumeId.

This fix adds a check for empty TaskId before processing the assignment,
preventing worker crashes and improving stability in production environments.

* Add EC integration test for admin-worker maintenance system

Adds comprehensive integration test that verifies the end-to-end flow
of erasure coding maintenance tasks:
- Admin server detects volumes needing EC encoding
- Workers register and receive task assignments
- EC encoding is executed and verified in master topology
- File read-back validation confirms data integrity

The test uses unique absolute working directories for each worker to
prevent ID conflicts and ensure stable worker registration. Includes
proper cleanup and process management for reliable test execution.

* Improve maintenance system stability and task deduplication

- Add cross-type task deduplication to prevent concurrent maintenance
  operations on the same volume (EC, balance, vacuum)
- Implement HasAnyTask check in ActiveTopology for better coordination
- Increase RequestTask timeout from 5s to 30s to prevent unnecessary
  worker reconnections
- Add TaskTypeNone sentinel for generic task checks
- Update all task detectors to use HasAnyTask for conflict prevention
- Improve config persistence and schema handling

* Add GitHub Actions workflow for EC integration tests

Adds CI workflow that runs EC integration tests on push and pull requests
to master branch. The workflow:
- Triggers on changes to admin, worker, or test files
- Builds the weed binary
- Runs the EC integration test suite
- Uploads test logs as artifacts on failure for debugging

This ensures the maintenance system remains stable and worker-admin
integration is validated in CI.

* go version 1.24

* address comments

* Update maintenance_integration.go

* support seconds

* ec prioritize over balancing in tests
2026-01-20 15:07:43 -08:00

1256 lines
41 KiB
Go

package dash
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"time"
"github.com/seaweedfs/seaweedfs/weed/admin/maintenance"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/worker_pb"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/balance"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/worker/tasks/vacuum"
"google.golang.org/protobuf/encoding/protojson"
"google.golang.org/protobuf/proto"
)
const (
// Configuration subdirectory
ConfigSubdir = "conf"
// Configuration file names (protobuf binary)
MaintenanceConfigFile = "maintenance.pb"
VacuumTaskConfigFile = "task_vacuum.pb"
ECTaskConfigFile = "task_erasure_coding.pb"
BalanceTaskConfigFile = "task_balance.pb"
ReplicationTaskConfigFile = "task_replication.pb"
// JSON reference files
MaintenanceConfigJSONFile = "maintenance.json"
VacuumTaskConfigJSONFile = "task_vacuum.json"
ECTaskConfigJSONFile = "task_erasure_coding.json"
BalanceTaskConfigJSONFile = "task_balance.json"
ReplicationTaskConfigJSONFile = "task_replication.json"
// Task persistence subdirectories and settings
TasksSubdir = "tasks"
TaskDetailsSubdir = "task_details"
TaskLogsSubdir = "task_logs"
MaxCompletedTasks = 10 // Only keep last 10 completed tasks
ConfigDirPermissions = 0755
ConfigFilePermissions = 0644
)
// Task configuration types
type (
VacuumTaskConfig = worker_pb.VacuumTaskConfig
ErasureCodingTaskConfig = worker_pb.ErasureCodingTaskConfig
BalanceTaskConfig = worker_pb.BalanceTaskConfig
ReplicationTaskConfig = worker_pb.ReplicationTaskConfig
)
// isValidTaskID validates that a task ID is safe for use in file paths
// This prevents path traversal attacks by ensuring the task ID doesn't contain
// path separators or parent directory references
func isValidTaskID(taskID string) bool {
if taskID == "" {
return false
}
// Reject task IDs with leading or trailing whitespace
if strings.TrimSpace(taskID) != taskID {
return false
}
// Check for path traversal patterns
if strings.Contains(taskID, "/") ||
strings.Contains(taskID, "\\") ||
strings.Contains(taskID, "..") ||
strings.Contains(taskID, ":") {
return false
}
// Additional safety: ensure it's not just dots or empty after trim
if taskID == "." || taskID == ".." {
return false
}
return true
}
// ConfigPersistence handles saving and loading configuration files
type ConfigPersistence struct {
dataDir string
}
// NewConfigPersistence creates a new configuration persistence manager
func NewConfigPersistence(dataDir string) *ConfigPersistence {
return &ConfigPersistence{
dataDir: dataDir,
}
}
// SaveMaintenanceConfig saves maintenance configuration to protobuf file and JSON reference
func (cp *ConfigPersistence) SaveMaintenanceConfig(config *MaintenanceConfig) error {
if cp.dataDir == "" {
return fmt.Errorf("no data directory specified, cannot save configuration")
}
confDir := filepath.Join(cp.dataDir, ConfigSubdir)
if err := os.MkdirAll(confDir, ConfigDirPermissions); err != nil {
return fmt.Errorf("failed to create config directory: %w", err)
}
// Save as protobuf (primary format)
pbConfigPath := filepath.Join(confDir, MaintenanceConfigFile)
pbData, err := proto.Marshal(config)
if err != nil {
return fmt.Errorf("failed to marshal maintenance config to protobuf: %w", err)
}
if err := os.WriteFile(pbConfigPath, pbData, ConfigFilePermissions); err != nil {
return fmt.Errorf("failed to write protobuf config file: %w", err)
}
// Save JSON reference copy for debugging
jsonConfigPath := filepath.Join(confDir, MaintenanceConfigJSONFile)
jsonData, err := protojson.MarshalOptions{
Multiline: true,
Indent: " ",
EmitUnpopulated: true,
}.Marshal(config)
if err != nil {
return fmt.Errorf("failed to marshal maintenance config to JSON: %w", err)
}
if err := os.WriteFile(jsonConfigPath, jsonData, ConfigFilePermissions); err != nil {
return fmt.Errorf("failed to write JSON reference file: %w", err)
}
return nil
}
// LoadMaintenanceConfig loads maintenance configuration from protobuf file
func (cp *ConfigPersistence) LoadMaintenanceConfig() (*MaintenanceConfig, error) {
if cp.dataDir == "" {
return DefaultMaintenanceConfig(), nil
}
confDir := filepath.Join(cp.dataDir, ConfigSubdir)
configPath := filepath.Join(confDir, MaintenanceConfigFile)
// Try to load from protobuf file
if configData, err := os.ReadFile(configPath); err == nil {
var config MaintenanceConfig
if err := proto.Unmarshal(configData, &config); err == nil {
// Always populate policy from separate task configuration files
config.Policy = buildPolicyFromTaskConfigs()
return &config, nil
}
}
// File doesn't exist or failed to load, use defaults
return DefaultMaintenanceConfig(), nil
}
// GetConfigPath returns the path to a configuration file
func (cp *ConfigPersistence) GetConfigPath(filename string) string {
if cp.dataDir == "" {
return ""
}
// All configs go in conf subdirectory
confDir := filepath.Join(cp.dataDir, ConfigSubdir)
return filepath.Join(confDir, filename)
}
// ListConfigFiles returns all configuration files in the conf subdirectory
func (cp *ConfigPersistence) ListConfigFiles() ([]string, error) {
if cp.dataDir == "" {
return nil, fmt.Errorf("no data directory specified")
}
confDir := filepath.Join(cp.dataDir, ConfigSubdir)
files, err := os.ReadDir(confDir)
if err != nil {
// If conf directory doesn't exist, return empty list
if os.IsNotExist(err) {
return []string{}, nil
}
return nil, fmt.Errorf("failed to read config directory: %w", err)
}
var configFiles []string
for _, file := range files {
if !file.IsDir() {
ext := filepath.Ext(file.Name())
if ext == ".json" || ext == ".pb" {
configFiles = append(configFiles, file.Name())
}
}
}
return configFiles, nil
}
// BackupConfig creates a backup of a configuration file
func (cp *ConfigPersistence) BackupConfig(filename string) error {
if cp.dataDir == "" {
return fmt.Errorf("no data directory specified")
}
configPath := cp.GetConfigPath(filename)
if _, err := os.Stat(configPath); os.IsNotExist(err) {
return fmt.Errorf("config file does not exist: %s", filename)
}
// Create backup filename with timestamp
timestamp := time.Now().Format("2006-01-02_15-04-05")
backupName := fmt.Sprintf("%s.backup_%s", filename, timestamp)
// Determine backup directory (conf subdirectory)
confDir := filepath.Join(cp.dataDir, ConfigSubdir)
backupPath := filepath.Join(confDir, backupName)
// Copy file
configData, err := os.ReadFile(configPath)
if err != nil {
return fmt.Errorf("failed to read config file: %w", err)
}
if err := os.WriteFile(backupPath, configData, ConfigFilePermissions); err != nil {
return fmt.Errorf("failed to create backup: %w", err)
}
glog.V(1).Infof("Created backup of %s as %s", filename, backupName)
return nil
}
// RestoreConfig restores a configuration file from a backup
func (cp *ConfigPersistence) RestoreConfig(filename, backupName string) error {
if cp.dataDir == "" {
return fmt.Errorf("no data directory specified")
}
// Determine backup path (conf subdirectory)
confDir := filepath.Join(cp.dataDir, ConfigSubdir)
backupPath := filepath.Join(confDir, backupName)
if _, err := os.Stat(backupPath); os.IsNotExist(err) {
return fmt.Errorf("backup file does not exist: %s", backupName)
}
// Read backup file
backupData, err := os.ReadFile(backupPath)
if err != nil {
return fmt.Errorf("failed to read backup file: %w", err)
}
// Write to config file
configPath := cp.GetConfigPath(filename)
if err := os.WriteFile(configPath, backupData, ConfigFilePermissions); err != nil {
return fmt.Errorf("failed to restore config: %w", err)
}
glog.V(1).Infof("Restored %s from backup %s", filename, backupName)
return nil
}
// SaveVacuumTaskConfig saves vacuum task configuration to protobuf file
func (cp *ConfigPersistence) SaveVacuumTaskConfig(config *VacuumTaskConfig) error {
return cp.saveTaskConfig(VacuumTaskConfigFile, config)
}
// SaveVacuumTaskPolicy saves complete vacuum task policy to protobuf file
func (cp *ConfigPersistence) SaveVacuumTaskPolicy(policy *worker_pb.TaskPolicy) error {
return cp.saveTaskConfig(VacuumTaskConfigFile, policy)
}
// LoadVacuumTaskConfig loads vacuum task configuration from protobuf file
func (cp *ConfigPersistence) LoadVacuumTaskConfig() (*VacuumTaskConfig, error) {
// Load as TaskPolicy and extract vacuum config
if taskPolicy, err := cp.LoadVacuumTaskPolicy(); err == nil && taskPolicy != nil {
if vacuumConfig := taskPolicy.GetVacuumConfig(); vacuumConfig != nil {
return vacuumConfig, nil
}
}
// Return default config if no valid config found
return &VacuumTaskConfig{
GarbageThreshold: 0.3,
MinVolumeAgeHours: 24,
MinIntervalSeconds: 7 * 24 * 60 * 60, // 7 days
}, nil
}
// LoadVacuumTaskPolicy loads complete vacuum task policy from protobuf file
func (cp *ConfigPersistence) LoadVacuumTaskPolicy() (*worker_pb.TaskPolicy, error) {
if cp.dataDir == "" {
// Return default policy if no data directory
return &worker_pb.TaskPolicy{
Enabled: true,
MaxConcurrent: 2,
RepeatIntervalSeconds: 24 * 3600, // 24 hours in seconds
CheckIntervalSeconds: 6 * 3600, // 6 hours in seconds
TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{
VacuumConfig: &worker_pb.VacuumTaskConfig{
GarbageThreshold: 0.3,
MinVolumeAgeHours: 24,
MinIntervalSeconds: 7 * 24 * 60 * 60, // 7 days
},
},
}, nil
}
confDir := filepath.Join(cp.dataDir, ConfigSubdir)
configPath := filepath.Join(confDir, VacuumTaskConfigFile)
// Check if file exists
if _, err := os.Stat(configPath); os.IsNotExist(err) {
// Return default policy if file doesn't exist
return &worker_pb.TaskPolicy{
Enabled: true,
MaxConcurrent: 2,
RepeatIntervalSeconds: 24 * 3600, // 24 hours in seconds
CheckIntervalSeconds: 6 * 3600, // 6 hours in seconds
TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{
VacuumConfig: &worker_pb.VacuumTaskConfig{
GarbageThreshold: 0.3,
MinVolumeAgeHours: 24,
MinIntervalSeconds: 7 * 24 * 60 * 60, // 7 days
},
},
}, nil
}
// Read file
configData, err := os.ReadFile(configPath)
if err != nil {
return nil, fmt.Errorf("failed to read vacuum task config file: %w", err)
}
// Try to unmarshal as TaskPolicy
var policy worker_pb.TaskPolicy
if err := proto.Unmarshal(configData, &policy); err == nil {
// Validate that it's actually a TaskPolicy with vacuum config
if policy.GetVacuumConfig() != nil {
glog.V(1).Infof("Loaded vacuum task policy from %s", configPath)
return &policy, nil
}
}
return nil, fmt.Errorf("failed to unmarshal vacuum task configuration")
}
// SaveErasureCodingTaskConfig saves EC task configuration to protobuf file
func (cp *ConfigPersistence) SaveErasureCodingTaskConfig(config *ErasureCodingTaskConfig) error {
return cp.saveTaskConfig(ECTaskConfigFile, config)
}
// SaveErasureCodingTaskPolicy saves complete EC task policy to protobuf file
func (cp *ConfigPersistence) SaveErasureCodingTaskPolicy(policy *worker_pb.TaskPolicy) error {
return cp.saveTaskConfig(ECTaskConfigFile, policy)
}
// LoadErasureCodingTaskConfig loads EC task configuration from protobuf file
func (cp *ConfigPersistence) LoadErasureCodingTaskConfig() (*ErasureCodingTaskConfig, error) {
// Load as TaskPolicy and extract EC config
if taskPolicy, err := cp.LoadErasureCodingTaskPolicy(); err == nil && taskPolicy != nil {
if ecConfig := taskPolicy.GetErasureCodingConfig(); ecConfig != nil {
return ecConfig, nil
}
}
// Return default config if no valid config found
return &ErasureCodingTaskConfig{
FullnessRatio: 0.9,
QuietForSeconds: 3600,
MinVolumeSizeMb: 1024,
CollectionFilter: "",
}, nil
}
// LoadErasureCodingTaskPolicy loads complete EC task policy from protobuf file
func (cp *ConfigPersistence) LoadErasureCodingTaskPolicy() (*worker_pb.TaskPolicy, error) {
if cp.dataDir == "" {
// Return default policy if no data directory
return &worker_pb.TaskPolicy{
Enabled: true,
MaxConcurrent: 1,
RepeatIntervalSeconds: 168 * 3600, // 1 week in seconds
CheckIntervalSeconds: 24 * 3600, // 24 hours in seconds
TaskConfig: &worker_pb.TaskPolicy_ErasureCodingConfig{
ErasureCodingConfig: &worker_pb.ErasureCodingTaskConfig{
FullnessRatio: 0.9,
QuietForSeconds: 3600,
MinVolumeSizeMb: 1024,
CollectionFilter: "",
},
},
}, nil
}
confDir := filepath.Join(cp.dataDir, ConfigSubdir)
configPath := filepath.Join(confDir, ECTaskConfigFile)
// Check if file exists
if _, err := os.Stat(configPath); os.IsNotExist(err) {
// Return default policy if file doesn't exist
return &worker_pb.TaskPolicy{
Enabled: true,
MaxConcurrent: 1,
RepeatIntervalSeconds: 168 * 3600, // 1 week in seconds
CheckIntervalSeconds: 24 * 3600, // 24 hours in seconds
TaskConfig: &worker_pb.TaskPolicy_ErasureCodingConfig{
ErasureCodingConfig: &worker_pb.ErasureCodingTaskConfig{
FullnessRatio: 0.9,
QuietForSeconds: 3600,
MinVolumeSizeMb: 1024,
CollectionFilter: "",
},
},
}, nil
}
// Read file
configData, err := os.ReadFile(configPath)
if err != nil {
return nil, fmt.Errorf("failed to read EC task config file: %w", err)
}
// Try to unmarshal as TaskPolicy
var policy worker_pb.TaskPolicy
if err := proto.Unmarshal(configData, &policy); err == nil {
// Validate that it's actually a TaskPolicy with EC config
if policy.GetErasureCodingConfig() != nil {
glog.V(1).Infof("Loaded EC task policy from %s", configPath)
return &policy, nil
}
}
return nil, fmt.Errorf("failed to unmarshal EC task configuration")
}
// SaveBalanceTaskConfig saves balance task configuration to protobuf file
func (cp *ConfigPersistence) SaveBalanceTaskConfig(config *BalanceTaskConfig) error {
return cp.saveTaskConfig(BalanceTaskConfigFile, config)
}
// SaveBalanceTaskPolicy saves complete balance task policy to protobuf file
func (cp *ConfigPersistence) SaveBalanceTaskPolicy(policy *worker_pb.TaskPolicy) error {
return cp.saveTaskConfig(BalanceTaskConfigFile, policy)
}
// LoadBalanceTaskConfig loads balance task configuration from protobuf file
func (cp *ConfigPersistence) LoadBalanceTaskConfig() (*BalanceTaskConfig, error) {
// Load as TaskPolicy and extract balance config
if taskPolicy, err := cp.LoadBalanceTaskPolicy(); err == nil && taskPolicy != nil {
if balanceConfig := taskPolicy.GetBalanceConfig(); balanceConfig != nil {
return balanceConfig, nil
}
}
// Return default config if no valid config found
return &BalanceTaskConfig{
ImbalanceThreshold: 0.1,
MinServerCount: 2,
}, nil
}
// LoadBalanceTaskPolicy loads complete balance task policy from protobuf file
func (cp *ConfigPersistence) LoadBalanceTaskPolicy() (*worker_pb.TaskPolicy, error) {
if cp.dataDir == "" {
// Return default policy if no data directory
return &worker_pb.TaskPolicy{
Enabled: true,
MaxConcurrent: 1,
RepeatIntervalSeconds: 6 * 3600, // 6 hours in seconds
CheckIntervalSeconds: 12 * 3600, // 12 hours in seconds
TaskConfig: &worker_pb.TaskPolicy_BalanceConfig{
BalanceConfig: &worker_pb.BalanceTaskConfig{
ImbalanceThreshold: 0.1,
MinServerCount: 2,
},
},
}, nil
}
confDir := filepath.Join(cp.dataDir, ConfigSubdir)
configPath := filepath.Join(confDir, BalanceTaskConfigFile)
// Check if file exists
if _, err := os.Stat(configPath); os.IsNotExist(err) {
// Return default policy if file doesn't exist
return &worker_pb.TaskPolicy{
Enabled: true,
MaxConcurrent: 1,
RepeatIntervalSeconds: 6 * 3600, // 6 hours in seconds
CheckIntervalSeconds: 12 * 3600, // 12 hours in seconds
TaskConfig: &worker_pb.TaskPolicy_BalanceConfig{
BalanceConfig: &worker_pb.BalanceTaskConfig{
ImbalanceThreshold: 0.1,
MinServerCount: 2,
},
},
}, nil
}
// Read file
configData, err := os.ReadFile(configPath)
if err != nil {
return nil, fmt.Errorf("failed to read balance task config file: %w", err)
}
// Try to unmarshal as TaskPolicy
var policy worker_pb.TaskPolicy
if err := proto.Unmarshal(configData, &policy); err == nil {
// Validate that it's actually a TaskPolicy with balance config
if policy.GetBalanceConfig() != nil {
glog.V(1).Infof("Loaded balance task policy from %s", configPath)
return &policy, nil
}
}
return nil, fmt.Errorf("failed to unmarshal balance task configuration")
}
// SaveReplicationTaskConfig saves replication task configuration to protobuf file
func (cp *ConfigPersistence) SaveReplicationTaskConfig(config *ReplicationTaskConfig) error {
return cp.saveTaskConfig(ReplicationTaskConfigFile, config)
}
// LoadReplicationTaskConfig loads replication task configuration from protobuf file
func (cp *ConfigPersistence) LoadReplicationTaskConfig() (*ReplicationTaskConfig, error) {
var config ReplicationTaskConfig
err := cp.loadTaskConfig(ReplicationTaskConfigFile, &config)
if err != nil {
// Return default config if file doesn't exist
if os.IsNotExist(err) {
return &ReplicationTaskConfig{
TargetReplicaCount: 1,
}, nil
}
return nil, err
}
return &config, nil
}
// saveTaskConfig is a generic helper for saving task configurations with both protobuf and JSON reference
func (cp *ConfigPersistence) saveTaskConfig(filename string, config proto.Message) error {
if cp.dataDir == "" {
return fmt.Errorf("no data directory specified, cannot save task configuration")
}
// Create conf subdirectory path
confDir := filepath.Join(cp.dataDir, ConfigSubdir)
configPath := filepath.Join(confDir, filename)
// Generate JSON reference filename
jsonFilename := filename[:len(filename)-3] + ".json" // Replace .pb with .json
jsonPath := filepath.Join(confDir, jsonFilename)
// Create conf directory if it doesn't exist
if err := os.MkdirAll(confDir, ConfigDirPermissions); err != nil {
return fmt.Errorf("failed to create config directory: %w", err)
}
// Marshal configuration to protobuf binary format
configData, err := proto.Marshal(config)
if err != nil {
return fmt.Errorf("failed to marshal task config: %w", err)
}
// Write protobuf file
if err := os.WriteFile(configPath, configData, ConfigFilePermissions); err != nil {
return fmt.Errorf("failed to write task config file: %w", err)
}
// Marshal configuration to JSON for reference
marshaler := protojson.MarshalOptions{
Multiline: true,
Indent: " ",
EmitUnpopulated: true,
}
jsonData, err := marshaler.Marshal(config)
if err != nil {
glog.Warningf("Failed to marshal task config to JSON reference: %v", err)
} else {
// Write JSON reference file
if err := os.WriteFile(jsonPath, jsonData, ConfigFilePermissions); err != nil {
glog.Warningf("Failed to write task config JSON reference: %v", err)
}
}
glog.V(1).Infof("Saved task configuration to %s (with JSON reference)", configPath)
return nil
}
// loadTaskConfig is a generic helper for loading task configurations from conf subdirectory
func (cp *ConfigPersistence) loadTaskConfig(filename string, config proto.Message) error {
if cp.dataDir == "" {
return os.ErrNotExist // Will trigger default config return
}
confDir := filepath.Join(cp.dataDir, ConfigSubdir)
configPath := filepath.Join(confDir, filename)
// Check if file exists
if _, err := os.Stat(configPath); os.IsNotExist(err) {
return err // Will trigger default config return
}
// Read file
configData, err := os.ReadFile(configPath)
if err != nil {
return fmt.Errorf("failed to read task config file: %w", err)
}
// Unmarshal protobuf binary data
if err := proto.Unmarshal(configData, config); err != nil {
return fmt.Errorf("failed to unmarshal task config: %w", err)
}
glog.V(1).Infof("Loaded task configuration from %s", configPath)
return nil
}
// SaveTaskPolicy generic dispatcher for task persistence
func (cp *ConfigPersistence) SaveTaskPolicy(taskType string, policy *worker_pb.TaskPolicy) error {
switch taskType {
case "vacuum":
return cp.SaveVacuumTaskPolicy(policy)
case "erasure_coding":
return cp.SaveErasureCodingTaskPolicy(policy)
case "balance":
return cp.SaveBalanceTaskPolicy(policy)
case "replication":
return cp.SaveReplicationTaskPolicy(policy)
}
return fmt.Errorf("unknown task type: %s", taskType)
}
// SaveReplicationTaskPolicy saves complete replication task policy to protobuf file
func (cp *ConfigPersistence) SaveReplicationTaskPolicy(policy *worker_pb.TaskPolicy) error {
return cp.saveTaskConfig(ReplicationTaskConfigFile, policy)
}
// GetDataDir returns the data directory path
func (cp *ConfigPersistence) GetDataDir() string {
return cp.dataDir
}
// IsConfigured returns true if a data directory is configured
func (cp *ConfigPersistence) IsConfigured() bool {
return cp.dataDir != ""
}
// GetConfigInfo returns information about the configuration storage
func (cp *ConfigPersistence) GetConfigInfo() map[string]interface{} {
info := map[string]interface{}{
"data_dir_configured": cp.IsConfigured(),
"data_dir": cp.dataDir,
"config_subdir": ConfigSubdir,
}
if cp.IsConfigured() {
// Check if data directory exists
if _, err := os.Stat(cp.dataDir); err == nil {
info["data_dir_exists"] = true
// Check if conf subdirectory exists
confDir := filepath.Join(cp.dataDir, ConfigSubdir)
if _, err := os.Stat(confDir); err == nil {
info["conf_dir_exists"] = true
// List config files
configFiles, err := cp.ListConfigFiles()
if err == nil {
info["config_files"] = configFiles
}
} else {
info["conf_dir_exists"] = false
}
} else {
info["data_dir_exists"] = false
}
}
return info
}
// buildPolicyFromTaskConfigs loads task configurations from separate files and builds a MaintenancePolicy
func buildPolicyFromTaskConfigs() *worker_pb.MaintenancePolicy {
policy := &worker_pb.MaintenancePolicy{
GlobalMaxConcurrent: 4,
DefaultRepeatIntervalSeconds: 6 * 3600, // 6 hours in seconds
DefaultCheckIntervalSeconds: 12 * 3600, // 12 hours in seconds
TaskPolicies: make(map[string]*worker_pb.TaskPolicy),
}
// Load vacuum task configuration
if vacuumConfig := vacuum.LoadConfigFromPersistence(nil); vacuumConfig != nil {
policy.TaskPolicies["vacuum"] = &worker_pb.TaskPolicy{
Enabled: vacuumConfig.Enabled,
MaxConcurrent: int32(vacuumConfig.MaxConcurrent),
RepeatIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds),
CheckIntervalSeconds: int32(vacuumConfig.ScanIntervalSeconds),
TaskConfig: &worker_pb.TaskPolicy_VacuumConfig{
VacuumConfig: &worker_pb.VacuumTaskConfig{
GarbageThreshold: float64(vacuumConfig.GarbageThreshold),
MinVolumeAgeHours: int32(vacuumConfig.MinVolumeAgeSeconds / 3600), // Convert seconds to hours
MinIntervalSeconds: int32(vacuumConfig.MinIntervalSeconds),
},
},
}
}
// Load erasure coding task configuration
if ecConfig := erasure_coding.LoadConfigFromPersistence(nil); ecConfig != nil {
policy.TaskPolicies["erasure_coding"] = &worker_pb.TaskPolicy{
Enabled: ecConfig.Enabled,
MaxConcurrent: int32(ecConfig.MaxConcurrent),
RepeatIntervalSeconds: int32(ecConfig.ScanIntervalSeconds),
CheckIntervalSeconds: int32(ecConfig.ScanIntervalSeconds),
TaskConfig: &worker_pb.TaskPolicy_ErasureCodingConfig{
ErasureCodingConfig: &worker_pb.ErasureCodingTaskConfig{
FullnessRatio: float64(ecConfig.FullnessRatio),
QuietForSeconds: int32(ecConfig.QuietForSeconds),
MinVolumeSizeMb: int32(ecConfig.MinSizeMB),
CollectionFilter: ecConfig.CollectionFilter,
},
},
}
}
// Load balance task configuration
if balanceConfig := balance.LoadConfigFromPersistence(nil); balanceConfig != nil {
policy.TaskPolicies["balance"] = &worker_pb.TaskPolicy{
Enabled: balanceConfig.Enabled,
MaxConcurrent: int32(balanceConfig.MaxConcurrent),
RepeatIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds),
CheckIntervalSeconds: int32(balanceConfig.ScanIntervalSeconds),
TaskConfig: &worker_pb.TaskPolicy_BalanceConfig{
BalanceConfig: &worker_pb.BalanceTaskConfig{
ImbalanceThreshold: float64(balanceConfig.ImbalanceThreshold),
MinServerCount: int32(balanceConfig.MinServerCount),
},
},
}
}
glog.V(1).Infof("Built maintenance policy from separate task configs - %d task policies loaded", len(policy.TaskPolicies))
return policy
}
// SaveTaskDetail saves detailed task information to disk
func (cp *ConfigPersistence) SaveTaskDetail(taskID string, detail *maintenance.TaskDetailData) error {
if cp.dataDir == "" {
return fmt.Errorf("no data directory specified, cannot save task detail")
}
// Validate task ID to prevent path traversal
if !isValidTaskID(taskID) {
return fmt.Errorf("invalid task ID: %q contains illegal path characters", taskID)
}
taskDetailDir := filepath.Join(cp.dataDir, TaskDetailsSubdir)
if err := os.MkdirAll(taskDetailDir, ConfigDirPermissions); err != nil {
return fmt.Errorf("failed to create task details directory: %w", err)
}
// Save task detail as JSON for easy reading and debugging
taskDetailPath := filepath.Join(taskDetailDir, fmt.Sprintf("%s.json", taskID))
jsonData, err := json.MarshalIndent(detail, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal task detail to JSON: %w", err)
}
if err := os.WriteFile(taskDetailPath, jsonData, ConfigFilePermissions); err != nil {
return fmt.Errorf("failed to write task detail file: %w", err)
}
glog.V(2).Infof("Saved task detail for task %s to %s", taskID, taskDetailPath)
return nil
}
// LoadTaskDetail loads detailed task information from disk
func (cp *ConfigPersistence) LoadTaskDetail(taskID string) (*maintenance.TaskDetailData, error) {
if cp.dataDir == "" {
return nil, fmt.Errorf("no data directory specified, cannot load task detail")
}
// Validate task ID to prevent path traversal
if !isValidTaskID(taskID) {
return nil, fmt.Errorf("invalid task ID: %q contains illegal path characters", taskID)
}
taskDetailPath := filepath.Join(cp.dataDir, TaskDetailsSubdir, fmt.Sprintf("%s.json", taskID))
if _, err := os.Stat(taskDetailPath); os.IsNotExist(err) {
return nil, fmt.Errorf("task detail file not found: %s", taskID)
}
jsonData, err := os.ReadFile(taskDetailPath)
if err != nil {
return nil, fmt.Errorf("failed to read task detail file: %w", err)
}
var detail maintenance.TaskDetailData
if err := json.Unmarshal(jsonData, &detail); err != nil {
return nil, fmt.Errorf("failed to unmarshal task detail JSON: %w", err)
}
glog.V(2).Infof("Loaded task detail for task %s from %s", taskID, taskDetailPath)
return &detail, nil
}
// SaveTaskExecutionLogs saves execution logs for a task
func (cp *ConfigPersistence) SaveTaskExecutionLogs(taskID string, logs []*maintenance.TaskExecutionLog) error {
if cp.dataDir == "" {
return fmt.Errorf("no data directory specified, cannot save task logs")
}
// Validate task ID to prevent path traversal
if !isValidTaskID(taskID) {
return fmt.Errorf("invalid task ID: %q contains illegal path characters", taskID)
}
taskLogsDir := filepath.Join(cp.dataDir, TaskLogsSubdir)
if err := os.MkdirAll(taskLogsDir, ConfigDirPermissions); err != nil {
return fmt.Errorf("failed to create task logs directory: %w", err)
}
// Save logs as JSON for easy reading
taskLogsPath := filepath.Join(taskLogsDir, fmt.Sprintf("%s.json", taskID))
logsData := struct {
TaskID string `json:"task_id"`
Logs []*maintenance.TaskExecutionLog `json:"logs"`
}{
TaskID: taskID,
Logs: logs,
}
jsonData, err := json.MarshalIndent(logsData, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal task logs to JSON: %w", err)
}
if err := os.WriteFile(taskLogsPath, jsonData, ConfigFilePermissions); err != nil {
return fmt.Errorf("failed to write task logs file: %w", err)
}
glog.V(2).Infof("Saved %d execution logs for task %s to %s", len(logs), taskID, taskLogsPath)
return nil
}
// LoadTaskExecutionLogs loads execution logs for a task
func (cp *ConfigPersistence) LoadTaskExecutionLogs(taskID string) ([]*maintenance.TaskExecutionLog, error) {
if cp.dataDir == "" {
return nil, fmt.Errorf("no data directory specified, cannot load task logs")
}
// Validate task ID to prevent path traversal
if !isValidTaskID(taskID) {
return nil, fmt.Errorf("invalid task ID: %q contains illegal path characters", taskID)
}
taskLogsPath := filepath.Join(cp.dataDir, TaskLogsSubdir, fmt.Sprintf("%s.json", taskID))
if _, err := os.Stat(taskLogsPath); os.IsNotExist(err) {
// Return empty slice if logs don't exist yet
return []*maintenance.TaskExecutionLog{}, nil
}
jsonData, err := os.ReadFile(taskLogsPath)
if err != nil {
return nil, fmt.Errorf("failed to read task logs file: %w", err)
}
var logsData struct {
TaskID string `json:"task_id"`
Logs []*maintenance.TaskExecutionLog `json:"logs"`
}
if err := json.Unmarshal(jsonData, &logsData); err != nil {
return nil, fmt.Errorf("failed to unmarshal task logs JSON: %w", err)
}
glog.V(2).Infof("Loaded %d execution logs for task %s from %s", len(logsData.Logs), taskID, taskLogsPath)
return logsData.Logs, nil
}
// DeleteTaskDetail removes task detail and logs from disk
func (cp *ConfigPersistence) DeleteTaskDetail(taskID string) error {
if cp.dataDir == "" {
return fmt.Errorf("no data directory specified, cannot delete task detail")
}
// Validate task ID to prevent path traversal
if !isValidTaskID(taskID) {
return fmt.Errorf("invalid task ID: %q contains illegal path characters", taskID)
}
// Delete task detail file
taskDetailPath := filepath.Join(cp.dataDir, TaskDetailsSubdir, fmt.Sprintf("%s.json", taskID))
if err := os.Remove(taskDetailPath); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to delete task detail file: %w", err)
}
// Delete task logs file
taskLogsPath := filepath.Join(cp.dataDir, TaskLogsSubdir, fmt.Sprintf("%s.json", taskID))
if err := os.Remove(taskLogsPath); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to delete task logs file: %w", err)
}
glog.V(2).Infof("Deleted task detail and logs for task %s", taskID)
return nil
}
// ListTaskDetails returns a list of all task IDs that have stored details
func (cp *ConfigPersistence) ListTaskDetails() ([]string, error) {
if cp.dataDir == "" {
return nil, fmt.Errorf("no data directory specified, cannot list task details")
}
taskDetailDir := filepath.Join(cp.dataDir, TaskDetailsSubdir)
if _, err := os.Stat(taskDetailDir); os.IsNotExist(err) {
return []string{}, nil
}
entries, err := os.ReadDir(taskDetailDir)
if err != nil {
return nil, fmt.Errorf("failed to read task details directory: %w", err)
}
var taskIDs []string
for _, entry := range entries {
if !entry.IsDir() && filepath.Ext(entry.Name()) == ".json" {
taskID := entry.Name()[:len(entry.Name())-5] // Remove .json extension
taskIDs = append(taskIDs, taskID)
}
}
return taskIDs, nil
}
// CleanupCompletedTasks removes old completed tasks beyond the retention limit
func (cp *ConfigPersistence) CleanupCompletedTasks() error {
if cp.dataDir == "" {
return fmt.Errorf("no data directory specified, cannot cleanup completed tasks")
}
tasksDir := filepath.Join(cp.dataDir, TasksSubdir)
if _, err := os.Stat(tasksDir); os.IsNotExist(err) {
return nil // No tasks directory, nothing to cleanup
}
// Load all tasks and find completed/failed ones
allTasks, err := cp.LoadAllTaskStates()
if err != nil {
return fmt.Errorf("failed to load tasks for cleanup: %w", err)
}
// Filter completed and failed tasks, sort by completion time
var completedTasks []*maintenance.MaintenanceTask
for _, task := range allTasks {
if (task.Status == maintenance.TaskStatusCompleted || task.Status == maintenance.TaskStatusFailed) && task.CompletedAt != nil {
completedTasks = append(completedTasks, task)
}
}
// Sort by completion time (most recent first)
sort.Slice(completedTasks, func(i, j int) bool {
return completedTasks[i].CompletedAt.After(*completedTasks[j].CompletedAt)
})
// Keep only the most recent MaxCompletedTasks, delete the rest
if len(completedTasks) > MaxCompletedTasks {
tasksToDelete := completedTasks[MaxCompletedTasks:]
for _, task := range tasksToDelete {
if err := cp.DeleteTaskState(task.ID); err != nil {
glog.Warningf("Failed to delete old completed task %s: %v", task.ID, err)
} else {
glog.V(2).Infof("Cleaned up old completed task %s (completed: %v)", task.ID, task.CompletedAt)
}
}
glog.V(1).Infof("Cleaned up %d old completed tasks (keeping %d most recent)", len(tasksToDelete), MaxCompletedTasks)
}
return nil
}
// SaveTaskState saves a task state to protobuf file
func (cp *ConfigPersistence) SaveTaskState(task *maintenance.MaintenanceTask) error {
if cp.dataDir == "" {
return fmt.Errorf("no data directory specified, cannot save task state")
}
// Validate task ID to prevent path traversal
if !isValidTaskID(task.ID) {
return fmt.Errorf("invalid task ID: %q contains illegal path characters", task.ID)
}
tasksDir := filepath.Join(cp.dataDir, TasksSubdir)
if err := os.MkdirAll(tasksDir, ConfigDirPermissions); err != nil {
return fmt.Errorf("failed to create tasks directory: %w", err)
}
taskFilePath := filepath.Join(tasksDir, fmt.Sprintf("%s.pb", task.ID))
// Convert task to protobuf
pbTask := cp.maintenanceTaskToProtobuf(task)
taskStateFile := &worker_pb.TaskStateFile{
Task: pbTask,
LastUpdated: time.Now().Unix(),
AdminVersion: "unknown", // TODO: add version info
}
pbData, err := proto.Marshal(taskStateFile)
if err != nil {
return fmt.Errorf("failed to marshal task state protobuf: %w", err)
}
if err := os.WriteFile(taskFilePath, pbData, ConfigFilePermissions); err != nil {
return fmt.Errorf("failed to write task state file: %w", err)
}
glog.V(2).Infof("Saved task state for task %s to %s", task.ID, taskFilePath)
return nil
}
// LoadTaskState loads a task state from protobuf file
func (cp *ConfigPersistence) LoadTaskState(taskID string) (*maintenance.MaintenanceTask, error) {
if cp.dataDir == "" {
return nil, fmt.Errorf("no data directory specified, cannot load task state")
}
// Validate task ID to prevent path traversal
if !isValidTaskID(taskID) {
return nil, fmt.Errorf("invalid task ID: %q contains illegal path characters", taskID)
}
taskFilePath := filepath.Join(cp.dataDir, TasksSubdir, fmt.Sprintf("%s.pb", taskID))
if _, err := os.Stat(taskFilePath); os.IsNotExist(err) {
return nil, fmt.Errorf("task state file not found: %s", taskID)
}
pbData, err := os.ReadFile(taskFilePath)
if err != nil {
return nil, fmt.Errorf("failed to read task state file: %w", err)
}
var taskStateFile worker_pb.TaskStateFile
if err := proto.Unmarshal(pbData, &taskStateFile); err != nil {
return nil, fmt.Errorf("failed to unmarshal task state protobuf: %w", err)
}
// Convert protobuf to maintenance task
task := cp.protobufToMaintenanceTask(taskStateFile.Task)
glog.V(2).Infof("Loaded task state for task %s from %s", taskID, taskFilePath)
return task, nil
}
// LoadAllTaskStates loads all task states from disk
func (cp *ConfigPersistence) LoadAllTaskStates() ([]*maintenance.MaintenanceTask, error) {
if cp.dataDir == "" {
return []*maintenance.MaintenanceTask{}, nil
}
tasksDir := filepath.Join(cp.dataDir, TasksSubdir)
if _, err := os.Stat(tasksDir); os.IsNotExist(err) {
return []*maintenance.MaintenanceTask{}, nil
}
entries, err := os.ReadDir(tasksDir)
if err != nil {
return nil, fmt.Errorf("failed to read tasks directory: %w", err)
}
var tasks []*maintenance.MaintenanceTask
for _, entry := range entries {
if !entry.IsDir() && filepath.Ext(entry.Name()) == ".pb" {
taskID := entry.Name()[:len(entry.Name())-3] // Remove .pb extension
task, err := cp.LoadTaskState(taskID)
if err != nil {
glog.Warningf("Failed to load task state for %s: %v", taskID, err)
continue
}
tasks = append(tasks, task)
}
}
glog.V(1).Infof("Loaded %d task states from disk", len(tasks))
return tasks, nil
}
// DeleteTaskState removes a task state file from disk
func (cp *ConfigPersistence) DeleteTaskState(taskID string) error {
if cp.dataDir == "" {
return fmt.Errorf("no data directory specified, cannot delete task state")
}
// Validate task ID to prevent path traversal
if !isValidTaskID(taskID) {
return fmt.Errorf("invalid task ID: %q contains illegal path characters", taskID)
}
taskFilePath := filepath.Join(cp.dataDir, TasksSubdir, fmt.Sprintf("%s.pb", taskID))
if err := os.Remove(taskFilePath); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to delete task state file: %w", err)
}
glog.V(2).Infof("Deleted task state for task %s", taskID)
return nil
}
// maintenanceTaskToProtobuf converts a MaintenanceTask to protobuf format
func (cp *ConfigPersistence) maintenanceTaskToProtobuf(task *maintenance.MaintenanceTask) *worker_pb.MaintenanceTaskData {
pbTask := &worker_pb.MaintenanceTaskData{
Id: task.ID,
Type: string(task.Type),
Priority: cp.priorityToString(task.Priority),
Status: string(task.Status),
VolumeId: task.VolumeID,
Server: task.Server,
Collection: task.Collection,
Reason: task.Reason,
CreatedAt: task.CreatedAt.Unix(),
ScheduledAt: task.ScheduledAt.Unix(),
WorkerId: task.WorkerID,
Error: task.Error,
Progress: task.Progress,
RetryCount: int32(task.RetryCount),
MaxRetries: int32(task.MaxRetries),
CreatedBy: task.CreatedBy,
CreationContext: task.CreationContext,
DetailedReason: task.DetailedReason,
Tags: task.Tags,
}
// Handle optional timestamps
if task.StartedAt != nil {
pbTask.StartedAt = task.StartedAt.Unix()
}
if task.CompletedAt != nil {
pbTask.CompletedAt = task.CompletedAt.Unix()
}
// Convert assignment history
if task.AssignmentHistory != nil {
for _, record := range task.AssignmentHistory {
pbRecord := &worker_pb.TaskAssignmentRecord{
WorkerId: record.WorkerID,
WorkerAddress: record.WorkerAddress,
AssignedAt: record.AssignedAt.Unix(),
Reason: record.Reason,
}
if record.UnassignedAt != nil {
pbRecord.UnassignedAt = record.UnassignedAt.Unix()
}
pbTask.AssignmentHistory = append(pbTask.AssignmentHistory, pbRecord)
}
}
// Convert typed parameters if available
if task.TypedParams != nil {
pbTask.TypedParams = task.TypedParams
}
return pbTask
}
// protobufToMaintenanceTask converts protobuf format to MaintenanceTask
func (cp *ConfigPersistence) protobufToMaintenanceTask(pbTask *worker_pb.MaintenanceTaskData) *maintenance.MaintenanceTask {
task := &maintenance.MaintenanceTask{
ID: pbTask.Id,
Type: maintenance.MaintenanceTaskType(pbTask.Type),
Priority: cp.stringToPriority(pbTask.Priority),
Status: maintenance.MaintenanceTaskStatus(pbTask.Status),
VolumeID: pbTask.VolumeId,
Server: pbTask.Server,
Collection: pbTask.Collection,
Reason: pbTask.Reason,
CreatedAt: time.Unix(pbTask.CreatedAt, 0),
ScheduledAt: time.Unix(pbTask.ScheduledAt, 0),
WorkerID: pbTask.WorkerId,
Error: pbTask.Error,
Progress: pbTask.Progress,
RetryCount: int(pbTask.RetryCount),
MaxRetries: int(pbTask.MaxRetries),
CreatedBy: pbTask.CreatedBy,
CreationContext: pbTask.CreationContext,
DetailedReason: pbTask.DetailedReason,
Tags: pbTask.Tags,
}
// Handle optional timestamps
if pbTask.StartedAt > 0 {
startTime := time.Unix(pbTask.StartedAt, 0)
task.StartedAt = &startTime
}
if pbTask.CompletedAt > 0 {
completedTime := time.Unix(pbTask.CompletedAt, 0)
task.CompletedAt = &completedTime
}
// Convert assignment history
if pbTask.AssignmentHistory != nil {
task.AssignmentHistory = make([]*maintenance.TaskAssignmentRecord, 0, len(pbTask.AssignmentHistory))
for _, pbRecord := range pbTask.AssignmentHistory {
record := &maintenance.TaskAssignmentRecord{
WorkerID: pbRecord.WorkerId,
WorkerAddress: pbRecord.WorkerAddress,
AssignedAt: time.Unix(pbRecord.AssignedAt, 0),
Reason: pbRecord.Reason,
}
if pbRecord.UnassignedAt > 0 {
unassignedTime := time.Unix(pbRecord.UnassignedAt, 0)
record.UnassignedAt = &unassignedTime
}
task.AssignmentHistory = append(task.AssignmentHistory, record)
}
}
// Convert typed parameters if available
if pbTask.TypedParams != nil {
task.TypedParams = pbTask.TypedParams
}
return task
}
// priorityToString converts MaintenanceTaskPriority to string for protobuf storage
func (cp *ConfigPersistence) priorityToString(priority maintenance.MaintenanceTaskPriority) string {
switch priority {
case maintenance.PriorityLow:
return "low"
case maintenance.PriorityNormal:
return "normal"
case maintenance.PriorityHigh:
return "high"
case maintenance.PriorityCritical:
return "critical"
default:
return "normal"
}
}
// stringToPriority converts string from protobuf to MaintenanceTaskPriority
func (cp *ConfigPersistence) stringToPriority(priorityStr string) maintenance.MaintenanceTaskPriority {
switch priorityStr {
case "low":
return maintenance.PriorityLow
case "normal":
return maintenance.PriorityNormal
case "high":
return maintenance.PriorityHigh
case "critical":
return maintenance.PriorityCritical
default:
return maintenance.PriorityNormal
}
}