Admin: refactoring active topology (#7073)

* refactoring

* add ec shard size

* address comments

* passing task id

There seems to be a disconnect between the pending tasks created in ActiveTopology and the TaskDetectionResult returned by this function. A taskID is generated locally and used to create pending tasks via AddPendingECShardTask, but this taskID is not stored in the TaskDetectionResult or passed along in any way.

This makes it impossible for the worker that eventually executes the task to know which pending task in ActiveTopology it corresponds to. Without the correct taskID, the worker cannot call AssignTask or CompleteTask on the master, breaking the entire task lifecycle and capacity management feature.

A potential solution is to add a TaskID field to TaskDetectionResult and worker_pb.TaskParams, ensuring the ID is propagated from detection to execution.

* 1 source multiple destinations

* task supports multi source and destination

* ec needs to clean up previous shards

* use erasure coding constants

* getPlanningCapacityUnsafe getEffectiveAvailableCapacityUnsafe  should return StorageSlotChange for calculation

* use CanAccommodate to calculate

* remove dead code

* address comments

* fix Mutex Copying in Protobuf Structs

* use constants

* fix estimatedSize

The calculation for estimatedSize only considers source.EstimatedSize and dest.StorageChange, but omits dest.EstimatedSize. The TaskDestination struct has an EstimatedSize field, which seems to be ignored here. This could lead to an incorrect estimation of the total size of data involved in tasks on a disk. The loop should probably also include estimatedSize += dest.EstimatedSize.

* at.assignTaskToDisk(task)

* refactoring

* Update weed/admin/topology/internal.go

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

* fail fast

* fix compilation

* Update weed/worker/tasks/erasure_coding/detection.go

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

* indexes for volume and shard locations

* dedup with ToVolumeSlots

* return an additional boolean to indicate success, or an error

* Update abstract_sql_store.go

* fix

* Update weed/worker/tasks/erasure_coding/detection.go

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

* Update weed/admin/topology/task_management.go

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

* faster findVolumeDisk

* Update weed/worker/tasks/erasure_coding/detection.go

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update weed/admin/topology/storage_slot_test.go

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* refactor

* simplify

* remove unused GetDiskStorageImpact function

* refactor

* add comments

* Update weed/admin/topology/storage_impact.go

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

* Update weed/admin/topology/storage_slot_test.go

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

* Update storage_impact.go

* AddPendingTask

The unified AddPendingTask function now serves as the single entry point for all task creation, successfully consolidating the previously separate functions while maintaining full functionality and improving code organization.

---------

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
Chris Lu
2025-08-03 01:35:38 -07:00
committed by GitHub
parent 315fcc70b2
commit 0ecb466eda
18 changed files with 2579 additions and 467 deletions

View File

@@ -1,6 +1,7 @@
package topology
import (
"fmt"
"testing"
"time"
@@ -9,6 +10,16 @@ import (
"github.com/stretchr/testify/require"
)
// Helper function to find a disk by ID for testing - reduces code duplication
func findDiskByID(disks []*DiskInfo, diskID uint32) *DiskInfo {
for _, disk := range disks {
if disk.DiskID == diskID {
return disk
}
}
return nil
}
// TestActiveTopologyBasicOperations tests basic topology management
func TestActiveTopologyBasicOperations(t *testing.T) {
topology := NewActiveTopology(10)
@@ -58,8 +69,19 @@ func TestTaskLifecycle(t *testing.T) {
taskID := "balance-001"
// 1. Add pending task
topology.AddPendingTask(taskID, TaskTypeBalance, 1001,
"10.0.0.1:8080", 0, "10.0.0.2:8080", 1)
err := topology.AddPendingTask(TaskSpec{
TaskID: taskID,
TaskType: TaskTypeBalance,
VolumeID: 1001,
VolumeSize: 1024 * 1024 * 1024,
Sources: []TaskSourceSpec{
{ServerID: "10.0.0.1:8080", DiskID: 0},
},
Destinations: []TaskDestinationSpec{
{ServerID: "10.0.0.2:8080", DiskID: 1},
},
})
assert.NoError(t, err, "Should add pending task successfully")
// Verify pending state
assert.Equal(t, 1, len(topology.pendingTasks))
@@ -77,7 +99,7 @@ func TestTaskLifecycle(t *testing.T) {
assert.Equal(t, 1, len(targetDisk.pendingTasks))
// 2. Assign task
err := topology.AssignTask(taskID)
err = topology.AssignTask(taskID)
require.NoError(t, err)
// Verify assigned state
@@ -258,8 +280,7 @@ func TestTargetSelectionScenarios(t *testing.T) {
assert.NotEqual(t, tt.excludeNode, disk.NodeID,
"Available disk should not be on excluded node")
load := tt.topology.GetDiskLoad(disk.NodeID, disk.DiskID)
assert.Less(t, load, 2, "Disk load should be less than 2")
assert.Less(t, disk.LoadCount, 2, "Disk load should be less than 2")
}
})
}
@@ -271,37 +292,65 @@ func TestDiskLoadCalculation(t *testing.T) {
topology.UpdateTopology(createSampleTopology())
// Initially no load
load := topology.GetDiskLoad("10.0.0.1:8080", 0)
assert.Equal(t, 0, load)
disks := topology.GetNodeDisks("10.0.0.1:8080")
targetDisk := findDiskByID(disks, 0)
require.NotNil(t, targetDisk, "Should find disk with ID 0")
assert.Equal(t, 0, targetDisk.LoadCount)
// Add pending task
topology.AddPendingTask("task1", TaskTypeBalance, 1001,
"10.0.0.1:8080", 0, "10.0.0.2:8080", 1)
err := topology.AddPendingTask(TaskSpec{
TaskID: "task1",
TaskType: TaskTypeBalance,
VolumeID: 1001,
VolumeSize: 1024 * 1024 * 1024,
Sources: []TaskSourceSpec{
{ServerID: "10.0.0.1:8080", DiskID: 0},
},
Destinations: []TaskDestinationSpec{
{ServerID: "10.0.0.2:8080", DiskID: 1},
},
})
assert.NoError(t, err, "Should add pending task successfully")
// Check load increased
load = topology.GetDiskLoad("10.0.0.1:8080", 0)
assert.Equal(t, 1, load)
disks = topology.GetNodeDisks("10.0.0.1:8080")
targetDisk = findDiskByID(disks, 0)
assert.Equal(t, 1, targetDisk.LoadCount)
// Add another task to same disk
topology.AddPendingTask("task2", TaskTypeVacuum, 1002,
"10.0.0.1:8080", 0, "", 0)
err = topology.AddPendingTask(TaskSpec{
TaskID: "task2",
TaskType: TaskTypeVacuum,
VolumeID: 1002,
VolumeSize: 0,
Sources: []TaskSourceSpec{
{ServerID: "10.0.0.1:8080", DiskID: 0},
},
Destinations: []TaskDestinationSpec{
{ServerID: "", DiskID: 0}, // Vacuum doesn't have a destination
},
})
assert.NoError(t, err, "Should add vacuum task successfully")
load = topology.GetDiskLoad("10.0.0.1:8080", 0)
assert.Equal(t, 2, load)
disks = topology.GetNodeDisks("10.0.0.1:8080")
targetDisk = findDiskByID(disks, 0)
assert.Equal(t, 2, targetDisk.LoadCount)
// Move one task to assigned
topology.AssignTask("task1")
// Load should still be 2 (1 pending + 1 assigned)
load = topology.GetDiskLoad("10.0.0.1:8080", 0)
assert.Equal(t, 2, load)
disks = topology.GetNodeDisks("10.0.0.1:8080")
targetDisk = findDiskByID(disks, 0)
assert.Equal(t, 2, targetDisk.LoadCount)
// Complete one task
topology.CompleteTask("task1")
// Load should decrease to 1
load = topology.GetDiskLoad("10.0.0.1:8080", 0)
assert.Equal(t, 1, load)
disks = topology.GetNodeDisks("10.0.0.1:8080")
targetDisk = findDiskByID(disks, 0)
assert.Equal(t, 1, targetDisk.LoadCount)
}
// TestTaskConflictDetection tests task conflict detection
@@ -310,8 +359,19 @@ func TestTaskConflictDetection(t *testing.T) {
topology.UpdateTopology(createSampleTopology())
// Add a balance task
topology.AddPendingTask("balance1", TaskTypeBalance, 1001,
"10.0.0.1:8080", 0, "10.0.0.2:8080", 1)
err := topology.AddPendingTask(TaskSpec{
TaskID: "balance1",
TaskType: TaskTypeBalance,
VolumeID: 1001,
VolumeSize: 1024 * 1024 * 1024,
Sources: []TaskSourceSpec{
{ServerID: "10.0.0.1:8080", DiskID: 0},
},
Destinations: []TaskDestinationSpec{
{ServerID: "10.0.0.2:8080", DiskID: 1},
},
})
assert.NoError(t, err, "Should add balance task successfully")
topology.AssignTask("balance1")
// Try to get available disks for vacuum (conflicts with balance)
@@ -448,8 +508,22 @@ func createTopologyWithLoad() *ActiveTopology {
topology.UpdateTopology(createSampleTopology())
// Add some existing tasks to create load
topology.AddPendingTask("existing1", TaskTypeVacuum, 2001,
"10.0.0.1:8080", 0, "", 0)
err := topology.AddPendingTask(TaskSpec{
TaskID: "existing1",
TaskType: TaskTypeVacuum,
VolumeID: 2001,
VolumeSize: 0,
Sources: []TaskSourceSpec{
{ServerID: "10.0.0.1:8080", DiskID: 0},
},
Destinations: []TaskDestinationSpec{
{ServerID: "", DiskID: 0}, // Vacuum doesn't have a destination
},
})
if err != nil {
// In test helper function, just log error instead of failing
fmt.Printf("Warning: Failed to add existing task: %v\n", err)
}
topology.AssignTask("existing1")
return topology
@@ -466,12 +540,38 @@ func createTopologyWithConflicts() *ActiveTopology {
topology.UpdateTopology(createSampleTopology())
// Add conflicting tasks
topology.AddPendingTask("balance1", TaskTypeBalance, 3001,
"10.0.0.1:8080", 0, "10.0.0.2:8080", 0)
err := topology.AddPendingTask(TaskSpec{
TaskID: "balance1",
TaskType: TaskTypeBalance,
VolumeID: 3001,
VolumeSize: 1024 * 1024 * 1024,
Sources: []TaskSourceSpec{
{ServerID: "10.0.0.1:8080", DiskID: 0},
},
Destinations: []TaskDestinationSpec{
{ServerID: "10.0.0.2:8080", DiskID: 0},
},
})
if err != nil {
fmt.Printf("Warning: Failed to add balance task: %v\n", err)
}
topology.AssignTask("balance1")
topology.AddPendingTask("ec1", TaskTypeErasureCoding, 3002,
"10.0.0.1:8080", 1, "", 0)
err = topology.AddPendingTask(TaskSpec{
TaskID: "ec1",
TaskType: TaskTypeErasureCoding,
VolumeID: 3002,
VolumeSize: 1024 * 1024 * 1024,
Sources: []TaskSourceSpec{
{ServerID: "10.0.0.1:8080", DiskID: 1},
},
Destinations: []TaskDestinationSpec{
{ServerID: "", DiskID: 0}, // EC doesn't have single destination
},
})
if err != nil {
fmt.Printf("Warning: Failed to add EC task: %v\n", err)
}
topology.AssignTask("ec1")
return topology