* fix: EC rebalance fails with replica placement 000 This PR fixes several issues with EC shard distribution: 1. Pre-flight check before EC encoding - Verify target disk type has capacity before encoding starts - Prevents encoding shards only to fail during rebalance - Shows helpful error when wrong diskType is specified (e.g., ssd when volumes are on hdd) 2. Fix EC rebalance with replica placement 000 - When DiffRackCount=0, shards should be distributed freely across racks - The '000' placement means 'no volume replication needed' because EC provides redundancy - Previously all racks were skipped with error 'shards X > replica placement limit (0)' 3. Add unit tests for EC rebalance slot calculation - TestECRebalanceWithLimitedSlots: documents the limited slots scenario - TestECRebalanceZeroFreeSlots: reproduces the 0 free slots error 4. Add Makefile for manual EC testing - make setup: start cluster and populate data - make shell: open weed shell for EC commands - make clean: stop cluster and cleanup * fix: default -rebalance to true for ec.encode The -rebalance flag was defaulting to false, which meant ec.encode would only print shard moves but not actually execute them. This is a poor default since the whole point of EC encoding is to distribute shards across servers for fault tolerance. Now -rebalance defaults to true, so shards are actually distributed after encoding. Users can use -rebalance=false if they only want to see what would happen without making changes. * test/erasure_coding: improve Makefile safety and docs - Narrow pkill pattern for volume servers to use TEST_DIR instead of port pattern, avoiding accidental kills of unrelated SeaweedFS processes - Document external dependencies (curl, jq) in header comments * shell: refactor buildRackWithEcShards to reuse buildEcShards Extract common shard bit construction logic to avoid duplication between buildEcShards and buildRackWithEcShards helper functions. * shell: update test for EC replication 000 behavior When DiffRackCount=0 (replication "000"), EC shards should be distributed freely across racks since erasure coding provides its own redundancy. Update test expectation to reflect this behavior. * erasure_coding: add distribution package for proportional EC shard placement Add a new reusable package for EC shard distribution that: - Supports configurable EC ratios (not hard-coded 10+4) - Distributes shards proportionally based on replication policy - Provides fault tolerance analysis - Prefers moving parity shards to keep data shards spread out Key components: - ECConfig: Configurable data/parity shard counts - ReplicationConfig: Parsed XYZ replication policy - ECDistribution: Target shard counts per DC/rack/node - Rebalancer: Plans shard moves with parity-first strategy This enables seaweed-enterprise custom EC ratios and weed worker integration while maintaining a clean, testable architecture. * shell: integrate distribution package for EC rebalancing Add shell wrappers around the distribution package: - ProportionalECRebalancer: Plans moves using distribution.Rebalancer - NewProportionalECRebalancerWithConfig: Supports custom EC configs - GetDistributionSummary/GetFaultToleranceAnalysis: Helper functions The shell layer converts between EcNode types and the generic TopologyNode types used by the distribution package. * test setup * ec: improve data and parity shard distribution across racks - Add shardsByTypePerRack helper to track data vs parity shards - Rewrite doBalanceEcShardsAcrossRacks for two-pass balancing: 1. Balance data shards (0-9) evenly, max ceil(10/6)=2 per rack 2. Balance parity shards (10-13) evenly, max ceil(4/6)=1 per rack - Add balanceShardTypeAcrossRacks for generic shard type balancing - Add pickRackForShardType to select destination with room for type - Add unit tests for even data/parity distribution verification This ensures even read load during normal operation by spreading both data and parity shards across all available racks. * ec: make data/parity shard counts configurable in ecBalancer - Add dataShardCount and parityShardCount fields to ecBalancer struct - Add getDataShardCount() and getParityShardCount() methods with defaults - Replace direct constant usage with configurable methods - Fix unused variable warning for parityPerRack This allows seaweed-enterprise to use custom EC ratios while defaulting to standard 10+4 scheme. * Address PR 7812 review comments Makefile improvements: - Save PIDs for each volume server for precise termination - Use PID-based killing in stop target with pkill fallback - Use more specific pkill patterns with TEST_DIR paths Documentation: - Document jq dependency in README.md Rebalancer fix: - Fix duplicate shard count updates in applyMovesToAnalysis - All planners (DC/rack/node) update counts inline during planning - Remove duplicate updates from applyMovesToAnalysis to avoid double-counting * test/erasure_coding: use mktemp for test file template Use mktemp instead of hardcoded /tmp/testfile_template.bin path to provide better isolation for concurrent test runs.
288 lines
12 KiB
Go
288 lines
12 KiB
Go
package shell
|
|
|
|
import (
|
|
"testing"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/types"
|
|
)
|
|
|
|
func TestCommandEcBalanceSmall(t *testing.T) {
|
|
ecb := &ecBalancer{
|
|
ecNodes: []*EcNode{
|
|
newEcNode("dc1", "rack1", "dn1", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}),
|
|
newEcNode("dc1", "rack2", "dn2", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}),
|
|
},
|
|
applyBalancing: false,
|
|
diskType: types.HardDriveType,
|
|
}
|
|
|
|
ecb.balanceEcVolumes("c1")
|
|
}
|
|
|
|
func TestCommandEcBalanceNothingToMove(t *testing.T) {
|
|
ecb := &ecBalancer{
|
|
ecNodes: []*EcNode{
|
|
newEcNode("dc1", "rack1", "dn1", 100).
|
|
addEcVolumeAndShardsForTest(1, "c1", []uint32{0, 1, 2, 3, 4, 5, 6}).
|
|
addEcVolumeAndShardsForTest(2, "c1", []uint32{7, 8, 9, 10, 11, 12, 13}),
|
|
newEcNode("dc1", "rack1", "dn2", 100).
|
|
addEcVolumeAndShardsForTest(1, "c1", []uint32{7, 8, 9, 10, 11, 12, 13}).
|
|
addEcVolumeAndShardsForTest(2, "c1", []uint32{0, 1, 2, 3, 4, 5, 6}),
|
|
},
|
|
applyBalancing: false,
|
|
diskType: types.HardDriveType,
|
|
}
|
|
|
|
ecb.balanceEcVolumes("c1")
|
|
}
|
|
|
|
func TestCommandEcBalanceAddNewServers(t *testing.T) {
|
|
ecb := &ecBalancer{
|
|
ecNodes: []*EcNode{
|
|
newEcNode("dc1", "rack1", "dn1", 100).
|
|
addEcVolumeAndShardsForTest(1, "c1", []uint32{0, 1, 2, 3, 4, 5, 6}).
|
|
addEcVolumeAndShardsForTest(2, "c1", []uint32{7, 8, 9, 10, 11, 12, 13}),
|
|
newEcNode("dc1", "rack1", "dn2", 100).
|
|
addEcVolumeAndShardsForTest(1, "c1", []uint32{7, 8, 9, 10, 11, 12, 13}).
|
|
addEcVolumeAndShardsForTest(2, "c1", []uint32{0, 1, 2, 3, 4, 5, 6}),
|
|
newEcNode("dc1", "rack1", "dn3", 100),
|
|
newEcNode("dc1", "rack1", "dn4", 100),
|
|
},
|
|
applyBalancing: false,
|
|
diskType: types.HardDriveType,
|
|
}
|
|
|
|
ecb.balanceEcVolumes("c1")
|
|
}
|
|
|
|
func TestCommandEcBalanceAddNewRacks(t *testing.T) {
|
|
ecb := &ecBalancer{
|
|
ecNodes: []*EcNode{
|
|
newEcNode("dc1", "rack1", "dn1", 100).
|
|
addEcVolumeAndShardsForTest(1, "c1", []uint32{0, 1, 2, 3, 4, 5, 6}).
|
|
addEcVolumeAndShardsForTest(2, "c1", []uint32{7, 8, 9, 10, 11, 12, 13}),
|
|
newEcNode("dc1", "rack1", "dn2", 100).
|
|
addEcVolumeAndShardsForTest(1, "c1", []uint32{7, 8, 9, 10, 11, 12, 13}).
|
|
addEcVolumeAndShardsForTest(2, "c1", []uint32{0, 1, 2, 3, 4, 5, 6}),
|
|
newEcNode("dc1", "rack2", "dn3", 100),
|
|
newEcNode("dc1", "rack2", "dn4", 100),
|
|
},
|
|
applyBalancing: false,
|
|
diskType: types.HardDriveType,
|
|
}
|
|
|
|
ecb.balanceEcVolumes("c1")
|
|
}
|
|
|
|
func TestCommandEcBalanceVolumeEvenButRackUneven(t *testing.T) {
|
|
ecb := ecBalancer{
|
|
ecNodes: []*EcNode{
|
|
newEcNode("dc1", "rack1", "dn_shared", 100).
|
|
addEcVolumeAndShardsForTest(1, "c1", []uint32{0}).
|
|
addEcVolumeAndShardsForTest(2, "c1", []uint32{0}),
|
|
|
|
newEcNode("dc1", "rack1", "dn_a1", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{1}),
|
|
newEcNode("dc1", "rack1", "dn_a2", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{2}),
|
|
newEcNode("dc1", "rack1", "dn_a3", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{3}),
|
|
newEcNode("dc1", "rack1", "dn_a4", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{4}),
|
|
newEcNode("dc1", "rack1", "dn_a5", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{5}),
|
|
newEcNode("dc1", "rack1", "dn_a6", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{6}),
|
|
newEcNode("dc1", "rack1", "dn_a7", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{7}),
|
|
newEcNode("dc1", "rack1", "dn_a8", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{8}),
|
|
newEcNode("dc1", "rack1", "dn_a9", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{9}),
|
|
newEcNode("dc1", "rack1", "dn_a10", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{10}),
|
|
newEcNode("dc1", "rack1", "dn_a11", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{11}),
|
|
newEcNode("dc1", "rack1", "dn_a12", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{12}),
|
|
newEcNode("dc1", "rack1", "dn_a13", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{13}),
|
|
|
|
newEcNode("dc1", "rack1", "dn_b1", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{1}),
|
|
newEcNode("dc1", "rack1", "dn_b2", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{2}),
|
|
newEcNode("dc1", "rack1", "dn_b3", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{3}),
|
|
newEcNode("dc1", "rack1", "dn_b4", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{4}),
|
|
newEcNode("dc1", "rack1", "dn_b5", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{5}),
|
|
newEcNode("dc1", "rack1", "dn_b6", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{6}),
|
|
newEcNode("dc1", "rack1", "dn_b7", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{7}),
|
|
newEcNode("dc1", "rack1", "dn_b8", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{8}),
|
|
newEcNode("dc1", "rack1", "dn_b9", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{9}),
|
|
newEcNode("dc1", "rack1", "dn_b10", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{10}),
|
|
newEcNode("dc1", "rack1", "dn_b11", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{11}),
|
|
newEcNode("dc1", "rack1", "dn_b12", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{12}),
|
|
newEcNode("dc1", "rack1", "dn_b13", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{13}),
|
|
|
|
newEcNode("dc1", "rack1", "dn3", 100),
|
|
},
|
|
applyBalancing: false,
|
|
diskType: types.HardDriveType,
|
|
}
|
|
|
|
ecb.balanceEcVolumes("c1")
|
|
ecb.balanceEcRacks()
|
|
}
|
|
|
|
func newEcNode(dc string, rack string, dataNodeId string, freeEcSlot int) *EcNode {
|
|
return &EcNode{
|
|
info: &master_pb.DataNodeInfo{
|
|
Id: dataNodeId,
|
|
DiskInfos: make(map[string]*master_pb.DiskInfo),
|
|
},
|
|
dc: DataCenterId(dc),
|
|
rack: RackId(rack),
|
|
freeEcSlot: freeEcSlot,
|
|
}
|
|
}
|
|
|
|
func (ecNode *EcNode) addEcVolumeAndShardsForTest(vid uint32, collection string, shardIds []uint32) *EcNode {
|
|
return ecNode.addEcVolumeShards(needle.VolumeId(vid), collection, shardIds, types.HardDriveType)
|
|
}
|
|
|
|
// TestCommandEcBalanceEvenDataAndParityDistribution verifies that after balancing:
|
|
// 1. Data shards (0-9) are evenly distributed across racks (max 2 per rack for 6 racks)
|
|
// 2. Parity shards (10-13) are evenly distributed across racks (max 1 per rack for 6 racks)
|
|
func TestCommandEcBalanceEvenDataAndParityDistribution(t *testing.T) {
|
|
// Setup: All 14 shards start on rack1 (simulating fresh EC encode)
|
|
ecb := &ecBalancer{
|
|
ecNodes: []*EcNode{
|
|
// All shards initially on rack1/dn1
|
|
newEcNode("dc1", "rack1", "dn1", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}),
|
|
// Empty nodes on other racks
|
|
newEcNode("dc1", "rack2", "dn2", 100),
|
|
newEcNode("dc1", "rack3", "dn3", 100),
|
|
newEcNode("dc1", "rack4", "dn4", 100),
|
|
newEcNode("dc1", "rack5", "dn5", 100),
|
|
newEcNode("dc1", "rack6", "dn6", 100),
|
|
},
|
|
applyBalancing: false, // Dry-run mode (simulates moves by updating internal state)
|
|
diskType: types.HardDriveType,
|
|
}
|
|
|
|
ecb.balanceEcVolumes("c1")
|
|
|
|
// After balancing (dry-run), verify the PLANNED distribution by checking what moves were proposed
|
|
// The ecb.ecNodes state is updated during dry-run to track planned moves
|
|
vid := needle.VolumeId(1)
|
|
dataShardCount := erasure_coding.DataShardsCount // 10
|
|
parityShardCount := erasure_coding.ParityShardsCount // 4
|
|
|
|
// Count data and parity shards per rack based on current (updated) state
|
|
dataPerRack, parityPerRack := countDataAndParityShardsPerRack(ecb.ecNodes, vid, dataShardCount)
|
|
|
|
// With 6 racks:
|
|
// - Data shards (10): max 2 per rack (ceil(10/6) = 2)
|
|
// - Parity shards (4): max 1 per rack (ceil(4/6) = 1)
|
|
maxDataPerRack := ceilDivide(dataShardCount, 6) // 2
|
|
maxParityPerRack := ceilDivide(parityShardCount, 6) // 1
|
|
|
|
// Verify no rack has more than max data shards
|
|
for rackId, count := range dataPerRack {
|
|
if count > maxDataPerRack {
|
|
t.Errorf("rack %s has %d data shards, expected max %d", rackId, count, maxDataPerRack)
|
|
}
|
|
}
|
|
|
|
// Verify no rack has more than max parity shards
|
|
for rackId, count := range parityPerRack {
|
|
if count > maxParityPerRack {
|
|
t.Errorf("rack %s has %d parity shards, expected max %d", rackId, count, maxParityPerRack)
|
|
}
|
|
}
|
|
|
|
// Verify all shards are distributed (total counts)
|
|
totalData := 0
|
|
totalParity := 0
|
|
for _, count := range dataPerRack {
|
|
totalData += count
|
|
}
|
|
for _, count := range parityPerRack {
|
|
totalParity += count
|
|
}
|
|
if totalData != dataShardCount {
|
|
t.Errorf("total data shards = %d, expected %d", totalData, dataShardCount)
|
|
}
|
|
if totalParity != parityShardCount {
|
|
t.Errorf("total parity shards = %d, expected %d", totalParity, parityShardCount)
|
|
}
|
|
|
|
// Verify data shards are spread across at least 5 racks (10 shards / 2 max per rack)
|
|
racksWithData := len(dataPerRack)
|
|
minRacksForData := dataShardCount / maxDataPerRack // At least 5 racks needed for 10 data shards
|
|
if racksWithData < minRacksForData {
|
|
t.Errorf("data shards spread across only %d racks, expected at least %d", racksWithData, minRacksForData)
|
|
}
|
|
|
|
// Verify parity shards are spread across at least 4 racks (4 shards / 1 max per rack)
|
|
racksWithParity := len(parityPerRack)
|
|
if racksWithParity < parityShardCount {
|
|
t.Errorf("parity shards spread across only %d racks, expected at least %d", racksWithParity, parityShardCount)
|
|
}
|
|
|
|
t.Logf("Distribution after balancing:")
|
|
t.Logf(" Data shards per rack: %v (max allowed: %d)", dataPerRack, maxDataPerRack)
|
|
t.Logf(" Parity shards per rack: %v (max allowed: %d)", parityPerRack, maxParityPerRack)
|
|
}
|
|
|
|
// countDataAndParityShardsPerRack counts data and parity shards per rack
|
|
func countDataAndParityShardsPerRack(ecNodes []*EcNode, vid needle.VolumeId, dataShardCount int) (dataPerRack, parityPerRack map[string]int) {
|
|
dataPerRack = make(map[string]int)
|
|
parityPerRack = make(map[string]int)
|
|
|
|
for _, ecNode := range ecNodes {
|
|
shardBits := findEcVolumeShards(ecNode, vid, types.HardDriveType)
|
|
for _, shardId := range shardBits.ShardIds() {
|
|
rackId := string(ecNode.rack)
|
|
if int(shardId) < dataShardCount {
|
|
dataPerRack[rackId]++
|
|
} else {
|
|
parityPerRack[rackId]++
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// TestCommandEcBalanceMultipleVolumesEvenDistribution tests that multiple volumes
|
|
// each get their data and parity shards evenly distributed
|
|
func TestCommandEcBalanceMultipleVolumesEvenDistribution(t *testing.T) {
|
|
// Setup: Two volumes, each with all 14 shards on different starting racks
|
|
ecb := &ecBalancer{
|
|
ecNodes: []*EcNode{
|
|
// Volume 1: all shards on rack1
|
|
newEcNode("dc1", "rack1", "dn1", 100).addEcVolumeAndShardsForTest(1, "c1", []uint32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}),
|
|
// Volume 2: all shards on rack2
|
|
newEcNode("dc1", "rack2", "dn2", 100).addEcVolumeAndShardsForTest(2, "c1", []uint32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}),
|
|
// Empty nodes on other racks
|
|
newEcNode("dc1", "rack3", "dn3", 100),
|
|
newEcNode("dc1", "rack4", "dn4", 100),
|
|
newEcNode("dc1", "rack5", "dn5", 100),
|
|
newEcNode("dc1", "rack6", "dn6", 100),
|
|
},
|
|
applyBalancing: false, // Dry-run mode
|
|
diskType: types.HardDriveType,
|
|
}
|
|
|
|
ecb.balanceEcVolumes("c1")
|
|
|
|
// Check both volumes
|
|
for _, vid := range []needle.VolumeId{1, 2} {
|
|
dataPerRack, parityPerRack := countDataAndParityShardsPerRack(ecb.ecNodes, vid, erasure_coding.DataShardsCount)
|
|
|
|
maxDataPerRack := ceilDivide(erasure_coding.DataShardsCount, 6)
|
|
maxParityPerRack := ceilDivide(erasure_coding.ParityShardsCount, 6)
|
|
|
|
for rackId, count := range dataPerRack {
|
|
if count > maxDataPerRack {
|
|
t.Errorf("volume %d: rack %s has %d data shards, expected max %d", vid, rackId, count, maxDataPerRack)
|
|
}
|
|
}
|
|
for rackId, count := range parityPerRack {
|
|
if count > maxParityPerRack {
|
|
t.Errorf("volume %d: rack %s has %d parity shards, expected max %d", vid, rackId, count, maxParityPerRack)
|
|
}
|
|
}
|
|
|
|
t.Logf("Volume %d - Data: %v, Parity: %v", vid, dataPerRack, parityPerRack)
|
|
}
|
|
}
|