Fix EC shard recovery with improved diagnostics (#8091)

* storage: fix EC shard recovery with improved diagnostics and logging

- Fix buffer size mismatch in ReconstructData call
- Add detailed logging of available and missing shards
- Improve error messages when recovery is impossible
- Add unit tests for EC recovery shard counting logic

* test: refine EC recovery unit tests

- Remove redundant tests that only validate setup
- Use standard strings.Contains instead of custom recursive helper

* adjust tests and minor improvement
This commit is contained in:
Chris Lu
2026-01-22 20:34:19 -08:00
committed by GitHub
parent bc1113208d
commit e717a63665
2 changed files with 448 additions and 3 deletions

View File

@@ -390,9 +390,32 @@ func (s *Store) recoverOneRemoteEcShardInterval(needleId types.NeedleId, ecVolum
wg.Wait()
if err = enc.ReconstructData(bufs); err != nil {
glog.V(3).Infof("recovered ec shard %d.%d failed: %v", ecVolume.VolumeId, shardIdToRecover, err)
return 0, false, err
// Count and log available shards for diagnostics
availableShards := make([]erasure_coding.ShardId, 0, erasure_coding.TotalShardsCount)
missingShards := make([]erasure_coding.ShardId, 0, erasure_coding.ParityShardsCount+1)
for shardId := 0; shardId < erasure_coding.TotalShardsCount; shardId++ {
if bufs[shardId] != nil {
availableShards = append(availableShards, erasure_coding.ShardId(shardId))
} else {
missingShards = append(missingShards, erasure_coding.ShardId(shardId))
}
}
glog.V(3).Infof("recover ec shard %d.%d: %d shards available %v, %d missing %v",
ecVolume.VolumeId, shardIdToRecover,
len(availableShards), availableShards,
len(missingShards), missingShards)
if len(availableShards) < erasure_coding.DataShardsCount {
return 0, false, fmt.Errorf("cannot recover shard %d.%d: only %d shards available %v, need at least %d (missing: %v)",
ecVolume.VolumeId, shardIdToRecover,
len(availableShards), availableShards,
erasure_coding.DataShardsCount, missingShards)
}
if err = enc.ReconstructData(bufs[:erasure_coding.TotalShardsCount]); err != nil {
return 0, false, fmt.Errorf("failed to reconstruct data for shard %d.%d with %d available shards %v: %w",
ecVolume.VolumeId, shardIdToRecover, len(availableShards), availableShards, err)
}
glog.V(4).Infof("recovered ec shard %d.%d from other locations", ecVolume.VolumeId, shardIdToRecover)