Fix EC shard recovery with improved diagnostics (#8091)
* storage: fix EC shard recovery with improved diagnostics and logging - Fix buffer size mismatch in ReconstructData call - Add detailed logging of available and missing shards - Improve error messages when recovery is impossible - Add unit tests for EC recovery shard counting logic * test: refine EC recovery unit tests - Remove redundant tests that only validate setup - Use standard strings.Contains instead of custom recursive helper * adjust tests and minor improvement
This commit is contained in:
@@ -390,9 +390,32 @@ func (s *Store) recoverOneRemoteEcShardInterval(needleId types.NeedleId, ecVolum
|
||||
|
||||
wg.Wait()
|
||||
|
||||
if err = enc.ReconstructData(bufs); err != nil {
|
||||
glog.V(3).Infof("recovered ec shard %d.%d failed: %v", ecVolume.VolumeId, shardIdToRecover, err)
|
||||
return 0, false, err
|
||||
// Count and log available shards for diagnostics
|
||||
availableShards := make([]erasure_coding.ShardId, 0, erasure_coding.TotalShardsCount)
|
||||
missingShards := make([]erasure_coding.ShardId, 0, erasure_coding.ParityShardsCount+1)
|
||||
for shardId := 0; shardId < erasure_coding.TotalShardsCount; shardId++ {
|
||||
if bufs[shardId] != nil {
|
||||
availableShards = append(availableShards, erasure_coding.ShardId(shardId))
|
||||
} else {
|
||||
missingShards = append(missingShards, erasure_coding.ShardId(shardId))
|
||||
}
|
||||
}
|
||||
|
||||
glog.V(3).Infof("recover ec shard %d.%d: %d shards available %v, %d missing %v",
|
||||
ecVolume.VolumeId, shardIdToRecover,
|
||||
len(availableShards), availableShards,
|
||||
len(missingShards), missingShards)
|
||||
|
||||
if len(availableShards) < erasure_coding.DataShardsCount {
|
||||
return 0, false, fmt.Errorf("cannot recover shard %d.%d: only %d shards available %v, need at least %d (missing: %v)",
|
||||
ecVolume.VolumeId, shardIdToRecover,
|
||||
len(availableShards), availableShards,
|
||||
erasure_coding.DataShardsCount, missingShards)
|
||||
}
|
||||
|
||||
if err = enc.ReconstructData(bufs[:erasure_coding.TotalShardsCount]); err != nil {
|
||||
return 0, false, fmt.Errorf("failed to reconstruct data for shard %d.%d with %d available shards %v: %w",
|
||||
ecVolume.VolumeId, shardIdToRecover, len(availableShards), availableShards, err)
|
||||
}
|
||||
glog.V(4).Infof("recovered ec shard %d.%d from other locations", ecVolume.VolumeId, shardIdToRecover)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user