Volume Server: handle incomplete ec encoding (#7384)
* handle incomplete ec encoding * unit tests * simplify, and better logs * Update disk_location_ec.go When loadEcShards() fails partway through, some EC shards may already be loaded into the l.ecVolumes map in memory. The previous code only cleaned up filesystem files but left orphaned in-memory state, which could cause memory leaks and inconsistent state. * address comments * Performance: Avoid Double os.Stat() Call * Platform Compatibility: Use filepath.Join * in memory cleanup * Update disk_location_ec.go * refactor * Added Shard Size Validation * check ec shard sizes * validate shard size * calculate expected shard size * refactoring * minor * fix shard directory * 10GB sparse files can be slow or fail on non-sparse FS. Use 10MB to hit SmallBlockSize math (1MB shards) deterministically. * grouping logic should be updated to use both collection and volumeId to ensure correctness * unexpected error * handle exceptions in tests; use constants * The check for orphaned shards should be performed for the previous volume before resetting sameVolumeShards for the new volume. * address comments * Eliminated Redundant Parsing in checkOrphanedShards * minor * Avoid misclassifying local EC as distributed when .dat stat errors occur; also standardize unload-before-remove. * fmt * refactor * refactor * adjust to warning
This commit is contained in:
@@ -144,10 +144,26 @@ func (l *DiskLocation) loadExistingVolume(dirEntry os.DirEntry, needleMapKind Ne
|
||||
return false
|
||||
}
|
||||
|
||||
// skip if ec volumes exists
|
||||
// parse out collection, volume id (moved up to use in EC validation)
|
||||
vid, collection, err := volumeIdFromFileName(basename)
|
||||
if err != nil {
|
||||
glog.Warningf("get volume id failed, %s, err : %s", volumeName, err)
|
||||
return false
|
||||
}
|
||||
|
||||
// skip if ec volumes exists, but validate EC files first
|
||||
if skipIfEcVolumesExists {
|
||||
if util.FileExists(l.IdxDirectory + "/" + volumeName + ".ecx") {
|
||||
return false
|
||||
ecxFilePath := filepath.Join(l.IdxDirectory, volumeName+".ecx")
|
||||
if util.FileExists(ecxFilePath) {
|
||||
// Check if EC volume is valid by verifying shard count
|
||||
if !l.validateEcVolume(collection, vid) {
|
||||
glog.Warningf("EC volume %d validation failed, removing incomplete EC files to allow .dat file loading", vid)
|
||||
l.removeEcVolumeFiles(collection, vid)
|
||||
// Continue to load .dat file
|
||||
} else {
|
||||
// Valid EC volume exists, skip .dat file
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -161,13 +177,6 @@ func (l *DiskLocation) loadExistingVolume(dirEntry os.DirEntry, needleMapKind Ne
|
||||
return false
|
||||
}
|
||||
|
||||
// parse out collection, volume id
|
||||
vid, collection, err := volumeIdFromFileName(basename)
|
||||
if err != nil {
|
||||
glog.Warningf("get volume id failed, %s, err : %s", volumeName, err)
|
||||
return false
|
||||
}
|
||||
|
||||
// avoid loading one volume more than once
|
||||
l.volumesLock.RLock()
|
||||
_, found := l.volumes[vid]
|
||||
|
||||
Reference in New Issue
Block a user