Volume Server: handle incomplete ec encoding (#7384)
* handle incomplete ec encoding * unit tests * simplify, and better logs * Update disk_location_ec.go When loadEcShards() fails partway through, some EC shards may already be loaded into the l.ecVolumes map in memory. The previous code only cleaned up filesystem files but left orphaned in-memory state, which could cause memory leaks and inconsistent state. * address comments * Performance: Avoid Double os.Stat() Call * Platform Compatibility: Use filepath.Join * in memory cleanup * Update disk_location_ec.go * refactor * Added Shard Size Validation * check ec shard sizes * validate shard size * calculate expected shard size * refactoring * minor * fix shard directory * 10GB sparse files can be slow or fail on non-sparse FS. Use 10MB to hit SmallBlockSize math (1MB shards) deterministically. * grouping logic should be updated to use both collection and volumeId to ensure correctness * unexpected error * handle exceptions in tests; use constants * The check for orphaned shards should be performed for the previous volume before resetting sameVolumeShards for the new volume. * address comments * Eliminated Redundant Parsing in checkOrphanedShards * minor * Avoid misclassifying local EC as distributed when .dat stat errors occur; also standardize unload-before-remove. * fmt * refactor * refactor * adjust to warning
This commit is contained in:
@@ -10,6 +10,7 @@ import (
|
||||
|
||||
"slices"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
||||
)
|
||||
@@ -40,6 +41,23 @@ func (l *DiskLocation) DestroyEcVolume(vid needle.VolumeId) {
|
||||
}
|
||||
}
|
||||
|
||||
// unloadEcVolume removes an EC volume from memory without deleting its files on disk.
|
||||
// This is useful for distributed EC volumes where shards may be on other servers.
|
||||
func (l *DiskLocation) unloadEcVolume(vid needle.VolumeId) {
|
||||
var toClose *erasure_coding.EcVolume
|
||||
l.ecVolumesLock.Lock()
|
||||
if ecVolume, found := l.ecVolumes[vid]; found {
|
||||
toClose = ecVolume
|
||||
delete(l.ecVolumes, vid)
|
||||
}
|
||||
l.ecVolumesLock.Unlock()
|
||||
|
||||
// Close outside the lock to avoid holding write lock during I/O
|
||||
if toClose != nil {
|
||||
toClose.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func (l *DiskLocation) CollectEcShards(vid needle.VolumeId, shardFileNames []string) (ecVolume *erasure_coding.EcVolume, found bool) {
|
||||
l.ecVolumesLock.RLock()
|
||||
defer l.ecVolumesLock.RUnlock()
|
||||
@@ -154,8 +172,18 @@ func (l *DiskLocation) loadAllEcShards() (err error) {
|
||||
slices.SortFunc(dirEntries, func(a, b os.DirEntry) int {
|
||||
return strings.Compare(a.Name(), b.Name())
|
||||
})
|
||||
|
||||
var sameVolumeShards []string
|
||||
var prevVolumeId needle.VolumeId
|
||||
var prevCollection string
|
||||
|
||||
// Helper to reset state between volume processing
|
||||
reset := func() {
|
||||
sameVolumeShards = nil
|
||||
prevVolumeId = 0
|
||||
prevCollection = ""
|
||||
}
|
||||
|
||||
for _, fileInfo := range dirEntries {
|
||||
if fileInfo.IsDir() {
|
||||
continue
|
||||
@@ -178,24 +206,31 @@ func (l *DiskLocation) loadAllEcShards() (err error) {
|
||||
// 0 byte files should be only appearing erroneously for ec data files
|
||||
// so we ignore them
|
||||
if re.MatchString(ext) && info.Size() > 0 {
|
||||
if prevVolumeId == 0 || volumeId == prevVolumeId {
|
||||
// Group shards by both collection and volumeId to avoid mixing collections
|
||||
if prevVolumeId == 0 || (volumeId == prevVolumeId && collection == prevCollection) {
|
||||
sameVolumeShards = append(sameVolumeShards, fileInfo.Name())
|
||||
} else {
|
||||
// Before starting a new group, check if previous group had orphaned shards
|
||||
l.checkOrphanedShards(sameVolumeShards, prevCollection, prevVolumeId)
|
||||
sameVolumeShards = []string{fileInfo.Name()}
|
||||
}
|
||||
prevVolumeId = volumeId
|
||||
prevCollection = collection
|
||||
continue
|
||||
}
|
||||
|
||||
if ext == ".ecx" && volumeId == prevVolumeId {
|
||||
if err = l.loadEcShards(sameVolumeShards, collection, volumeId); err != nil {
|
||||
return fmt.Errorf("loadEcShards collection:%v volumeId:%d : %v", collection, volumeId, err)
|
||||
}
|
||||
prevVolumeId = volumeId
|
||||
if ext == ".ecx" && volumeId == prevVolumeId && collection == prevCollection {
|
||||
l.handleFoundEcxFile(sameVolumeShards, collection, volumeId)
|
||||
reset()
|
||||
continue
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Check for orphaned EC shards without .ecx file at the end of the directory scan
|
||||
// This handles the last group of shards in the directory
|
||||
l.checkOrphanedShards(sameVolumeShards, prevCollection, prevVolumeId)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -237,3 +272,208 @@ func (l *DiskLocation) EcShardCount() int {
|
||||
}
|
||||
return shardCount
|
||||
}
|
||||
|
||||
// handleFoundEcxFile processes a complete group of EC shards when their .ecx file is found.
|
||||
// This includes validation, loading, and cleanup of incomplete/invalid EC volumes.
|
||||
func (l *DiskLocation) handleFoundEcxFile(shards []string, collection string, volumeId needle.VolumeId) {
|
||||
// Check if this is an incomplete EC encoding (not a distributed EC volume)
|
||||
// Key distinction: if .dat file still exists, EC encoding may have failed
|
||||
// If .dat file is gone, this is likely a distributed EC volume with shards on multiple servers
|
||||
baseFileName := erasure_coding.EcShardFileName(collection, l.Directory, int(volumeId))
|
||||
datFileName := baseFileName + ".dat"
|
||||
|
||||
// Determine .dat presence robustly; unexpected errors are treated as "exists"
|
||||
datExists := l.checkDatFileExists(datFileName)
|
||||
|
||||
// Validate EC volume if .dat file exists (incomplete EC encoding scenario)
|
||||
// This checks shard count, shard size consistency, and expected size vs .dat file
|
||||
// If .dat is gone, EC encoding completed and shards are distributed across servers
|
||||
if datExists && !l.validateEcVolume(collection, volumeId) {
|
||||
glog.Warningf("Incomplete or invalid EC volume %d: .dat exists but validation failed, cleaning up EC files...", volumeId)
|
||||
l.removeEcVolumeFiles(collection, volumeId)
|
||||
return
|
||||
}
|
||||
|
||||
// Attempt to load the EC shards
|
||||
if err := l.loadEcShards(shards, collection, volumeId); err != nil {
|
||||
// If EC shards failed to load and .dat still exists, clean up EC files to allow .dat file to be used
|
||||
// If .dat is gone, log error but don't clean up (may be waiting for shards from other servers)
|
||||
if datExists {
|
||||
glog.Warningf("Failed to load EC shards for volume %d and .dat exists: %v, cleaning up EC files to use .dat...", volumeId, err)
|
||||
// Unload first to release FDs, then remove files
|
||||
l.unloadEcVolume(volumeId)
|
||||
l.removeEcVolumeFiles(collection, volumeId)
|
||||
} else {
|
||||
glog.Warningf("Failed to load EC shards for volume %d: %v (this may be normal for distributed EC volumes)", volumeId, err)
|
||||
// Clean up any partially loaded in-memory state. This does not delete files.
|
||||
l.unloadEcVolume(volumeId)
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// checkDatFileExists checks if .dat file exists with robust error handling.
|
||||
// Unexpected errors (permission, I/O) are treated as "exists" to avoid misclassifying
|
||||
// local EC as distributed EC, which is the safer fallback.
|
||||
func (l *DiskLocation) checkDatFileExists(datFileName string) bool {
|
||||
if _, err := os.Stat(datFileName); err == nil {
|
||||
return true
|
||||
} else if !os.IsNotExist(err) {
|
||||
glog.Warningf("Failed to stat .dat file %s: %v", datFileName, err)
|
||||
// Safer to assume local .dat exists to avoid misclassifying as distributed EC
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// checkOrphanedShards checks if the given shards are orphaned (no .ecx file) and cleans them up if needed.
|
||||
// Returns true if orphaned shards were found and cleaned up.
|
||||
// This handles the case where EC encoding was interrupted before creating the .ecx file.
|
||||
func (l *DiskLocation) checkOrphanedShards(shards []string, collection string, volumeId needle.VolumeId) bool {
|
||||
if len(shards) == 0 || volumeId == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
// Check if .dat file exists (incomplete encoding, not distributed EC)
|
||||
baseFileName := erasure_coding.EcShardFileName(collection, l.Directory, int(volumeId))
|
||||
datFileName := baseFileName + ".dat"
|
||||
|
||||
if l.checkDatFileExists(datFileName) {
|
||||
glog.Warningf("Found %d EC shards without .ecx file for volume %d (incomplete encoding interrupted before .ecx creation), cleaning up...",
|
||||
len(shards), volumeId)
|
||||
l.removeEcVolumeFiles(collection, volumeId)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// calculateExpectedShardSize computes the exact expected shard size based on .dat file size
|
||||
// The EC encoding process is deterministic:
|
||||
// 1. Data is processed in batches of (LargeBlockSize * DataShardsCount) for large blocks
|
||||
// 2. Remaining data is processed in batches of (SmallBlockSize * DataShardsCount) for small blocks
|
||||
// 3. Each shard gets exactly its portion, with zero-padding applied to incomplete blocks
|
||||
func calculateExpectedShardSize(datFileSize int64) int64 {
|
||||
var shardSize int64
|
||||
|
||||
// Process large blocks (1GB * 10 = 10GB batches)
|
||||
largeBatchSize := int64(erasure_coding.ErasureCodingLargeBlockSize) * int64(erasure_coding.DataShardsCount)
|
||||
numLargeBatches := datFileSize / largeBatchSize
|
||||
shardSize = numLargeBatches * int64(erasure_coding.ErasureCodingLargeBlockSize)
|
||||
remainingSize := datFileSize - (numLargeBatches * largeBatchSize)
|
||||
|
||||
// Process remaining data in small blocks (1MB * 10 = 10MB batches)
|
||||
if remainingSize > 0 {
|
||||
smallBatchSize := int64(erasure_coding.ErasureCodingSmallBlockSize) * int64(erasure_coding.DataShardsCount)
|
||||
numSmallBatches := (remainingSize + smallBatchSize - 1) / smallBatchSize // Ceiling division
|
||||
shardSize += numSmallBatches * int64(erasure_coding.ErasureCodingSmallBlockSize)
|
||||
}
|
||||
|
||||
return shardSize
|
||||
}
|
||||
|
||||
// validateEcVolume checks if EC volume has enough shards to be functional
|
||||
// For distributed EC volumes (where .dat is deleted), any number of shards is valid
|
||||
// For incomplete EC encoding (where .dat still exists), we need at least DataShardsCount shards
|
||||
// Also validates that all shards have the same size (required for Reed-Solomon EC)
|
||||
// If .dat exists, it also validates shards match the expected size based on .dat file size
|
||||
func (l *DiskLocation) validateEcVolume(collection string, vid needle.VolumeId) bool {
|
||||
baseFileName := erasure_coding.EcShardFileName(collection, l.Directory, int(vid))
|
||||
datFileName := baseFileName + ".dat"
|
||||
|
||||
var expectedShardSize int64 = -1
|
||||
datExists := false
|
||||
|
||||
// If .dat file exists, compute exact expected shard size from it
|
||||
if datFileInfo, err := os.Stat(datFileName); err == nil {
|
||||
datExists = true
|
||||
expectedShardSize = calculateExpectedShardSize(datFileInfo.Size())
|
||||
} else if !os.IsNotExist(err) {
|
||||
// If stat fails with unexpected error (permission, I/O), fail validation
|
||||
// Don't treat this as "distributed EC" - it could be a temporary error
|
||||
glog.Warningf("Failed to stat .dat file %s: %v", datFileName, err)
|
||||
return false
|
||||
}
|
||||
|
||||
shardCount := 0
|
||||
var actualShardSize int64 = -1
|
||||
|
||||
// Count shards and validate they all have the same size (required for Reed-Solomon EC)
|
||||
// Shard files (.ec00 - .ec13) are always in l.Directory, not l.IdxDirectory
|
||||
for i := 0; i < erasure_coding.TotalShardsCount; i++ {
|
||||
shardFileName := baseFileName + erasure_coding.ToExt(i)
|
||||
fi, err := os.Stat(shardFileName)
|
||||
|
||||
if err == nil {
|
||||
// Check if file has non-zero size
|
||||
if fi.Size() > 0 {
|
||||
// Validate all shards are the same size (required for Reed-Solomon EC)
|
||||
if actualShardSize == -1 {
|
||||
actualShardSize = fi.Size()
|
||||
} else if fi.Size() != actualShardSize {
|
||||
glog.Warningf("EC volume %d shard %d has size %d, expected %d (all EC shards must be same size)",
|
||||
vid, i, fi.Size(), actualShardSize)
|
||||
return false
|
||||
}
|
||||
shardCount++
|
||||
}
|
||||
} else if !os.IsNotExist(err) {
|
||||
// If stat fails with unexpected error (permission, I/O), fail validation
|
||||
// This is consistent with .dat file error handling
|
||||
glog.Warningf("Failed to stat shard file %s: %v", shardFileName, err)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// If .dat file exists, validate shard size matches expected size
|
||||
if datExists && actualShardSize > 0 && expectedShardSize > 0 {
|
||||
if actualShardSize != expectedShardSize {
|
||||
glog.Warningf("EC volume %d: shard size %d doesn't match expected size %d (based on .dat file size)",
|
||||
vid, actualShardSize, expectedShardSize)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// If .dat file is gone, this is a distributed EC volume - any shard count is valid
|
||||
if !datExists {
|
||||
glog.V(1).Infof("EC volume %d: distributed EC (.dat removed) with %d shards", vid, shardCount)
|
||||
return true
|
||||
}
|
||||
|
||||
// If .dat file exists, we need at least DataShardsCount shards locally
|
||||
// Otherwise it's an incomplete EC encoding that should be cleaned up
|
||||
if shardCount < erasure_coding.DataShardsCount {
|
||||
glog.Warningf("EC volume %d has .dat file but only %d shards (need at least %d for local EC)",
|
||||
vid, shardCount, erasure_coding.DataShardsCount)
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// removeEcVolumeFiles removes all EC-related files for a volume
|
||||
func (l *DiskLocation) removeEcVolumeFiles(collection string, vid needle.VolumeId) {
|
||||
baseFileName := erasure_coding.EcShardFileName(collection, l.Directory, int(vid))
|
||||
indexBaseFileName := erasure_coding.EcShardFileName(collection, l.IdxDirectory, int(vid))
|
||||
|
||||
// Helper to remove a file with consistent error handling
|
||||
removeFile := func(filePath, description string) {
|
||||
if err := os.Remove(filePath); err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
glog.Warningf("Failed to remove incomplete %s %s: %v", description, filePath, err)
|
||||
}
|
||||
} else {
|
||||
glog.V(2).Infof("Removed incomplete %s: %s", description, filePath)
|
||||
}
|
||||
}
|
||||
|
||||
// Remove index files first (.ecx, .ecj) before shard files
|
||||
// This ensures that if cleanup is interrupted, the .ecx file won't trigger
|
||||
// EC loading for incomplete/missing shards on next startup
|
||||
removeFile(indexBaseFileName+".ecx", "EC index file")
|
||||
removeFile(indexBaseFileName+".ecj", "EC journal file")
|
||||
|
||||
// Remove all EC shard files (.ec00 ~ .ec13) from data directory
|
||||
for i := 0; i < erasure_coding.TotalShardsCount; i++ {
|
||||
removeFile(baseFileName+erasure_coding.ToExt(i), "EC shard file")
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user