* Process .ecj deletions during EC decode and vacuum decoded volume (#8798) When decoding EC volumes back to normal volumes, deletions recorded in the .ecj journal were not being applied before computing the dat file size or checking for live needles. This caused the decoded volume to include data for deleted files and could produce false positives in the all-deleted check. - Call RebuildEcxFile before HasLiveNeedles/FindDatFileSize in VolumeEcShardsToVolume so .ecj deletions are merged into .ecx first - Vacuum the decoded volume after mounting in ec.decode to compact out deleted needle data from the .dat file - Add integration tests for decoding with non-empty .ecj files * storage: add offline volume compaction helper Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * ec: compact decoded volumes before deleting shards Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * ec: address PR review comments - Fall back to data directory for .ecx when idx directory lacks it - Make compaction failure non-fatal during EC decode - Remove misleading "buffer: 10%" from space check error message * ec: collect .ecj from all shard locations during decode Each server's .ecj only contains deletions for needles whose data resides in shards held by that server. Previously, sources with no new data shards to contribute were skipped entirely, losing their .ecj deletion entries. Now .ecj is always appended from every shard location so RebuildEcxFile sees the full set of deletions. * ec: add integration tests for .ecj collection during decode TestEcDecodePreservesDeletedNeedles: verifies that needles deleted via VolumeEcBlobDelete are excluded from the decoded volume. TestEcDecodeCollectsEcjFromPeer: regression test for the fix in collectEcShards. Deletes a needle only on a peer server that holds no new data shards, then verifies the deletion survives decode via .ecj collection. * ec: address review nits in decode and tests - Remove double error wrapping in mountDecodedVolume - Check VolumeUnmount error in peer ecj test - Assert 404 specifically for deleted needles, fail on 5xx --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
257 lines
9.2 KiB
Go
257 lines
9.2 KiB
Go
package storage
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/types"
|
|
|
|
"github.com/syndtr/goleveldb/leveldb/opt"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/stats"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/backend"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
|
|
"github.com/seaweedfs/seaweedfs/weed/util"
|
|
)
|
|
|
|
func loadVolumeWithoutIndex(dirname string, collection string, id needle.VolumeId, needleMapKind NeedleMapKind, ver needle.Version) (v *Volume, err error) {
|
|
v = &Volume{dir: dirname, Collection: collection, Id: id}
|
|
v.SuperBlock = super_block.SuperBlock{}
|
|
v.needleMapKind = needleMapKind
|
|
err = v.load(false, false, needleMapKind, 0, ver)
|
|
return
|
|
}
|
|
|
|
func loadVolumeWithoutWorker(dirname string, dirIdx string, collection string, id needle.VolumeId, needleMapKind NeedleMapKind, ldbTimeout int64) (v *Volume, err error) {
|
|
v = &Volume{
|
|
dir: dirname,
|
|
dirIdx: dirIdx,
|
|
Collection: collection,
|
|
Id: id,
|
|
needleMapKind: needleMapKind,
|
|
ldbTimeout: ldbTimeout,
|
|
}
|
|
v.SuperBlock = super_block.SuperBlock{}
|
|
err = v.load(true, false, needleMapKind, 0, needle.GetCurrentVersion())
|
|
return
|
|
}
|
|
|
|
func (v *Volume) load(alsoLoadIndex bool, createDatIfMissing bool, needleMapKind NeedleMapKind, preallocate int64, ver needle.Version) (err error) {
|
|
alreadyHasSuperBlock := false
|
|
|
|
hasLoadedVolume := false
|
|
defer func() {
|
|
if !hasLoadedVolume {
|
|
if v.nm != nil {
|
|
v.nm.Close()
|
|
v.nm = nil
|
|
}
|
|
if v.DataBackend != nil {
|
|
v.DataBackend.Close()
|
|
v.DataBackend = nil
|
|
}
|
|
}
|
|
}()
|
|
|
|
hasVolumeInfoFile := v.maybeLoadVolumeInfo()
|
|
|
|
if v.volumeInfo.ReadOnly && !v.HasRemoteFile() {
|
|
// this covers the case where the volume is marked as read-only and has no remote file
|
|
v.noWriteOrDelete = true
|
|
}
|
|
|
|
if v.HasRemoteFile() {
|
|
v.noWriteCanDelete = true
|
|
v.noWriteOrDelete = false
|
|
glog.V(0).Infof("loading volume %d from remote %v", v.Id, v.volumeInfo)
|
|
if err := v.LoadRemoteFile(); err != nil {
|
|
return fmt.Errorf("load remote file %v: %w", v.volumeInfo, err)
|
|
}
|
|
// Set lastModifiedTsSeconds from remote file to prevent premature expiry on startup
|
|
if len(v.volumeInfo.GetFiles()) > 0 {
|
|
remoteFileModifiedTime := v.volumeInfo.GetFiles()[0].GetModifiedTime()
|
|
if remoteFileModifiedTime > 0 {
|
|
v.lastModifiedTsSeconds = remoteFileModifiedTime
|
|
} else {
|
|
// Fallback: use .vif file's modification time
|
|
if exists, _, _, modifiedTime, _ := util.CheckFile(v.FileName(".vif")); exists {
|
|
v.lastModifiedTsSeconds = uint64(modifiedTime.Unix())
|
|
}
|
|
}
|
|
glog.V(1).Infof("volume %d remote file lastModifiedTsSeconds set to %d", v.Id, v.lastModifiedTsSeconds)
|
|
}
|
|
alreadyHasSuperBlock = true
|
|
} else if exists, canRead, canWrite, modifiedTime, fileSize := util.CheckFile(v.FileName(".dat")); exists {
|
|
// open dat file
|
|
if !canRead {
|
|
return fmt.Errorf("cannot read Volume Data file %s", v.FileName(".dat"))
|
|
}
|
|
var dataFile *os.File
|
|
if canWrite {
|
|
dataFile, err = os.OpenFile(v.FileName(".dat"), os.O_RDWR|os.O_CREATE, 0644)
|
|
} else {
|
|
glog.V(0).Infof("opening %s in READONLY mode", v.FileName(".dat"))
|
|
dataFile, err = os.Open(v.FileName(".dat"))
|
|
v.noWriteOrDelete = true
|
|
}
|
|
v.lastModifiedTsSeconds = uint64(modifiedTime.Unix())
|
|
if fileSize >= super_block.SuperBlockSize {
|
|
alreadyHasSuperBlock = true
|
|
}
|
|
v.DataBackend = backend.NewDiskFile(dataFile)
|
|
} else {
|
|
if createDatIfMissing {
|
|
v.DataBackend, err = backend.CreateVolumeFile(v.FileName(".dat"), preallocate, v.MemoryMapMaxSizeMb)
|
|
} else {
|
|
return fmt.Errorf("volume data file %s does not exist", v.FileName(".dat"))
|
|
}
|
|
}
|
|
|
|
if err != nil {
|
|
if !os.IsPermission(err) {
|
|
return fmt.Errorf("cannot load volume data %s: %v", v.FileName(".dat"), err)
|
|
} else {
|
|
return fmt.Errorf("load data file %s: %v", v.FileName(".dat"), err)
|
|
}
|
|
}
|
|
|
|
if alreadyHasSuperBlock {
|
|
err = v.readSuperBlock()
|
|
if err == nil {
|
|
if !needle.IsSupportedVersion(v.SuperBlock.Version) {
|
|
glog.Fatalf("Unsupported volume %d version %v", v.Id, v.SuperBlock.Version)
|
|
}
|
|
v.volumeInfo.Version = uint32(v.SuperBlock.Version)
|
|
}
|
|
glog.V(2).Infof("readSuperBlock volume %d version %v", v.Id, v.SuperBlock.Version)
|
|
if v.HasRemoteFile() {
|
|
// maybe temporary network problem
|
|
glog.Errorf("readSuperBlock remote volume %d: %v", v.Id, err)
|
|
err = nil
|
|
}
|
|
} else {
|
|
if !v.SuperBlock.Initialized() {
|
|
return fmt.Errorf("volume %s not initialized", v.FileName(".dat"))
|
|
}
|
|
err = v.maybeWriteSuperBlock(ver)
|
|
}
|
|
if err == nil && alsoLoadIndex {
|
|
// adjust for existing volumes with .idx together with .dat files
|
|
if v.dirIdx != v.dir {
|
|
if util.FileExists(v.DataFileName() + ".idx") {
|
|
v.dirIdx = v.dir
|
|
}
|
|
}
|
|
// check volume idx files
|
|
if err := v.checkIdxFile(); err != nil {
|
|
glog.Fatalf("check volume idx file %s: %v", v.FileName(".idx"), err)
|
|
}
|
|
var indexFile *os.File
|
|
if v.noWriteOrDelete {
|
|
glog.V(0).Infoln("open to read file", v.FileName(".idx"))
|
|
if indexFile, err = os.OpenFile(v.FileName(".idx"), os.O_RDONLY, 0644); err != nil {
|
|
return fmt.Errorf("cannot read Volume Index %s: %v", v.FileName(".idx"), err)
|
|
}
|
|
} else {
|
|
glog.V(1).Infoln("open to write file", v.FileName(".idx"))
|
|
if indexFile, err = os.OpenFile(v.FileName(".idx"), os.O_RDWR|os.O_CREATE, 0644); err != nil {
|
|
return fmt.Errorf("cannot write Volume Index %s: %v", v.FileName(".idx"), err)
|
|
}
|
|
}
|
|
// Do not need to check the data integrity for remote volumes,
|
|
// since the remote storage tier may have larger capacity, the volume
|
|
// data read will trigger the ReadAt() function to read from the remote
|
|
// storage tier, and download to local storage, which may cause the
|
|
// capactiy overloading.
|
|
if !v.HasRemoteFile() {
|
|
glog.V(2).Infof("checking volume data integrity for volume %d", v.Id)
|
|
if v.lastAppendAtNs, err = CheckVolumeDataIntegrity(v, indexFile); err != nil {
|
|
v.noWriteOrDelete = true
|
|
glog.V(0).Infof("volumeDataIntegrityChecking failed %v", err)
|
|
}
|
|
}
|
|
|
|
if v.noWriteOrDelete || v.noWriteCanDelete {
|
|
if v.nm, err = NewSortedFileNeedleMap(v.IndexFileName(), indexFile); err != nil {
|
|
glog.V(0).Infof("loading sorted db %s error: %v", v.FileName(".sdx"), err)
|
|
}
|
|
} else {
|
|
switch needleMapKind {
|
|
case NeedleMapInMemory:
|
|
if v.tmpNm != nil {
|
|
glog.V(2).Infof("updating memory compact index %s ", v.FileName(".idx"))
|
|
err = v.tmpNm.UpdateNeedleMap(v, indexFile, nil, 0)
|
|
} else {
|
|
glog.V(2).Infoln("loading memory index", v.FileName(".idx"), "to memory")
|
|
if v.nm, err = LoadCompactNeedleMap(indexFile); err != nil {
|
|
glog.V(0).Infof("loading index %s to memory error: %v", v.FileName(".idx"), err)
|
|
}
|
|
}
|
|
case NeedleMapLevelDb:
|
|
opts := &opt.Options{
|
|
BlockCacheCapacity: 2 * 1024 * 1024, // default value is 8MiB
|
|
WriteBuffer: 1 * 1024 * 1024, // default value is 4MiB
|
|
CompactionTableSizeMultiplier: 10, // default value is 1
|
|
}
|
|
if v.tmpNm != nil {
|
|
glog.V(0).Infoln("updating leveldb index", v.FileName(".ldb"))
|
|
err = v.tmpNm.UpdateNeedleMap(v, indexFile, opts, v.ldbTimeout)
|
|
} else {
|
|
glog.V(0).Infoln("loading leveldb index", v.FileName(".ldb"))
|
|
if v.nm, err = NewLevelDbNeedleMap(v.FileName(".ldb"), indexFile, opts, v.ldbTimeout); err != nil {
|
|
glog.V(0).Infof("loading leveldb %s error: %v", v.FileName(".ldb"), err)
|
|
}
|
|
}
|
|
case NeedleMapLevelDbMedium:
|
|
opts := &opt.Options{
|
|
BlockCacheCapacity: 4 * 1024 * 1024, // default value is 8MiB
|
|
WriteBuffer: 2 * 1024 * 1024, // default value is 4MiB
|
|
CompactionTableSizeMultiplier: 10, // default value is 1
|
|
}
|
|
if v.tmpNm != nil {
|
|
glog.V(0).Infoln("updating leveldb medium index", v.FileName(".ldb"))
|
|
err = v.tmpNm.UpdateNeedleMap(v, indexFile, opts, v.ldbTimeout)
|
|
} else {
|
|
glog.V(0).Infoln("loading leveldb medium index", v.FileName(".ldb"))
|
|
if v.nm, err = NewLevelDbNeedleMap(v.FileName(".ldb"), indexFile, opts, v.ldbTimeout); err != nil {
|
|
glog.V(0).Infof("loading leveldb %s error: %v", v.FileName(".ldb"), err)
|
|
}
|
|
}
|
|
case NeedleMapLevelDbLarge:
|
|
opts := &opt.Options{
|
|
BlockCacheCapacity: 8 * 1024 * 1024, // default value is 8MiB
|
|
WriteBuffer: 4 * 1024 * 1024, // default value is 4MiB
|
|
CompactionTableSizeMultiplier: 10, // default value is 1
|
|
}
|
|
if v.tmpNm != nil {
|
|
glog.V(0).Infoln("updating leveldb large index", v.FileName(".ldb"))
|
|
err = v.tmpNm.UpdateNeedleMap(v, indexFile, opts, v.ldbTimeout)
|
|
} else {
|
|
glog.V(0).Infoln("loading leveldb large index", v.FileName(".ldb"))
|
|
if v.nm, err = NewLevelDbNeedleMap(v.FileName(".ldb"), indexFile, opts, v.ldbTimeout); err != nil {
|
|
glog.V(0).Infof("loading leveldb %s error: %v", v.FileName(".ldb"), err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if !hasVolumeInfoFile {
|
|
v.volumeInfo.Version = uint32(v.SuperBlock.Version)
|
|
v.volumeInfo.BytesOffset = uint32(types.OffsetSize)
|
|
if err := v.SaveVolumeInfo(); err != nil {
|
|
glog.Warningf("volume %d failed to save file info: %v", v.Id, err)
|
|
}
|
|
}
|
|
|
|
stats.VolumeServerVolumeGauge.WithLabelValues(v.Collection, "volume").Inc()
|
|
|
|
if err == nil {
|
|
hasLoadedVolume = true
|
|
}
|
|
|
|
return err
|
|
}
|