* check for nil needle map before compaction sync When CommitCompact runs concurrently, it sets v.nm = nil under dataFileAccessLock. CompactByIndex does not hold that lock, so v.nm.Sync() can hit a nil pointer. Add an early nil check to return an error instead of crashing. Fixes #8591 * guard copyDataBasedOnIndexFile size check against nil needle map The post-compaction size validation at line 538 accesses v.nm.ContentSize() and v.nm.DeletedSize(). If CommitCompact has concurrently set v.nm to nil, this causes a SIGSEGV. Skip the validation when v.nm is nil since the actual data copy uses local needle maps (oldNm/newNm) and is unaffected. Fixes #8591 * use atomic.Bool for compaction flags to prevent concurrent vacuum races The isCompacting and isCommitCompacting flags were plain bools read and written from multiple goroutines without synchronization. This allowed concurrent vacuums on the same volume to pass the guard checks and run simultaneously, leading to the nil pointer crash. Using atomic.Bool with CompareAndSwap ensures only one compaction or commit can run per volume at a time. Fixes #8591 * use go-version-file in CI workflows instead of hardcoded versions Use go-version-file: 'go.mod' so CI automatically picks up the Go version from go.mod, avoiding future version drift. Reordered checkout before setup-go in go.yml and e2e.yml so go.mod is available. Removed the now-unused GO_VERSION env vars. * capture v.nm locally in CompactByIndex to close TOCTOU race A bare nil check on v.nm followed by v.nm.Sync() has a race window where CommitCompact can set v.nm = nil between the two. Snapshot the pointer into a local variable so the nil check and Sync operate on the same reference. * add dynamic timeouts to plugin worker vacuum gRPC calls All vacuum gRPC calls used context.Background() with no deadline, so the plugin scheduler's execution timeout could kill a job while a large volume compact was still in progress. Use volume-size-scaled timeouts matching the topology vacuum approach: 3 min/GB for compact, 1 min/GB for check, commit, and cleanup. Fixes #8591 * Revert "add dynamic timeouts to plugin worker vacuum gRPC calls" This reverts commit 80951934c37416bc4f6c1472a5d3f8d204a637d9. * unify compaction lifecycle into single atomic flag Replace separate isCompacting and isCommitCompacting flags with a single isCompactionInProgress atomic.Bool. This ensures CompactBy*, CommitCompact, Close, and Destroy are mutually exclusive — only one can run at a time per volume. Key changes: - All entry points use CompareAndSwap(false, true) to claim exclusive access. CompactByVolumeData and CompactByIndex now also guard v.nm and v.DataBackend with local captures. - Close() waits for the flag outside dataFileAccessLock to avoid deadlocking with CommitCompact (which holds the flag while waiting for the lock). It claims the flag before acquiring the lock so no new compaction can start. - Destroy() uses CAS instead of a racy Load check, preventing concurrent compaction from racing with volume teardown. - unmountVolumeByCollection no longer deletes from the map; DeleteCollectionFromDiskLocation removes entries only after successful Destroy, preventing orphaned volumes on failure. Fixes #8591
376 lines
10 KiB
Go
376 lines
10 KiB
Go
package storage
|
|
|
|
import (
|
|
"fmt"
|
|
"path"
|
|
"strconv"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/stats"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/backend"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/types"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
)
|
|
|
|
type Volume struct {
|
|
Id needle.VolumeId
|
|
dir string
|
|
dirIdx string
|
|
Collection string
|
|
DataBackend backend.BackendStorageFile
|
|
nm NeedleMapper
|
|
tmpNm TempNeedleMapper
|
|
needleMapKind NeedleMapKind
|
|
noWriteOrDelete bool // if readonly, either noWriteOrDelete or noWriteCanDelete
|
|
noWriteCanDelete bool // if readonly, either noWriteOrDelete or noWriteCanDelete
|
|
noWriteLock sync.RWMutex
|
|
hasRemoteFile bool // if the volume has a remote file
|
|
MemoryMapMaxSizeMb uint32
|
|
|
|
super_block.SuperBlock
|
|
|
|
dataFileAccessLock sync.RWMutex
|
|
superBlockAccessLock sync.Mutex
|
|
asyncRequestsChan chan *needle.AsyncRequest
|
|
lastModifiedTsSeconds uint64 // unix time in seconds
|
|
lastAppendAtNs uint64 // unix time in nanoseconds
|
|
|
|
lastCompactIndexOffset uint64
|
|
lastCompactRevision uint16
|
|
ldbTimeout int64
|
|
|
|
isCompactionInProgress atomic.Bool
|
|
|
|
volumeInfoRWLock sync.RWMutex
|
|
volumeInfo *volume_server_pb.VolumeInfo
|
|
location *DiskLocation
|
|
diskId uint32 // ID of this volume's disk in Store.Locations array
|
|
|
|
lastIoError error
|
|
}
|
|
|
|
func NewVolume(dirname string, dirIdx string, collection string, id needle.VolumeId, needleMapKind NeedleMapKind, replicaPlacement *super_block.ReplicaPlacement, ttl *needle.TTL, preallocate int64, ver needle.Version, memoryMapMaxSizeMb uint32, ldbTimeout int64) (v *Volume, e error) {
|
|
// if replicaPlacement is nil, the superblock will be loaded from disk
|
|
v = &Volume{dir: dirname, dirIdx: dirIdx, Collection: collection, Id: id, MemoryMapMaxSizeMb: memoryMapMaxSizeMb,
|
|
asyncRequestsChan: make(chan *needle.AsyncRequest, 128)}
|
|
v.SuperBlock = super_block.SuperBlock{ReplicaPlacement: replicaPlacement, Ttl: ttl}
|
|
v.needleMapKind = needleMapKind
|
|
v.ldbTimeout = ldbTimeout
|
|
e = v.load(true, true, needleMapKind, preallocate, ver)
|
|
v.startWorker()
|
|
return
|
|
}
|
|
|
|
func (v *Volume) String() string {
|
|
v.noWriteLock.RLock()
|
|
defer v.noWriteLock.RUnlock()
|
|
return fmt.Sprintf("Id:%v dir:%s dirIdx:%s Collection:%s dataFile:%v nm:%v noWrite:%v canDelete:%v", v.Id, v.dir, v.dirIdx, v.Collection, v.DataBackend, v.nm, v.noWriteOrDelete || v.noWriteCanDelete, v.noWriteCanDelete)
|
|
}
|
|
|
|
func VolumeFileName(dir string, collection string, id int) (fileName string) {
|
|
idString := strconv.Itoa(id)
|
|
if collection == "" {
|
|
fileName = path.Join(dir, idString)
|
|
} else {
|
|
fileName = path.Join(dir, collection+"_"+idString)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (v *Volume) DataFileName() (fileName string) {
|
|
return VolumeFileName(v.dir, v.Collection, int(v.Id))
|
|
}
|
|
|
|
func (v *Volume) IndexFileName() (fileName string) {
|
|
return VolumeFileName(v.dirIdx, v.Collection, int(v.Id))
|
|
}
|
|
|
|
func (v *Volume) FileName(ext string) (fileName string) {
|
|
switch ext {
|
|
case ".idx", ".cpx", ".ldb", ".cpldb":
|
|
return VolumeFileName(v.dirIdx, v.Collection, int(v.Id)) + ext
|
|
}
|
|
// .dat, .cpd, .vif
|
|
return VolumeFileName(v.dir, v.Collection, int(v.Id)) + ext
|
|
}
|
|
|
|
func (v *Volume) Version() needle.Version {
|
|
v.superBlockAccessLock.Lock()
|
|
defer v.superBlockAccessLock.Unlock()
|
|
if v.volumeInfo.Version != 0 {
|
|
v.SuperBlock.Version = needle.Version(v.volumeInfo.Version)
|
|
}
|
|
return v.SuperBlock.Version
|
|
}
|
|
|
|
func (v *Volume) FileStat() (datSize uint64, idxSize uint64, modTime time.Time) {
|
|
v.dataFileAccessLock.RLock()
|
|
defer v.dataFileAccessLock.RUnlock()
|
|
|
|
if v.DataBackend == nil {
|
|
return
|
|
}
|
|
|
|
datFileSize, modTime, e := v.DataBackend.GetStat()
|
|
if e == nil {
|
|
return uint64(datFileSize), v.nm.IndexFileSize(), modTime
|
|
}
|
|
glog.V(0).Infof("Failed to read file size %s %v", v.DataBackend.Name(), e)
|
|
return // -1 causes integer overflow and the volume to become unwritable.
|
|
}
|
|
|
|
func (v *Volume) ContentSize() uint64 {
|
|
v.dataFileAccessLock.RLock()
|
|
defer v.dataFileAccessLock.RUnlock()
|
|
if v.nm == nil {
|
|
return 0
|
|
}
|
|
return v.nm.ContentSize()
|
|
}
|
|
|
|
func (v *Volume) doIsEmpty() (bool, error) {
|
|
// check v.DataBackend.GetStat()
|
|
if v.DataBackend == nil {
|
|
return false, fmt.Errorf("v.DataBackend is nil")
|
|
} else {
|
|
datFileSize, _, e := v.DataBackend.GetStat()
|
|
if e != nil {
|
|
glog.V(0).Infof("Failed to read file size %s %v", v.DataBackend.Name(), e)
|
|
return false, fmt.Errorf("v.DataBackend.GetStat(): %v", e)
|
|
}
|
|
if datFileSize > super_block.SuperBlockSize {
|
|
return false, nil
|
|
}
|
|
}
|
|
// check v.nm.ContentSize()
|
|
if v.nm != nil {
|
|
if v.nm.ContentSize() > 0 {
|
|
return false, nil
|
|
}
|
|
}
|
|
return true, nil
|
|
}
|
|
|
|
func (v *Volume) DeletedSize() uint64 {
|
|
v.dataFileAccessLock.RLock()
|
|
defer v.dataFileAccessLock.RUnlock()
|
|
if v.nm == nil {
|
|
return 0
|
|
}
|
|
return v.nm.DeletedSize()
|
|
}
|
|
|
|
func (v *Volume) FileCount() uint64 {
|
|
v.dataFileAccessLock.RLock()
|
|
defer v.dataFileAccessLock.RUnlock()
|
|
if v.nm == nil {
|
|
return 0
|
|
}
|
|
return uint64(v.nm.FileCount())
|
|
}
|
|
|
|
func (v *Volume) DeletedCount() uint64 {
|
|
v.dataFileAccessLock.RLock()
|
|
defer v.dataFileAccessLock.RUnlock()
|
|
if v.nm == nil {
|
|
return 0
|
|
}
|
|
return uint64(v.nm.DeletedCount())
|
|
}
|
|
|
|
func (v *Volume) MaxFileKey() types.NeedleId {
|
|
v.dataFileAccessLock.RLock()
|
|
defer v.dataFileAccessLock.RUnlock()
|
|
if v.nm == nil {
|
|
return 0
|
|
}
|
|
return v.nm.MaxFileKey()
|
|
}
|
|
|
|
func (v *Volume) IndexFileSize() uint64 {
|
|
v.dataFileAccessLock.RLock()
|
|
defer v.dataFileAccessLock.RUnlock()
|
|
if v.nm == nil {
|
|
return 0
|
|
}
|
|
return v.nm.IndexFileSize()
|
|
}
|
|
|
|
func (v *Volume) DiskType() types.DiskType {
|
|
return v.location.DiskType
|
|
}
|
|
|
|
func (v *Volume) SyncToDisk() {
|
|
v.dataFileAccessLock.Lock()
|
|
defer v.dataFileAccessLock.Unlock()
|
|
if v.nm != nil {
|
|
if err := v.nm.Sync(); err != nil {
|
|
glog.Warningf("Volume Close fail to sync volume idx %d", v.Id)
|
|
}
|
|
}
|
|
if v.DataBackend != nil {
|
|
if err := v.DataBackend.Sync(); err != nil {
|
|
glog.Warningf("Volume Close fail to sync volume %d", v.Id)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Close cleanly shuts down this volume
|
|
func (v *Volume) Close() {
|
|
// Wait for any in-progress compaction to finish and claim the flag so no
|
|
// new compaction can start. This must happen BEFORE acquiring
|
|
// dataFileAccessLock to avoid deadlocking with CommitCompact which holds
|
|
// the flag while waiting for the lock.
|
|
for !v.isCompactionInProgress.CompareAndSwap(false, true) {
|
|
time.Sleep(521 * time.Millisecond)
|
|
glog.Warningf("Volume Close wait for compaction %d", v.Id)
|
|
}
|
|
defer v.isCompactionInProgress.Store(false)
|
|
|
|
v.dataFileAccessLock.Lock()
|
|
defer v.dataFileAccessLock.Unlock()
|
|
|
|
v.doClose()
|
|
}
|
|
|
|
func (v *Volume) doClose() {
|
|
if v.nm != nil {
|
|
if err := v.nm.Sync(); err != nil {
|
|
glog.Warningf("Volume Close fail to sync volume idx %d", v.Id)
|
|
}
|
|
v.nm.Close()
|
|
v.nm = nil
|
|
}
|
|
if v.DataBackend != nil {
|
|
if err := v.DataBackend.Close(); err != nil {
|
|
glog.Warningf("Volume Close fail to sync volume %d", v.Id)
|
|
}
|
|
v.DataBackend = nil
|
|
stats.VolumeServerVolumeGauge.WithLabelValues(v.Collection, "volume").Dec()
|
|
}
|
|
}
|
|
|
|
func (v *Volume) NeedToReplicate() bool {
|
|
return v.ReplicaPlacement.GetCopyCount() > 1
|
|
}
|
|
|
|
// volume is expired if modified time + volume ttl < now
|
|
// except when volume is empty
|
|
// or when the volume does not have a ttl
|
|
// or when volumeSizeLimit is 0 when server just starts
|
|
func (v *Volume) expired(contentSize uint64, volumeSizeLimit uint64) bool {
|
|
if volumeSizeLimit == 0 {
|
|
// skip if we don't know size limit
|
|
return false
|
|
}
|
|
if contentSize <= super_block.SuperBlockSize {
|
|
return false
|
|
}
|
|
if v.Ttl == nil || v.Ttl.Minutes() == 0 {
|
|
return false
|
|
}
|
|
glog.V(2).Infof("volume %d now:%v lastModified:%v", v.Id, time.Now().Unix(), v.lastModifiedTsSeconds)
|
|
livedMinutes := (time.Now().Unix() - int64(v.lastModifiedTsSeconds)) / 60
|
|
glog.V(2).Infof("volume %d ttl:%v lived:%v", v.Id, v.Ttl, livedMinutes)
|
|
if int64(v.Ttl.Minutes()) < livedMinutes {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// wait either maxDelayMinutes or 10% of ttl minutes
|
|
func (v *Volume) expiredLongEnough(maxDelayMinutes uint32) bool {
|
|
if v.Ttl == nil || v.Ttl.Minutes() == 0 {
|
|
return false
|
|
}
|
|
removalDelay := v.Ttl.Minutes() / 10
|
|
if removalDelay > maxDelayMinutes {
|
|
removalDelay = maxDelayMinutes
|
|
}
|
|
|
|
if uint64(v.Ttl.Minutes()+removalDelay)*60+v.lastModifiedTsSeconds < uint64(time.Now().Unix()) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (v *Volume) collectStatus() (maxFileKey types.NeedleId, datFileSize int64, modTime time.Time, fileCount, deletedCount, deletedSize uint64, ok bool) {
|
|
v.dataFileAccessLock.RLock()
|
|
defer v.dataFileAccessLock.RUnlock()
|
|
glog.V(4).Infof("collectStatus volume %d", v.Id)
|
|
|
|
if v.nm == nil || v.DataBackend == nil {
|
|
return
|
|
}
|
|
|
|
ok = true
|
|
|
|
maxFileKey = v.nm.MaxFileKey()
|
|
datFileSize, modTime, _ = v.DataBackend.GetStat()
|
|
fileCount = uint64(v.nm.FileCount())
|
|
deletedCount = uint64(v.nm.DeletedCount())
|
|
deletedSize = v.nm.DeletedSize()
|
|
|
|
return
|
|
}
|
|
|
|
func (v *Volume) ToVolumeInformationMessage() (types.NeedleId, *master_pb.VolumeInformationMessage) {
|
|
|
|
maxFileKey, volumeSize, modTime, fileCount, deletedCount, deletedSize, ok := v.collectStatus()
|
|
|
|
if !ok {
|
|
return 0, nil
|
|
}
|
|
|
|
volumeInfo := &master_pb.VolumeInformationMessage{
|
|
Id: uint32(v.Id),
|
|
Size: uint64(volumeSize),
|
|
Collection: v.Collection,
|
|
FileCount: fileCount,
|
|
DeleteCount: deletedCount,
|
|
DeletedByteCount: deletedSize,
|
|
ReadOnly: v.IsReadOnly(),
|
|
ReplicaPlacement: uint32(v.ReplicaPlacement.Byte()),
|
|
Version: uint32(v.Version()),
|
|
Ttl: v.Ttl.ToUint32(),
|
|
CompactRevision: uint32(v.SuperBlock.CompactionRevision),
|
|
ModifiedAtSecond: modTime.Unix(),
|
|
DiskType: string(v.location.DiskType),
|
|
DiskId: v.diskId,
|
|
}
|
|
|
|
volumeInfo.RemoteStorageName, volumeInfo.RemoteStorageKey = v.RemoteStorageNameKey()
|
|
|
|
return maxFileKey, volumeInfo
|
|
}
|
|
|
|
func (v *Volume) RemoteStorageNameKey() (storageName, storageKey string) {
|
|
if v.volumeInfo == nil {
|
|
return
|
|
}
|
|
if len(v.volumeInfo.GetFiles()) == 0 {
|
|
return
|
|
}
|
|
return v.volumeInfo.GetFiles()[0].BackendName(), v.volumeInfo.GetFiles()[0].GetKey()
|
|
}
|
|
|
|
func (v *Volume) IsReadOnly() bool {
|
|
v.noWriteLock.RLock()
|
|
defer v.noWriteLock.RUnlock()
|
|
return v.noWriteOrDelete || v.noWriteCanDelete || v.location.isDiskSpaceLow
|
|
}
|
|
|
|
func (v *Volume) PersistReadOnly(readOnly bool) {
|
|
v.volumeInfoRWLock.RLock()
|
|
defer v.volumeInfoRWLock.RUnlock()
|
|
v.volumeInfo.ReadOnly = readOnly
|
|
v.SaveVolumeInfo()
|
|
}
|