seaweedFS/weed/mount/posix_file_lock.go

package mount

import (
	"math"
	"sort"
	"sync"
	"syscall"

	"github.com/seaweedfs/go-fuse/v2/fuse"
)

// lockRange represents a single held POSIX byte-range lock.
type lockRange struct {
	Start uint64 // inclusive byte offset
	End   uint64 // inclusive; math.MaxUint64 means "to EOF"
	Typ   uint32 // syscall.F_RDLCK or syscall.F_WRLCK
	Owner uint64 // FUSE lock owner (from LkIn.Owner)
	Pid   uint32 // PID of lock holder (for GetLk reporting)
	// flock and fcntl locks have different ownership and close semantics.
	// Keep them in separate namespaces inside the table.
	IsFlock bool
}

// inodeLocks holds all locks for one inode plus a waiter queue for SetLkw.
type inodeLocks struct {
	mu       sync.Mutex
	locks    []lockRange   // currently held locks, sorted by Start
	waiters  []*lockWaiter // blocked SetLkw callers
	wakeRefs int           // woken waiters still retrying on this inodeLocks
}

// lockWaiter represents a blocked SetLkw caller.
type lockWaiter struct {
	requested   lockRange     // the lock this waiter is trying to acquire
	ch          chan struct{} // closed when the waiter should re-check
	wakeRefHeld bool
}

// PosixLockTable is the per-mount POSIX lock manager.
type PosixLockTable struct {
	mu     sync.Mutex
	inodes map[uint64]*inodeLocks
}

func NewPosixLockTable() *PosixLockTable {
	return &PosixLockTable{
		inodes: make(map[uint64]*inodeLocks),
	}
}

// getOrCreateInodeLocks returns the lock state for an inode, creating it if needed.
func (plt *PosixLockTable) getOrCreateInodeLocks(inode uint64) *inodeLocks {
	plt.mu.Lock()
	defer plt.mu.Unlock()
	il, ok := plt.inodes[inode]
	if !ok {
		il = &inodeLocks{}
		plt.inodes[inode] = il
	}
	return il
}

// getInodeLocks returns the lock state for an inode, or nil if none exists.
func (plt *PosixLockTable) getInodeLocks(inode uint64) *inodeLocks {
	plt.mu.Lock()
	defer plt.mu.Unlock()
	return plt.inodes[inode]
}

// maybeCleanupInode removes the inodeLocks entry if it has no locks, no waiters,
// and no woken waiters still retrying against this inodeLocks.
func (plt *PosixLockTable) maybeCleanupInode(inode uint64, il *inodeLocks) {
	// Caller must NOT hold il.mu. We acquire both locks in the correct order.
	plt.mu.Lock()
	defer plt.mu.Unlock()
	il.mu.Lock()
	defer il.mu.Unlock()
	if len(il.locks) == 0 && len(il.waiters) == 0 && il.wakeRefs == 0 {
		delete(plt.inodes, inode)
	}
}

// rangesOverlap returns true if two inclusive ranges overlap.
func rangesOverlap(aStart, aEnd, bStart, bEnd uint64) bool {
	return aStart <= bEnd && bStart <= aEnd
}

// findConflict returns the first lock that conflicts with the proposed lock.
// A conflict exists when ranges overlap, at least one is a write lock, and the owners differ.
func findConflict(locks []lockRange, proposed lockRange) (lockRange, bool) {
	for _, h := range locks {
		if h.IsFlock != proposed.IsFlock {
			continue
		}
		if h.Owner == proposed.Owner {
			continue
		}
		if !rangesOverlap(h.Start, h.End, proposed.Start, proposed.End) {
			continue
		}
		if h.Typ == syscall.F_RDLCK && proposed.Typ == syscall.F_RDLCK {
			continue
		}
		return h, true
	}
	return lockRange{}, false
}

// insertAndCoalesce inserts a lock for the given owner, replacing/splitting any
// existing same-owner locks that overlap. Adjacent same-type locks are merged.
// Caller must hold il.mu.
func insertAndCoalesce(il *inodeLocks, lk lockRange) {
	owner := lk.Owner

	var kept []lockRange
	for _, h := range il.locks {
		if h.Owner != owner || h.IsFlock != lk.IsFlock {
			kept = append(kept, h)
			continue
		}
		if !rangesOverlap(h.Start, h.End, lk.Start, lk.End) {
			// Check for adjacency with same type for merging.
			if h.Typ == lk.Typ && ((h.End < ^uint64(0) && h.End+1 == lk.Start) || (lk.End < ^uint64(0) && lk.End+1 == h.Start)) {
				// Merge adjacent same-type lock into lk.
				if h.Start < lk.Start {
					lk.Start = h.Start
				}
				if h.End > lk.End {
					lk.End = h.End
				}
				continue
			}
			kept = append(kept, h)
			continue
		}
		// Overlapping same-owner lock.
		if h.Typ == lk.Typ {
			// Same type: absorb into lk (expand range).
			if h.Start < lk.Start {
				lk.Start = h.Start
			}
			if h.End > lk.End {
				lk.End = h.End
			}
			continue
		}
		// Different type: truncate or split the existing lock.
		if h.Start < lk.Start {
			// Left portion survives.
			left := h
			left.End = lk.Start - 1
			kept = append(kept, left)
		}
		if h.End > lk.End {
			// Right portion survives.
			right := h
			right.Start = lk.End + 1
			kept = append(kept, right)
		}
	}

	kept = append(kept, lk)
	sort.Slice(kept, func(i, j int) bool {
		return kept[i].Start < kept[j].Start
	})
	il.locks = kept
}

// removeLocks removes or splits matching locks in the given range.
// Caller must hold il.mu.
func removeLocks(il *inodeLocks, matches func(lockRange) bool, start, end uint64) {
	var kept []lockRange
	for _, h := range il.locks {
		if !matches(h) || !rangesOverlap(h.Start, h.End, start, end) {
			kept = append(kept, h)
			continue
		}
		// h overlaps the unlock range.
		if h.Start < start {
			// Left portion survives.
			left := h
			left.End = start - 1
			kept = append(kept, left)
		}
		if h.End > end {
			// Right portion survives.
			right := h
			right.Start = end + 1
			kept = append(kept, right)
		}
		// If fully contained, it's simply dropped.
	}
	il.locks = kept
}

func (plt *PosixLockTable) releaseMatching(inode uint64, matches func(lockRange) bool) {
	il := plt.getInodeLocks(inode)
	if il == nil {
		return
	}
	il.mu.Lock()
	removeLocks(il, matches, 0, math.MaxUint64)
	wakeEligibleWaiters(il)
	il.mu.Unlock()
	plt.maybeCleanupInode(inode, il)
}

// releaseWakeRef drops the temporary reference that keeps inodeLocks live while
// a woken waiter retries its SetLkw acquisition.
func releaseWakeRef(il *inodeLocks, waiter *lockWaiter) {
	if waiter == nil || !waiter.wakeRefHeld {
		return
	}
	waiter.wakeRefHeld = false
	il.wakeRefs--
}

// wakeEligibleWaiters selectively wakes blocked SetLkw callers that can now
// succeed given the current lock state. Waiters whose requests still conflict
// with held locks remain in the queue, avoiding a thundering herd.
// Caller must hold il.mu.
func wakeEligibleWaiters(il *inodeLocks) {
	remaining := il.waiters[:0]
	for _, w := range il.waiters {
		if _, conflicted := findConflict(il.locks, w.requested); !conflicted {
			w.wakeRefHeld = true
			il.wakeRefs++
			close(w.ch)
		} else {
			remaining = append(remaining, w)
		}
	}
	il.waiters = remaining
}

// removeWaiter removes a specific waiter from the list.
// Caller must hold il.mu.
func removeWaiter(il *inodeLocks, w *lockWaiter) {
	for i, existing := range il.waiters {
		if existing == w {
			il.waiters = append(il.waiters[:i], il.waiters[i+1:]...)
			return
		}
	}
}

// GetLk checks for a conflicting lock. If found, it populates out with the
// conflict details. If no conflict, out.Typ is set to F_UNLCK.
func (plt *PosixLockTable) GetLk(inode uint64, proposed lockRange, out *fuse.LkOut) {
	il := plt.getInodeLocks(inode)
	if il == nil {
		out.Lk.Typ = syscall.F_UNLCK
		return
	}
	il.mu.Lock()
	conflict, found := findConflict(il.locks, proposed)
	il.mu.Unlock()

	if found {
		out.Lk.Start = conflict.Start
		out.Lk.End = conflict.End
		out.Lk.Typ = conflict.Typ
		out.Lk.Pid = conflict.Pid
	} else {
		out.Lk.Typ = syscall.F_UNLCK
	}
}

// SetLk attempts a non-blocking lock or unlock.
// For unlock (F_UNLCK): removes locks in the given range for the owner.
// For lock: returns fuse.EAGAIN if a conflict exists, fuse.OK on success.
func (plt *PosixLockTable) SetLk(inode uint64, lk lockRange) fuse.Status {
	if lk.Typ == syscall.F_UNLCK {
		il := plt.getInodeLocks(inode)
		if il == nil {
			return fuse.OK
		}
		il.mu.Lock()
		removeLocks(il, func(existing lockRange) bool {
			return existing.Owner == lk.Owner && existing.IsFlock == lk.IsFlock
		}, lk.Start, lk.End)
		wakeEligibleWaiters(il)
		il.mu.Unlock()
		plt.maybeCleanupInode(inode, il)
		return fuse.OK
	}

	il := plt.getOrCreateInodeLocks(inode)
	il.mu.Lock()
	if _, found := findConflict(il.locks, lk); found {
		il.mu.Unlock()
		return fuse.EAGAIN
	}
	insertAndCoalesce(il, lk)
	il.mu.Unlock()
	return fuse.OK
}

// SetLkw attempts a blocking lock. It waits until the lock can be acquired
// or the cancel channel is closed.
func (plt *PosixLockTable) SetLkw(inode uint64, lk lockRange, cancel <-chan struct{}) fuse.Status {
	if lk.Typ == syscall.F_UNLCK {
		return plt.SetLk(inode, lk)
	}

	il := plt.getOrCreateInodeLocks(inode)
	var waiter *lockWaiter
	for {
		il.mu.Lock()
		releaseWakeRef(il, waiter)
		if _, found := findConflict(il.locks, lk); !found {
			insertAndCoalesce(il, lk)
			il.mu.Unlock()
			return fuse.OK
		}
		// Register waiter with the requested lock details for selective waking.
		waiter = &lockWaiter{requested: lk, ch: make(chan struct{})}
		il.waiters = append(il.waiters, waiter)
		il.mu.Unlock()

		// Block until woken or cancelled.
		select {
		case <-waiter.ch:
			// Woken — retry.
			continue
		case <-cancel:
			// Request cancelled.
			il.mu.Lock()
			releaseWakeRef(il, waiter)
			removeWaiter(il, waiter)
			il.mu.Unlock()
			plt.maybeCleanupInode(inode, il)
			return fuse.EINTR
		}
	}
}

// ReleaseOwner removes all locks held by the given owner on the given inode.
// Used for same-owner cleanup in tests and lock-table operations.
func (plt *PosixLockTable) ReleaseOwner(inode uint64, owner uint64) {
	plt.releaseMatching(inode, func(lk lockRange) bool {
		return lk.Owner == owner
	})
}

// ReleaseFlockOwner removes flock locks for a released file description.
// FUSE only provides LockOwner on RELEASE when FUSE_RELEASE_FLOCK_UNLOCK is set.
func (plt *PosixLockTable) ReleaseFlockOwner(inode uint64, owner uint64) {
	plt.releaseMatching(inode, func(lk lockRange) bool {
		return lk.IsFlock && lk.Owner == owner
	})
}

// ReleasePosixOwner removes POSIX fcntl locks for a closing lock owner.
// FUSE passes the closing fi->owner on FLUSH, which is the correct close-time
// identity for POSIX byte-range lock cleanup.
func (plt *PosixLockTable) ReleasePosixOwner(inode uint64, owner uint64) {
	plt.releaseMatching(inode, func(lk lockRange) bool {
		return !lk.IsFlock && lk.Owner == owner
	})
}