Add FUSE integration tests for POSIX file locking (#8752)

* Add FUSE integration tests for POSIX file locking Test flock() and fcntl() advisory locks over the FUSE mount: - Exclusive and shared flock with conflict detection - flock upgrade (shared to exclusive) and release on close - fcntl F_SETLK write lock conflicts and shared read locks - fcntl F_GETLK conflict reporting on overlapping byte ranges - Non-overlapping byte-range locks held independently - F_SETLKW blocking until conflicting lock is released - Lock release on file descriptor close - Concurrent lock contention with multiple workers * Fix review feedback in POSIX lock integration tests - Assert specific EAGAIN error on fcntl lock conflicts instead of generic Error - Use O_APPEND in concurrent contention test so workers append rather than overwrite - Verify exact line count (numWorkers * writesPerWorker) after concurrent test - Check unlock error in F_SETLKW blocking test goroutine * Refactor fcntl tests to use subprocesses for inter-process semantics POSIX fcntl locks use the process's files_struct as lock owner, so all fds in the same process share the same owner and never conflict. This caused the fcntl tests to silently pass without exercising lock conflicts. Changes: - Add TestFcntlLockHelper subprocess entry point with hold/try/getlk actions - Add lockHolder with channel-based coordination (no scanner race) - Rewrite all fcntl tests to run contenders in separate subprocesses - Fix F_UNLCK int16 cast in GetLk assertion for type-safe comparison - Fix concurrent test: use non-blocking flock with retry to avoid exhausting go-fuse server reader goroutines (blocking FUSE SETLKW can starve unlock request processing, causing deadlock) flock tests remain same-process since flock uses per-struct-file owners. * Fix misleading comment and error handling in lock test subprocess - Fix comment: tryLockInSubprocess tests a subprocess, not the test process - Distinguish EAGAIN/EACCES from unexpected errors in subprocess try mode so real failures aren't silently masked as lock conflicts * Fix CI race in FcntlReleaseOnClose and increase flock retry budget - FcntlReleaseOnClose: retry lock acquisition after subprocess exits since the FUSE server may not process Release immediately - ConcurrentLockContention: increase retry limit from 500 to 3000 (5s → 30s budget) to handle CI load * separating flock and fcntl in the in-memory lock table and cleaning them up through the right release path: PID for POSIX locks, lock owner for flock * ReleasePosixOwner * weed/mount: flush before releasing posix close owner * weed/mount: keep woken lock waiters from losing inode state * test/fuse: make blocking fcntl helper state explicit * test/fuse: assert flock contention never overlaps * test/fuse: stabilize concurrent lock contention check * test/fuse: make concurrent contention writes deterministic * weed/mount: retry synchronous metadata flushes
2026-03-24 11:43:25 -07:00
parent 9cc26d09e8
commit 805625d06e
9 changed files with 1090 additions and 62 deletions
--- a/weed/mount/metadata_flush_retry.go
+++ b/weed/mount/metadata_flush_retry.go
@@ -0,0 +1,28 @@
+package mount
+
+import "time"
+
+const metadataFlushRetries = 3
+
+var metadataFlushSleep = time.Sleep
+
+func retryMetadataFlush(flush func() error, onRetry func(nextAttempt, totalAttempts int, backoff time.Duration, err error)) error {
+	totalAttempts := metadataFlushRetries + 1
+	var err error
+	for attempt := 1; attempt <= totalAttempts; attempt++ {
+		err = flush()
+		if err == nil {
+			return nil
+		}
+		if attempt == totalAttempts {
+			return err
+		}
+
+		backoff := time.Duration(1<<uint(attempt-1)) * time.Second
+		if onRetry != nil {
+			onRetry(attempt+1, totalAttempts, backoff, err)
+		}
+		metadataFlushSleep(backoff)
+	}
+	return err
+}
--- a/weed/mount/metadata_flush_retry_test.go
+++ b/weed/mount/metadata_flush_retry_test.go
@@ -0,0 +1,81 @@
+package mount
+
+import (
+	"errors"
+	"testing"
+	"time"
+)
+
+func TestRetryMetadataFlushEventuallySucceeds(t *testing.T) {
+	originalSleep := metadataFlushSleep
+	t.Cleanup(func() {
+		metadataFlushSleep = originalSleep
+	})
+
+	var sleeps []time.Duration
+	metadataFlushSleep = func(d time.Duration) {
+		sleeps = append(sleeps, d)
+	}
+
+	attempts := 0
+	err := retryMetadataFlush(func() error {
+		attempts++
+		if attempts < 3 {
+			return errors.New("temporary failure")
+		}
+		return nil
+	}, nil)
+	if err != nil {
+		t.Fatalf("retryMetadataFlush returned error: %v", err)
+	}
+
+	if attempts != 3 {
+		t.Fatalf("attempts = %d, want 3", attempts)
+	}
+
+	wantSleeps := []time.Duration{time.Second, 2 * time.Second}
+	if len(sleeps) != len(wantSleeps) {
+		t.Fatalf("sleep count = %d, want %d", len(sleeps), len(wantSleeps))
+	}
+	for i, want := range wantSleeps {
+		if sleeps[i] != want {
+			t.Fatalf("sleep[%d] = %v, want %v", i, sleeps[i], want)
+		}
+	}
+}
+
+func TestRetryMetadataFlushReturnsLastError(t *testing.T) {
+	originalSleep := metadataFlushSleep
+	t.Cleanup(func() {
+		metadataFlushSleep = originalSleep
+	})
+
+	var sleeps []time.Duration
+	metadataFlushSleep = func(d time.Duration) {
+		sleeps = append(sleeps, d)
+	}
+
+	expectedErr := errors.New("permanent failure")
+	attempts := 0
+	err := retryMetadataFlush(func() error {
+		attempts++
+		return expectedErr
+	}, nil)
+	if !errors.Is(err, expectedErr) {
+		t.Fatalf("retryMetadataFlush error = %v, want %v", err, expectedErr)
+	}
+
+	if attempts != metadataFlushRetries+1 {
+		t.Fatalf("attempts = %d, want %d", attempts, metadataFlushRetries+1)
+	}
+
+	wantSleeps := []time.Duration{time.Second, 2 * time.Second, 4 * time.Second}
+	if len(sleeps) != len(wantSleeps) {
+		t.Fatalf("sleep count = %d, want %d", len(sleeps), len(wantSleeps))
+	}
+	for i, want := range wantSleeps {
+		if sleeps[i] != want {
+			t.Fatalf("sleep[%d] = %v, want %v", i, sleeps[i], want)
+		}
+	}
+}
--- a/weed/mount/posix_file_lock.go
+++ b/weed/mount/posix_file_lock.go
@@ -16,19 +16,24 @@ type lockRange struct {
 	Typ   uint32 // syscall.F_RDLCK or syscall.F_WRLCK
 	Owner uint64 // FUSE lock owner (from LkIn.Owner)
 	Pid   uint32 // PID of lock holder (for GetLk reporting)
+	// flock and fcntl locks have different ownership and close semantics.
+	// Keep them in separate namespaces inside the table.
+	IsFlock bool
 }

 // inodeLocks holds all locks for one inode plus a waiter queue for SetLkw.
 type inodeLocks struct {
-	mu      sync.Mutex
-	locks   []lockRange   // currently held locks, sorted by Start
-	waiters []*lockWaiter // blocked SetLkw callers
+	mu       sync.Mutex
+	locks    []lockRange   // currently held locks, sorted by Start
+	waiters  []*lockWaiter // blocked SetLkw callers
+	wakeRefs int           // woken waiters still retrying on this inodeLocks
 }

 // lockWaiter represents a blocked SetLkw caller.
 type lockWaiter struct {
-	requested lockRange   // the lock this waiter is trying to acquire
-	ch        chan struct{} // closed when the waiter should re-check
+	requested   lockRange     // the lock this waiter is trying to acquire
+	ch          chan struct{} // closed when the waiter should re-check
+	wakeRefHeld bool
 }

 // PosixLockTable is the per-mount POSIX lock manager.
@@ -62,14 +67,15 @@ func (plt *PosixLockTable) getInodeLocks(inode uint64) *inodeLocks {
 	return plt.inodes[inode]
 }

-// maybeCleanupInode removes the inodeLocks entry if it has no locks and no waiters.
+// maybeCleanupInode removes the inodeLocks entry if it has no locks, no waiters,
+// and no woken waiters still retrying against this inodeLocks.
 func (plt *PosixLockTable) maybeCleanupInode(inode uint64, il *inodeLocks) {
 	// Caller must NOT hold il.mu. We acquire both locks in the correct order.
 	plt.mu.Lock()
 	defer plt.mu.Unlock()
 	il.mu.Lock()
 	defer il.mu.Unlock()
-	if len(il.locks) == 0 && len(il.waiters) == 0 {
+	if len(il.locks) == 0 && len(il.waiters) == 0 && il.wakeRefs == 0 {
 		delete(plt.inodes, inode)
 	}
 }
@@ -83,6 +89,9 @@ func rangesOverlap(aStart, aEnd, bStart, bEnd uint64) bool {
 // A conflict exists when ranges overlap, at least one is a write lock, and the owners differ.
 func findConflict(locks []lockRange, proposed lockRange) (lockRange, bool) {
 	for _, h := range locks {
+		if h.IsFlock != proposed.IsFlock {
+			continue
+		}
 		if h.Owner == proposed.Owner {
 			continue
 		}
@@ -105,7 +114,7 @@ func insertAndCoalesce(il *inodeLocks, lk lockRange) {

 	var kept []lockRange
 	for _, h := range il.locks {
-		if h.Owner != owner {
+		if h.Owner != owner || h.IsFlock != lk.IsFlock {
 			kept = append(kept, h)
 			continue
 		}
@@ -157,12 +166,12 @@ func insertAndCoalesce(il *inodeLocks, lk lockRange) {
 	il.locks = kept
 }

-// removeLocks removes or splits locks owned by the given owner in the given range.
+// removeLocks removes or splits matching locks in the given range.
 // Caller must hold il.mu.
-func removeLocks(il *inodeLocks, owner uint64, start, end uint64) {
+func removeLocks(il *inodeLocks, matches func(lockRange) bool, start, end uint64) {
 	var kept []lockRange
 	for _, h := range il.locks {
-		if h.Owner != owner || !rangesOverlap(h.Start, h.End, start, end) {
+		if !matches(h) || !rangesOverlap(h.Start, h.End, start, end) {
 			kept = append(kept, h)
 			continue
 		}
@@ -184,6 +193,28 @@ func removeLocks(il *inodeLocks, owner uint64, start, end uint64) {
 	il.locks = kept
 }

+func (plt *PosixLockTable) releaseMatching(inode uint64, matches func(lockRange) bool) {
+	il := plt.getInodeLocks(inode)
+	if il == nil {
+		return
+	}
+	il.mu.Lock()
+	removeLocks(il, matches, 0, math.MaxUint64)
+	wakeEligibleWaiters(il)
+	il.mu.Unlock()
+	plt.maybeCleanupInode(inode, il)
+}
+
+// releaseWakeRef drops the temporary reference that keeps inodeLocks live while
+// a woken waiter retries its SetLkw acquisition.
+func releaseWakeRef(il *inodeLocks, waiter *lockWaiter) {
+	if waiter == nil || !waiter.wakeRefHeld {
+		return
+	}
+	waiter.wakeRefHeld = false
+	il.wakeRefs--
+}
+
 // wakeEligibleWaiters selectively wakes blocked SetLkw callers that can now
 // succeed given the current lock state. Waiters whose requests still conflict
 // with held locks remain in the queue, avoiding a thundering herd.
@@ -192,6 +223,8 @@ func wakeEligibleWaiters(il *inodeLocks) {
 	remaining := il.waiters[:0]
 	for _, w := range il.waiters {
 		if _, conflicted := findConflict(il.locks, w.requested); !conflicted {
+			w.wakeRefHeld = true
+			il.wakeRefs++
 			close(w.ch)
 		} else {
 			remaining = append(remaining, w)
@@ -243,7 +276,9 @@ func (plt *PosixLockTable) SetLk(inode uint64, lk lockRange) fuse.Status {
 			return fuse.OK
 		}
 		il.mu.Lock()
-		removeLocks(il, lk.Owner, lk.Start, lk.End)
+		removeLocks(il, func(existing lockRange) bool {
+			return existing.Owner == lk.Owner && existing.IsFlock == lk.IsFlock
+		}, lk.Start, lk.End)
 		wakeEligibleWaiters(il)
 		il.mu.Unlock()
 		plt.maybeCleanupInode(inode, il)
@@ -269,15 +304,17 @@ func (plt *PosixLockTable) SetLkw(inode uint64, lk lockRange, cancel <-chan stru
 	}

 	il := plt.getOrCreateInodeLocks(inode)
+	var waiter *lockWaiter
 	for {
 		il.mu.Lock()
+		releaseWakeRef(il, waiter)
 		if _, found := findConflict(il.locks, lk); !found {
 			insertAndCoalesce(il, lk)
 			il.mu.Unlock()
 			return fuse.OK
 		}
 		// Register waiter with the requested lock details for selective waking.
-		waiter := &lockWaiter{requested: lk, ch: make(chan struct{})}
+		waiter = &lockWaiter{requested: lk, ch: make(chan struct{})}
 		il.waiters = append(il.waiters, waiter)
 		il.mu.Unlock()

@@ -289,23 +326,36 @@ func (plt *PosixLockTable) SetLkw(inode uint64, lk lockRange, cancel <-chan stru
 		case <-cancel:
 			// Request cancelled.
 			il.mu.Lock()
+			releaseWakeRef(il, waiter)
 			removeWaiter(il, waiter)
 			il.mu.Unlock()
+			plt.maybeCleanupInode(inode, il)
 			return fuse.EINTR
 		}
 	}
 }

 // ReleaseOwner removes all locks held by the given owner on the given inode.
-// Called from FUSE Release to clean up when a file description is closed.
+// Used for same-owner cleanup in tests and lock-table operations.
 func (plt *PosixLockTable) ReleaseOwner(inode uint64, owner uint64) {
-	il := plt.getInodeLocks(inode)
-	if il == nil {
-		return
-	}
-	il.mu.Lock()
-	removeLocks(il, owner, 0, math.MaxUint64)
-	wakeEligibleWaiters(il)
-	il.mu.Unlock()
-	plt.maybeCleanupInode(inode, il)
+	plt.releaseMatching(inode, func(lk lockRange) bool {
+		return lk.Owner == owner
+	})
+}
+
+// ReleaseFlockOwner removes flock locks for a released file description.
+// FUSE only provides LockOwner on RELEASE when FUSE_RELEASE_FLOCK_UNLOCK is set.
+func (plt *PosixLockTable) ReleaseFlockOwner(inode uint64, owner uint64) {
+	plt.releaseMatching(inode, func(lk lockRange) bool {
+		return lk.IsFlock && lk.Owner == owner
+	})
+}
+
+// ReleasePosixOwner removes POSIX fcntl locks for a closing lock owner.
+// FUSE passes the closing fi->owner on FLUSH, which is the correct close-time
+// identity for POSIX byte-range lock cleanup.
+func (plt *PosixLockTable) ReleasePosixOwner(inode uint64, owner uint64) {
+	plt.releaseMatching(inode, func(lk lockRange) bool {
+		return !lk.IsFlock && lk.Owner == owner
+	})
 }
--- a/weed/mount/posix_file_lock_test.go
+++ b/weed/mount/posix_file_lock_test.go
@@ -217,6 +217,123 @@ func TestReleaseOwner(t *testing.T) {
 	}
 }

+func TestDifferentLockKindsDoNotConflict(t *testing.T) {
+	plt := NewPosixLockTable()
+	inode := uint64(1)
+
+	s1 := plt.SetLk(inode, lockRange{Start: 0, End: 99, Typ: syscall.F_WRLCK, Owner: 1, Pid: 10})
+	if s1 != fuse.OK {
+		t.Fatalf("expected POSIX lock OK, got %v", s1)
+	}
+
+	s2 := plt.SetLk(inode, lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 2, Pid: 20, IsFlock: true})
+	if s2 != fuse.OK {
+		t.Fatalf("expected flock lock OK in separate namespace, got %v", s2)
+	}
+}
+
+func TestReleasePosixOwnerReleasesPosixLocksAndWakesWaiters(t *testing.T) {
+	plt := NewPosixLockTable()
+	inode := uint64(1)
+
+	plt.SetLk(inode, lockRange{Start: 0, End: 99, Typ: syscall.F_WRLCK, Owner: 1, Pid: 10})
+
+	done := make(chan fuse.Status, 1)
+	go func() {
+		cancel := make(chan struct{})
+		done <- plt.SetLkw(inode, lockRange{Start: 0, End: 99, Typ: syscall.F_WRLCK, Owner: 2, Pid: 20}, cancel)
+	}()
+
+	time.Sleep(50 * time.Millisecond)
+	plt.ReleasePosixOwner(inode, 1)
+
+	select {
+	case s := <-done:
+		if s != fuse.OK {
+			t.Fatalf("expected OK after ReleasePosixOwner, got %v", s)
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatal("SetLkw did not unblock after ReleasePosixOwner")
+	}
+}
+
+func TestReleasePosixOwnerDoesNotReleaseFlockLocks(t *testing.T) {
+	plt := NewPosixLockTable()
+	inode := uint64(1)
+
+	plt.SetLk(inode, lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 1, Pid: 10, IsFlock: true})
+	plt.ReleasePosixOwner(inode, 1)
+
+	var out fuse.LkOut
+	plt.GetLk(inode, lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 2, Pid: 20, IsFlock: true}, &out)
+	if out.Lk.Typ != syscall.F_WRLCK {
+		t.Fatalf("expected flock lock to remain after ReleasePosixOwner, got type %d", out.Lk.Typ)
+	}
+}
+
+func TestWakeEligibleWaitersKeepsInodeUntilWakeRefReleased(t *testing.T) {
+	plt := NewPosixLockTable()
+	inode := uint64(1)
+	il := plt.getOrCreateInodeLocks(inode)
+	waiter := &lockWaiter{
+		requested: lockRange{Start: 0, End: 99, Typ: syscall.F_WRLCK, Owner: 2, Pid: 20},
+		ch:        make(chan struct{}),
+	}
+
+	il.mu.Lock()
+	il.waiters = append(il.waiters, waiter)
+	il.mu.Unlock()
+
+	plt.releaseMatching(inode, func(lockRange) bool { return false })
+
+	select {
+	case <-waiter.ch:
+		// Expected.
+	default:
+		t.Fatal("expected waiter to be woken")
+	}
+
+	plt.mu.Lock()
+	_, exists := plt.inodes[inode]
+	plt.mu.Unlock()
+	if !exists {
+		t.Fatal("inodeLocks should remain while a woken waiter still holds a wake ref")
+	}
+
+	il.mu.Lock()
+	releaseWakeRef(il, waiter)
+	il.mu.Unlock()
+	plt.maybeCleanupInode(inode, il)
+
+	plt.mu.Lock()
+	_, exists = plt.inodes[inode]
+	plt.mu.Unlock()
+	if exists {
+		t.Fatal("inodeLocks should be cleaned up after the final wake ref is released")
+	}
+}
+
+func TestReleaseFlockOwnerDoesNotReleasePosixLocks(t *testing.T) {
+	plt := NewPosixLockTable()
+	inode := uint64(1)
+
+	plt.SetLk(inode, lockRange{Start: 0, End: 99, Typ: syscall.F_WRLCK, Owner: 1, Pid: 10})
+	plt.SetLk(inode, lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 2, Pid: 10, IsFlock: true})
+
+	plt.ReleaseFlockOwner(inode, 2)
+
+	var out fuse.LkOut
+	plt.GetLk(inode, lockRange{Start: 0, End: 99, Typ: syscall.F_WRLCK, Owner: 3, Pid: 30}, &out)
+	if out.Lk.Typ != syscall.F_WRLCK {
+		t.Fatalf("expected POSIX lock to remain after ReleaseFlockOwner, got type %d", out.Lk.Typ)
+	}
+
+	plt.GetLk(inode, lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 4, Pid: 40, IsFlock: true}, &out)
+	if out.Lk.Typ != syscall.F_UNLCK {
+		t.Fatalf("expected flock lock to be removed after ReleaseFlockOwner, got type %d", out.Lk.Typ)
+	}
+}
+
 func TestReleaseOwnerWakesWaiters(t *testing.T) {
 	plt := NewPosixLockTable()
 	inode := uint64(1)
--- a/weed/mount/weedfs_async_flush.go
+++ b/weed/mount/weedfs_async_flush.go
@@ -8,8 +8,6 @@ import (
 	"github.com/seaweedfs/seaweedfs/weed/util"
 )

-const asyncFlushMetadataRetries = 3
-
 // completeAsyncFlush is called in a background goroutine when a file handle
 // with pending async flush work is released. It performs the deferred data
 // upload and metadata flush that was skipped in doFlush() for writebackCache mode.
@@ -68,24 +66,17 @@ func (wfs *WFS) completeAsyncFlush(fh *FileHandle) {
 // with exponential backoff on transient errors. The chunk data is already on the
 // volume servers at this point; only the filer metadata reference needs persisting.
 func (wfs *WFS) flushMetadataWithRetry(fh *FileHandle, dir, name string, fileFullPath util.FullPath) {
-	for attempt := 0; attempt <= asyncFlushMetadataRetries; attempt++ {
-		if attempt > 0 {
-			backoff := time.Duration(1<<uint(attempt-1)) * time.Second
-			glog.Warningf("completeAsyncFlush %s: retrying metadata flush (attempt %d/%d) after %v",
-				fileFullPath, attempt+1, asyncFlushMetadataRetries+1, backoff)
-			time.Sleep(backoff)
-		}
-
-		if err := wfs.flushMetadataToFiler(fh, dir, name, fh.asyncFlushUid, fh.asyncFlushGid); err != nil {
-			if attempt == asyncFlushMetadataRetries {
-				glog.Errorf("completeAsyncFlush %s: metadata flush failed after %d attempts: %v — "+
-					"chunks are uploaded but NOT referenced in filer metadata; "+
-					"they will appear as orphans in volume.fsck",
-					fileFullPath, asyncFlushMetadataRetries+1, err)
-			}
-			continue
-		}
-		return // success
+	err := retryMetadataFlush(func() error {
+		return wfs.flushMetadataToFiler(fh, dir, name, fh.asyncFlushUid, fh.asyncFlushGid)
+	}, func(nextAttempt, totalAttempts int, backoff time.Duration, err error) {
+		glog.Warningf("completeAsyncFlush %s: retrying metadata flush (attempt %d/%d) after %v: %v",
+			fileFullPath, nextAttempt, totalAttempts, backoff, err)
+	})
+	if err != nil {
+		glog.Errorf("completeAsyncFlush %s: metadata flush failed after %d attempts: %v - "+
+			"chunks are uploaded but NOT referenced in filer metadata; "+
+			"they will appear as orphans in volume.fsck",
+			fileFullPath, metadataFlushRetries+1, err)
 	}
 }

--- a/weed/mount/weedfs_file_io.go
+++ b/weed/mount/weedfs_file_io.go
@@ -106,6 +106,8 @@ func (wfs *WFS) Open(cancel <-chan struct{}, in *fuse.OpenIn, out *fuse.OpenOut)
 * @param fi file information
 */
 func (wfs *WFS) Release(cancel <-chan struct{}, in *fuse.ReleaseIn) {
-	wfs.posixLocks.ReleaseOwner(in.NodeId, in.LockOwner)
+	if in.ReleaseFlags&fuse.FUSE_RELEASE_FLOCK_UNLOCK != 0 {
+		wfs.posixLocks.ReleaseFlockOwner(in.NodeId, in.LockOwner)
+	}
 	wfs.ReleaseHandle(FileHandleId(in.Fh))
 }
--- a/weed/mount/weedfs_file_lock.go
+++ b/weed/mount/weedfs_file_lock.go
@@ -11,11 +11,12 @@ import (
 // If no conflict, out.Lk.Typ is set to F_UNLCK.
 func (wfs *WFS) GetLk(cancel <-chan struct{}, in *fuse.LkIn, out *fuse.LkOut) fuse.Status {
 	proposed := lockRange{
-		Start: in.Lk.Start,
-		End:   in.Lk.End,
-		Typ:   in.Lk.Typ,
-		Owner: in.Owner,
-		Pid:   in.Lk.Pid,
+		Start:   in.Lk.Start,
+		End:     in.Lk.End,
+		Typ:     in.Lk.Typ,
+		Owner:   in.Owner,
+		Pid:     in.Lk.Pid,
+		IsFlock: in.LkFlags&fuse.FUSE_LK_FLOCK != 0,
 	}
 	wfs.posixLocks.GetLk(in.NodeId, proposed, out)
 	return fuse.OK
@@ -25,11 +26,12 @@ func (wfs *WFS) GetLk(cancel <-chan struct{}, in *fuse.LkIn, out *fuse.LkOut) fu
 // Returns EAGAIN if the lock conflicts with an existing lock from another owner.
 func (wfs *WFS) SetLk(cancel <-chan struct{}, in *fuse.LkIn) fuse.Status {
 	lk := lockRange{
-		Start: in.Lk.Start,
-		End:   in.Lk.End,
-		Typ:   in.Lk.Typ,
-		Owner: in.Owner,
-		Pid:   in.Lk.Pid,
+		Start:   in.Lk.Start,
+		End:     in.Lk.End,
+		Typ:     in.Lk.Typ,
+		Owner:   in.Owner,
+		Pid:     in.Lk.Pid,
+		IsFlock: in.LkFlags&fuse.FUSE_LK_FLOCK != 0,
 	}
 	return wfs.posixLocks.SetLk(in.NodeId, lk)
 }
@@ -38,11 +40,12 @@ func (wfs *WFS) SetLk(cancel <-chan struct{}, in *fuse.LkIn) fuse.Status {
 // Waits until the lock can be acquired or the request is cancelled.
 func (wfs *WFS) SetLkw(cancel <-chan struct{}, in *fuse.LkIn) fuse.Status {
 	lk := lockRange{
-		Start: in.Lk.Start,
-		End:   in.Lk.End,
-		Typ:   in.Lk.Typ,
-		Owner: in.Owner,
-		Pid:   in.Lk.Pid,
+		Start:   in.Lk.Start,
+		End:     in.Lk.End,
+		Typ:     in.Lk.Typ,
+		Owner:   in.Owner,
+		Pid:     in.Lk.Pid,
+		IsFlock: in.LkFlags&fuse.FUSE_LK_FLOCK != 0,
 	}
 	if lk.Typ == syscall.F_UNLCK {
 		return wfs.posixLocks.SetLk(in.NodeId, lk)
--- a/weed/mount/weedfs_file_sync.go
+++ b/weed/mount/weedfs_file_sync.go
@@ -56,10 +56,20 @@ func (wfs *WFS) Flush(cancel <-chan struct{}, in *fuse.FlushIn) fuse.Status {
 	if fh == nil {
 		// If handle is not found, it might have been already released
 		// This is not an error condition for FLUSH
+		if in.LockOwner != 0 {
+			wfs.posixLocks.ReleasePosixOwner(in.NodeId, in.LockOwner)
+		}
 		return fuse.OK
 	}

-	return wfs.doFlush(fh, in.Uid, in.Gid, true)
+	// When a closing lock owner is present, flush synchronously before waking any
+	// blocked POSIX lock waiters so write-serialized callers cannot overtake each other.
+	allowAsync := in.LockOwner == 0
+	status := wfs.doFlush(fh, in.Uid, in.Gid, allowAsync)
+	if in.LockOwner != 0 {
+		wfs.posixLocks.ReleasePosixOwner(in.NodeId, in.LockOwner)
+	}
+	return status
 }

 /**
@@ -137,9 +147,14 @@ func (wfs *WFS) doFlush(fh *FileHandle, uid, gid uint32, allowAsync bool) fuse.S
 		return fuse.Status(syscall.ENOSPC)
 	}

-	if err := wfs.flushMetadataToFiler(fh, dir, name, uid, gid); err != nil {
+	if err := retryMetadataFlush(func() error {
+		return wfs.flushMetadataToFiler(fh, dir, name, uid, gid)
+	}, func(nextAttempt, totalAttempts int, backoff time.Duration, err error) {
+		glog.Warningf("%v fh %d flush: retrying metadata flush (attempt %d/%d) after %v: %v",
+			fileFullPath, fh.fh, nextAttempt, totalAttempts, backoff, err)
+	}); err != nil {
 		glog.Errorf("%v fh %d flush: %v", fileFullPath, fh.fh, err)
-		return fuse.EIO
+		return grpcErrorToFuseStatus(err)
 	}

 	if IsDebugFileReadWrite {