Files
seaweedFS/weed/mount/weedfs_file_sync.go
Chris Lu 805625d06e Add FUSE integration tests for POSIX file locking (#8752)
* Add FUSE integration tests for POSIX file locking

Test flock() and fcntl() advisory locks over the FUSE mount:
- Exclusive and shared flock with conflict detection
- flock upgrade (shared to exclusive) and release on close
- fcntl F_SETLK write lock conflicts and shared read locks
- fcntl F_GETLK conflict reporting on overlapping byte ranges
- Non-overlapping byte-range locks held independently
- F_SETLKW blocking until conflicting lock is released
- Lock release on file descriptor close
- Concurrent lock contention with multiple workers

* Fix review feedback in POSIX lock integration tests

- Assert specific EAGAIN error on fcntl lock conflicts instead of generic Error
- Use O_APPEND in concurrent contention test so workers append rather than overwrite
- Verify exact line count (numWorkers * writesPerWorker) after concurrent test
- Check unlock error in F_SETLKW blocking test goroutine

* Refactor fcntl tests to use subprocesses for inter-process semantics

POSIX fcntl locks use the process's files_struct as lock owner, so all
fds in the same process share the same owner and never conflict. This
caused the fcntl tests to silently pass without exercising lock conflicts.

Changes:
- Add TestFcntlLockHelper subprocess entry point with hold/try/getlk actions
- Add lockHolder with channel-based coordination (no scanner race)
- Rewrite all fcntl tests to run contenders in separate subprocesses
- Fix F_UNLCK int16 cast in GetLk assertion for type-safe comparison
- Fix concurrent test: use non-blocking flock with retry to avoid
  exhausting go-fuse server reader goroutines (blocking FUSE SETLKW
  can starve unlock request processing, causing deadlock)

flock tests remain same-process since flock uses per-struct-file owners.

* Fix misleading comment and error handling in lock test subprocess

- Fix comment: tryLockInSubprocess tests a subprocess, not the test process
- Distinguish EAGAIN/EACCES from unexpected errors in subprocess try mode
  so real failures aren't silently masked as lock conflicts

* Fix CI race in FcntlReleaseOnClose and increase flock retry budget

- FcntlReleaseOnClose: retry lock acquisition after subprocess exits
  since the FUSE server may not process Release immediately
- ConcurrentLockContention: increase retry limit from 500 to 3000
  (5s → 30s budget) to handle CI load

* separating flock and fcntl in the in-memory lock table and cleaning them up through the right release path: PID for POSIX locks, lock owner for flock

* ReleasePosixOwner

* weed/mount: flush before releasing posix close owner

* weed/mount: keep woken lock waiters from losing inode state

* test/fuse: make blocking fcntl helper state explicit

* test/fuse: assert flock contention never overlaps

* test/fuse: stabilize concurrent lock contention check

* test/fuse: make concurrent contention writes deterministic

* weed/mount: retry synchronous metadata flushes
2026-03-24 11:43:25 -07:00

237 lines
7.4 KiB
Go

package mount
import (
"context"
"fmt"
"syscall"
"time"
"github.com/seaweedfs/go-fuse/v2/fuse"
"github.com/seaweedfs/seaweedfs/weed/filer"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
"github.com/seaweedfs/seaweedfs/weed/util"
)
/**
* Flush method
*
* This is called on each close() of the opened file.
*
* Since file descriptors can be duplicated (dup, dup2, fork), for
* one open call there may be many flush calls.
*
* Filesystems shouldn't assume that flush will always be called
* after some writes, or that if will be called at all.
*
* fi->fh will contain the value set by the open method, or will
* be undefined if the open method didn't set any value.
*
* NOTE: the name of the method is misleading, since (unlike
* fsync) the filesystem is not forced to flush pending writes.
* One reason to flush data is if the filesystem wants to return
* write errors during close. However, such use is non-portable
* because POSIX does not require [close] to wait for delayed I/O to
* complete.
*
* If the filesystem supports file locking operations (setlk,
* getlk) it should remove all locks belonging to 'fi->owner'.
*
* If this request is answered with an error code of ENOSYS,
* this is treated as success and future calls to flush() will
* succeed automatically without being send to the filesystem
* process.
*
* Valid replies:
* fuse_reply_err
*
* @param req request handle
* @param ino the inode number
* @param fi file information
*
* [close]: http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html
*/
func (wfs *WFS) Flush(cancel <-chan struct{}, in *fuse.FlushIn) fuse.Status {
fh := wfs.GetHandle(FileHandleId(in.Fh))
if fh == nil {
// If handle is not found, it might have been already released
// This is not an error condition for FLUSH
if in.LockOwner != 0 {
wfs.posixLocks.ReleasePosixOwner(in.NodeId, in.LockOwner)
}
return fuse.OK
}
// When a closing lock owner is present, flush synchronously before waking any
// blocked POSIX lock waiters so write-serialized callers cannot overtake each other.
allowAsync := in.LockOwner == 0
status := wfs.doFlush(fh, in.Uid, in.Gid, allowAsync)
if in.LockOwner != 0 {
wfs.posixLocks.ReleasePosixOwner(in.NodeId, in.LockOwner)
}
return status
}
/**
* Synchronize file contents
*
* If the datasync parameter is non-zero, then only the user data
* should be flushed, not the meta data.
*
* If this request is answered with an error code of ENOSYS,
* this is treated as success and future calls to fsync() will
* succeed automatically without being send to the filesystem
* process.
*
* Valid replies:
* fuse_reply_err
*
* @param req request handle
* @param ino the inode number
* @param datasync flag indicating if only data should be flushed
* @param fi file information
*/
func (wfs *WFS) Fsync(cancel <-chan struct{}, in *fuse.FsyncIn) (code fuse.Status) {
fh := wfs.GetHandle(FileHandleId(in.Fh))
if fh == nil {
return fuse.ENOENT
}
// Fsync is an explicit sync request — always flush synchronously
return wfs.doFlush(fh, in.Uid, in.Gid, false)
}
func (wfs *WFS) doFlush(fh *FileHandle, uid, gid uint32, allowAsync bool) fuse.Status {
// flush works at fh level
fileFullPath := fh.FullPath()
fh.RememberPath(fileFullPath)
dir, name := fileFullPath.DirAndName()
// send the data to the OS
glog.V(4).Infof("doFlush %s fh %d", fileFullPath, fh.fh)
// When writebackCache is enabled and this is a close()-triggered Flush (not fsync),
// defer the expensive data upload + metadata flush to a background goroutine.
// This allows the calling process (e.g., rsync) to proceed to the next file immediately.
// POSIX does not require close() to wait for delayed I/O to complete.
if allowAsync && wfs.option.WritebackCache && fh.dirtyMetadata {
if wfs.IsOverQuotaWithUncommitted() {
return fuse.Status(syscall.ENOSPC)
}
fh.asyncFlushPending = true
fh.asyncFlushUid = uid
fh.asyncFlushGid = gid
glog.V(3).Infof("doFlush async deferred %s fh %d", fileFullPath, fh.fh)
return fuse.OK
}
// Synchronous flush path (normal mode, fsync, or no dirty data)
fh.asyncFlushPending = false
// Check quota including uncommitted writes for real-time enforcement
isOverQuota := wfs.IsOverQuotaWithUncommitted()
if !isOverQuota {
if err := fh.dirtyPages.FlushData(); err != nil {
glog.Errorf("%v doFlush: %v", fileFullPath, err)
return fuse.EIO
}
}
if !fh.dirtyMetadata {
return fuse.OK
}
if isOverQuota {
return fuse.Status(syscall.ENOSPC)
}
if err := retryMetadataFlush(func() error {
return wfs.flushMetadataToFiler(fh, dir, name, uid, gid)
}, func(nextAttempt, totalAttempts int, backoff time.Duration, err error) {
glog.Warningf("%v fh %d flush: retrying metadata flush (attempt %d/%d) after %v: %v",
fileFullPath, fh.fh, nextAttempt, totalAttempts, backoff, err)
}); err != nil {
glog.Errorf("%v fh %d flush: %v", fileFullPath, fh.fh, err)
return grpcErrorToFuseStatus(err)
}
if IsDebugFileReadWrite {
fh.mirrorFile.Sync()
}
return fuse.OK
}
// flushMetadataToFiler sends the file's chunk references and attributes to the filer.
// This is shared between the synchronous doFlush path and the async flush completion.
func (wfs *WFS) flushMetadataToFiler(fh *FileHandle, dir, name string, uid, gid uint32) error {
fileFullPath := fh.FullPath()
fhActiveLock := fh.wfs.fhLockTable.AcquireLock("doFlush", fh.fh, util.ExclusiveLock)
defer fh.wfs.fhLockTable.ReleaseLock(fh.fh, fhActiveLock)
err := wfs.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
entry := fh.GetEntry()
entry.Name = name // this flush may be just after a rename operation
if entry.Attributes != nil {
entry.Attributes.Mime = fh.contentType
if entry.Attributes.Uid == 0 {
entry.Attributes.Uid = uid
}
if entry.Attributes.Gid == 0 {
entry.Attributes.Gid = gid
}
entry.Attributes.Mtime = time.Now().Unix()
}
request := &filer_pb.CreateEntryRequest{
Directory: string(dir),
Entry: entry.GetEntry(),
Signatures: []int32{wfs.signature},
SkipCheckParentDirectory: true,
}
glog.V(4).Infof("%s set chunks: %v", fileFullPath, len(entry.GetChunks()))
manifestChunks, nonManifestChunks := filer.SeparateManifestChunks(entry.GetChunks())
chunks, _ := filer.CompactFileChunks(context.Background(), wfs.LookupFn(), nonManifestChunks)
chunks, manifestErr := filer.MaybeManifestize(wfs.saveDataAsChunk(fileFullPath), chunks)
if manifestErr != nil {
// not good, but should be ok
glog.V(0).Infof("MaybeManifestize: %v", manifestErr)
}
entry.Chunks = append(chunks, manifestChunks...)
wfs.mapPbIdFromLocalToFiler(request.Entry)
defer wfs.mapPbIdFromFilerToLocal(request.Entry)
resp, err := filer_pb.CreateEntryWithResponse(context.Background(), client, request)
if err != nil {
glog.Errorf("fh flush create %s: %v", fileFullPath, err)
return fmt.Errorf("fh flush create %s: %v", fileFullPath, err)
}
event := resp.GetMetadataEvent()
if event == nil {
event = metadataUpdateEvent(string(dir), request.Entry)
}
if applyErr := wfs.applyLocalMetadataEvent(context.Background(), event); applyErr != nil {
glog.Warningf("flush %s: best-effort metadata apply failed: %v", fileFullPath, applyErr)
wfs.inodeToPath.InvalidateChildrenCache(util.FullPath(dir))
}
return nil
})
if err == nil {
fh.dirtyMetadata = false
}
return err
}