* mount: defer file creation gRPC to flush time for faster small file writes When creating a file via FUSE Create(), skip the synchronous gRPC CreateEntry call to the filer. Instead, allocate the inode and build the entry locally, deferring the filer create to the Flush/Release path where flushMetadataToFiler already sends a CreateEntry with chunk data. This eliminates one synchronous gRPC round-trip per file during creation. For workloads with many small files (e.g. 30K files), this reduces the per-file overhead from ~2 gRPC calls to ~1. Mknod retains synchronous filer creation since it has no file handle and thus no flush path. * mount: use bounded worker pool for async flush operations Replace unbounded goroutine spawning in writebackCache async flush with a fixed-size worker pool backed by a channel. When many files are closed rapidly (e.g., cp -r of 30K files), the previous approach spawned one goroutine per file, leading to resource contention on gRPC/HTTP connections and high goroutine overhead. The worker pool size matches ConcurrentWriters (default 128), which provides good parallelism while bounding resource usage. Work items are queued into a buffered channel and processed by persistent worker goroutines. * mount: fix deferred create cache visibility and async flush race Three fixes for the deferred create and async flush changes: 1. Insert a local placeholder entry into the metadata cache during deferred file creation so that maybeLoadEntry() can find the file for duplicate-create checks, stat, and readdir. Uses InsertEntry directly (not applyLocalMetadataEvent) to avoid triggering the directory hot-threshold eviction that would wipe the entry. 2. Fix race in ReleaseHandle where asyncFlushWg.Add(1) and the channel send happened after pendingAsyncFlushMu was unlocked. A concurrent WaitForAsyncFlush could observe a zero counter, close the channel, and cause a send-on-closed panic. Move Add(1) before the unlock; keep the send after unlock to avoid deadlock with workers that acquire the same mutex during cleanup. 3. Update TestCreateCreatesAndOpensFile to flush the file handle before verifying the CreateEntry gRPC call, since file creation is now deferred to flush time.
335 lines
11 KiB
Go
335 lines
11 KiB
Go
package mount
|
|
|
|
import (
|
|
"context"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/go-fuse/v2/fuse"
|
|
"github.com/seaweedfs/seaweedfs/weed/filer"
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/util"
|
|
)
|
|
|
|
/**
|
|
* Create and open a file
|
|
*
|
|
* If the file does not exist, first create it with the specified
|
|
* mode, and then open it.
|
|
*
|
|
* If this method is not implemented or under Linux kernel
|
|
* versions earlier than 2.6.15, the mknod() and open() methods
|
|
* will be called instead.
|
|
*/
|
|
func (wfs *WFS) Create(cancel <-chan struct{}, in *fuse.CreateIn, name string, out *fuse.CreateOut) (code fuse.Status) {
|
|
if s := checkName(name); s != fuse.OK {
|
|
return s
|
|
}
|
|
|
|
dirFullPath, code := wfs.inodeToPath.GetPath(in.NodeId)
|
|
if code != fuse.OK {
|
|
return code
|
|
}
|
|
|
|
entryFullPath := dirFullPath.Child(name)
|
|
var inode uint64
|
|
|
|
newEntry, code := wfs.maybeLoadEntry(entryFullPath)
|
|
if code == fuse.OK {
|
|
if newEntry == nil || newEntry.Attributes == nil {
|
|
return fuse.EIO
|
|
}
|
|
if in.Flags&syscall.O_EXCL != 0 {
|
|
return fuse.Status(syscall.EEXIST)
|
|
}
|
|
inode = wfs.inodeToPath.Lookup(entryFullPath, newEntry.Attributes.Crtime, false, len(newEntry.HardLinkId) > 0, newEntry.Attributes.Inode, true)
|
|
fileHandle, status := wfs.AcquireHandle(inode, in.Flags, in.Uid, in.Gid)
|
|
if status != fuse.OK {
|
|
return status
|
|
}
|
|
if in.Flags&syscall.O_TRUNC != 0 && in.Flags&fuse.O_ANYWRITE != 0 {
|
|
if code = wfs.truncateEntry(entryFullPath, newEntry); code != fuse.OK {
|
|
wfs.ReleaseHandle(fileHandle.fh)
|
|
return code
|
|
}
|
|
newEntry = fileHandle.GetEntry().GetEntry()
|
|
}
|
|
|
|
wfs.outputPbEntry(&out.EntryOut, inode, newEntry)
|
|
out.Fh = uint64(fileHandle.fh)
|
|
out.OpenFlags = 0
|
|
return fuse.OK
|
|
}
|
|
if code != fuse.ENOENT {
|
|
return code
|
|
}
|
|
|
|
inode, newEntry, code = wfs.createRegularFile(dirFullPath, name, in.Mode, in.Uid, in.Gid, 0, true)
|
|
if code == fuse.Status(syscall.EEXIST) && in.Flags&syscall.O_EXCL == 0 {
|
|
// Race: another process created the file between our check and create.
|
|
// Reopen the winner's entry.
|
|
newEntry, code = wfs.maybeLoadEntry(entryFullPath)
|
|
if code != fuse.OK {
|
|
return code
|
|
}
|
|
if newEntry == nil || newEntry.Attributes == nil {
|
|
return fuse.EIO
|
|
}
|
|
inode = wfs.inodeToPath.Lookup(entryFullPath, newEntry.Attributes.Crtime, false, len(newEntry.HardLinkId) > 0, newEntry.Attributes.Inode, true)
|
|
fileHandle, status := wfs.AcquireHandle(inode, in.Flags, in.Uid, in.Gid)
|
|
if status != fuse.OK {
|
|
return status
|
|
}
|
|
if in.Flags&syscall.O_TRUNC != 0 && in.Flags&fuse.O_ANYWRITE != 0 {
|
|
if code = wfs.truncateEntry(entryFullPath, newEntry); code != fuse.OK {
|
|
wfs.ReleaseHandle(fileHandle.fh)
|
|
return code
|
|
}
|
|
newEntry = fileHandle.GetEntry().GetEntry()
|
|
}
|
|
wfs.outputPbEntry(&out.EntryOut, inode, newEntry)
|
|
out.Fh = uint64(fileHandle.fh)
|
|
out.OpenFlags = 0
|
|
return fuse.OK
|
|
} else if code != fuse.OK {
|
|
return code
|
|
} else {
|
|
inode = wfs.inodeToPath.Lookup(entryFullPath, newEntry.Attributes.Crtime, false, false, inode, true)
|
|
}
|
|
|
|
wfs.outputPbEntry(&out.EntryOut, inode, newEntry)
|
|
|
|
// For deferred creates, bypass AcquireHandle (which calls maybeReadEntry
|
|
// and would fail since the entry is not yet on the filer or in the meta cache).
|
|
// We already have the entry from createRegularFile, so create the handle directly.
|
|
fileHandle := wfs.fhMap.AcquireFileHandle(wfs, inode, newEntry)
|
|
fileHandle.RememberPath(entryFullPath)
|
|
// Mark dirty so the deferred filer create happens on Flush,
|
|
// even if the file is closed without any writes.
|
|
fileHandle.dirtyMetadata = true
|
|
out.Fh = uint64(fileHandle.fh)
|
|
out.OpenFlags = 0
|
|
|
|
return fuse.OK
|
|
}
|
|
|
|
/** Create a file node
|
|
*
|
|
* This is called for creation of all non-directory, non-symlink
|
|
* nodes. If the filesystem defines a create() method, then for
|
|
* regular files that will be called instead.
|
|
*/
|
|
func (wfs *WFS) Mknod(cancel <-chan struct{}, in *fuse.MknodIn, name string, out *fuse.EntryOut) (code fuse.Status) {
|
|
|
|
if s := checkName(name); s != fuse.OK {
|
|
return s
|
|
}
|
|
|
|
dirFullPath, code := wfs.inodeToPath.GetPath(in.NodeId)
|
|
if code != fuse.OK {
|
|
return
|
|
}
|
|
|
|
inode, newEntry, code := wfs.createRegularFile(dirFullPath, name, in.Mode, in.Uid, in.Gid, in.Rdev, false)
|
|
if code != fuse.OK {
|
|
return code
|
|
}
|
|
|
|
// this is to increase nlookup counter
|
|
entryFullPath := dirFullPath.Child(name)
|
|
inode = wfs.inodeToPath.Lookup(entryFullPath, newEntry.Attributes.Crtime, false, false, inode, true)
|
|
|
|
wfs.outputPbEntry(out, inode, newEntry)
|
|
|
|
return fuse.OK
|
|
|
|
}
|
|
|
|
/** Remove a file */
|
|
func (wfs *WFS) Unlink(cancel <-chan struct{}, header *fuse.InHeader, name string) (code fuse.Status) {
|
|
|
|
dirFullPath, code := wfs.inodeToPath.GetPath(header.NodeId)
|
|
if code != fuse.OK {
|
|
if code == fuse.ENOENT {
|
|
return fuse.OK
|
|
}
|
|
return code
|
|
}
|
|
entryFullPath := dirFullPath.Child(name)
|
|
|
|
entry, code := wfs.maybeLoadEntry(entryFullPath)
|
|
if code != fuse.OK {
|
|
if code == fuse.ENOENT {
|
|
return fuse.OK
|
|
}
|
|
return code
|
|
}
|
|
|
|
if wormEnforced, _ := wfs.wormEnforcedForEntry(entryFullPath, entry); wormEnforced {
|
|
return fuse.EPERM
|
|
}
|
|
|
|
// first, ensure the filer store can correctly delete
|
|
glog.V(3).Infof("remove file: %v", entryFullPath)
|
|
// Always let the filer decide whether to delete chunks based on its authoritative data.
|
|
// The filer has the correct hard link count and will only delete chunks when appropriate.
|
|
resp, err := filer_pb.RemoveWithResponse(context.Background(), wfs, string(dirFullPath), name, true, false, false, false, []int32{wfs.signature})
|
|
if err != nil {
|
|
glog.V(0).Infof("remove %s: %v", entryFullPath, err)
|
|
return fuse.OK
|
|
}
|
|
|
|
var event *filer_pb.SubscribeMetadataResponse
|
|
if resp != nil && resp.MetadataEvent != nil {
|
|
event = resp.MetadataEvent
|
|
} else {
|
|
event = metadataDeleteEvent(string(dirFullPath), name, false)
|
|
}
|
|
if applyErr := wfs.applyLocalMetadataEvent(context.Background(), event); applyErr != nil {
|
|
glog.Warningf("unlink %s: best-effort metadata apply failed: %v", entryFullPath, applyErr)
|
|
wfs.inodeToPath.InvalidateChildrenCache(dirFullPath)
|
|
}
|
|
wfs.inodeToPath.TouchDirectory(dirFullPath)
|
|
|
|
// If there is an async-draining handle for this file, mark it as deleted
|
|
// so the background flush skips the metadata write instead of recreating
|
|
// the just-unlinked entry. The handle is still in fhMap during drain.
|
|
if inode, found := wfs.inodeToPath.GetInode(entryFullPath); found {
|
|
if fh, fhFound := wfs.fhMap.FindFileHandle(inode); fhFound {
|
|
fh.isDeleted = true
|
|
}
|
|
}
|
|
|
|
wfs.inodeToPath.RemovePath(entryFullPath)
|
|
|
|
return fuse.OK
|
|
|
|
}
|
|
|
|
func (wfs *WFS) createRegularFile(dirFullPath util.FullPath, name string, mode uint32, uid, gid, rdev uint32, deferFilerCreate bool) (inode uint64, newEntry *filer_pb.Entry, code fuse.Status) {
|
|
if wfs.IsOverQuotaWithUncommitted() {
|
|
return 0, nil, fuse.Status(syscall.ENOSPC)
|
|
}
|
|
|
|
// Verify write+search permission on the parent directory.
|
|
parentEntry, parentStatus := wfs.maybeLoadEntry(dirFullPath)
|
|
if parentStatus != fuse.OK {
|
|
return 0, nil, parentStatus
|
|
}
|
|
if parentEntry == nil || parentEntry.Attributes == nil {
|
|
return 0, nil, fuse.EIO
|
|
}
|
|
if !hasAccess(uid, gid, parentEntry.Attributes.Uid, parentEntry.Attributes.Gid, parentEntry.Attributes.FileMode, fuse.W_OK|fuse.X_OK) {
|
|
return 0, nil, fuse.Status(syscall.EACCES)
|
|
}
|
|
|
|
entryFullPath := dirFullPath.Child(name)
|
|
if _, status := wfs.maybeLoadEntry(entryFullPath); status == fuse.OK {
|
|
return 0, nil, fuse.Status(syscall.EEXIST)
|
|
} else if status != fuse.ENOENT {
|
|
return 0, nil, status
|
|
}
|
|
fileMode := toOsFileMode(mode)
|
|
now := time.Now().Unix()
|
|
inode = wfs.inodeToPath.AllocateInode(entryFullPath, now)
|
|
|
|
newEntry = &filer_pb.Entry{
|
|
Name: name,
|
|
IsDirectory: false,
|
|
Attributes: &filer_pb.FuseAttributes{
|
|
Mtime: now,
|
|
Crtime: now,
|
|
FileMode: uint32(fileMode),
|
|
Uid: uid,
|
|
Gid: gid,
|
|
TtlSec: wfs.option.TtlSec,
|
|
Rdev: rdev,
|
|
Inode: inode,
|
|
},
|
|
}
|
|
|
|
if deferFilerCreate {
|
|
// Defer the filer gRPC call to flush time. The caller (Create) will
|
|
// build a file handle directly from newEntry, bypassing AcquireHandle.
|
|
// Insert a local placeholder into the metadata cache so that
|
|
// maybeLoadEntry() can find the file (e.g., duplicate-create checks,
|
|
// stat, readdir). The actual filer entry is created by flushMetadataToFiler.
|
|
// We use InsertEntry directly instead of applyLocalMetadataEvent to avoid
|
|
// triggering directory hot-threshold eviction that would wipe the entry.
|
|
if insertErr := wfs.metaCache.InsertEntry(context.Background(), filer.FromPbEntry(string(dirFullPath), newEntry)); insertErr != nil {
|
|
glog.Warningf("createFile %s: insert local entry: %v", entryFullPath, insertErr)
|
|
}
|
|
glog.V(3).Infof("createFile %s: deferred to flush", entryFullPath)
|
|
return inode, newEntry, fuse.OK
|
|
}
|
|
|
|
err := wfs.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
|
|
wfs.mapPbIdFromLocalToFiler(newEntry)
|
|
defer wfs.mapPbIdFromFilerToLocal(newEntry)
|
|
|
|
request := &filer_pb.CreateEntryRequest{
|
|
Directory: string(dirFullPath),
|
|
Entry: newEntry,
|
|
Signatures: []int32{wfs.signature},
|
|
SkipCheckParentDirectory: true,
|
|
}
|
|
|
|
glog.V(1).Infof("createFile: %v", request)
|
|
resp, err := filer_pb.CreateEntryWithResponse(context.Background(), client, request)
|
|
if err != nil {
|
|
glog.V(0).Infof("createFile %s: %v", entryFullPath, err)
|
|
return err
|
|
}
|
|
|
|
event := resp.GetMetadataEvent()
|
|
if event == nil {
|
|
event = metadataCreateEvent(string(dirFullPath), newEntry)
|
|
}
|
|
if applyErr := wfs.applyLocalMetadataEvent(context.Background(), event); applyErr != nil {
|
|
glog.Warningf("createFile %s: best-effort metadata apply failed: %v", entryFullPath, applyErr)
|
|
wfs.inodeToPath.InvalidateChildrenCache(dirFullPath)
|
|
}
|
|
wfs.inodeToPath.TouchDirectory(dirFullPath)
|
|
|
|
return nil
|
|
})
|
|
|
|
glog.V(3).Infof("createFile %s: %v", entryFullPath, err)
|
|
|
|
if err != nil {
|
|
return 0, nil, grpcErrorToFuseStatus(err)
|
|
}
|
|
|
|
return inode, newEntry, fuse.OK
|
|
}
|
|
|
|
func (wfs *WFS) truncateEntry(entryFullPath util.FullPath, entry *filer_pb.Entry) fuse.Status {
|
|
if entry == nil {
|
|
return fuse.EIO
|
|
}
|
|
if entry.Attributes == nil {
|
|
entry.Attributes = &filer_pb.FuseAttributes{}
|
|
}
|
|
|
|
entry.Content = nil
|
|
entry.Chunks = nil
|
|
entry.Attributes.FileSize = 0
|
|
entry.Attributes.Mtime = time.Now().Unix()
|
|
|
|
if code := wfs.saveEntry(entryFullPath, entry); code != fuse.OK {
|
|
return code
|
|
}
|
|
|
|
if inode, found := wfs.inodeToPath.GetInode(entryFullPath); found {
|
|
if fh, fhFound := wfs.fhMap.FindFileHandle(inode); fhFound {
|
|
fhActiveLock := fh.wfs.fhLockTable.AcquireLock("truncateEntry", fh.fh, util.ExclusiveLock)
|
|
fh.ResetDirtyPages()
|
|
fh.SetEntry(entry)
|
|
fh.wfs.fhLockTable.ReleaseLock(fh.fh, fhActiveLock)
|
|
}
|
|
}
|
|
|
|
return fuse.OK
|
|
}
|