* mount: improve read throughput with parallel chunk fetching This addresses issue #7504 where a single weed mount FUSE instance does not fully utilize node network bandwidth when reading large files. Changes: - Add -concurrentReaders mount option (default: 16) to control the maximum number of parallel chunk fetches during read operations - Implement parallel section reading in ChunkGroup.ReadDataAt() using errgroup for better throughput when reading across multiple sections - Enhance ReaderCache with MaybeCacheMany() to prefetch multiple chunks ahead in parallel during sequential reads (now prefetches 4 chunks) - Increase ReaderCache limit dynamically based on concurrentReaders to support higher read parallelism The bottleneck was that chunks were being read sequentially even when they reside on different volume servers. By introducing parallel chunk fetching, a single mount instance can now better saturate available network bandwidth. Fixes: #7504 * fmt * Address review comments: make prefetch configurable, improve error handling Changes: 1. Add DefaultPrefetchCount constant (4) to reader_at.go 2. Add GetPrefetchCount() method to ChunkGroup that derives prefetch count from concurrentReaders (1/4 ratio, min 1, max 8) 3. Pass prefetch count through NewChunkReaderAtFromClient 4. Fix error handling in readDataAtParallel to prioritize errgroup error 5. Update all callers to use DefaultPrefetchCount constant For mount operations, prefetch scales with -concurrentReaders: - concurrentReaders=16 (default) -> prefetch=4 - concurrentReaders=32 -> prefetch=8 (capped) - concurrentReaders=4 -> prefetch=1 For non-mount paths (WebDAV, query engine, MQ), uses DefaultPrefetchCount. * fmt * Refactor: use variadic parameter instead of new function name Use NewChunkGroup with optional concurrentReaders parameter instead of creating a separate NewChunkGroupWithConcurrency function. This maintains backward compatibility - existing callers without the parameter get the default of 16 concurrent readers. * Use explicit concurrentReaders parameter instead of variadic * Refactor: use MaybeCache with count parameter instead of new MaybeCacheMany function * Address nitpick review comments - Add upper bound (128) on concurrentReaders to prevent excessive goroutine fan-out - Cap readerCacheLimit at 256 accordingly - Fix SetChunks: use Lock() instead of RLock() since we are writing to group.sections
282 lines
9.7 KiB
Go
282 lines
9.7 KiB
Go
package filer
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"math/rand"
|
|
"sync"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/util"
|
|
"github.com/seaweedfs/seaweedfs/weed/wdclient"
|
|
)
|
|
|
|
// DefaultPrefetchCount is the default number of chunks to prefetch ahead during
|
|
// sequential reads. This value is used when prefetch count is not explicitly
|
|
// configured (e.g., WebDAV, query engine, message queue). For mount operations,
|
|
// the prefetch count is derived from the -concurrentReaders option.
|
|
const DefaultPrefetchCount = 4
|
|
|
|
type ChunkReadAt struct {
|
|
masterClient *wdclient.MasterClient
|
|
chunkViews *IntervalList[*ChunkView]
|
|
fileSize int64
|
|
readerCache *ReaderCache
|
|
readerPattern *ReaderPattern
|
|
lastChunkFid string
|
|
prefetchCount int // Number of chunks to prefetch ahead during sequential reads
|
|
ctx context.Context // Context used for cancellation during chunk read operations
|
|
}
|
|
|
|
var _ = io.ReaderAt(&ChunkReadAt{})
|
|
var _ = io.Closer(&ChunkReadAt{})
|
|
|
|
// LookupFn creates a basic volume location lookup function with simple caching.
|
|
//
|
|
// Deprecated: Use wdclient.FilerClient instead. This function has several limitations compared to wdclient.FilerClient:
|
|
// - Simple bounded cache (10k entries, no eviction policy or TTL for stale entries)
|
|
// - No singleflight deduplication (concurrent requests for same volume will duplicate work)
|
|
// - No cache history for volume moves (no fallback chain when volumes migrate)
|
|
// - No high availability (single filer address, no automatic failover)
|
|
//
|
|
// For NEW code, especially mount operations, use wdclient.FilerClient instead:
|
|
//
|
|
// filerClient := wdclient.NewFilerClient(filerAddresses, grpcDialOption, dataCenter, opts)
|
|
// lookupFn := filerClient.GetLookupFileIdFunction()
|
|
//
|
|
// This provides:
|
|
// - Bounded cache with configurable size
|
|
// - Singleflight deduplication of concurrent lookups
|
|
// - Cache history when volumes move
|
|
// - Battle-tested vidMap with cache chain
|
|
//
|
|
// This function is kept for backward compatibility with existing code paths
|
|
// (shell commands, streaming, etc.) but should be avoided in long-running processes
|
|
// or multi-tenant deployments where unbounded memory growth is a concern.
|
|
//
|
|
// Maximum recommended cache entries: ~10,000 volumes per process.
|
|
// Beyond this, consider migrating to wdclient.FilerClient.
|
|
func LookupFn(filerClient filer_pb.FilerClient) wdclient.LookupFileIdFunctionType {
|
|
|
|
vidCache := make(map[string]*filer_pb.Locations)
|
|
var vidCacheLock sync.RWMutex
|
|
cacheSize := 0
|
|
const maxCacheSize = 10000 // Simple bound to prevent unbounded growth
|
|
|
|
return func(ctx context.Context, fileId string) (targetUrls []string, err error) {
|
|
vid := VolumeId(fileId)
|
|
vidCacheLock.RLock()
|
|
locations, found := vidCache[vid]
|
|
vidCacheLock.RUnlock()
|
|
|
|
if !found {
|
|
util.Retry("lookup volume "+vid, func() error {
|
|
err = filerClient.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
|
|
resp, err := client.LookupVolume(ctx, &filer_pb.LookupVolumeRequest{
|
|
VolumeIds: []string{vid},
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
locations = resp.LocationsMap[vid]
|
|
if locations == nil || len(locations.Locations) == 0 {
|
|
glog.V(0).InfofCtx(ctx, "failed to locate %s", fileId)
|
|
return fmt.Errorf("failed to locate %s", fileId)
|
|
}
|
|
vidCacheLock.Lock()
|
|
// Simple size limit to prevent unbounded growth
|
|
// For proper cache management, use wdclient.FilerClient instead
|
|
if cacheSize < maxCacheSize {
|
|
vidCache[vid] = locations
|
|
cacheSize++
|
|
} else if cacheSize == maxCacheSize {
|
|
glog.Warningf("filer.LookupFn cache reached limit of %d volumes, not caching new entries. Consider migrating to wdclient.FilerClient for bounded cache management.", maxCacheSize)
|
|
cacheSize++ // Only log once
|
|
}
|
|
vidCacheLock.Unlock()
|
|
|
|
return nil
|
|
})
|
|
return err
|
|
})
|
|
}
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
fcDataCenter := filerClient.GetDataCenter()
|
|
var sameDcTargetUrls, otherTargetUrls []string
|
|
for _, loc := range locations.Locations {
|
|
volumeServerAddress := filerClient.AdjustedUrl(loc)
|
|
targetUrl := fmt.Sprintf("http://%s/%s", volumeServerAddress, fileId)
|
|
if fcDataCenter == "" || fcDataCenter != loc.DataCenter {
|
|
otherTargetUrls = append(otherTargetUrls, targetUrl)
|
|
} else {
|
|
sameDcTargetUrls = append(sameDcTargetUrls, targetUrl)
|
|
}
|
|
}
|
|
rand.Shuffle(len(sameDcTargetUrls), func(i, j int) {
|
|
sameDcTargetUrls[i], sameDcTargetUrls[j] = sameDcTargetUrls[j], sameDcTargetUrls[i]
|
|
})
|
|
rand.Shuffle(len(otherTargetUrls), func(i, j int) {
|
|
otherTargetUrls[i], otherTargetUrls[j] = otherTargetUrls[j], otherTargetUrls[i]
|
|
})
|
|
// Prefer same data center
|
|
targetUrls = append(sameDcTargetUrls, otherTargetUrls...)
|
|
return
|
|
}
|
|
}
|
|
|
|
func NewChunkReaderAtFromClient(ctx context.Context, readerCache *ReaderCache, chunkViews *IntervalList[*ChunkView], fileSize int64, prefetchCount int) *ChunkReadAt {
|
|
|
|
return &ChunkReadAt{
|
|
chunkViews: chunkViews,
|
|
fileSize: fileSize,
|
|
readerCache: readerCache,
|
|
readerPattern: NewReaderPattern(),
|
|
prefetchCount: prefetchCount,
|
|
ctx: ctx,
|
|
}
|
|
}
|
|
|
|
func (c *ChunkReadAt) Size() int64 {
|
|
return c.fileSize
|
|
}
|
|
|
|
func (c *ChunkReadAt) Close() error {
|
|
c.readerCache.destroy()
|
|
return nil
|
|
}
|
|
|
|
func (c *ChunkReadAt) ReadAt(p []byte, offset int64) (n int, err error) {
|
|
|
|
c.readerPattern.MonitorReadAt(offset, len(p))
|
|
|
|
c.chunkViews.Lock.RLock()
|
|
defer c.chunkViews.Lock.RUnlock()
|
|
|
|
// glog.V(4).Infof("ReadAt [%d,%d) of total file size %d bytes %d chunk views", offset, offset+int64(len(p)), c.fileSize, len(c.chunkViews))
|
|
n, _, err = c.doReadAt(c.ctx, p, offset)
|
|
return
|
|
}
|
|
|
|
func (c *ChunkReadAt) ReadAtWithTime(ctx context.Context, p []byte, offset int64) (n int, ts int64, err error) {
|
|
|
|
c.readerPattern.MonitorReadAt(offset, len(p))
|
|
|
|
c.chunkViews.Lock.RLock()
|
|
defer c.chunkViews.Lock.RUnlock()
|
|
|
|
// glog.V(4).Infof("ReadAt [%d,%d) of total file size %d bytes %d chunk views", offset, offset+int64(len(p)), c.fileSize, len(c.chunkViews))
|
|
return c.doReadAt(ctx, p, offset)
|
|
}
|
|
|
|
func (c *ChunkReadAt) doReadAt(ctx context.Context, p []byte, offset int64) (n int, ts int64, err error) {
|
|
|
|
startOffset, remaining := offset, int64(len(p))
|
|
var nextChunks *Interval[*ChunkView]
|
|
for x := c.chunkViews.Front(); x != nil; x = x.Next {
|
|
chunk := x.Value
|
|
if remaining <= 0 {
|
|
break
|
|
}
|
|
if x.Next != nil {
|
|
nextChunks = x.Next
|
|
}
|
|
if startOffset < chunk.ViewOffset {
|
|
gap := chunk.ViewOffset - startOffset
|
|
glog.V(4).Infof("zero [%d,%d)", startOffset, chunk.ViewOffset)
|
|
n += zero(p, startOffset-offset, gap)
|
|
startOffset, remaining = chunk.ViewOffset, remaining-gap
|
|
if remaining <= 0 {
|
|
break
|
|
}
|
|
}
|
|
// fmt.Printf(">>> doReadAt [%d,%d), chunk[%d,%d)\n", offset, offset+int64(len(p)), chunk.ViewOffset, chunk.ViewOffset+int64(chunk.ViewSize))
|
|
chunkStart, chunkStop := max(chunk.ViewOffset, startOffset), min(chunk.ViewOffset+int64(chunk.ViewSize), startOffset+remaining)
|
|
if chunkStart >= chunkStop {
|
|
continue
|
|
}
|
|
// glog.V(4).Infof("read [%d,%d), %d/%d chunk %s [%d,%d)", chunkStart, chunkStop, i, len(c.chunkViews), chunk.FileId, chunk.ViewOffset-chunk.Offset, chunk.ViewOffset-chunk.Offset+int64(chunk.ViewSize))
|
|
bufferOffset := chunkStart - chunk.ViewOffset + chunk.OffsetInChunk
|
|
ts = chunk.ModifiedTsNs
|
|
copied, err := c.readChunkSliceAt(ctx, p[startOffset-offset:chunkStop-chunkStart+startOffset-offset], chunk, nextChunks, uint64(bufferOffset))
|
|
if err != nil {
|
|
glog.Errorf("fetching chunk %+v: %v\n", chunk, err)
|
|
return copied, ts, err
|
|
}
|
|
|
|
n += copied
|
|
startOffset, remaining = startOffset+int64(copied), remaining-int64(copied)
|
|
}
|
|
|
|
// glog.V(4).Infof("doReadAt [%d,%d), n:%v, err:%v", offset, offset+int64(len(p)), n, err)
|
|
|
|
// zero the remaining bytes if a gap exists at the end of the last chunk (or a fully sparse file)
|
|
if err == nil && remaining > 0 {
|
|
var delta int64
|
|
if c.fileSize >= startOffset {
|
|
delta = min(remaining, c.fileSize-startOffset)
|
|
startOffset -= offset
|
|
}
|
|
if delta > 0 {
|
|
glog.V(4).Infof("zero2 [%d,%d) of file size %d bytes", startOffset, startOffset+delta, c.fileSize)
|
|
n += zero(p, startOffset, delta)
|
|
}
|
|
}
|
|
|
|
if err == nil && offset+int64(len(p)) >= c.fileSize {
|
|
err = io.EOF
|
|
}
|
|
// fmt.Printf("~~~ filled %d, err: %v\n\n", n, err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
func (c *ChunkReadAt) readChunkSliceAt(ctx context.Context, buffer []byte, chunkView *ChunkView, nextChunkViews *Interval[*ChunkView], offset uint64) (n int, err error) {
|
|
|
|
if c.readerPattern.IsRandomMode() {
|
|
n, err := c.readerCache.chunkCache.ReadChunkAt(buffer, chunkView.FileId, offset)
|
|
if n > 0 {
|
|
return n, err
|
|
}
|
|
return fetchChunkRange(ctx, buffer, c.readerCache.lookupFileIdFn, chunkView.FileId, chunkView.CipherKey, chunkView.IsGzipped, int64(offset))
|
|
}
|
|
|
|
shouldCache := (uint64(chunkView.ViewOffset) + chunkView.ChunkSize) <= c.readerCache.chunkCache.GetMaxFilePartSizeInCache()
|
|
n, err = c.readerCache.ReadChunkAt(buffer, chunkView.FileId, chunkView.CipherKey, chunkView.IsGzipped, int64(offset), int(chunkView.ChunkSize), shouldCache)
|
|
if c.lastChunkFid != chunkView.FileId {
|
|
if chunkView.OffsetInChunk == 0 { // start of a new chunk
|
|
if c.lastChunkFid != "" {
|
|
c.readerCache.UnCache(c.lastChunkFid)
|
|
}
|
|
if nextChunkViews != nil && c.prefetchCount > 0 {
|
|
// Prefetch multiple chunks ahead for better sequential read throughput
|
|
// This keeps the network pipeline full with parallel chunk fetches
|
|
c.readerCache.MaybeCache(nextChunkViews, c.prefetchCount)
|
|
}
|
|
}
|
|
}
|
|
c.lastChunkFid = chunkView.FileId
|
|
return
|
|
}
|
|
|
|
func zero(buffer []byte, start, length int64) int {
|
|
if length <= 0 {
|
|
return 0
|
|
}
|
|
end := min(start+length, int64(len(buffer)))
|
|
start = max(start, 0)
|
|
|
|
// zero the bytes
|
|
for o := start; o < end; o++ {
|
|
buffer[o] = 0
|
|
}
|
|
return int(end - start)
|
|
}
|