seaweedFS/weed/filer/reader_at.go

package filer

import (
	"context"
	"fmt"
	"io"
	"math/rand"
	"sync"

	"golang.org/x/sync/errgroup"

	"github.com/seaweedfs/seaweedfs/weed/glog"
	"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
	"github.com/seaweedfs/seaweedfs/weed/util"
	"github.com/seaweedfs/seaweedfs/weed/wdclient"
)

// DefaultPrefetchCount is the default number of chunks to prefetch ahead during
// sequential reads. This value is used when prefetch count is not explicitly
// configured (e.g., WebDAV, query engine, message queue). For mount operations,
// the prefetch count is derived from the -concurrentReaders option.
const DefaultPrefetchCount = 4

// minReadConcurrency is the minimum number of parallel chunk fetches.
// This ensures at least some parallelism even when prefetchCount is low,
// improving throughput for reads spanning multiple chunks.
const minReadConcurrency = 4

type ChunkReadAt struct {
	masterClient  *wdclient.MasterClient
	chunkViews    *IntervalList[*ChunkView]
	fileSize      int64
	readerCache   *ReaderCache
	readerPattern *ReaderPattern
	lastChunkFid  string
	prefetchCount int             // Number of chunks to prefetch ahead during sequential reads
	ctx           context.Context // Context used for cancellation during chunk read operations
}

var _ = io.ReaderAt(&ChunkReadAt{})
var _ = io.Closer(&ChunkReadAt{})

// LookupFn creates a basic volume location lookup function with simple caching.
//
// Deprecated: Use wdclient.FilerClient instead. This function has several limitations compared to wdclient.FilerClient:
//   - Simple bounded cache (10k entries, no eviction policy or TTL for stale entries)
//   - No singleflight deduplication (concurrent requests for same volume will duplicate work)
//   - No cache history for volume moves (no fallback chain when volumes migrate)
//   - No high availability (single filer address, no automatic failover)
//
// For NEW code, especially mount operations, use wdclient.FilerClient instead:
//
//	filerClient := wdclient.NewFilerClient(filerAddresses, grpcDialOption, dataCenter, opts)
//	lookupFn := filerClient.GetLookupFileIdFunction()
//
// This provides:
//   - Bounded cache with configurable size
//   - Singleflight deduplication of concurrent lookups
//   - Cache history when volumes move
//   - Battle-tested vidMap with cache chain
//
// This function is kept for backward compatibility with existing code paths
// (shell commands, streaming, etc.) but should be avoided in long-running processes
// or multi-tenant deployments where unbounded memory growth is a concern.
//
// Maximum recommended cache entries: ~10,000 volumes per process.
// Beyond this, consider migrating to wdclient.FilerClient.
func LookupFn(filerClient filer_pb.FilerClient) wdclient.LookupFileIdFunctionType {

	vidCache := make(map[string]*filer_pb.Locations)
	var vidCacheLock sync.RWMutex
	cacheSize := 0
	const maxCacheSize = 10000 // Simple bound to prevent unbounded growth

	return func(ctx context.Context, fileId string) (targetUrls []string, err error) {
		vid := VolumeId(fileId)
		vidCacheLock.RLock()
		locations, found := vidCache[vid]
		vidCacheLock.RUnlock()

		if !found {
			util.Retry("lookup volume "+vid, func() error {
				err = filerClient.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
					resp, err := client.LookupVolume(ctx, &filer_pb.LookupVolumeRequest{
						VolumeIds: []string{vid},
					})
					if err != nil {
						return err
					}

					locations = resp.LocationsMap[vid]
					if locations == nil || len(locations.Locations) == 0 {
						glog.V(0).InfofCtx(ctx, "failed to locate %s", fileId)
						return fmt.Errorf("failed to locate %s", fileId)
					}
					vidCacheLock.Lock()
					// Simple size limit to prevent unbounded growth
					// For proper cache management, use wdclient.FilerClient instead
					if cacheSize < maxCacheSize {
						vidCache[vid] = locations
						cacheSize++
					} else if cacheSize == maxCacheSize {
						glog.Warningf("filer.LookupFn cache reached limit of %d volumes, not caching new entries. Consider migrating to wdclient.FilerClient for bounded cache management.", maxCacheSize)
						cacheSize++ // Only log once
					}
					vidCacheLock.Unlock()

					return nil
				})
				return err
			})
		}

		if err != nil {
			return nil, err
		}

		fcDataCenter := filerClient.GetDataCenter()
		var sameDcTargetUrls, otherTargetUrls []string
		for _, loc := range locations.Locations {
			volumeServerAddress := filerClient.AdjustedUrl(loc)
			targetUrl := fmt.Sprintf("http://%s/%s", volumeServerAddress, fileId)
			if fcDataCenter == "" || fcDataCenter != loc.DataCenter {
				otherTargetUrls = append(otherTargetUrls, targetUrl)
			} else {
				sameDcTargetUrls = append(sameDcTargetUrls, targetUrl)
			}
		}
		rand.Shuffle(len(sameDcTargetUrls), func(i, j int) {
			sameDcTargetUrls[i], sameDcTargetUrls[j] = sameDcTargetUrls[j], sameDcTargetUrls[i]
		})
		rand.Shuffle(len(otherTargetUrls), func(i, j int) {
			otherTargetUrls[i], otherTargetUrls[j] = otherTargetUrls[j], otherTargetUrls[i]
		})
		// Prefer same data center
		targetUrls = append(sameDcTargetUrls, otherTargetUrls...)
		return
	}
}

func NewChunkReaderAtFromClient(ctx context.Context, readerCache *ReaderCache, chunkViews *IntervalList[*ChunkView], fileSize int64, prefetchCount int) *ChunkReadAt {

	return &ChunkReadAt{
		chunkViews:    chunkViews,
		fileSize:      fileSize,
		readerCache:   readerCache,
		readerPattern: NewReaderPattern(),
		prefetchCount: prefetchCount,
		ctx:           ctx,
	}
}

func (c *ChunkReadAt) Size() int64 {
	return c.fileSize
}

func (c *ChunkReadAt) Close() error {
	c.readerCache.destroy()
	return nil
}

func (c *ChunkReadAt) ReadAt(p []byte, offset int64) (n int, err error) {

	c.readerPattern.MonitorReadAt(offset, len(p))

	c.chunkViews.Lock.RLock()
	defer c.chunkViews.Lock.RUnlock()

	// glog.V(4).Infof("ReadAt [%d,%d) of total file size %d bytes %d chunk views", offset, offset+int64(len(p)), c.fileSize, len(c.chunkViews))
	n, _, err = c.doReadAt(c.ctx, p, offset)
	return
}

func (c *ChunkReadAt) ReadAtWithTime(ctx context.Context, p []byte, offset int64) (n int, ts int64, err error) {

	c.readerPattern.MonitorReadAt(offset, len(p))

	c.chunkViews.Lock.RLock()
	defer c.chunkViews.Lock.RUnlock()

	// glog.V(4).Infof("ReadAt [%d,%d) of total file size %d bytes %d chunk views", offset, offset+int64(len(p)), c.fileSize, len(c.chunkViews))
	return c.doReadAt(ctx, p, offset)
}

// chunkReadTask represents a single chunk read operation for parallel processing
type chunkReadTask struct {
	chunk        *ChunkView
	bufferStart  int64  // start position in the output buffer
	bufferEnd    int64  // end position in the output buffer
	chunkOffset  uint64 // offset within the chunk to read from
	bytesRead    int
	modifiedTsNs int64
}

func (c *ChunkReadAt) doReadAt(ctx context.Context, p []byte, offset int64) (n int, ts int64, err error) {

	// Collect all chunk read tasks
	var tasks []*chunkReadTask
	var gaps []struct{ start, length int64 } // gaps that need zero-filling

	startOffset, remaining := offset, int64(len(p))
	var lastChunk *Interval[*ChunkView]

	for x := c.chunkViews.Front(); x != nil; x = x.Next {
		chunk := x.Value
		if remaining <= 0 {
			break
		}
		lastChunk = x

		// Handle gap before this chunk
		if startOffset < chunk.ViewOffset {
			gap := chunk.ViewOffset - startOffset
			gaps = append(gaps, struct{ start, length int64 }{startOffset - offset, gap})
			startOffset, remaining = chunk.ViewOffset, remaining-gap
			if remaining <= 0 {
				break
			}
		}

		chunkStart, chunkStop := max(chunk.ViewOffset, startOffset), min(chunk.ViewOffset+int64(chunk.ViewSize), startOffset+remaining)
		if chunkStart >= chunkStop {
			continue
		}

		bufferOffset := chunkStart - chunk.ViewOffset + chunk.OffsetInChunk
		tasks = append(tasks, &chunkReadTask{
			chunk:       chunk,
			bufferStart: startOffset - offset,
			bufferEnd:   chunkStop - chunkStart + startOffset - offset,
			chunkOffset: uint64(bufferOffset),
		})

		startOffset, remaining = chunkStop, remaining-(chunkStop-chunkStart)
	}

	// Zero-fill gaps
	for _, gap := range gaps {
		glog.V(4).Infof("zero [%d,%d)", offset+gap.start, offset+gap.start+gap.length)
		n += zero(p, gap.start, gap.length)
	}

	// If only one chunk or random access mode, use sequential reading
	if len(tasks) <= 1 || c.readerPattern.IsRandomMode() {
		for _, task := range tasks {
			copied, readErr := c.readChunkSliceAt(ctx, p[task.bufferStart:task.bufferEnd], task.chunk, nil, task.chunkOffset)
			ts = max(ts, task.chunk.ModifiedTsNs)
			if readErr != nil {
				glog.Errorf("fetching chunk %+v: %v\n", task.chunk, readErr)
				return n + copied, ts, readErr
			}
			n += copied
		}
	} else {
		// Parallel chunk fetching for multiple chunks
		// This significantly improves throughput when chunks are on different volume servers
		g, gCtx := errgroup.WithContext(ctx)

		// Limit concurrency to avoid overwhelming the system
		concurrency := c.prefetchCount
		if concurrency < minReadConcurrency {
			concurrency = minReadConcurrency
		}
		if concurrency > len(tasks) {
			concurrency = len(tasks)
		}
		g.SetLimit(concurrency)

		for _, task := range tasks {
			g.Go(func() error {
				// Read directly into the correct position in the output buffer
				copied, readErr := c.readChunkSliceAtForParallel(gCtx, p[task.bufferStart:task.bufferEnd], task.chunk, task.chunkOffset)
				task.bytesRead = copied
				task.modifiedTsNs = task.chunk.ModifiedTsNs
				return readErr
			})
		}

		// Wait for all chunk reads to complete
		if waitErr := g.Wait(); waitErr != nil {
			err = waitErr
		}

		// Aggregate results (order is preserved since we read directly into buffer positions)
		for _, task := range tasks {
			n += task.bytesRead
			ts = max(ts, task.modifiedTsNs)
		}

		if err != nil {
			return n, ts, err
		}
	}

	// Trigger prefetch for sequential reads
	if lastChunk != nil && lastChunk.Next != nil && c.prefetchCount > 0 && !c.readerPattern.IsRandomMode() {
		c.readerCache.MaybeCache(lastChunk.Next, c.prefetchCount)
	}

	// Zero the remaining bytes if a gap exists at the end
	if remaining > 0 {
		var delta int64
		if c.fileSize >= startOffset {
			delta = min(remaining, c.fileSize-startOffset)
			bufStart := startOffset - offset
			if delta > 0 {
				glog.V(4).Infof("zero2 [%d,%d) of file size %d bytes", startOffset, startOffset+delta, c.fileSize)
				n += zero(p, bufStart, delta)
			}
		}
	}

	if err == nil && offset+int64(len(p)) >= c.fileSize {
		err = io.EOF
	}

	return
}

func (c *ChunkReadAt) readChunkSliceAt(ctx context.Context, buffer []byte, chunkView *ChunkView, nextChunkViews *Interval[*ChunkView], offset uint64) (n int, err error) {

	if c.readerPattern.IsRandomMode() {
		n, err := c.readerCache.chunkCache.ReadChunkAt(buffer, chunkView.FileId, offset)
		if n > 0 {
			return n, err
		}
		return fetchChunkRange(ctx, buffer, c.readerCache.lookupFileIdFn, chunkView.FileId, chunkView.CipherKey, chunkView.IsGzipped, int64(offset))
	}

	shouldCache := (uint64(chunkView.ViewOffset) + chunkView.ChunkSize) <= c.readerCache.chunkCache.GetMaxFilePartSizeInCache()
	n, err = c.readerCache.ReadChunkAt(ctx, buffer, chunkView.FileId, chunkView.CipherKey, chunkView.IsGzipped, int64(offset), int(chunkView.ChunkSize), shouldCache)
	if c.lastChunkFid != chunkView.FileId {
		if chunkView.OffsetInChunk == 0 { // start of a new chunk
			if c.lastChunkFid != "" {
				c.readerCache.UnCache(c.lastChunkFid)
			}
			if nextChunkViews != nil && c.prefetchCount > 0 {
				// Prefetch multiple chunks ahead for better sequential read throughput
				// This keeps the network pipeline full with parallel chunk fetches
				c.readerCache.MaybeCache(nextChunkViews, c.prefetchCount)
			}
		}
	}
	c.lastChunkFid = chunkView.FileId
	return
}

// readChunkSliceAtForParallel is a simplified version for parallel chunk fetching
// It doesn't update lastChunkFid or trigger prefetch (handled by the caller)
func (c *ChunkReadAt) readChunkSliceAtForParallel(ctx context.Context, buffer []byte, chunkView *ChunkView, offset uint64) (n int, err error) {
	shouldCache := (uint64(chunkView.ViewOffset) + chunkView.ChunkSize) <= c.readerCache.chunkCache.GetMaxFilePartSizeInCache()
	return c.readerCache.ReadChunkAt(ctx, buffer, chunkView.FileId, chunkView.CipherKey, chunkView.IsGzipped, int64(offset), int(chunkView.ChunkSize), shouldCache)
}

func zero(buffer []byte, start, length int64) int {
	if length <= 0 {
		return 0
	}
	end := min(start+length, int64(len(buffer)))
	start = max(start, 0)

	// zero the bytes
	for o := start; o < end; o++ {
		buffer[o] = 0
	}
	return int(end - start)
}