Files
seaweedFS/weed/filer/stream.go
Chris Lu 0798b274dd feat(s3): add concurrent chunk prefetch for large file downloads (#8917)
* feat(s3): add concurrent chunk prefetch for large file downloads

Add a pipe-based prefetch pipeline that overlaps chunk fetching with
response writing during S3 GetObject, SSE downloads, and filer proxy.

While chunk N streams to the HTTP response, fetch goroutines for the
next K chunks establish HTTP connections to volume servers ahead of
time, eliminating the RTT gap between sequential chunk fetches.

Uses io.Pipe for minimal memory overhead (~1MB per download regardless
of chunk size, vs buffering entire chunks). Also increases the
streaming read buffer from 64KB to 256KB to reduce syscall overhead.

Benchmark results (64KB chunks, prefetch=4):
- 0ms latency:  1058 → 2362 MB/s (2.2× faster)
- 5ms latency:  11.0 → 41.7 MB/s (3.8× faster)
- 10ms latency: 5.9  → 23.3 MB/s (4.0× faster)
- 20ms latency: 3.1  → 12.1 MB/s (3.9× faster)

* fix: address review feedback for prefetch pipeline

- Fix data race: use *chunkPipeResult (pointer) on channel to avoid
  copying struct while fetch goroutines write to it. Confirmed clean
  with -race detector.
- Remove concurrent map write: retryWithCacheInvalidation no longer
  updates fileId2Url map. Producer only reads it; consumer never writes.
- Use mem.Allocate/mem.Free for copy buffer to reduce GC pressure.
- Add local cancellable context so consumer errors (client disconnect)
  immediately stop the producer and all in-flight fetch goroutines.

* fix(test): remove dead code and add Range header support in test server

- Remove unused allData variable in makeChunksAndServer
- Add Range header handling to createTestServer for partial chunk
  read coverage (206 Partial Content, 416 Range Not Satisfiable)

* fix: correct retry condition and goroutine leak in prefetch pipeline

- Fix retry condition: use result.fetchErr/result.written instead of
  copied to decide cache-invalidation retry. The old condition wrongly
  triggered retry when the fetch succeeded but the response writer
  failed on the first write (copied==0 despite fetcher having data).
  Now matches the sequential path (stream.go:197) which checks whether
  the fetcher itself wrote zero bytes.

- Fix goroutine leak: when the producer's send to the results channel
  is interrupted by context cancellation, the fetch goroutine was
  already launched but the result was never sent to the channel. The
  drain loop couldn't handle it. Now waits on result.done before
  returning so every fetch goroutine is properly awaited.
2026-04-03 19:57:30 -07:00

531 lines
16 KiB
Go

package filer
import (
"bytes"
"context"
"fmt"
"io"
"math"
"strings"
"sync"
"time"
"slices"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
"github.com/seaweedfs/seaweedfs/weed/security"
"github.com/seaweedfs/seaweedfs/weed/stats"
"github.com/seaweedfs/seaweedfs/weed/util"
util_http "github.com/seaweedfs/seaweedfs/weed/util/http"
"github.com/seaweedfs/seaweedfs/weed/wdclient"
)
var getLookupFileIdBackoffSchedule = []time.Duration{
150 * time.Millisecond,
600 * time.Millisecond,
1800 * time.Millisecond,
}
var (
jwtSigningReadKey security.SigningKey
jwtSigningReadKeyExpires int
loadJwtConfigOnce sync.Once
)
func loadJwtConfig() {
v := util.GetViper()
jwtSigningReadKey = security.SigningKey(v.GetString("jwt.signing.read.key"))
jwtSigningReadKeyExpires = v.GetInt("jwt.signing.read.expires_after_seconds")
if jwtSigningReadKeyExpires == 0 {
jwtSigningReadKeyExpires = 60
}
}
// JwtForVolumeServer generates a JWT token for volume server read operations if jwt.signing.read is configured
func JwtForVolumeServer(fileId string) string {
loadJwtConfigOnce.Do(loadJwtConfig)
if len(jwtSigningReadKey) == 0 {
return ""
}
return string(security.GenJwtForVolumeServer(jwtSigningReadKey, jwtSigningReadKeyExpires, fileId))
}
func HasData(entry *filer_pb.Entry) bool {
if len(entry.Content) > 0 {
return true
}
return len(entry.GetChunks()) > 0
}
func IsSameData(a, b *filer_pb.Entry) bool {
if len(a.Content) > 0 || len(b.Content) > 0 {
return bytes.Equal(a.Content, b.Content)
}
return isSameChunks(a.Chunks, b.Chunks)
}
func isSameChunks(a, b []*filer_pb.FileChunk) bool {
if len(a) != len(b) {
return false
}
slices.SortFunc(a, func(i, j *filer_pb.FileChunk) int {
return strings.Compare(i.ETag, j.ETag)
})
slices.SortFunc(b, func(i, j *filer_pb.FileChunk) int {
return strings.Compare(i.ETag, j.ETag)
})
for i := 0; i < len(a); i++ {
if a[i].ETag != b[i].ETag {
return false
}
}
return true
}
func NewFileReader(filerClient filer_pb.FilerClient, entry *filer_pb.Entry) io.Reader {
if len(entry.Content) > 0 {
return bytes.NewReader(entry.Content)
}
return NewChunkStreamReader(filerClient, entry.GetChunks())
}
type DoStreamContent func(writer io.Writer) error
func PrepareStreamContent(masterClient wdclient.HasLookupFileIdFunction, jwtFunc VolumeServerJwtFunction, chunks []*filer_pb.FileChunk, offset int64, size int64) (DoStreamContent, error) {
return PrepareStreamContentWithThrottler(context.Background(), masterClient, jwtFunc, chunks, offset, size, 0)
}
type VolumeServerJwtFunction func(fileId string) string
type CacheInvalidator interface {
InvalidateCache(fileId string)
}
// urlSlicesEqual checks if two URL slices contain the same URLs (order-independent)
func urlSlicesEqual(a, b []string) bool {
if len(a) != len(b) {
return false
}
// Create a map to count occurrences in first slice
counts := make(map[string]int)
for _, url := range a {
counts[url]++
}
// Verify all URLs in second slice match
for _, url := range b {
if counts[url] == 0 {
return false
}
counts[url]--
}
return true
}
func PrepareStreamContentWithThrottler(ctx context.Context, masterClient wdclient.HasLookupFileIdFunction, jwtFunc VolumeServerJwtFunction, chunks []*filer_pb.FileChunk, offset int64, size int64, downloadMaxBytesPs int64) (DoStreamContent, error) {
glog.V(4).InfofCtx(ctx, "prepare to stream content for chunks: %d", len(chunks))
chunkViews := ViewFromChunks(ctx, masterClient.GetLookupFileIdFunction(), chunks, offset, size)
fileId2Url := make(map[string][]string)
for x := chunkViews.Front(); x != nil; x = x.Next {
chunkView := x.Value
var urlStrings []string
var err error
for _, backoff := range getLookupFileIdBackoffSchedule {
if err := ctx.Err(); err != nil {
return nil, err
}
urlStrings, err = masterClient.GetLookupFileIdFunction()(ctx, chunkView.FileId)
if err == nil && len(urlStrings) > 0 {
break
}
if err := ctx.Err(); err != nil {
return nil, err
}
glog.V(4).InfofCtx(ctx, "waiting for chunk: %s", chunkView.FileId)
timer := time.NewTimer(backoff)
select {
case <-ctx.Done():
if !timer.Stop() {
<-timer.C
}
return nil, ctx.Err()
case <-timer.C:
}
}
if err != nil {
glog.V(1).InfofCtx(ctx, "operation LookupFileId %s failed, err: %v", chunkView.FileId, err)
return nil, err
} else if len(urlStrings) == 0 {
errUrlNotFound := fmt.Errorf("operation LookupFileId %s failed, err: urls not found", chunkView.FileId)
glog.ErrorCtx(ctx, errUrlNotFound)
return nil, errUrlNotFound
}
fileId2Url[chunkView.FileId] = urlStrings
}
return func(writer io.Writer) error {
downloadThrottler := util.NewWriteThrottler(downloadMaxBytesPs)
remaining := size
for x := chunkViews.Front(); x != nil; x = x.Next {
chunkView := x.Value
if offset < chunkView.ViewOffset {
gap := chunkView.ViewOffset - offset
remaining -= gap
glog.V(4).InfofCtx(ctx, "zero [%d,%d)", offset, chunkView.ViewOffset)
err := writeZero(writer, gap)
if err != nil {
return fmt.Errorf("write zero [%d,%d)", offset, chunkView.ViewOffset)
}
offset = chunkView.ViewOffset
}
urlStrings := fileId2Url[chunkView.FileId]
start := time.Now()
jwt := jwtFunc(chunkView.FileId)
written, err := retriedStreamFetchChunkData(ctx, writer, urlStrings, jwt, chunkView.CipherKey, chunkView.IsGzipped, chunkView.IsFullChunk(), chunkView.OffsetInChunk, int(chunkView.ViewSize))
if err != nil && ctx.Err() != nil {
return ctx.Err()
}
// If read failed, try to invalidate cache and re-lookup
if err != nil && written == 0 {
if invalidator, ok := masterClient.(CacheInvalidator); ok {
glog.V(0).InfofCtx(ctx, "read chunk %s failed, invalidating cache and retrying", chunkView.FileId)
invalidator.InvalidateCache(chunkView.FileId)
// Re-lookup
newUrlStrings, lookupErr := masterClient.GetLookupFileIdFunction()(ctx, chunkView.FileId)
if lookupErr == nil && len(newUrlStrings) > 0 {
// Check if new URLs are different from old ones to avoid infinite retry
if !urlSlicesEqual(urlStrings, newUrlStrings) {
glog.V(0).InfofCtx(ctx, "retrying read chunk %s with new locations: %v", chunkView.FileId, newUrlStrings)
_, err = retriedStreamFetchChunkData(ctx, writer, newUrlStrings, jwt, chunkView.CipherKey, chunkView.IsGzipped, chunkView.IsFullChunk(), chunkView.OffsetInChunk, int(chunkView.ViewSize))
// Update the map so subsequent references use fresh URLs
if err == nil {
fileId2Url[chunkView.FileId] = newUrlStrings
}
} else {
glog.V(0).InfofCtx(ctx, "re-lookup returned same locations for chunk %s, skipping retry", chunkView.FileId)
}
} else {
if lookupErr != nil {
glog.WarningfCtx(ctx, "failed to re-lookup chunk %s after cache invalidation: %v", chunkView.FileId, lookupErr)
} else {
glog.WarningfCtx(ctx, "re-lookup for chunk %s returned no locations, skipping retry", chunkView.FileId)
}
}
}
}
offset += int64(chunkView.ViewSize)
remaining -= int64(chunkView.ViewSize)
stats.FilerRequestHistogram.WithLabelValues("chunkDownload").Observe(time.Since(start).Seconds())
if err != nil {
stats.FilerHandlerCounter.WithLabelValues("chunkDownloadError").Inc()
return fmt.Errorf("read chunk: %w", err)
}
stats.FilerHandlerCounter.WithLabelValues("chunkDownload").Inc()
downloadThrottler.MaybeSlowdown(int64(chunkView.ViewSize))
}
if remaining > 0 {
glog.V(4).InfofCtx(ctx, "zero [%d,%d)", offset, offset+remaining)
err := writeZero(writer, remaining)
if err != nil {
return fmt.Errorf("write zero [%d,%d)", offset, offset+remaining)
}
}
return nil
}, nil
}
// PrepareStreamContentWithPrefetch is like PrepareStreamContentWithThrottler but uses
// concurrent chunk prefetching to overlap network I/O. When prefetchAhead > 1, fetch
// goroutines establish HTTP connections to volume servers ahead of time, streaming data
// through io.Pipe with minimal memory overhead.
//
// prefetchAhead controls the number of chunks fetched concurrently:
// - 0 or 1: falls back to sequential fetching (same as PrepareStreamContentWithThrottler)
// - 2+: uses pipe-based prefetch pipeline with that many concurrent fetches
func PrepareStreamContentWithPrefetch(ctx context.Context, masterClient wdclient.HasLookupFileIdFunction, jwtFunc VolumeServerJwtFunction, chunks []*filer_pb.FileChunk, offset int64, size int64, downloadMaxBytesPs int64, prefetchAhead int) (DoStreamContent, error) {
if prefetchAhead <= 1 {
return PrepareStreamContentWithThrottler(ctx, masterClient, jwtFunc, chunks, offset, size, downloadMaxBytesPs)
}
glog.V(4).InfofCtx(ctx, "prepare to stream content with prefetch=%d for chunks: %d", prefetchAhead, len(chunks))
chunkViews := ViewFromChunks(ctx, masterClient.GetLookupFileIdFunction(), chunks, offset, size)
fileId2Url := make(map[string][]string)
for x := chunkViews.Front(); x != nil; x = x.Next {
chunkView := x.Value
var urlStrings []string
var err error
for _, backoff := range getLookupFileIdBackoffSchedule {
if err := ctx.Err(); err != nil {
return nil, err
}
urlStrings, err = masterClient.GetLookupFileIdFunction()(ctx, chunkView.FileId)
if err == nil && len(urlStrings) > 0 {
break
}
if err := ctx.Err(); err != nil {
return nil, err
}
glog.V(4).InfofCtx(ctx, "waiting for chunk: %s", chunkView.FileId)
timer := time.NewTimer(backoff)
select {
case <-ctx.Done():
if !timer.Stop() {
<-timer.C
}
return nil, ctx.Err()
case <-timer.C:
}
}
if err != nil {
glog.V(1).InfofCtx(ctx, "operation LookupFileId %s failed, err: %v", chunkView.FileId, err)
return nil, err
} else if len(urlStrings) == 0 {
errUrlNotFound := fmt.Errorf("operation LookupFileId %s failed, err: urls not found", chunkView.FileId)
glog.ErrorCtx(ctx, errUrlNotFound)
return nil, errUrlNotFound
}
fileId2Url[chunkView.FileId] = urlStrings
}
return func(writer io.Writer) error {
return streamChunksPrefetched(ctx, writer, chunkViews, fileId2Url, jwtFunc, masterClient, offset, size, downloadMaxBytesPs, prefetchAhead)
}, nil
}
func StreamContent(masterClient wdclient.HasLookupFileIdFunction, writer io.Writer, chunks []*filer_pb.FileChunk, offset int64, size int64) error {
streamFn, err := PrepareStreamContent(masterClient, JwtForVolumeServer, chunks, offset, size)
if err != nil {
return err
}
return streamFn(writer)
}
// ---------------- ReadAllReader ----------------------------------
func writeZero(w io.Writer, size int64) (err error) {
zeroPadding := make([]byte, 1024)
var written int
for size > 0 {
if size > 1024 {
written, err = w.Write(zeroPadding)
} else {
written, err = w.Write(zeroPadding[:size])
}
size -= int64(written)
if err != nil {
return
}
}
return
}
// ---------------- ChunkStreamReader ----------------------------------
type ChunkStreamReader struct {
head *Interval[*ChunkView]
chunkView *Interval[*ChunkView]
totalSize int64
logicOffset int64
buffer []byte
bufferOffset int64
bufferLock sync.Mutex
chunk string
lookupFileId wdclient.LookupFileIdFunctionType
}
var _ = io.ReadSeeker(&ChunkStreamReader{})
var _ = io.ReaderAt(&ChunkStreamReader{})
var _ = io.Closer(&ChunkStreamReader{})
func doNewChunkStreamReader(ctx context.Context, lookupFileIdFn wdclient.LookupFileIdFunctionType, chunks []*filer_pb.FileChunk) *ChunkStreamReader {
chunkViews := ViewFromChunks(ctx, lookupFileIdFn, chunks, 0, math.MaxInt64)
var totalSize int64
for x := chunkViews.Front(); x != nil; x = x.Next {
chunk := x.Value
totalSize += int64(chunk.ViewSize)
}
return &ChunkStreamReader{
head: chunkViews.Front(),
chunkView: chunkViews.Front(),
lookupFileId: lookupFileIdFn,
totalSize: totalSize,
}
}
func NewChunkStreamReaderFromFiler(ctx context.Context, masterClient *wdclient.MasterClient, chunks []*filer_pb.FileChunk) *ChunkStreamReader {
lookupFileIdFn := func(ctx context.Context, fileId string) (targetUrl []string, err error) {
return masterClient.LookupFileId(ctx, fileId)
}
return doNewChunkStreamReader(ctx, lookupFileIdFn, chunks)
}
// NewChunkStreamReaderFromLookup creates a ChunkStreamReader from a lookup function.
// Used by clients that already have a LookupFileIdFunctionType (e.g., from FilerSource).
func NewChunkStreamReaderFromLookup(ctx context.Context, lookupFn wdclient.LookupFileIdFunctionType, chunks []*filer_pb.FileChunk) *ChunkStreamReader {
return doNewChunkStreamReader(ctx, lookupFn, chunks)
}
func NewChunkStreamReader(filerClient filer_pb.FilerClient, chunks []*filer_pb.FileChunk) *ChunkStreamReader {
lookupFileIdFn := LookupFn(filerClient)
return doNewChunkStreamReader(context.Background(), lookupFileIdFn, chunks)
}
func (c *ChunkStreamReader) ReadAt(p []byte, off int64) (n int, err error) {
c.bufferLock.Lock()
defer c.bufferLock.Unlock()
if err = c.prepareBufferFor(off); err != nil {
return
}
c.logicOffset = off
return c.doRead(p)
}
func (c *ChunkStreamReader) Read(p []byte) (n int, err error) {
c.bufferLock.Lock()
defer c.bufferLock.Unlock()
return c.doRead(p)
}
func (c *ChunkStreamReader) doRead(p []byte) (n int, err error) {
// fmt.Printf("do read [%d,%d) at %s[%d,%d)\n", c.logicOffset, c.logicOffset+int64(len(p)), c.chunk, c.bufferOffset, c.bufferOffset+int64(len(c.buffer)))
for n < len(p) {
// println("read", c.logicOffset)
if err = c.prepareBufferFor(c.logicOffset); err != nil {
return
}
t := copy(p[n:], c.buffer[c.logicOffset-c.bufferOffset:])
n += t
c.logicOffset += int64(t)
}
return
}
func (c *ChunkStreamReader) isBufferEmpty() bool {
return len(c.buffer) <= int(c.logicOffset-c.bufferOffset)
}
func (c *ChunkStreamReader) Seek(offset int64, whence int) (int64, error) {
c.bufferLock.Lock()
defer c.bufferLock.Unlock()
var err error
switch whence {
case io.SeekStart:
case io.SeekCurrent:
offset += c.logicOffset
case io.SeekEnd:
offset = c.totalSize + offset
}
if offset > c.totalSize {
err = io.ErrUnexpectedEOF
} else {
c.logicOffset = offset
}
return offset, err
}
func insideChunk(offset int64, chunk *ChunkView) bool {
return chunk.ViewOffset <= offset && offset < chunk.ViewOffset+int64(chunk.ViewSize)
}
func (c *ChunkStreamReader) prepareBufferFor(offset int64) (err error) {
// stay in the same chunk
if c.bufferOffset <= offset && offset < c.bufferOffset+int64(len(c.buffer)) {
return nil
}
// glog.V(2).Infof("c.chunkView: %v buffer:[%d,%d) offset:%d totalSize:%d", c.chunkView, c.bufferOffset, c.bufferOffset+int64(len(c.buffer)), offset, c.totalSize)
// find a possible chunk view
p := c.chunkView
for p != nil {
chunk := p.Value
// glog.V(2).Infof("prepareBufferFor check chunk:[%d,%d)", chunk.ViewOffset, chunk.ViewOffset+int64(chunk.ViewSize))
if insideChunk(offset, chunk) {
if c.isBufferEmpty() || c.bufferOffset != chunk.ViewOffset {
c.chunkView = p
return c.fetchChunkToBuffer(chunk)
}
}
if offset < c.bufferOffset {
p = p.Prev
} else {
p = p.Next
}
}
return io.EOF
}
func (c *ChunkStreamReader) fetchChunkToBuffer(chunkView *ChunkView) error {
urlStrings, err := c.lookupFileId(context.Background(), chunkView.FileId)
if err != nil {
glog.V(1).Infof("operation LookupFileId %s failed, err: %v", chunkView.FileId, err)
return err
}
var buffer bytes.Buffer
var shouldRetry bool
jwt := JwtForVolumeServer(chunkView.FileId)
for _, urlString := range urlStrings {
shouldRetry, err = util_http.ReadUrlAsStream(context.Background(), urlString+"?readDeleted=true", jwt, chunkView.CipherKey, chunkView.IsGzipped, chunkView.IsFullChunk(), chunkView.OffsetInChunk, int(chunkView.ViewSize), func(data []byte) {
buffer.Write(data)
})
if !shouldRetry {
break
}
if err != nil {
glog.V(1).Infof("read %s failed, err: %v", chunkView.FileId, err)
buffer.Reset()
} else {
break
}
}
if err != nil {
return err
}
c.buffer = buffer.Bytes()
c.bufferOffset = chunkView.ViewOffset
c.chunk = chunkView.FileId
// glog.V(0).Infof("fetched %s [%d,%d)", chunkView.FileId, chunkView.ViewOffset, chunkView.ViewOffset+int64(chunkView.ViewSize))
return nil
}
func (c *ChunkStreamReader) Close() error {
c.bufferLock.Lock()
defer c.bufferLock.Unlock()
c.buffer = nil
c.head = nil
c.chunkView = nil
return nil
}
func VolumeId(fileId string) string {
lastCommaIndex := strings.LastIndex(fileId, ",")
if lastCommaIndex > 0 {
return fileId[:lastCommaIndex]
}
return fileId
}