filer: add write batching for FoundationDB store to improve throughput (#7708)
This addresses issue #7699 where FoundationDB filer store had low throughput (~400-500 obj/s) due to each write operation creating a separate transaction. Changes: - Add writeBatcher that collects multiple writes into batched transactions - New config options: batch_size (default: 100), batch_interval (default: 5ms) - Batching provides ~5.7x throughput improvement (from ~456 to ~2600 obj/s) Benchmark results with different batch sizes: - batch_size=1: ~456 obj/s (baseline, no batching) - batch_size=10: ~2621 obj/s (5.7x improvement) - batch_size=16: ~2514 obj/s (5.5x improvement) - batch_size=100: ~2617 obj/s (5.7x improvement) - batch_size=1000: ~2593 obj/s (5.7x improvement) The batch_interval timer (5ms) ensures writes are flushed promptly even when batch is not full, providing good latency characteristics. Addressed review feedback: - Changed wait=false to wait=true in UpdateEntry/DeleteEntry to properly propagate errors to callers - Fixed timer reset race condition by stopping and draining before reset Fixes #7699
This commit is contained in:
2
go.mod
2
go.mod
@@ -123,7 +123,7 @@ require (
|
|||||||
github.com/Jille/raft-grpc-transport v1.6.1
|
github.com/Jille/raft-grpc-transport v1.6.1
|
||||||
github.com/ThreeDotsLabs/watermill v1.5.1
|
github.com/ThreeDotsLabs/watermill v1.5.1
|
||||||
github.com/a-h/templ v0.3.943
|
github.com/a-h/templ v0.3.943
|
||||||
github.com/apple/foundationdb/bindings/go v0.0.0-20240515141816-262c6fe778ad
|
github.com/apple/foundationdb/bindings/go v0.0.0-20250911184653-27f7192f47c3
|
||||||
github.com/arangodb/go-driver v1.6.9
|
github.com/arangodb/go-driver v1.6.9
|
||||||
github.com/armon/go-metrics v0.4.1
|
github.com/armon/go-metrics v0.4.1
|
||||||
github.com/aws/aws-sdk-go-v2 v1.40.1
|
github.com/aws/aws-sdk-go-v2 v1.40.1
|
||||||
|
|||||||
2
go.sum
2
go.sum
@@ -653,6 +653,8 @@ github.com/apache/arrow/go/v10 v10.0.1/go.mod h1:YvhnlEePVnBS4+0z3fhPfUy7W1Ikj0I
|
|||||||
github.com/apache/thrift v0.16.0/go.mod h1:PHK3hniurgQaNMZYaCLEqXKsYK8upmhPbmdP2FXSqgU=
|
github.com/apache/thrift v0.16.0/go.mod h1:PHK3hniurgQaNMZYaCLEqXKsYK8upmhPbmdP2FXSqgU=
|
||||||
github.com/apple/foundationdb/bindings/go v0.0.0-20240515141816-262c6fe778ad h1:fQBkhYv86zyW95PWhzBlkgz3NoY1ue0L+8oYBaoCMbg=
|
github.com/apple/foundationdb/bindings/go v0.0.0-20240515141816-262c6fe778ad h1:fQBkhYv86zyW95PWhzBlkgz3NoY1ue0L+8oYBaoCMbg=
|
||||||
github.com/apple/foundationdb/bindings/go v0.0.0-20240515141816-262c6fe778ad/go.mod h1:OMVSB21p9+xQUIqlGizHPZfjK+SHws1ht+ZytVDoz9U=
|
github.com/apple/foundationdb/bindings/go v0.0.0-20240515141816-262c6fe778ad/go.mod h1:OMVSB21p9+xQUIqlGizHPZfjK+SHws1ht+ZytVDoz9U=
|
||||||
|
github.com/apple/foundationdb/bindings/go v0.0.0-20250911184653-27f7192f47c3 h1:WZaTKNHCfcw7fWSR6/RKnCldVzvYZC+Y20Su4lffEIg=
|
||||||
|
github.com/apple/foundationdb/bindings/go v0.0.0-20250911184653-27f7192f47c3/go.mod h1:OMVSB21p9+xQUIqlGizHPZfjK+SHws1ht+ZytVDoz9U=
|
||||||
github.com/appscode/go-querystring v0.0.0-20170504095604-0126cfb3f1dc h1:LoL75er+LKDHDUfU5tRvFwxH0LjPpZN8OoG8Ll+liGU=
|
github.com/appscode/go-querystring v0.0.0-20170504095604-0126cfb3f1dc h1:LoL75er+LKDHDUfU5tRvFwxH0LjPpZN8OoG8Ll+liGU=
|
||||||
github.com/appscode/go-querystring v0.0.0-20170504095604-0126cfb3f1dc/go.mod h1:w648aMHEgFYS6xb0KVMMtZ2uMeemhiKCuD2vj6gY52A=
|
github.com/appscode/go-querystring v0.0.0-20170504095604-0126cfb3f1dc/go.mod h1:w648aMHEgFYS6xb0KVMMtZ2uMeemhiKCuD2vj6gY52A=
|
||||||
github.com/arangodb/go-driver v1.6.9 h1:zckB+xuA16NmHUuYOX7INCJTIyIkoBQjAGqNpiyf2HQ=
|
github.com/arangodb/go-driver v1.6.9 h1:zckB+xuA16NmHUuYOX7INCJTIyIkoBQjAGqNpiyf2HQ=
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/apple/foundationdb/bindings/go/src/fdb"
|
"github.com/apple/foundationdb/bindings/go/src/fdb"
|
||||||
@@ -37,15 +38,146 @@ import (
|
|||||||
const (
|
const (
|
||||||
// FoundationDB transaction size limit is 10MB
|
// FoundationDB transaction size limit is 10MB
|
||||||
FDB_TRANSACTION_SIZE_LIMIT = 10 * 1024 * 1024
|
FDB_TRANSACTION_SIZE_LIMIT = 10 * 1024 * 1024
|
||||||
|
// Safe limit for batch size (leave margin for FDB overhead)
|
||||||
|
FDB_BATCH_SIZE_LIMIT = 8 * 1024 * 1024
|
||||||
// Maximum number of entries to return in a single directory listing
|
// Maximum number of entries to return in a single directory listing
|
||||||
// Large batches can cause transaction timeouts and increase memory pressure
|
// Large batches can cause transaction timeouts and increase memory pressure
|
||||||
MAX_DIRECTORY_LIST_LIMIT = 1000
|
MAX_DIRECTORY_LIST_LIMIT = 1000
|
||||||
|
|
||||||
|
// Write batching defaults
|
||||||
|
DEFAULT_BATCH_SIZE = 100
|
||||||
|
DEFAULT_BATCH_INTERVAL = 5 * time.Millisecond
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
filer.Stores = append(filer.Stores, &FoundationDBStore{})
|
filer.Stores = append(filer.Stores, &FoundationDBStore{})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// writeOp represents a pending write operation
|
||||||
|
type writeOp struct {
|
||||||
|
key fdb.Key
|
||||||
|
value []byte // nil for delete
|
||||||
|
done chan error
|
||||||
|
}
|
||||||
|
|
||||||
|
// opSize returns the approximate size of an operation in bytes
|
||||||
|
func (op *writeOp) size() int {
|
||||||
|
return len(op.key) + len(op.value)
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeBatcher batches multiple writes into single transactions
|
||||||
|
type writeBatcher struct {
|
||||||
|
store *FoundationDBStore
|
||||||
|
ops chan *writeOp
|
||||||
|
stop chan struct{}
|
||||||
|
wg sync.WaitGroup
|
||||||
|
size int
|
||||||
|
interval time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
func newWriteBatcher(store *FoundationDBStore, size int, interval time.Duration) *writeBatcher {
|
||||||
|
b := &writeBatcher{
|
||||||
|
store: store,
|
||||||
|
ops: make(chan *writeOp, size*10),
|
||||||
|
stop: make(chan struct{}),
|
||||||
|
size: size,
|
||||||
|
interval: interval,
|
||||||
|
}
|
||||||
|
b.wg.Add(1)
|
||||||
|
go b.run()
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *writeBatcher) run() {
|
||||||
|
defer b.wg.Done()
|
||||||
|
batch := make([]*writeOp, 0, b.size)
|
||||||
|
batchBytes := 0 // Track cumulative size of batch
|
||||||
|
timer := time.NewTimer(b.interval)
|
||||||
|
defer timer.Stop()
|
||||||
|
|
||||||
|
flush := func() {
|
||||||
|
if len(batch) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_, err := b.store.database.Transact(func(tr fdb.Transaction) (interface{}, error) {
|
||||||
|
for _, op := range batch {
|
||||||
|
if op.value != nil {
|
||||||
|
tr.Set(op.key, op.value)
|
||||||
|
} else {
|
||||||
|
tr.Clear(op.key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
})
|
||||||
|
for _, op := range batch {
|
||||||
|
if op.done != nil {
|
||||||
|
op.done <- err
|
||||||
|
close(op.done)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
batch = batch[:0]
|
||||||
|
batchBytes = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
resetTimer := func() {
|
||||||
|
if !timer.Stop() {
|
||||||
|
select {
|
||||||
|
case <-timer.C:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
timer.Reset(b.interval)
|
||||||
|
}
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case op := <-b.ops:
|
||||||
|
batch = append(batch, op)
|
||||||
|
batchBytes += op.size()
|
||||||
|
// Flush when batch count or size limit is reached
|
||||||
|
if len(batch) >= b.size || batchBytes >= FDB_BATCH_SIZE_LIMIT {
|
||||||
|
flush()
|
||||||
|
resetTimer()
|
||||||
|
}
|
||||||
|
case <-timer.C:
|
||||||
|
flush()
|
||||||
|
// Timer already fired, safe to reset directly
|
||||||
|
timer.Reset(b.interval)
|
||||||
|
case <-b.stop:
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case op := <-b.ops:
|
||||||
|
batch = append(batch, op)
|
||||||
|
default:
|
||||||
|
flush()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *writeBatcher) submit(key fdb.Key, value []byte, wait bool) error {
|
||||||
|
op := &writeOp{key: key, value: value}
|
||||||
|
if wait {
|
||||||
|
op.done = make(chan error, 1)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case b.ops <- op:
|
||||||
|
if wait {
|
||||||
|
return <-op.done
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
case <-b.stop:
|
||||||
|
return fmt.Errorf("batcher stopped")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *writeBatcher) shutdown() {
|
||||||
|
close(b.stop)
|
||||||
|
b.wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
type FoundationDBStore struct {
|
type FoundationDBStore struct {
|
||||||
database fdb.Database
|
database fdb.Database
|
||||||
seaweedfsDir directory.DirectorySubspace
|
seaweedfsDir directory.DirectorySubspace
|
||||||
@@ -53,6 +185,10 @@ type FoundationDBStore struct {
|
|||||||
directoryPrefix string
|
directoryPrefix string
|
||||||
timeout time.Duration
|
timeout time.Duration
|
||||||
maxRetryDelay time.Duration
|
maxRetryDelay time.Duration
|
||||||
|
// Write batching
|
||||||
|
batcher *writeBatcher
|
||||||
|
batchSize int
|
||||||
|
batchInterval time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// Context key type for storing transactions
|
// Context key type for storing transactions
|
||||||
@@ -89,6 +225,8 @@ func (store *FoundationDBStore) Initialize(configuration util.Configuration, pre
|
|||||||
configuration.SetDefault(prefix+"timeout", "5s")
|
configuration.SetDefault(prefix+"timeout", "5s")
|
||||||
configuration.SetDefault(prefix+"max_retry_delay", "1s")
|
configuration.SetDefault(prefix+"max_retry_delay", "1s")
|
||||||
configuration.SetDefault(prefix+"directory_prefix", "seaweedfs")
|
configuration.SetDefault(prefix+"directory_prefix", "seaweedfs")
|
||||||
|
configuration.SetDefault(prefix+"batch_size", DEFAULT_BATCH_SIZE)
|
||||||
|
configuration.SetDefault(prefix+"batch_interval", DEFAULT_BATCH_INTERVAL.String())
|
||||||
|
|
||||||
clusterFile := configuration.GetString(prefix + "cluster_file")
|
clusterFile := configuration.GetString(prefix + "cluster_file")
|
||||||
apiVersion := configuration.GetInt(prefix + "api_version")
|
apiVersion := configuration.GetInt(prefix + "api_version")
|
||||||
@@ -108,6 +246,18 @@ func (store *FoundationDBStore) Initialize(configuration util.Configuration, pre
|
|||||||
return fmt.Errorf("invalid max_retry_delay duration %s: %w", maxRetryDelayStr, err)
|
return fmt.Errorf("invalid max_retry_delay duration %s: %w", maxRetryDelayStr, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Parse batch configuration
|
||||||
|
store.batchSize = configuration.GetInt(prefix + "batch_size")
|
||||||
|
if store.batchSize <= 0 {
|
||||||
|
store.batchSize = DEFAULT_BATCH_SIZE
|
||||||
|
}
|
||||||
|
batchIntervalStr := configuration.GetString(prefix + "batch_interval")
|
||||||
|
store.batchInterval, err = time.ParseDuration(batchIntervalStr)
|
||||||
|
if err != nil {
|
||||||
|
glog.Warningf("invalid %sbatch_interval duration %q, using default %v: %v", prefix, batchIntervalStr, DEFAULT_BATCH_INTERVAL, err)
|
||||||
|
store.batchInterval = DEFAULT_BATCH_INTERVAL
|
||||||
|
}
|
||||||
|
|
||||||
return store.initialize(clusterFile, apiVersion)
|
return store.initialize(clusterFile, apiVersion)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -138,6 +288,11 @@ func (store *FoundationDBStore) initialize(clusterFile string, apiVersion int) e
|
|||||||
return fmt.Errorf("failed to create/open kv directory: %w", err)
|
return fmt.Errorf("failed to create/open kv directory: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Start write batcher for improved throughput
|
||||||
|
store.batcher = newWriteBatcher(store, store.batchSize, store.batchInterval)
|
||||||
|
glog.V(0).Infof("FoundationDB: write batching enabled (batch_size=%d, batch_interval=%v)",
|
||||||
|
store.batchSize, store.batchInterval)
|
||||||
|
|
||||||
glog.V(0).Infof("FoundationDB store initialized successfully with directory prefix: %s", store.directoryPrefix)
|
glog.V(0).Infof("FoundationDB store initialized successfully with directory prefix: %s", store.directoryPrefix)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -215,7 +370,12 @@ func (store *FoundationDBStore) UpdateEntry(ctx context.Context, entry *filer.En
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Execute in a new transaction if not in an existing one
|
// Use write batcher for better throughput
|
||||||
|
if store.batcher != nil {
|
||||||
|
return store.batcher.submit(key, value, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: execute in a new transaction
|
||||||
_, err = store.database.Transact(func(tr fdb.Transaction) (interface{}, error) {
|
_, err = store.database.Transact(func(tr fdb.Transaction) (interface{}, error) {
|
||||||
tr.Set(key, value)
|
tr.Set(key, value)
|
||||||
return nil, nil
|
return nil, nil
|
||||||
@@ -276,7 +436,12 @@ func (store *FoundationDBStore) DeleteEntry(ctx context.Context, fullpath util.F
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Execute in a new transaction if not in an existing one
|
// Use write batcher for better throughput (nil value = delete)
|
||||||
|
if store.batcher != nil {
|
||||||
|
return store.batcher.submit(key, nil, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: execute in a new transaction
|
||||||
_, err := store.database.Transact(func(tr fdb.Transaction) (interface{}, error) {
|
_, err := store.database.Transact(func(tr fdb.Transaction) (interface{}, error) {
|
||||||
tr.Clear(key)
|
tr.Clear(key)
|
||||||
return nil, nil
|
return nil, nil
|
||||||
@@ -556,6 +721,11 @@ func (store *FoundationDBStore) KvDelete(ctx context.Context, key []byte) error
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (store *FoundationDBStore) Shutdown() {
|
func (store *FoundationDBStore) Shutdown() {
|
||||||
|
// Stop write batcher
|
||||||
|
if store.batcher != nil {
|
||||||
|
store.batcher.shutdown()
|
||||||
|
store.batcher = nil
|
||||||
|
}
|
||||||
// FoundationDB doesn't have an explicit close method for Database
|
// FoundationDB doesn't have an explicit close method for Database
|
||||||
glog.V(0).Infof("FoundationDB store shutdown")
|
glog.V(0).Infof("FoundationDB store shutdown")
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user