Files
seaweedFS/weed/query/engine/parquet_scanner.go
Chris Lu d48e1e1659 mount: improve read throughput with parallel chunk fetching (#7569)
* mount: improve read throughput with parallel chunk fetching

This addresses issue #7504 where a single weed mount FUSE instance
does not fully utilize node network bandwidth when reading large files.

Changes:
- Add -concurrentReaders mount option (default: 16) to control the
  maximum number of parallel chunk fetches during read operations
- Implement parallel section reading in ChunkGroup.ReadDataAt() using
  errgroup for better throughput when reading across multiple sections
- Enhance ReaderCache with MaybeCacheMany() to prefetch multiple chunks
  ahead in parallel during sequential reads (now prefetches 4 chunks)
- Increase ReaderCache limit dynamically based on concurrentReaders
  to support higher read parallelism

The bottleneck was that chunks were being read sequentially even when
they reside on different volume servers. By introducing parallel chunk
fetching, a single mount instance can now better saturate available
network bandwidth.

Fixes: #7504

* fmt

* Address review comments: make prefetch configurable, improve error handling

Changes:
1. Add DefaultPrefetchCount constant (4) to reader_at.go
2. Add GetPrefetchCount() method to ChunkGroup that derives prefetch count
   from concurrentReaders (1/4 ratio, min 1, max 8)
3. Pass prefetch count through NewChunkReaderAtFromClient
4. Fix error handling in readDataAtParallel to prioritize errgroup error
5. Update all callers to use DefaultPrefetchCount constant

For mount operations, prefetch scales with -concurrentReaders:
- concurrentReaders=16 (default) -> prefetch=4
- concurrentReaders=32 -> prefetch=8 (capped)
- concurrentReaders=4 -> prefetch=1

For non-mount paths (WebDAV, query engine, MQ), uses DefaultPrefetchCount.

* fmt

* Refactor: use variadic parameter instead of new function name

Use NewChunkGroup with optional concurrentReaders parameter instead of
creating a separate NewChunkGroupWithConcurrency function.

This maintains backward compatibility - existing callers without the
parameter get the default of 16 concurrent readers.

* Use explicit concurrentReaders parameter instead of variadic

* Refactor: use MaybeCache with count parameter instead of new MaybeCacheMany function

* Address nitpick review comments

- Add upper bound (128) on concurrentReaders to prevent excessive goroutine fan-out
- Cap readerCacheLimit at 256 accordingly
- Fix SetChunks: use Lock() instead of RLock() since we are writing to group.sections
2025-11-29 10:06:11 -08:00

450 lines
14 KiB
Go

package engine
import (
"context"
"fmt"
"math/big"
"time"
"github.com/parquet-go/parquet-go"
"github.com/seaweedfs/seaweedfs/weed/filer"
"github.com/seaweedfs/seaweedfs/weed/mq/schema"
"github.com/seaweedfs/seaweedfs/weed/mq/topic"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/mq_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
"github.com/seaweedfs/seaweedfs/weed/query/sqltypes"
"github.com/seaweedfs/seaweedfs/weed/util/chunk_cache"
)
// ParquetScanner scans MQ topic Parquet files for SELECT queries
// Assumptions:
// 1. All MQ messages are stored in Parquet format in topic partitions
// 2. Each partition directory contains dated Parquet files
// 3. System columns (_ts_ns, _key) are added to user schema
// 4. Predicate pushdown is used for efficient scanning
type ParquetScanner struct {
filerClient filer_pb.FilerClient
chunkCache chunk_cache.ChunkCache
topic topic.Topic
recordSchema *schema_pb.RecordType
parquetLevels *schema.ParquetLevels
}
// NewParquetScanner creates a scanner for a specific MQ topic
// Assumption: Topic exists and has Parquet files in partition directories
func NewParquetScanner(filerClient filer_pb.FilerClient, namespace, topicName string) (*ParquetScanner, error) {
// Check if filerClient is available
if filerClient == nil {
return nil, fmt.Errorf("filerClient is required but not available")
}
// Create topic reference
t := topic.Topic{
Namespace: namespace,
Name: topicName,
}
// Read topic configuration to get schema
var topicConf *mq_pb.ConfigureTopicResponse
var err error
if err := filerClient.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
topicConf, err = t.ReadConfFile(client)
return err
}); err != nil {
return nil, fmt.Errorf("failed to read topic config: %v", err)
}
// Build complete schema with system columns - prefer flat schema if available
var recordType *schema_pb.RecordType
if topicConf.GetMessageRecordType() != nil {
// New flat schema format - use directly
recordType = topicConf.GetMessageRecordType()
}
if recordType == nil || len(recordType.Fields) == 0 {
// For topics without schema, create a minimal schema with system fields and _value
recordType = schema.RecordTypeBegin().
WithField(SW_COLUMN_NAME_TIMESTAMP, schema.TypeInt64).
WithField(SW_COLUMN_NAME_KEY, schema.TypeBytes).
WithField(SW_COLUMN_NAME_VALUE, schema.TypeBytes). // Raw message value
RecordTypeEnd()
} else {
// Add system columns that MQ adds to all records
recordType = schema.NewRecordTypeBuilder(recordType).
WithField(SW_COLUMN_NAME_TIMESTAMP, schema.TypeInt64).
WithField(SW_COLUMN_NAME_KEY, schema.TypeBytes).
RecordTypeEnd()
}
// Convert to Parquet levels for efficient reading
parquetLevels, err := schema.ToParquetLevels(recordType)
if err != nil {
return nil, fmt.Errorf("failed to create Parquet levels: %v", err)
}
return &ParquetScanner{
filerClient: filerClient,
chunkCache: chunk_cache.NewChunkCacheInMemory(256), // Same as MQ logstore
topic: t,
recordSchema: recordType,
parquetLevels: parquetLevels,
}, nil
}
// ScanOptions configure how the scanner reads data
type ScanOptions struct {
// Time range filtering (Unix nanoseconds)
StartTimeNs int64
StopTimeNs int64
// Column projection - if empty, select all columns
Columns []string
// Row limit - 0 means no limit
Limit int
// Predicate for WHERE clause filtering
Predicate func(*schema_pb.RecordValue) bool
}
// ScanResult represents a single scanned record
type ScanResult struct {
Values map[string]*schema_pb.Value // Column name -> value
Timestamp int64 // Message timestamp (_ts_ns)
Key []byte // Message key (_key)
}
// Scan reads records from the topic's Parquet files
// Assumptions:
// 1. Scans all partitions of the topic
// 2. Applies time filtering at Parquet level for efficiency
// 3. Applies predicates and projections after reading
func (ps *ParquetScanner) Scan(ctx context.Context, options ScanOptions) ([]ScanResult, error) {
var results []ScanResult
// Get all partitions for this topic
// TODO: Implement proper partition discovery
// For now, assume partition 0 exists
partitions := []topic.Partition{{RangeStart: 0, RangeStop: 1000}}
for _, partition := range partitions {
partitionResults, err := ps.scanPartition(ctx, partition, options)
if err != nil {
return nil, fmt.Errorf("failed to scan partition %v: %v", partition, err)
}
results = append(results, partitionResults...)
// Apply global limit across all partitions
if options.Limit > 0 && len(results) >= options.Limit {
results = results[:options.Limit]
break
}
}
return results, nil
}
// scanPartition scans a specific topic partition
func (ps *ParquetScanner) scanPartition(ctx context.Context, partition topic.Partition, options ScanOptions) ([]ScanResult, error) {
// partitionDir := topic.PartitionDir(ps.topic, partition) // TODO: Use for actual file listing
var results []ScanResult
// List Parquet files in partition directory
// TODO: Implement proper file listing with date range filtering
// For now, this is a placeholder that would list actual Parquet files
// Simulate file processing - in real implementation, this would:
// 1. List files in partitionDir via filerClient
// 2. Filter files by date range if time filtering is enabled
// 3. Process each Parquet file in chronological order
// Placeholder: Create sample data for testing
if len(results) == 0 {
// Generate sample data for demonstration
sampleData := ps.generateSampleData(options)
results = append(results, sampleData...)
}
return results, nil
}
// scanParquetFile scans a single Parquet file (real implementation)
func (ps *ParquetScanner) scanParquetFile(ctx context.Context, entry *filer_pb.Entry, options ScanOptions) ([]ScanResult, error) {
var results []ScanResult
// Create reader for the Parquet file (same pattern as logstore)
lookupFileIdFn := filer.LookupFn(ps.filerClient)
fileSize := filer.FileSize(entry)
visibleIntervals, _ := filer.NonOverlappingVisibleIntervals(ctx, lookupFileIdFn, entry.Chunks, 0, int64(fileSize))
chunkViews := filer.ViewFromVisibleIntervals(visibleIntervals, 0, int64(fileSize))
readerCache := filer.NewReaderCache(32, ps.chunkCache, lookupFileIdFn)
readerAt := filer.NewChunkReaderAtFromClient(ctx, readerCache, chunkViews, int64(fileSize), filer.DefaultPrefetchCount)
// Create Parquet reader
parquetReader := parquet.NewReader(readerAt)
defer parquetReader.Close()
rows := make([]parquet.Row, 128) // Read in batches like logstore
for {
rowCount, readErr := parquetReader.ReadRows(rows)
// Process rows even if EOF
for i := 0; i < rowCount; i++ {
// Convert Parquet row to schema value
recordValue, err := schema.ToRecordValue(ps.recordSchema, ps.parquetLevels, rows[i])
if err != nil {
return nil, fmt.Errorf("failed to convert row: %v", err)
}
// Extract system columns
timestamp := recordValue.Fields[SW_COLUMN_NAME_TIMESTAMP].GetInt64Value()
key := recordValue.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()
// Apply time filtering
if options.StartTimeNs > 0 && timestamp < options.StartTimeNs {
continue
}
if options.StopTimeNs > 0 && timestamp >= options.StopTimeNs {
break // Assume data is time-ordered
}
// Apply predicate filtering (WHERE clause)
if options.Predicate != nil && !options.Predicate(recordValue) {
continue
}
// Apply column projection
values := make(map[string]*schema_pb.Value)
if len(options.Columns) == 0 {
// Select all columns (excluding system columns from user view)
for name, value := range recordValue.Fields {
if name != SW_COLUMN_NAME_TIMESTAMP && name != SW_COLUMN_NAME_KEY {
values[name] = value
}
}
} else {
// Select specified columns only
for _, columnName := range options.Columns {
if value, exists := recordValue.Fields[columnName]; exists {
values[columnName] = value
}
}
}
results = append(results, ScanResult{
Values: values,
Timestamp: timestamp,
Key: key,
})
// Apply row limit
if options.Limit > 0 && len(results) >= options.Limit {
return results, nil
}
}
if readErr != nil {
break // EOF or error
}
}
return results, nil
}
// generateSampleData creates sample data for testing when no real Parquet files exist
func (ps *ParquetScanner) generateSampleData(options ScanOptions) []ScanResult {
now := time.Now().UnixNano()
sampleData := []ScanResult{
{
Values: map[string]*schema_pb.Value{
"user_id": {Kind: &schema_pb.Value_Int32Value{Int32Value: 1001}},
"event_type": {Kind: &schema_pb.Value_StringValue{StringValue: "login"}},
"data": {Kind: &schema_pb.Value_StringValue{StringValue: `{"ip": "192.168.1.1"}`}},
},
Timestamp: now - 3600000000000, // 1 hour ago
Key: []byte("user-1001"),
},
{
Values: map[string]*schema_pb.Value{
"user_id": {Kind: &schema_pb.Value_Int32Value{Int32Value: 1002}},
"event_type": {Kind: &schema_pb.Value_StringValue{StringValue: "page_view"}},
"data": {Kind: &schema_pb.Value_StringValue{StringValue: `{"page": "/dashboard"}`}},
},
Timestamp: now - 1800000000000, // 30 minutes ago
Key: []byte("user-1002"),
},
{
Values: map[string]*schema_pb.Value{
"user_id": {Kind: &schema_pb.Value_Int32Value{Int32Value: 1001}},
"event_type": {Kind: &schema_pb.Value_StringValue{StringValue: "logout"}},
"data": {Kind: &schema_pb.Value_StringValue{StringValue: `{"session_duration": 3600}`}},
},
Timestamp: now - 900000000000, // 15 minutes ago
Key: []byte("user-1001"),
},
}
// Apply predicate filtering if specified
if options.Predicate != nil {
var filtered []ScanResult
for _, result := range sampleData {
// Convert to RecordValue for predicate testing
recordValue := &schema_pb.RecordValue{Fields: make(map[string]*schema_pb.Value)}
for k, v := range result.Values {
recordValue.Fields[k] = v
}
recordValue.Fields[SW_COLUMN_NAME_TIMESTAMP] = &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: result.Timestamp}}
recordValue.Fields[SW_COLUMN_NAME_KEY] = &schema_pb.Value{Kind: &schema_pb.Value_BytesValue{BytesValue: result.Key}}
if options.Predicate(recordValue) {
filtered = append(filtered, result)
}
}
sampleData = filtered
}
// Apply limit
if options.Limit > 0 && len(sampleData) > options.Limit {
sampleData = sampleData[:options.Limit]
}
return sampleData
}
// ConvertToSQLResult converts ScanResults to SQL query results
func (ps *ParquetScanner) ConvertToSQLResult(results []ScanResult, columns []string) *QueryResult {
if len(results) == 0 {
return &QueryResult{
Columns: columns,
Rows: [][]sqltypes.Value{},
}
}
// Determine columns if not specified
if len(columns) == 0 {
columnSet := make(map[string]bool)
for _, result := range results {
for columnName := range result.Values {
columnSet[columnName] = true
}
}
columns = make([]string, 0, len(columnSet))
for columnName := range columnSet {
columns = append(columns, columnName)
}
}
// Convert to SQL rows
rows := make([][]sqltypes.Value, len(results))
for i, result := range results {
row := make([]sqltypes.Value, len(columns))
for j, columnName := range columns {
if value, exists := result.Values[columnName]; exists {
row[j] = convertSchemaValueToSQL(value)
} else {
row[j] = sqltypes.NULL
}
}
rows[i] = row
}
return &QueryResult{
Columns: columns,
Rows: rows,
}
}
// convertSchemaValueToSQL converts schema_pb.Value to sqltypes.Value
func convertSchemaValueToSQL(value *schema_pb.Value) sqltypes.Value {
if value == nil {
return sqltypes.NULL
}
switch v := value.Kind.(type) {
case *schema_pb.Value_BoolValue:
if v.BoolValue {
return sqltypes.NewInt32(1)
}
return sqltypes.NewInt32(0)
case *schema_pb.Value_Int32Value:
return sqltypes.NewInt32(v.Int32Value)
case *schema_pb.Value_Int64Value:
return sqltypes.NewInt64(v.Int64Value)
case *schema_pb.Value_FloatValue:
return sqltypes.NewFloat32(v.FloatValue)
case *schema_pb.Value_DoubleValue:
return sqltypes.NewFloat64(v.DoubleValue)
case *schema_pb.Value_BytesValue:
return sqltypes.NewVarBinary(string(v.BytesValue))
case *schema_pb.Value_StringValue:
return sqltypes.NewVarChar(v.StringValue)
// Parquet logical types
case *schema_pb.Value_TimestampValue:
timestampValue := value.GetTimestampValue()
if timestampValue == nil {
return sqltypes.NULL
}
// Convert microseconds to time.Time and format as datetime string
timestamp := time.UnixMicro(timestampValue.TimestampMicros)
return sqltypes.MakeTrusted(sqltypes.Datetime, []byte(timestamp.Format("2006-01-02 15:04:05")))
case *schema_pb.Value_DateValue:
dateValue := value.GetDateValue()
if dateValue == nil {
return sqltypes.NULL
}
// Convert days since epoch to date string
date := time.Unix(int64(dateValue.DaysSinceEpoch)*86400, 0).UTC()
return sqltypes.MakeTrusted(sqltypes.Date, []byte(date.Format("2006-01-02")))
case *schema_pb.Value_DecimalValue:
decimalValue := value.GetDecimalValue()
if decimalValue == nil {
return sqltypes.NULL
}
// Convert decimal bytes to string representation
decimalStr := decimalToStringHelper(decimalValue)
return sqltypes.MakeTrusted(sqltypes.Decimal, []byte(decimalStr))
case *schema_pb.Value_TimeValue:
timeValue := value.GetTimeValue()
if timeValue == nil {
return sqltypes.NULL
}
// Convert microseconds since midnight to time string
duration := time.Duration(timeValue.TimeMicros) * time.Microsecond
timeOfDay := time.Date(0, 1, 1, 0, 0, 0, 0, time.UTC).Add(duration)
return sqltypes.MakeTrusted(sqltypes.Time, []byte(timeOfDay.Format("15:04:05")))
default:
return sqltypes.NewVarChar(fmt.Sprintf("%v", value))
}
}
// decimalToStringHelper converts a DecimalValue to string representation
// This is a standalone version of the engine's decimalToString method
func decimalToStringHelper(decimalValue *schema_pb.DecimalValue) string {
if decimalValue == nil || decimalValue.Value == nil {
return "0"
}
// Convert bytes back to big.Int
intValue := new(big.Int).SetBytes(decimalValue.Value)
// Convert to string with proper decimal placement
str := intValue.String()
// Handle decimal placement based on scale
scale := int(decimalValue.Scale)
if scale > 0 && len(str) > scale {
// Insert decimal point
decimalPos := len(str) - scale
return str[:decimalPos] + "." + str[decimalPos:]
}
return str
}