* mount: improve read throughput with parallel chunk fetching This addresses issue #7504 where a single weed mount FUSE instance does not fully utilize node network bandwidth when reading large files. Changes: - Add -concurrentReaders mount option (default: 16) to control the maximum number of parallel chunk fetches during read operations - Implement parallel section reading in ChunkGroup.ReadDataAt() using errgroup for better throughput when reading across multiple sections - Enhance ReaderCache with MaybeCacheMany() to prefetch multiple chunks ahead in parallel during sequential reads (now prefetches 4 chunks) - Increase ReaderCache limit dynamically based on concurrentReaders to support higher read parallelism The bottleneck was that chunks were being read sequentially even when they reside on different volume servers. By introducing parallel chunk fetching, a single mount instance can now better saturate available network bandwidth. Fixes: #7504 * fmt * Address review comments: make prefetch configurable, improve error handling Changes: 1. Add DefaultPrefetchCount constant (4) to reader_at.go 2. Add GetPrefetchCount() method to ChunkGroup that derives prefetch count from concurrentReaders (1/4 ratio, min 1, max 8) 3. Pass prefetch count through NewChunkReaderAtFromClient 4. Fix error handling in readDataAtParallel to prioritize errgroup error 5. Update all callers to use DefaultPrefetchCount constant For mount operations, prefetch scales with -concurrentReaders: - concurrentReaders=16 (default) -> prefetch=4 - concurrentReaders=32 -> prefetch=8 (capped) - concurrentReaders=4 -> prefetch=1 For non-mount paths (WebDAV, query engine, MQ), uses DefaultPrefetchCount. * fmt * Refactor: use variadic parameter instead of new function name Use NewChunkGroup with optional concurrentReaders parameter instead of creating a separate NewChunkGroupWithConcurrency function. This maintains backward compatibility - existing callers without the parameter get the default of 16 concurrent readers. * Use explicit concurrentReaders parameter instead of variadic * Refactor: use MaybeCache with count parameter instead of new MaybeCacheMany function * Address nitpick review comments - Add upper bound (128) on concurrentReaders to prevent excessive goroutine fan-out - Cap readerCacheLimit at 256 accordingly - Fix SetChunks: use Lock() instead of RLock() since we are writing to group.sections
450 lines
14 KiB
Go
450 lines
14 KiB
Go
package engine
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math/big"
|
|
"time"
|
|
|
|
"github.com/parquet-go/parquet-go"
|
|
"github.com/seaweedfs/seaweedfs/weed/filer"
|
|
"github.com/seaweedfs/seaweedfs/weed/mq/schema"
|
|
"github.com/seaweedfs/seaweedfs/weed/mq/topic"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/mq_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/query/sqltypes"
|
|
"github.com/seaweedfs/seaweedfs/weed/util/chunk_cache"
|
|
)
|
|
|
|
// ParquetScanner scans MQ topic Parquet files for SELECT queries
|
|
// Assumptions:
|
|
// 1. All MQ messages are stored in Parquet format in topic partitions
|
|
// 2. Each partition directory contains dated Parquet files
|
|
// 3. System columns (_ts_ns, _key) are added to user schema
|
|
// 4. Predicate pushdown is used for efficient scanning
|
|
type ParquetScanner struct {
|
|
filerClient filer_pb.FilerClient
|
|
chunkCache chunk_cache.ChunkCache
|
|
topic topic.Topic
|
|
recordSchema *schema_pb.RecordType
|
|
parquetLevels *schema.ParquetLevels
|
|
}
|
|
|
|
// NewParquetScanner creates a scanner for a specific MQ topic
|
|
// Assumption: Topic exists and has Parquet files in partition directories
|
|
func NewParquetScanner(filerClient filer_pb.FilerClient, namespace, topicName string) (*ParquetScanner, error) {
|
|
// Check if filerClient is available
|
|
if filerClient == nil {
|
|
return nil, fmt.Errorf("filerClient is required but not available")
|
|
}
|
|
|
|
// Create topic reference
|
|
t := topic.Topic{
|
|
Namespace: namespace,
|
|
Name: topicName,
|
|
}
|
|
|
|
// Read topic configuration to get schema
|
|
var topicConf *mq_pb.ConfigureTopicResponse
|
|
var err error
|
|
if err := filerClient.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
|
|
topicConf, err = t.ReadConfFile(client)
|
|
return err
|
|
}); err != nil {
|
|
return nil, fmt.Errorf("failed to read topic config: %v", err)
|
|
}
|
|
|
|
// Build complete schema with system columns - prefer flat schema if available
|
|
var recordType *schema_pb.RecordType
|
|
|
|
if topicConf.GetMessageRecordType() != nil {
|
|
// New flat schema format - use directly
|
|
recordType = topicConf.GetMessageRecordType()
|
|
}
|
|
|
|
if recordType == nil || len(recordType.Fields) == 0 {
|
|
// For topics without schema, create a minimal schema with system fields and _value
|
|
recordType = schema.RecordTypeBegin().
|
|
WithField(SW_COLUMN_NAME_TIMESTAMP, schema.TypeInt64).
|
|
WithField(SW_COLUMN_NAME_KEY, schema.TypeBytes).
|
|
WithField(SW_COLUMN_NAME_VALUE, schema.TypeBytes). // Raw message value
|
|
RecordTypeEnd()
|
|
} else {
|
|
// Add system columns that MQ adds to all records
|
|
recordType = schema.NewRecordTypeBuilder(recordType).
|
|
WithField(SW_COLUMN_NAME_TIMESTAMP, schema.TypeInt64).
|
|
WithField(SW_COLUMN_NAME_KEY, schema.TypeBytes).
|
|
RecordTypeEnd()
|
|
}
|
|
|
|
// Convert to Parquet levels for efficient reading
|
|
parquetLevels, err := schema.ToParquetLevels(recordType)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create Parquet levels: %v", err)
|
|
}
|
|
|
|
return &ParquetScanner{
|
|
filerClient: filerClient,
|
|
chunkCache: chunk_cache.NewChunkCacheInMemory(256), // Same as MQ logstore
|
|
topic: t,
|
|
recordSchema: recordType,
|
|
parquetLevels: parquetLevels,
|
|
}, nil
|
|
}
|
|
|
|
// ScanOptions configure how the scanner reads data
|
|
type ScanOptions struct {
|
|
// Time range filtering (Unix nanoseconds)
|
|
StartTimeNs int64
|
|
StopTimeNs int64
|
|
|
|
// Column projection - if empty, select all columns
|
|
Columns []string
|
|
|
|
// Row limit - 0 means no limit
|
|
Limit int
|
|
|
|
// Predicate for WHERE clause filtering
|
|
Predicate func(*schema_pb.RecordValue) bool
|
|
}
|
|
|
|
// ScanResult represents a single scanned record
|
|
type ScanResult struct {
|
|
Values map[string]*schema_pb.Value // Column name -> value
|
|
Timestamp int64 // Message timestamp (_ts_ns)
|
|
Key []byte // Message key (_key)
|
|
}
|
|
|
|
// Scan reads records from the topic's Parquet files
|
|
// Assumptions:
|
|
// 1. Scans all partitions of the topic
|
|
// 2. Applies time filtering at Parquet level for efficiency
|
|
// 3. Applies predicates and projections after reading
|
|
func (ps *ParquetScanner) Scan(ctx context.Context, options ScanOptions) ([]ScanResult, error) {
|
|
var results []ScanResult
|
|
|
|
// Get all partitions for this topic
|
|
// TODO: Implement proper partition discovery
|
|
// For now, assume partition 0 exists
|
|
partitions := []topic.Partition{{RangeStart: 0, RangeStop: 1000}}
|
|
|
|
for _, partition := range partitions {
|
|
partitionResults, err := ps.scanPartition(ctx, partition, options)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to scan partition %v: %v", partition, err)
|
|
}
|
|
|
|
results = append(results, partitionResults...)
|
|
|
|
// Apply global limit across all partitions
|
|
if options.Limit > 0 && len(results) >= options.Limit {
|
|
results = results[:options.Limit]
|
|
break
|
|
}
|
|
}
|
|
|
|
return results, nil
|
|
}
|
|
|
|
// scanPartition scans a specific topic partition
|
|
func (ps *ParquetScanner) scanPartition(ctx context.Context, partition topic.Partition, options ScanOptions) ([]ScanResult, error) {
|
|
// partitionDir := topic.PartitionDir(ps.topic, partition) // TODO: Use for actual file listing
|
|
|
|
var results []ScanResult
|
|
|
|
// List Parquet files in partition directory
|
|
// TODO: Implement proper file listing with date range filtering
|
|
// For now, this is a placeholder that would list actual Parquet files
|
|
|
|
// Simulate file processing - in real implementation, this would:
|
|
// 1. List files in partitionDir via filerClient
|
|
// 2. Filter files by date range if time filtering is enabled
|
|
// 3. Process each Parquet file in chronological order
|
|
|
|
// Placeholder: Create sample data for testing
|
|
if len(results) == 0 {
|
|
// Generate sample data for demonstration
|
|
sampleData := ps.generateSampleData(options)
|
|
results = append(results, sampleData...)
|
|
}
|
|
|
|
return results, nil
|
|
}
|
|
|
|
// scanParquetFile scans a single Parquet file (real implementation)
|
|
func (ps *ParquetScanner) scanParquetFile(ctx context.Context, entry *filer_pb.Entry, options ScanOptions) ([]ScanResult, error) {
|
|
var results []ScanResult
|
|
|
|
// Create reader for the Parquet file (same pattern as logstore)
|
|
lookupFileIdFn := filer.LookupFn(ps.filerClient)
|
|
fileSize := filer.FileSize(entry)
|
|
visibleIntervals, _ := filer.NonOverlappingVisibleIntervals(ctx, lookupFileIdFn, entry.Chunks, 0, int64(fileSize))
|
|
chunkViews := filer.ViewFromVisibleIntervals(visibleIntervals, 0, int64(fileSize))
|
|
readerCache := filer.NewReaderCache(32, ps.chunkCache, lookupFileIdFn)
|
|
readerAt := filer.NewChunkReaderAtFromClient(ctx, readerCache, chunkViews, int64(fileSize), filer.DefaultPrefetchCount)
|
|
|
|
// Create Parquet reader
|
|
parquetReader := parquet.NewReader(readerAt)
|
|
defer parquetReader.Close()
|
|
|
|
rows := make([]parquet.Row, 128) // Read in batches like logstore
|
|
|
|
for {
|
|
rowCount, readErr := parquetReader.ReadRows(rows)
|
|
|
|
// Process rows even if EOF
|
|
for i := 0; i < rowCount; i++ {
|
|
// Convert Parquet row to schema value
|
|
recordValue, err := schema.ToRecordValue(ps.recordSchema, ps.parquetLevels, rows[i])
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to convert row: %v", err)
|
|
}
|
|
|
|
// Extract system columns
|
|
timestamp := recordValue.Fields[SW_COLUMN_NAME_TIMESTAMP].GetInt64Value()
|
|
key := recordValue.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()
|
|
|
|
// Apply time filtering
|
|
if options.StartTimeNs > 0 && timestamp < options.StartTimeNs {
|
|
continue
|
|
}
|
|
if options.StopTimeNs > 0 && timestamp >= options.StopTimeNs {
|
|
break // Assume data is time-ordered
|
|
}
|
|
|
|
// Apply predicate filtering (WHERE clause)
|
|
if options.Predicate != nil && !options.Predicate(recordValue) {
|
|
continue
|
|
}
|
|
|
|
// Apply column projection
|
|
values := make(map[string]*schema_pb.Value)
|
|
if len(options.Columns) == 0 {
|
|
// Select all columns (excluding system columns from user view)
|
|
for name, value := range recordValue.Fields {
|
|
if name != SW_COLUMN_NAME_TIMESTAMP && name != SW_COLUMN_NAME_KEY {
|
|
values[name] = value
|
|
}
|
|
}
|
|
} else {
|
|
// Select specified columns only
|
|
for _, columnName := range options.Columns {
|
|
if value, exists := recordValue.Fields[columnName]; exists {
|
|
values[columnName] = value
|
|
}
|
|
}
|
|
}
|
|
|
|
results = append(results, ScanResult{
|
|
Values: values,
|
|
Timestamp: timestamp,
|
|
Key: key,
|
|
})
|
|
|
|
// Apply row limit
|
|
if options.Limit > 0 && len(results) >= options.Limit {
|
|
return results, nil
|
|
}
|
|
}
|
|
|
|
if readErr != nil {
|
|
break // EOF or error
|
|
}
|
|
}
|
|
|
|
return results, nil
|
|
}
|
|
|
|
// generateSampleData creates sample data for testing when no real Parquet files exist
|
|
func (ps *ParquetScanner) generateSampleData(options ScanOptions) []ScanResult {
|
|
now := time.Now().UnixNano()
|
|
|
|
sampleData := []ScanResult{
|
|
{
|
|
Values: map[string]*schema_pb.Value{
|
|
"user_id": {Kind: &schema_pb.Value_Int32Value{Int32Value: 1001}},
|
|
"event_type": {Kind: &schema_pb.Value_StringValue{StringValue: "login"}},
|
|
"data": {Kind: &schema_pb.Value_StringValue{StringValue: `{"ip": "192.168.1.1"}`}},
|
|
},
|
|
Timestamp: now - 3600000000000, // 1 hour ago
|
|
Key: []byte("user-1001"),
|
|
},
|
|
{
|
|
Values: map[string]*schema_pb.Value{
|
|
"user_id": {Kind: &schema_pb.Value_Int32Value{Int32Value: 1002}},
|
|
"event_type": {Kind: &schema_pb.Value_StringValue{StringValue: "page_view"}},
|
|
"data": {Kind: &schema_pb.Value_StringValue{StringValue: `{"page": "/dashboard"}`}},
|
|
},
|
|
Timestamp: now - 1800000000000, // 30 minutes ago
|
|
Key: []byte("user-1002"),
|
|
},
|
|
{
|
|
Values: map[string]*schema_pb.Value{
|
|
"user_id": {Kind: &schema_pb.Value_Int32Value{Int32Value: 1001}},
|
|
"event_type": {Kind: &schema_pb.Value_StringValue{StringValue: "logout"}},
|
|
"data": {Kind: &schema_pb.Value_StringValue{StringValue: `{"session_duration": 3600}`}},
|
|
},
|
|
Timestamp: now - 900000000000, // 15 minutes ago
|
|
Key: []byte("user-1001"),
|
|
},
|
|
}
|
|
|
|
// Apply predicate filtering if specified
|
|
if options.Predicate != nil {
|
|
var filtered []ScanResult
|
|
for _, result := range sampleData {
|
|
// Convert to RecordValue for predicate testing
|
|
recordValue := &schema_pb.RecordValue{Fields: make(map[string]*schema_pb.Value)}
|
|
for k, v := range result.Values {
|
|
recordValue.Fields[k] = v
|
|
}
|
|
recordValue.Fields[SW_COLUMN_NAME_TIMESTAMP] = &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: result.Timestamp}}
|
|
recordValue.Fields[SW_COLUMN_NAME_KEY] = &schema_pb.Value{Kind: &schema_pb.Value_BytesValue{BytesValue: result.Key}}
|
|
|
|
if options.Predicate(recordValue) {
|
|
filtered = append(filtered, result)
|
|
}
|
|
}
|
|
sampleData = filtered
|
|
}
|
|
|
|
// Apply limit
|
|
if options.Limit > 0 && len(sampleData) > options.Limit {
|
|
sampleData = sampleData[:options.Limit]
|
|
}
|
|
|
|
return sampleData
|
|
}
|
|
|
|
// ConvertToSQLResult converts ScanResults to SQL query results
|
|
func (ps *ParquetScanner) ConvertToSQLResult(results []ScanResult, columns []string) *QueryResult {
|
|
if len(results) == 0 {
|
|
return &QueryResult{
|
|
Columns: columns,
|
|
Rows: [][]sqltypes.Value{},
|
|
}
|
|
}
|
|
|
|
// Determine columns if not specified
|
|
if len(columns) == 0 {
|
|
columnSet := make(map[string]bool)
|
|
for _, result := range results {
|
|
for columnName := range result.Values {
|
|
columnSet[columnName] = true
|
|
}
|
|
}
|
|
|
|
columns = make([]string, 0, len(columnSet))
|
|
for columnName := range columnSet {
|
|
columns = append(columns, columnName)
|
|
}
|
|
}
|
|
|
|
// Convert to SQL rows
|
|
rows := make([][]sqltypes.Value, len(results))
|
|
for i, result := range results {
|
|
row := make([]sqltypes.Value, len(columns))
|
|
for j, columnName := range columns {
|
|
if value, exists := result.Values[columnName]; exists {
|
|
row[j] = convertSchemaValueToSQL(value)
|
|
} else {
|
|
row[j] = sqltypes.NULL
|
|
}
|
|
}
|
|
rows[i] = row
|
|
}
|
|
|
|
return &QueryResult{
|
|
Columns: columns,
|
|
Rows: rows,
|
|
}
|
|
}
|
|
|
|
// convertSchemaValueToSQL converts schema_pb.Value to sqltypes.Value
|
|
func convertSchemaValueToSQL(value *schema_pb.Value) sqltypes.Value {
|
|
if value == nil {
|
|
return sqltypes.NULL
|
|
}
|
|
|
|
switch v := value.Kind.(type) {
|
|
case *schema_pb.Value_BoolValue:
|
|
if v.BoolValue {
|
|
return sqltypes.NewInt32(1)
|
|
}
|
|
return sqltypes.NewInt32(0)
|
|
case *schema_pb.Value_Int32Value:
|
|
return sqltypes.NewInt32(v.Int32Value)
|
|
case *schema_pb.Value_Int64Value:
|
|
return sqltypes.NewInt64(v.Int64Value)
|
|
case *schema_pb.Value_FloatValue:
|
|
return sqltypes.NewFloat32(v.FloatValue)
|
|
case *schema_pb.Value_DoubleValue:
|
|
return sqltypes.NewFloat64(v.DoubleValue)
|
|
case *schema_pb.Value_BytesValue:
|
|
return sqltypes.NewVarBinary(string(v.BytesValue))
|
|
case *schema_pb.Value_StringValue:
|
|
return sqltypes.NewVarChar(v.StringValue)
|
|
// Parquet logical types
|
|
case *schema_pb.Value_TimestampValue:
|
|
timestampValue := value.GetTimestampValue()
|
|
if timestampValue == nil {
|
|
return sqltypes.NULL
|
|
}
|
|
// Convert microseconds to time.Time and format as datetime string
|
|
timestamp := time.UnixMicro(timestampValue.TimestampMicros)
|
|
return sqltypes.MakeTrusted(sqltypes.Datetime, []byte(timestamp.Format("2006-01-02 15:04:05")))
|
|
case *schema_pb.Value_DateValue:
|
|
dateValue := value.GetDateValue()
|
|
if dateValue == nil {
|
|
return sqltypes.NULL
|
|
}
|
|
// Convert days since epoch to date string
|
|
date := time.Unix(int64(dateValue.DaysSinceEpoch)*86400, 0).UTC()
|
|
return sqltypes.MakeTrusted(sqltypes.Date, []byte(date.Format("2006-01-02")))
|
|
case *schema_pb.Value_DecimalValue:
|
|
decimalValue := value.GetDecimalValue()
|
|
if decimalValue == nil {
|
|
return sqltypes.NULL
|
|
}
|
|
// Convert decimal bytes to string representation
|
|
decimalStr := decimalToStringHelper(decimalValue)
|
|
return sqltypes.MakeTrusted(sqltypes.Decimal, []byte(decimalStr))
|
|
case *schema_pb.Value_TimeValue:
|
|
timeValue := value.GetTimeValue()
|
|
if timeValue == nil {
|
|
return sqltypes.NULL
|
|
}
|
|
// Convert microseconds since midnight to time string
|
|
duration := time.Duration(timeValue.TimeMicros) * time.Microsecond
|
|
timeOfDay := time.Date(0, 1, 1, 0, 0, 0, 0, time.UTC).Add(duration)
|
|
return sqltypes.MakeTrusted(sqltypes.Time, []byte(timeOfDay.Format("15:04:05")))
|
|
default:
|
|
return sqltypes.NewVarChar(fmt.Sprintf("%v", value))
|
|
}
|
|
}
|
|
|
|
// decimalToStringHelper converts a DecimalValue to string representation
|
|
// This is a standalone version of the engine's decimalToString method
|
|
func decimalToStringHelper(decimalValue *schema_pb.DecimalValue) string {
|
|
if decimalValue == nil || decimalValue.Value == nil {
|
|
return "0"
|
|
}
|
|
|
|
// Convert bytes back to big.Int
|
|
intValue := new(big.Int).SetBytes(decimalValue.Value)
|
|
|
|
// Convert to string with proper decimal placement
|
|
str := intValue.String()
|
|
|
|
// Handle decimal placement based on scale
|
|
scale := int(decimalValue.Scale)
|
|
if scale > 0 && len(str) > scale {
|
|
// Insert decimal point
|
|
decimalPos := len(str) - scale
|
|
return str[:decimalPos] + "." + str[decimalPos:]
|
|
}
|
|
|
|
return str
|
|
}
|