chore: remove ~50k lines of unreachable dead code (#8913)
* chore: remove unreachable dead code across the codebase Remove ~50,000 lines of unreachable code identified by static analysis. Major removals: - weed/filer/redis_lua: entire unused Redis Lua filer store implementation - weed/wdclient/net2, resource_pool: unused connection/resource pool packages - weed/plugin/worker/lifecycle: unused lifecycle plugin worker - weed/s3api: unused S3 policy templates, presigned URL IAM, streaming copy, multipart IAM, key rotation, and various SSE helper functions - weed/mq/kafka: unused partition mapping, compression, schema, and protocol functions - weed/mq/offset: unused SQL storage and migration code - weed/worker: unused registry, task, and monitoring functions - weed/query: unused SQL engine, parquet scanner, and type functions - weed/shell: unused EC proportional rebalance functions - weed/storage/erasure_coding/distribution: unused distribution analysis functions - Individual unreachable functions removed from 150+ files across admin, credential, filer, iam, kms, mount, mq, operation, pb, s3api, server, shell, storage, topology, and util packages * fix(s3): reset shared memory store in IAM test to prevent flaky failure TestLoadIAMManagerFromConfig_EmptyConfigWithFallbackKey was flaky because the MemoryStore credential backend is a singleton registered via init(). Earlier tests that create anonymous identities pollute the shared store, causing LookupAnonymous() to unexpectedly return true. Fix by calling Reset() on the memory store before the test runs. * style: run gofmt on changed files * fix: restore KMS functions used by integration tests * fix(plugin): prevent panic on send to closed worker session channel The Plugin.sendToWorker method could panic with "send on closed channel" when a worker disconnected while a message was being sent. The race was between streamSession.close() closing the outgoing channel and sendToWorker writing to it concurrently. Add a done channel to streamSession that is closed before the outgoing channel, and check it in sendToWorker's select to safely detect closed sessions without panicking.
This commit is contained in:
@@ -74,11 +74,6 @@ func (opt *FastPathOptimizer) DetermineStrategy(aggregations []AggregationSpec)
|
||||
return strategy
|
||||
}
|
||||
|
||||
// CollectDataSources gathers information about available data sources for a topic
|
||||
func (opt *FastPathOptimizer) CollectDataSources(ctx context.Context, hybridScanner *HybridMessageScanner) (*TopicDataSources, error) {
|
||||
return opt.CollectDataSourcesWithTimeFilter(ctx, hybridScanner, 0, 0)
|
||||
}
|
||||
|
||||
// CollectDataSourcesWithTimeFilter gathers information about available data sources for a topic
|
||||
// with optional time filtering to skip irrelevant parquet files
|
||||
func (opt *FastPathOptimizer) CollectDataSourcesWithTimeFilter(ctx context.Context, hybridScanner *HybridMessageScanner, startTimeNs, stopTimeNs int64) (*TopicDataSources, error) {
|
||||
|
||||
@@ -539,20 +539,6 @@ func NewSQLEngine(masterAddress string) *SQLEngine {
|
||||
}
|
||||
}
|
||||
|
||||
// NewSQLEngineWithCatalog creates a new SQL execution engine with a custom catalog
|
||||
// Used for testing or when you want to provide a pre-configured catalog
|
||||
func NewSQLEngineWithCatalog(catalog *SchemaCatalog) *SQLEngine {
|
||||
// Initialize global HTTP client if not already done
|
||||
// This is needed for reading partition data from the filer
|
||||
if util_http.GetGlobalHttpClient() == nil {
|
||||
util_http.InitGlobalHttpClient()
|
||||
}
|
||||
|
||||
return &SQLEngine{
|
||||
catalog: catalog,
|
||||
}
|
||||
}
|
||||
|
||||
// GetCatalog returns the schema catalog for external access
|
||||
func (e *SQLEngine) GetCatalog() *SchemaCatalog {
|
||||
return e.catalog
|
||||
@@ -3682,11 +3668,6 @@ type ExecutionPlanBuilder struct {
|
||||
engine *SQLEngine
|
||||
}
|
||||
|
||||
// NewExecutionPlanBuilder creates a new execution plan builder
|
||||
func NewExecutionPlanBuilder(engine *SQLEngine) *ExecutionPlanBuilder {
|
||||
return &ExecutionPlanBuilder{engine: engine}
|
||||
}
|
||||
|
||||
// BuildAggregationPlan builds an execution plan for aggregation queries
|
||||
func (builder *ExecutionPlanBuilder) BuildAggregationPlan(
|
||||
stmt *SelectStatement,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -44,7 +44,7 @@ type ParseError struct {
|
||||
|
||||
func (e ParseError) Error() string {
|
||||
if e.Cause != nil {
|
||||
return fmt.Sprintf("SQL parse error: %s (%v)", e.Message, e.Cause)
|
||||
return fmt.Sprintf("SQL parse error: %s (caused by: %v)", e.Message, e.Cause)
|
||||
}
|
||||
return fmt.Sprintf("SQL parse error: %s", e.Message)
|
||||
}
|
||||
|
||||
@@ -1,133 +0,0 @@
|
||||
package engine
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
// TestExecutionPlanFastPathDisplay tests that the execution plan correctly shows
|
||||
// "Parquet Statistics (fast path)" when fast path is used, not "Parquet Files (full scan)"
|
||||
func TestExecutionPlanFastPathDisplay(t *testing.T) {
|
||||
engine := NewMockSQLEngine()
|
||||
|
||||
// Create realistic data sources for fast path scenario
|
||||
dataSources := &TopicDataSources{
|
||||
ParquetFiles: map[string][]*ParquetFileStats{
|
||||
"/topics/test/topic/partition-1": {
|
||||
{
|
||||
RowCount: 500,
|
||||
ColumnStats: map[string]*ParquetColumnStats{
|
||||
"id": {
|
||||
ColumnName: "id",
|
||||
MinValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 1}},
|
||||
MaxValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 500}},
|
||||
NullCount: 0,
|
||||
RowCount: 500,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
ParquetRowCount: 500,
|
||||
LiveLogRowCount: 0, // Pure parquet scenario - ideal for fast path
|
||||
PartitionsCount: 1,
|
||||
}
|
||||
|
||||
t.Run("Fast path execution plan shows correct data sources", func(t *testing.T) {
|
||||
optimizer := NewFastPathOptimizer(engine.SQLEngine)
|
||||
|
||||
aggregations := []AggregationSpec{
|
||||
{Function: FuncCOUNT, Column: "*", Alias: "COUNT(*)"},
|
||||
}
|
||||
|
||||
// Test the strategy determination
|
||||
strategy := optimizer.DetermineStrategy(aggregations)
|
||||
assert.True(t, strategy.CanUseFastPath, "Strategy should allow fast path for COUNT(*)")
|
||||
assert.Equal(t, "all_aggregations_supported", strategy.Reason)
|
||||
|
||||
// Test data source list building
|
||||
builder := &ExecutionPlanBuilder{}
|
||||
dataSources := &TopicDataSources{
|
||||
ParquetFiles: map[string][]*ParquetFileStats{
|
||||
"/topics/test/topic/partition-1": {
|
||||
{RowCount: 500},
|
||||
},
|
||||
},
|
||||
ParquetRowCount: 500,
|
||||
LiveLogRowCount: 0,
|
||||
PartitionsCount: 1,
|
||||
}
|
||||
|
||||
dataSourcesList := builder.buildDataSourcesList(strategy, dataSources)
|
||||
|
||||
// When fast path is used, should show "parquet_stats" not "parquet_files"
|
||||
assert.Contains(t, dataSourcesList, "parquet_stats",
|
||||
"Data sources should contain 'parquet_stats' when fast path is used")
|
||||
assert.NotContains(t, dataSourcesList, "parquet_files",
|
||||
"Data sources should NOT contain 'parquet_files' when fast path is used")
|
||||
|
||||
// Test that the formatting works correctly
|
||||
formattedSource := engine.SQLEngine.formatDataSource("parquet_stats")
|
||||
assert.Equal(t, "Parquet Statistics (fast path)", formattedSource,
|
||||
"parquet_stats should format to 'Parquet Statistics (fast path)'")
|
||||
|
||||
formattedFullScan := engine.SQLEngine.formatDataSource("parquet_files")
|
||||
assert.Equal(t, "Parquet Files (full scan)", formattedFullScan,
|
||||
"parquet_files should format to 'Parquet Files (full scan)'")
|
||||
})
|
||||
|
||||
t.Run("Slow path execution plan shows full scan data sources", func(t *testing.T) {
|
||||
builder := &ExecutionPlanBuilder{}
|
||||
|
||||
// Create strategy that cannot use fast path
|
||||
strategy := AggregationStrategy{
|
||||
CanUseFastPath: false,
|
||||
Reason: "unsupported_aggregation_functions",
|
||||
}
|
||||
|
||||
dataSourcesList := builder.buildDataSourcesList(strategy, dataSources)
|
||||
|
||||
// When slow path is used, should show "parquet_files" and "live_logs"
|
||||
assert.Contains(t, dataSourcesList, "parquet_files",
|
||||
"Slow path should contain 'parquet_files'")
|
||||
assert.Contains(t, dataSourcesList, "live_logs",
|
||||
"Slow path should contain 'live_logs'")
|
||||
assert.NotContains(t, dataSourcesList, "parquet_stats",
|
||||
"Slow path should NOT contain 'parquet_stats'")
|
||||
})
|
||||
|
||||
t.Run("Data source formatting works correctly", func(t *testing.T) {
|
||||
// Test just the data source formatting which is the key fix
|
||||
|
||||
// Test parquet_stats formatting (fast path)
|
||||
fastPathFormatted := engine.SQLEngine.formatDataSource("parquet_stats")
|
||||
assert.Equal(t, "Parquet Statistics (fast path)", fastPathFormatted,
|
||||
"parquet_stats should format to show fast path usage")
|
||||
|
||||
// Test parquet_files formatting (slow path)
|
||||
slowPathFormatted := engine.SQLEngine.formatDataSource("parquet_files")
|
||||
assert.Equal(t, "Parquet Files (full scan)", slowPathFormatted,
|
||||
"parquet_files should format to show full scan")
|
||||
|
||||
// Test that data sources list is built correctly for fast path
|
||||
builder := &ExecutionPlanBuilder{}
|
||||
fastStrategy := AggregationStrategy{CanUseFastPath: true}
|
||||
|
||||
fastSources := builder.buildDataSourcesList(fastStrategy, dataSources)
|
||||
assert.Contains(t, fastSources, "parquet_stats",
|
||||
"Fast path should include parquet_stats")
|
||||
assert.NotContains(t, fastSources, "parquet_files",
|
||||
"Fast path should NOT include parquet_files")
|
||||
|
||||
// Test that data sources list is built correctly for slow path
|
||||
slowStrategy := AggregationStrategy{CanUseFastPath: false}
|
||||
|
||||
slowSources := builder.buildDataSourcesList(slowStrategy, dataSources)
|
||||
assert.Contains(t, slowSources, "parquet_files",
|
||||
"Slow path should include parquet_files")
|
||||
assert.NotContains(t, slowSources, "parquet_stats",
|
||||
"Slow path should NOT include parquet_stats")
|
||||
})
|
||||
}
|
||||
@@ -1,193 +0,0 @@
|
||||
package engine
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
// TestFastPathCountFixRealistic tests the specific scenario mentioned in the bug report:
|
||||
// Fast path returning 0 for COUNT(*) when slow path returns 1803
|
||||
func TestFastPathCountFixRealistic(t *testing.T) {
|
||||
engine := NewMockSQLEngine()
|
||||
|
||||
// Set up debug mode to see our new logging
|
||||
ctx := context.WithValue(context.Background(), "debug", true)
|
||||
|
||||
// Create realistic data sources that mimic a scenario with 1803 rows
|
||||
dataSources := &TopicDataSources{
|
||||
ParquetFiles: map[string][]*ParquetFileStats{
|
||||
"/topics/test/large-topic/0000-1023": {
|
||||
{
|
||||
RowCount: 800,
|
||||
ColumnStats: map[string]*ParquetColumnStats{
|
||||
"id": {
|
||||
ColumnName: "id",
|
||||
MinValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 1}},
|
||||
MaxValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 800}},
|
||||
NullCount: 0,
|
||||
RowCount: 800,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
RowCount: 500,
|
||||
ColumnStats: map[string]*ParquetColumnStats{
|
||||
"id": {
|
||||
ColumnName: "id",
|
||||
MinValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 801}},
|
||||
MaxValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 1300}},
|
||||
NullCount: 0,
|
||||
RowCount: 500,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
"/topics/test/large-topic/1024-2047": {
|
||||
{
|
||||
RowCount: 300,
|
||||
ColumnStats: map[string]*ParquetColumnStats{
|
||||
"id": {
|
||||
ColumnName: "id",
|
||||
MinValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 1301}},
|
||||
MaxValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 1600}},
|
||||
NullCount: 0,
|
||||
RowCount: 300,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
ParquetRowCount: 1600, // 800 + 500 + 300
|
||||
LiveLogRowCount: 203, // Additional live log data
|
||||
PartitionsCount: 2,
|
||||
LiveLogFilesCount: 15,
|
||||
}
|
||||
|
||||
partitions := []string{
|
||||
"/topics/test/large-topic/0000-1023",
|
||||
"/topics/test/large-topic/1024-2047",
|
||||
}
|
||||
|
||||
t.Run("COUNT(*) should return correct total (1803)", func(t *testing.T) {
|
||||
computer := NewAggregationComputer(engine.SQLEngine)
|
||||
|
||||
aggregations := []AggregationSpec{
|
||||
{Function: FuncCOUNT, Column: "*", Alias: "COUNT(*)"},
|
||||
}
|
||||
|
||||
results, err := computer.ComputeFastPathAggregations(ctx, aggregations, dataSources, partitions)
|
||||
|
||||
assert.NoError(t, err, "Fast path aggregation should not error")
|
||||
assert.Len(t, results, 1, "Should return one result")
|
||||
|
||||
// This is the key test - before our fix, this was returning 0
|
||||
expectedCount := int64(1803) // 1600 (parquet) + 203 (live log)
|
||||
actualCount := results[0].Count
|
||||
|
||||
assert.Equal(t, expectedCount, actualCount,
|
||||
"COUNT(*) should return %d (1600 parquet + 203 live log), but got %d",
|
||||
expectedCount, actualCount)
|
||||
})
|
||||
|
||||
t.Run("MIN/MAX should work with multiple partitions", func(t *testing.T) {
|
||||
computer := NewAggregationComputer(engine.SQLEngine)
|
||||
|
||||
aggregations := []AggregationSpec{
|
||||
{Function: FuncMIN, Column: "id", Alias: "MIN(id)"},
|
||||
{Function: FuncMAX, Column: "id", Alias: "MAX(id)"},
|
||||
}
|
||||
|
||||
results, err := computer.ComputeFastPathAggregations(ctx, aggregations, dataSources, partitions)
|
||||
|
||||
assert.NoError(t, err, "Fast path aggregation should not error")
|
||||
assert.Len(t, results, 2, "Should return two results")
|
||||
|
||||
// MIN should be the lowest across all parquet files
|
||||
assert.Equal(t, int64(1), results[0].Min, "MIN should be 1")
|
||||
|
||||
// MAX should be the highest across all parquet files
|
||||
assert.Equal(t, int64(1600), results[1].Max, "MAX should be 1600")
|
||||
})
|
||||
}
|
||||
|
||||
// TestFastPathDataSourceDiscoveryLogging tests that our debug logging works correctly
|
||||
func TestFastPathDataSourceDiscoveryLogging(t *testing.T) {
|
||||
// This test verifies that our enhanced data source collection structure is correct
|
||||
|
||||
t.Run("DataSources structure validation", func(t *testing.T) {
|
||||
// Test the TopicDataSources structure initialization
|
||||
dataSources := &TopicDataSources{
|
||||
ParquetFiles: make(map[string][]*ParquetFileStats),
|
||||
ParquetRowCount: 0,
|
||||
LiveLogRowCount: 0,
|
||||
LiveLogFilesCount: 0,
|
||||
PartitionsCount: 0,
|
||||
}
|
||||
|
||||
assert.NotNil(t, dataSources, "Data sources should not be nil")
|
||||
assert.NotNil(t, dataSources.ParquetFiles, "ParquetFiles map should be initialized")
|
||||
assert.GreaterOrEqual(t, dataSources.PartitionsCount, 0, "PartitionsCount should be non-negative")
|
||||
assert.GreaterOrEqual(t, dataSources.ParquetRowCount, int64(0), "ParquetRowCount should be non-negative")
|
||||
assert.GreaterOrEqual(t, dataSources.LiveLogRowCount, int64(0), "LiveLogRowCount should be non-negative")
|
||||
})
|
||||
}
|
||||
|
||||
// TestFastPathValidationLogic tests the enhanced validation we added
|
||||
func TestFastPathValidationLogic(t *testing.T) {
|
||||
t.Run("Validation catches data source vs computation mismatch", func(t *testing.T) {
|
||||
// Create a scenario where data sources and computation might be inconsistent
|
||||
dataSources := &TopicDataSources{
|
||||
ParquetFiles: make(map[string][]*ParquetFileStats),
|
||||
ParquetRowCount: 1000, // Data sources say 1000 rows
|
||||
LiveLogRowCount: 0,
|
||||
PartitionsCount: 1,
|
||||
}
|
||||
|
||||
// But aggregation result says different count (simulating the original bug)
|
||||
aggResults := []AggregationResult{
|
||||
{Count: 0}, // Bug: returns 0 when data sources show 1000
|
||||
}
|
||||
|
||||
// This simulates the validation logic from tryFastParquetAggregation
|
||||
totalRows := dataSources.ParquetRowCount + dataSources.LiveLogRowCount
|
||||
countResult := aggResults[0].Count
|
||||
|
||||
// Our validation should catch this mismatch
|
||||
assert.NotEqual(t, totalRows, countResult,
|
||||
"This test simulates the bug: data sources show %d but COUNT returns %d",
|
||||
totalRows, countResult)
|
||||
|
||||
// In the real code, this would trigger a fallback to slow path
|
||||
validationPassed := (countResult == totalRows)
|
||||
assert.False(t, validationPassed, "Validation should fail for inconsistent data")
|
||||
})
|
||||
|
||||
t.Run("Validation passes for consistent data", func(t *testing.T) {
|
||||
// Create a scenario where everything is consistent
|
||||
dataSources := &TopicDataSources{
|
||||
ParquetFiles: make(map[string][]*ParquetFileStats),
|
||||
ParquetRowCount: 1000,
|
||||
LiveLogRowCount: 803,
|
||||
PartitionsCount: 1,
|
||||
}
|
||||
|
||||
// Aggregation result matches data sources
|
||||
aggResults := []AggregationResult{
|
||||
{Count: 1803}, // Correct: matches 1000 + 803
|
||||
}
|
||||
|
||||
totalRows := dataSources.ParquetRowCount + dataSources.LiveLogRowCount
|
||||
countResult := aggResults[0].Count
|
||||
|
||||
// Our validation should pass this
|
||||
assert.Equal(t, totalRows, countResult,
|
||||
"Validation should pass when data sources (%d) match COUNT result (%d)",
|
||||
totalRows, countResult)
|
||||
|
||||
validationPassed := (countResult == totalRows)
|
||||
assert.True(t, validationPassed, "Validation should pass for consistent data")
|
||||
})
|
||||
}
|
||||
@@ -1,280 +1,14 @@
|
||||
package engine
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math/big"
|
||||
"time"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/mq/schema"
|
||||
"github.com/seaweedfs/seaweedfs/weed/mq/topic"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/mq_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/query/sqltypes"
|
||||
"github.com/seaweedfs/seaweedfs/weed/util/chunk_cache"
|
||||
)
|
||||
|
||||
// ParquetScanner scans MQ topic Parquet files for SELECT queries
|
||||
// Assumptions:
|
||||
// 1. All MQ messages are stored in Parquet format in topic partitions
|
||||
// 2. Each partition directory contains dated Parquet files
|
||||
// 3. System columns (_ts_ns, _key) are added to user schema
|
||||
// 4. Predicate pushdown is used for efficient scanning
|
||||
type ParquetScanner struct {
|
||||
filerClient filer_pb.FilerClient
|
||||
chunkCache chunk_cache.ChunkCache
|
||||
topic topic.Topic
|
||||
recordSchema *schema_pb.RecordType
|
||||
parquetLevels *schema.ParquetLevels
|
||||
}
|
||||
|
||||
// NewParquetScanner creates a scanner for a specific MQ topic
|
||||
// Assumption: Topic exists and has Parquet files in partition directories
|
||||
func NewParquetScanner(filerClient filer_pb.FilerClient, namespace, topicName string) (*ParquetScanner, error) {
|
||||
// Check if filerClient is available
|
||||
if filerClient == nil {
|
||||
return nil, fmt.Errorf("filerClient is required but not available")
|
||||
}
|
||||
|
||||
// Create topic reference
|
||||
t := topic.Topic{
|
||||
Namespace: namespace,
|
||||
Name: topicName,
|
||||
}
|
||||
|
||||
// Read topic configuration to get schema
|
||||
var topicConf *mq_pb.ConfigureTopicResponse
|
||||
var err error
|
||||
if err := filerClient.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
|
||||
topicConf, err = t.ReadConfFile(client)
|
||||
return err
|
||||
}); err != nil {
|
||||
return nil, fmt.Errorf("failed to read topic config: %v", err)
|
||||
}
|
||||
|
||||
// Build complete schema with system columns - prefer flat schema if available
|
||||
var recordType *schema_pb.RecordType
|
||||
|
||||
if topicConf.GetMessageRecordType() != nil {
|
||||
// New flat schema format - use directly
|
||||
recordType = topicConf.GetMessageRecordType()
|
||||
}
|
||||
|
||||
if recordType == nil || len(recordType.Fields) == 0 {
|
||||
// For topics without schema, create a minimal schema with system fields and _value
|
||||
recordType = schema.RecordTypeBegin().
|
||||
WithField(SW_COLUMN_NAME_TIMESTAMP, schema.TypeInt64).
|
||||
WithField(SW_COLUMN_NAME_KEY, schema.TypeBytes).
|
||||
WithField(SW_COLUMN_NAME_VALUE, schema.TypeBytes). // Raw message value
|
||||
RecordTypeEnd()
|
||||
} else {
|
||||
// Add system columns that MQ adds to all records
|
||||
recordType = schema.NewRecordTypeBuilder(recordType).
|
||||
WithField(SW_COLUMN_NAME_TIMESTAMP, schema.TypeInt64).
|
||||
WithField(SW_COLUMN_NAME_KEY, schema.TypeBytes).
|
||||
RecordTypeEnd()
|
||||
}
|
||||
|
||||
// Convert to Parquet levels for efficient reading
|
||||
parquetLevels, err := schema.ToParquetLevels(recordType)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create Parquet levels: %v", err)
|
||||
}
|
||||
|
||||
return &ParquetScanner{
|
||||
filerClient: filerClient,
|
||||
chunkCache: chunk_cache.NewChunkCacheInMemory(256), // Same as MQ logstore
|
||||
topic: t,
|
||||
recordSchema: recordType,
|
||||
parquetLevels: parquetLevels,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ScanOptions configure how the scanner reads data
|
||||
type ScanOptions struct {
|
||||
// Time range filtering (Unix nanoseconds)
|
||||
StartTimeNs int64
|
||||
StopTimeNs int64
|
||||
|
||||
// Column projection - if empty, select all columns
|
||||
Columns []string
|
||||
|
||||
// Row limit - 0 means no limit
|
||||
Limit int
|
||||
|
||||
// Predicate for WHERE clause filtering
|
||||
Predicate func(*schema_pb.RecordValue) bool
|
||||
}
|
||||
|
||||
// ScanResult represents a single scanned record
|
||||
type ScanResult struct {
|
||||
Values map[string]*schema_pb.Value // Column name -> value
|
||||
Timestamp int64 // Message timestamp (_ts_ns)
|
||||
Key []byte // Message key (_key)
|
||||
}
|
||||
|
||||
// Scan reads records from the topic's Parquet files
|
||||
// Assumptions:
|
||||
// 1. Scans all partitions of the topic
|
||||
// 2. Applies time filtering at Parquet level for efficiency
|
||||
// 3. Applies predicates and projections after reading
|
||||
func (ps *ParquetScanner) Scan(ctx context.Context, options ScanOptions) ([]ScanResult, error) {
|
||||
var results []ScanResult
|
||||
|
||||
// Get all partitions for this topic
|
||||
// TODO: Implement proper partition discovery
|
||||
// For now, assume partition 0 exists
|
||||
partitions := []topic.Partition{{RangeStart: 0, RangeStop: 1000}}
|
||||
|
||||
for _, partition := range partitions {
|
||||
partitionResults, err := ps.scanPartition(ctx, partition, options)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to scan partition %v: %v", partition, err)
|
||||
}
|
||||
|
||||
results = append(results, partitionResults...)
|
||||
|
||||
// Apply global limit across all partitions
|
||||
if options.Limit > 0 && len(results) >= options.Limit {
|
||||
results = results[:options.Limit]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
// scanPartition scans a specific topic partition
|
||||
func (ps *ParquetScanner) scanPartition(ctx context.Context, partition topic.Partition, options ScanOptions) ([]ScanResult, error) {
|
||||
// partitionDir := topic.PartitionDir(ps.topic, partition) // TODO: Use for actual file listing
|
||||
|
||||
var results []ScanResult
|
||||
|
||||
// List Parquet files in partition directory
|
||||
// TODO: Implement proper file listing with date range filtering
|
||||
// For now, this is a placeholder that would list actual Parquet files
|
||||
|
||||
// Simulate file processing - in real implementation, this would:
|
||||
// 1. List files in partitionDir via filerClient
|
||||
// 2. Filter files by date range if time filtering is enabled
|
||||
// 3. Process each Parquet file in chronological order
|
||||
|
||||
// Placeholder: Create sample data for testing
|
||||
if len(results) == 0 {
|
||||
// Generate sample data for demonstration
|
||||
sampleData := ps.generateSampleData(options)
|
||||
results = append(results, sampleData...)
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
// generateSampleData creates sample data for testing when no real Parquet files exist
|
||||
func (ps *ParquetScanner) generateSampleData(options ScanOptions) []ScanResult {
|
||||
now := time.Now().UnixNano()
|
||||
|
||||
sampleData := []ScanResult{
|
||||
{
|
||||
Values: map[string]*schema_pb.Value{
|
||||
"user_id": {Kind: &schema_pb.Value_Int32Value{Int32Value: 1001}},
|
||||
"event_type": {Kind: &schema_pb.Value_StringValue{StringValue: "login"}},
|
||||
"data": {Kind: &schema_pb.Value_StringValue{StringValue: `{"ip": "192.168.1.1"}`}},
|
||||
},
|
||||
Timestamp: now - 3600000000000, // 1 hour ago
|
||||
Key: []byte("user-1001"),
|
||||
},
|
||||
{
|
||||
Values: map[string]*schema_pb.Value{
|
||||
"user_id": {Kind: &schema_pb.Value_Int32Value{Int32Value: 1002}},
|
||||
"event_type": {Kind: &schema_pb.Value_StringValue{StringValue: "page_view"}},
|
||||
"data": {Kind: &schema_pb.Value_StringValue{StringValue: `{"page": "/dashboard"}`}},
|
||||
},
|
||||
Timestamp: now - 1800000000000, // 30 minutes ago
|
||||
Key: []byte("user-1002"),
|
||||
},
|
||||
{
|
||||
Values: map[string]*schema_pb.Value{
|
||||
"user_id": {Kind: &schema_pb.Value_Int32Value{Int32Value: 1001}},
|
||||
"event_type": {Kind: &schema_pb.Value_StringValue{StringValue: "logout"}},
|
||||
"data": {Kind: &schema_pb.Value_StringValue{StringValue: `{"session_duration": 3600}`}},
|
||||
},
|
||||
Timestamp: now - 900000000000, // 15 minutes ago
|
||||
Key: []byte("user-1001"),
|
||||
},
|
||||
}
|
||||
|
||||
// Apply predicate filtering if specified
|
||||
if options.Predicate != nil {
|
||||
var filtered []ScanResult
|
||||
for _, result := range sampleData {
|
||||
// Convert to RecordValue for predicate testing
|
||||
recordValue := &schema_pb.RecordValue{Fields: make(map[string]*schema_pb.Value)}
|
||||
for k, v := range result.Values {
|
||||
recordValue.Fields[k] = v
|
||||
}
|
||||
recordValue.Fields[SW_COLUMN_NAME_TIMESTAMP] = &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: result.Timestamp}}
|
||||
recordValue.Fields[SW_COLUMN_NAME_KEY] = &schema_pb.Value{Kind: &schema_pb.Value_BytesValue{BytesValue: result.Key}}
|
||||
|
||||
if options.Predicate(recordValue) {
|
||||
filtered = append(filtered, result)
|
||||
}
|
||||
}
|
||||
sampleData = filtered
|
||||
}
|
||||
|
||||
// Apply limit
|
||||
if options.Limit > 0 && len(sampleData) > options.Limit {
|
||||
sampleData = sampleData[:options.Limit]
|
||||
}
|
||||
|
||||
return sampleData
|
||||
}
|
||||
|
||||
// ConvertToSQLResult converts ScanResults to SQL query results
|
||||
func (ps *ParquetScanner) ConvertToSQLResult(results []ScanResult, columns []string) *QueryResult {
|
||||
if len(results) == 0 {
|
||||
return &QueryResult{
|
||||
Columns: columns,
|
||||
Rows: [][]sqltypes.Value{},
|
||||
}
|
||||
}
|
||||
|
||||
// Determine columns if not specified
|
||||
if len(columns) == 0 {
|
||||
columnSet := make(map[string]bool)
|
||||
for _, result := range results {
|
||||
for columnName := range result.Values {
|
||||
columnSet[columnName] = true
|
||||
}
|
||||
}
|
||||
|
||||
columns = make([]string, 0, len(columnSet))
|
||||
for columnName := range columnSet {
|
||||
columns = append(columns, columnName)
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to SQL rows
|
||||
rows := make([][]sqltypes.Value, len(results))
|
||||
for i, result := range results {
|
||||
row := make([]sqltypes.Value, len(columns))
|
||||
for j, columnName := range columns {
|
||||
if value, exists := result.Values[columnName]; exists {
|
||||
row[j] = convertSchemaValueToSQL(value)
|
||||
} else {
|
||||
row[j] = sqltypes.NULL
|
||||
}
|
||||
}
|
||||
rows[i] = row
|
||||
}
|
||||
|
||||
return &QueryResult{
|
||||
Columns: columns,
|
||||
Rows: rows,
|
||||
}
|
||||
}
|
||||
|
||||
// convertSchemaValueToSQL converts schema_pb.Value to sqltypes.Value
|
||||
func convertSchemaValueToSQL(value *schema_pb.Value) sqltypes.Value {
|
||||
if value == nil {
|
||||
|
||||
@@ -1,117 +0,0 @@
|
||||
package engine
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
// TestPartitionPathHandling tests that partition paths are handled correctly
|
||||
// whether discoverTopicPartitions returns relative or absolute paths
|
||||
func TestPartitionPathHandling(t *testing.T) {
|
||||
engine := NewMockSQLEngine()
|
||||
|
||||
t.Run("Mock discoverTopicPartitions returns correct paths", func(t *testing.T) {
|
||||
// Test that our mock engine handles absolute paths correctly
|
||||
engine.mockPartitions["test.user_events"] = []string{
|
||||
"/topics/test/user_events/v2025-09-03-15-36-29/0000-2520",
|
||||
"/topics/test/user_events/v2025-09-03-15-36-29/2521-5040",
|
||||
}
|
||||
|
||||
partitions, err := engine.discoverTopicPartitions("test", "user_events")
|
||||
assert.NoError(t, err, "Should discover partitions without error")
|
||||
assert.Equal(t, 2, len(partitions), "Should return 2 partitions")
|
||||
assert.Contains(t, partitions[0], "/topics/test/user_events/", "Should contain absolute path")
|
||||
})
|
||||
|
||||
t.Run("Mock discoverTopicPartitions handles relative paths", func(t *testing.T) {
|
||||
// Test relative paths scenario
|
||||
engine.mockPartitions["test.user_events"] = []string{
|
||||
"v2025-09-03-15-36-29/0000-2520",
|
||||
"v2025-09-03-15-36-29/2521-5040",
|
||||
}
|
||||
|
||||
partitions, err := engine.discoverTopicPartitions("test", "user_events")
|
||||
assert.NoError(t, err, "Should discover partitions without error")
|
||||
assert.Equal(t, 2, len(partitions), "Should return 2 partitions")
|
||||
assert.True(t, !strings.HasPrefix(partitions[0], "/topics/"), "Should be relative path")
|
||||
})
|
||||
|
||||
t.Run("Partition path building logic works correctly", func(t *testing.T) {
|
||||
topicBasePath := "/topics/test/user_events"
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
relativePartition string
|
||||
expectedPath string
|
||||
}{
|
||||
{
|
||||
name: "Absolute path - use as-is",
|
||||
relativePartition: "/topics/test/user_events/v2025-09-03-15-36-29/0000-2520",
|
||||
expectedPath: "/topics/test/user_events/v2025-09-03-15-36-29/0000-2520",
|
||||
},
|
||||
{
|
||||
name: "Relative path - build full path",
|
||||
relativePartition: "v2025-09-03-15-36-29/0000-2520",
|
||||
expectedPath: "/topics/test/user_events/v2025-09-03-15-36-29/0000-2520",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
var partitionPath string
|
||||
|
||||
// This is the same logic from our fixed code
|
||||
if strings.HasPrefix(tc.relativePartition, "/topics/") {
|
||||
// Already a full path - use as-is
|
||||
partitionPath = tc.relativePartition
|
||||
} else {
|
||||
// Relative path - build full path
|
||||
partitionPath = topicBasePath + "/" + tc.relativePartition
|
||||
}
|
||||
|
||||
assert.Equal(t, tc.expectedPath, partitionPath,
|
||||
"Partition path should be built correctly")
|
||||
|
||||
// Ensure no double slashes
|
||||
assert.NotContains(t, partitionPath, "//",
|
||||
"Partition path should not contain double slashes")
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestPartitionPathLogic tests the core logic for handling partition paths
|
||||
func TestPartitionPathLogic(t *testing.T) {
|
||||
t.Run("Building partition paths from discovered partitions", func(t *testing.T) {
|
||||
// Test the specific partition path building that was causing issues
|
||||
|
||||
topicBasePath := "/topics/ecommerce/user_events"
|
||||
|
||||
// This simulates the discoverTopicPartitions returning absolute paths (realistic scenario)
|
||||
relativePartitions := []string{
|
||||
"/topics/ecommerce/user_events/v2025-09-03-15-36-29/0000-2520",
|
||||
}
|
||||
|
||||
// This is the code from our fix - test it directly
|
||||
partitions := make([]string, len(relativePartitions))
|
||||
for i, relPartition := range relativePartitions {
|
||||
// Handle both relative and absolute partition paths from discoverTopicPartitions
|
||||
if strings.HasPrefix(relPartition, "/topics/") {
|
||||
// Already a full path - use as-is
|
||||
partitions[i] = relPartition
|
||||
} else {
|
||||
// Relative path - build full path
|
||||
partitions[i] = topicBasePath + "/" + relPartition
|
||||
}
|
||||
}
|
||||
|
||||
// Verify the path was handled correctly
|
||||
expectedPath := "/topics/ecommerce/user_events/v2025-09-03-15-36-29/0000-2520"
|
||||
assert.Equal(t, expectedPath, partitions[0], "Absolute path should be used as-is")
|
||||
|
||||
// Ensure no double slashes (this was the original bug)
|
||||
assert.NotContains(t, partitions[0], "//", "Path should not contain double slashes")
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user