Fix sql bugs (#7219)

* fix nil when explaining

* add plain details when running full scan

* skip files by timestamp

* skip file by timestamp

* refactor

* handle filter by time

* skip broker memory only if it has unflushed messages

* refactoring

* refactor

* address comments

* address comments

* filter by parquet stats

* simplify

* refactor

* prune old code

* optimize

* Update aggregations.go

* ensure non-time predicates are properly detected

* add stmt to populatePlanFileDetails

This helper function is a great way to centralize logic for populating file details. However, it's missing an optimization that is present in executeSelectStatementWithBrokerStats: pruning Parquet files based on column statistics from the WHERE clause.

Aggregation queries that fall back to the slow path could benefit from this optimization. Consider modifying the function signature to accept the *SelectStatement and adding the column statistics pruning logic here, similar to how it's done in executeSelectStatementWithBrokerStats.

* refactoring to work with *schema_pb.Value directly after the initial conversion
This commit is contained in:
Chris Lu
2025-09-10 11:04:42 -07:00
committed by GitHub
parent 8ed1b104ce
commit 58e0c1b330
5 changed files with 799 additions and 351 deletions

View File

@@ -3,6 +3,7 @@ package engine
import (
"container/heap"
"context"
"encoding/binary"
"encoding/json"
"fmt"
"io"
@@ -145,6 +146,46 @@ type ParquetFileStats struct {
FileName string
RowCount int64
ColumnStats map[string]*ParquetColumnStats
// Optional file-level timestamp range from filer extended attributes
MinTimestampNs int64
MaxTimestampNs int64
}
// getTimestampRangeFromStats returns (minTsNs, maxTsNs, ok) by inspecting common timestamp columns
func (h *HybridMessageScanner) getTimestampRangeFromStats(fileStats *ParquetFileStats) (int64, int64, bool) {
if fileStats == nil {
return 0, 0, false
}
// Prefer column stats for _ts_ns if present
if len(fileStats.ColumnStats) > 0 {
if s, ok := fileStats.ColumnStats[logstore.SW_COLUMN_NAME_TS]; ok && s != nil && s.MinValue != nil && s.MaxValue != nil {
if minNs, okMin := h.schemaValueToNs(s.MinValue); okMin {
if maxNs, okMax := h.schemaValueToNs(s.MaxValue); okMax {
return minNs, maxNs, true
}
}
}
}
// Fallback to file-level range if present in filer extended metadata
if fileStats.MinTimestampNs != 0 || fileStats.MaxTimestampNs != 0 {
return fileStats.MinTimestampNs, fileStats.MaxTimestampNs, true
}
return 0, 0, false
}
// schemaValueToNs converts a schema_pb.Value that represents a timestamp to ns
func (h *HybridMessageScanner) schemaValueToNs(v *schema_pb.Value) (int64, bool) {
if v == nil {
return 0, false
}
switch k := v.Kind.(type) {
case *schema_pb.Value_Int64Value:
return k.Int64Value, true
case *schema_pb.Value_Int32Value:
return int64(k.Int32Value), true
default:
return 0, false
}
}
// StreamingDataSource provides a streaming interface for reading scan results
@@ -1080,6 +1121,15 @@ func (h *HybridMessageScanner) extractParquetFileStats(entry *filer_pb.Entry, lo
RowCount: fileView.NumRows(),
ColumnStats: make(map[string]*ParquetColumnStats),
}
// Populate optional min/max from filer extended attributes (writer stores ns timestamps)
if entry != nil && entry.Extended != nil {
if minBytes, ok := entry.Extended["min"]; ok && len(minBytes) == 8 {
fileStats.MinTimestampNs = int64(binary.BigEndian.Uint64(minBytes))
}
if maxBytes, ok := entry.Extended["max"]; ok && len(maxBytes) == 8 {
fileStats.MaxTimestampNs = int64(binary.BigEndian.Uint64(maxBytes))
}
}
// Get schema information
schema := fileView.Schema()