diff options
Diffstat (limited to 'weed/query/engine/engine_test.go')
| -rw-r--r-- | weed/query/engine/engine_test.go | 1392 |
1 files changed, 1392 insertions, 0 deletions
diff --git a/weed/query/engine/engine_test.go b/weed/query/engine/engine_test.go new file mode 100644 index 000000000..8193afef6 --- /dev/null +++ b/weed/query/engine/engine_test.go @@ -0,0 +1,1392 @@ +package engine + +import ( + "context" + "encoding/binary" + "errors" + "testing" + + "github.com/seaweedfs/seaweedfs/weed/mq/topic" + "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" + "github.com/seaweedfs/seaweedfs/weed/pb/schema_pb" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "google.golang.org/protobuf/proto" +) + +// Mock implementations for testing +type MockHybridMessageScanner struct { + mock.Mock + topic topic.Topic +} + +func (m *MockHybridMessageScanner) ReadParquetStatistics(partitionPath string) ([]*ParquetFileStats, error) { + args := m.Called(partitionPath) + return args.Get(0).([]*ParquetFileStats), args.Error(1) +} + +type MockSQLEngine struct { + *SQLEngine + mockPartitions map[string][]string + mockParquetSourceFiles map[string]map[string]bool + mockLiveLogRowCounts map[string]int64 + mockColumnStats map[string]map[string]*ParquetColumnStats +} + +func NewMockSQLEngine() *MockSQLEngine { + return &MockSQLEngine{ + SQLEngine: &SQLEngine{ + catalog: &SchemaCatalog{ + databases: make(map[string]*DatabaseInfo), + currentDatabase: "test", + }, + }, + mockPartitions: make(map[string][]string), + mockParquetSourceFiles: make(map[string]map[string]bool), + mockLiveLogRowCounts: make(map[string]int64), + mockColumnStats: make(map[string]map[string]*ParquetColumnStats), + } +} + +func (m *MockSQLEngine) discoverTopicPartitions(namespace, topicName string) ([]string, error) { + key := namespace + "." + topicName + if partitions, exists := m.mockPartitions[key]; exists { + return partitions, nil + } + return []string{"partition-1", "partition-2"}, nil +} + +func (m *MockSQLEngine) extractParquetSourceFiles(fileStats []*ParquetFileStats) map[string]bool { + if len(fileStats) == 0 { + return make(map[string]bool) + } + return map[string]bool{"converted-log-1": true} +} + +func (m *MockSQLEngine) countLiveLogRowsExcludingParquetSources(ctx context.Context, partition string, parquetSources map[string]bool) (int64, error) { + if count, exists := m.mockLiveLogRowCounts[partition]; exists { + return count, nil + } + return 25, nil +} + +func (m *MockSQLEngine) computeLiveLogMinMax(partition, column string, parquetSources map[string]bool) (interface{}, interface{}, error) { + switch column { + case "id": + return int64(1), int64(50), nil + case "value": + return 10.5, 99.9, nil + default: + return nil, nil, nil + } +} + +func (m *MockSQLEngine) getSystemColumnGlobalMin(column string, allFileStats map[string][]*ParquetFileStats) interface{} { + return int64(1000000000) +} + +func (m *MockSQLEngine) getSystemColumnGlobalMax(column string, allFileStats map[string][]*ParquetFileStats) interface{} { + return int64(2000000000) +} + +func createMockColumnStats(column string, minVal, maxVal interface{}) *ParquetColumnStats { + return &ParquetColumnStats{ + ColumnName: column, + MinValue: convertToSchemaValue(minVal), + MaxValue: convertToSchemaValue(maxVal), + NullCount: 0, + } +} + +func convertToSchemaValue(val interface{}) *schema_pb.Value { + switch v := val.(type) { + case int64: + return &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: v}} + case float64: + return &schema_pb.Value{Kind: &schema_pb.Value_DoubleValue{DoubleValue: v}} + case string: + return &schema_pb.Value{Kind: &schema_pb.Value_StringValue{StringValue: v}} + } + return nil +} + +// Test FastPathOptimizer +func TestFastPathOptimizer_DetermineStrategy(t *testing.T) { + engine := NewMockSQLEngine() + optimizer := NewFastPathOptimizer(engine.SQLEngine) + + tests := []struct { + name string + aggregations []AggregationSpec + expected AggregationStrategy + }{ + { + name: "Supported aggregations", + aggregations: []AggregationSpec{ + {Function: FuncCOUNT, Column: "*"}, + {Function: FuncMAX, Column: "id"}, + {Function: FuncMIN, Column: "value"}, + }, + expected: AggregationStrategy{ + CanUseFastPath: true, + Reason: "all_aggregations_supported", + UnsupportedSpecs: []AggregationSpec{}, + }, + }, + { + name: "Unsupported aggregation", + aggregations: []AggregationSpec{ + {Function: FuncCOUNT, Column: "*"}, + {Function: FuncAVG, Column: "value"}, // Not supported + }, + expected: AggregationStrategy{ + CanUseFastPath: false, + Reason: "unsupported_aggregation_functions", + }, + }, + { + name: "Empty aggregations", + aggregations: []AggregationSpec{}, + expected: AggregationStrategy{ + CanUseFastPath: true, + Reason: "all_aggregations_supported", + UnsupportedSpecs: []AggregationSpec{}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + strategy := optimizer.DetermineStrategy(tt.aggregations) + + assert.Equal(t, tt.expected.CanUseFastPath, strategy.CanUseFastPath) + assert.Equal(t, tt.expected.Reason, strategy.Reason) + if !tt.expected.CanUseFastPath { + assert.NotEmpty(t, strategy.UnsupportedSpecs) + } + }) + } +} + +// Test AggregationComputer +func TestAggregationComputer_ComputeFastPathAggregations(t *testing.T) { + engine := NewMockSQLEngine() + computer := NewAggregationComputer(engine.SQLEngine) + + dataSources := &TopicDataSources{ + ParquetFiles: map[string][]*ParquetFileStats{ + "/topics/test/topic1/partition-1": { + { + RowCount: 30, + ColumnStats: map[string]*ParquetColumnStats{ + "id": createMockColumnStats("id", int64(10), int64(40)), + }, + }, + }, + }, + ParquetRowCount: 30, + LiveLogRowCount: 25, + PartitionsCount: 1, + } + + partitions := []string{"/topics/test/topic1/partition-1"} + + tests := []struct { + name string + aggregations []AggregationSpec + validate func(t *testing.T, results []AggregationResult) + }{ + { + name: "COUNT aggregation", + aggregations: []AggregationSpec{ + {Function: FuncCOUNT, Column: "*"}, + }, + validate: func(t *testing.T, results []AggregationResult) { + assert.Len(t, results, 1) + assert.Equal(t, int64(55), results[0].Count) // 30 + 25 + }, + }, + { + name: "MAX aggregation", + aggregations: []AggregationSpec{ + {Function: FuncMAX, Column: "id"}, + }, + validate: func(t *testing.T, results []AggregationResult) { + assert.Len(t, results, 1) + // Should be max of parquet stats (40) - mock doesn't combine with live log + assert.Equal(t, int64(40), results[0].Max) + }, + }, + { + name: "MIN aggregation", + aggregations: []AggregationSpec{ + {Function: FuncMIN, Column: "id"}, + }, + validate: func(t *testing.T, results []AggregationResult) { + assert.Len(t, results, 1) + // Should be min of parquet stats (10) - mock doesn't combine with live log + assert.Equal(t, int64(10), results[0].Min) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + results, err := computer.ComputeFastPathAggregations(ctx, tt.aggregations, dataSources, partitions) + + assert.NoError(t, err) + tt.validate(t, results) + }) + } +} + +// Test case-insensitive column lookup and null handling for MIN/MAX aggregations +func TestAggregationComputer_MinMaxEdgeCases(t *testing.T) { + engine := NewMockSQLEngine() + computer := NewAggregationComputer(engine.SQLEngine) + + tests := []struct { + name string + dataSources *TopicDataSources + aggregations []AggregationSpec + validate func(t *testing.T, results []AggregationResult, err error) + }{ + { + name: "Case insensitive column lookup", + dataSources: &TopicDataSources{ + ParquetFiles: map[string][]*ParquetFileStats{ + "/topics/test/partition-1": { + { + RowCount: 50, + ColumnStats: map[string]*ParquetColumnStats{ + "ID": createMockColumnStats("ID", int64(5), int64(95)), // Uppercase column name + }, + }, + }, + }, + ParquetRowCount: 50, + LiveLogRowCount: 0, + PartitionsCount: 1, + }, + aggregations: []AggregationSpec{ + {Function: FuncMIN, Column: "id"}, // lowercase column name + {Function: FuncMAX, Column: "id"}, + }, + validate: func(t *testing.T, results []AggregationResult, err error) { + assert.NoError(t, err) + assert.Len(t, results, 2) + assert.Equal(t, int64(5), results[0].Min, "MIN should work with case-insensitive lookup") + assert.Equal(t, int64(95), results[1].Max, "MAX should work with case-insensitive lookup") + }, + }, + { + name: "Null column stats handling", + dataSources: &TopicDataSources{ + ParquetFiles: map[string][]*ParquetFileStats{ + "/topics/test/partition-1": { + { + RowCount: 50, + ColumnStats: map[string]*ParquetColumnStats{ + "id": { + ColumnName: "id", + MinValue: nil, // Null min value + MaxValue: nil, // Null max value + NullCount: 50, + RowCount: 50, + }, + }, + }, + }, + }, + ParquetRowCount: 50, + LiveLogRowCount: 0, + PartitionsCount: 1, + }, + aggregations: []AggregationSpec{ + {Function: FuncMIN, Column: "id"}, + {Function: FuncMAX, Column: "id"}, + }, + validate: func(t *testing.T, results []AggregationResult, err error) { + assert.NoError(t, err) + assert.Len(t, results, 2) + // When stats are null, should fall back to system column or return nil + // This tests that we don't crash on null stats + }, + }, + { + name: "Mixed data types - string column", + dataSources: &TopicDataSources{ + ParquetFiles: map[string][]*ParquetFileStats{ + "/topics/test/partition-1": { + { + RowCount: 30, + ColumnStats: map[string]*ParquetColumnStats{ + "name": createMockColumnStats("name", "Alice", "Zoe"), + }, + }, + }, + }, + ParquetRowCount: 30, + LiveLogRowCount: 0, + PartitionsCount: 1, + }, + aggregations: []AggregationSpec{ + {Function: FuncMIN, Column: "name"}, + {Function: FuncMAX, Column: "name"}, + }, + validate: func(t *testing.T, results []AggregationResult, err error) { + assert.NoError(t, err) + assert.Len(t, results, 2) + assert.Equal(t, "Alice", results[0].Min) + assert.Equal(t, "Zoe", results[1].Max) + }, + }, + { + name: "Mixed data types - float column", + dataSources: &TopicDataSources{ + ParquetFiles: map[string][]*ParquetFileStats{ + "/topics/test/partition-1": { + { + RowCount: 25, + ColumnStats: map[string]*ParquetColumnStats{ + "price": createMockColumnStats("price", float64(19.99), float64(299.50)), + }, + }, + }, + }, + ParquetRowCount: 25, + LiveLogRowCount: 0, + PartitionsCount: 1, + }, + aggregations: []AggregationSpec{ + {Function: FuncMIN, Column: "price"}, + {Function: FuncMAX, Column: "price"}, + }, + validate: func(t *testing.T, results []AggregationResult, err error) { + assert.NoError(t, err) + assert.Len(t, results, 2) + assert.Equal(t, float64(19.99), results[0].Min) + assert.Equal(t, float64(299.50), results[1].Max) + }, + }, + { + name: "Column not found in parquet stats", + dataSources: &TopicDataSources{ + ParquetFiles: map[string][]*ParquetFileStats{ + "/topics/test/partition-1": { + { + RowCount: 20, + ColumnStats: map[string]*ParquetColumnStats{ + "id": createMockColumnStats("id", int64(1), int64(100)), + // Note: "nonexistent_column" is not in stats + }, + }, + }, + }, + ParquetRowCount: 20, + LiveLogRowCount: 10, // Has live logs to fall back to + PartitionsCount: 1, + }, + aggregations: []AggregationSpec{ + {Function: FuncMIN, Column: "nonexistent_column"}, + {Function: FuncMAX, Column: "nonexistent_column"}, + }, + validate: func(t *testing.T, results []AggregationResult, err error) { + assert.NoError(t, err) + assert.Len(t, results, 2) + // Should fall back to live log processing or return nil + // The key is that it shouldn't crash + }, + }, + { + name: "Multiple parquet files with different ranges", + dataSources: &TopicDataSources{ + ParquetFiles: map[string][]*ParquetFileStats{ + "/topics/test/partition-1": { + { + RowCount: 30, + ColumnStats: map[string]*ParquetColumnStats{ + "score": createMockColumnStats("score", int64(10), int64(50)), + }, + }, + { + RowCount: 40, + ColumnStats: map[string]*ParquetColumnStats{ + "score": createMockColumnStats("score", int64(5), int64(75)), // Lower min, higher max + }, + }, + }, + }, + ParquetRowCount: 70, + LiveLogRowCount: 0, + PartitionsCount: 1, + }, + aggregations: []AggregationSpec{ + {Function: FuncMIN, Column: "score"}, + {Function: FuncMAX, Column: "score"}, + }, + validate: func(t *testing.T, results []AggregationResult, err error) { + assert.NoError(t, err) + assert.Len(t, results, 2) + assert.Equal(t, int64(5), results[0].Min, "Should find global minimum across all files") + assert.Equal(t, int64(75), results[1].Max, "Should find global maximum across all files") + }, + }, + } + + partitions := []string{"/topics/test/partition-1"} + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + results, err := computer.ComputeFastPathAggregations(ctx, tt.aggregations, tt.dataSources, partitions) + tt.validate(t, results, err) + }) + } +} + +// Test the specific bug where MIN/MAX was returning empty values +func TestAggregationComputer_MinMaxEmptyValuesBugFix(t *testing.T) { + engine := NewMockSQLEngine() + computer := NewAggregationComputer(engine.SQLEngine) + + // This test specifically addresses the bug where MIN/MAX returned empty + // due to improper null checking and extraction logic + dataSources := &TopicDataSources{ + ParquetFiles: map[string][]*ParquetFileStats{ + "/topics/test/test-topic/partition1": { + { + RowCount: 100, + ColumnStats: map[string]*ParquetColumnStats{ + "id": { + ColumnName: "id", + MinValue: &schema_pb.Value{Kind: &schema_pb.Value_Int32Value{Int32Value: 0}}, // Min should be 0 + MaxValue: &schema_pb.Value{Kind: &schema_pb.Value_Int32Value{Int32Value: 99}}, // Max should be 99 + NullCount: 0, + RowCount: 100, + }, + }, + }, + }, + }, + ParquetRowCount: 100, + LiveLogRowCount: 0, // No live logs, pure parquet stats + PartitionsCount: 1, + } + + partitions := []string{"/topics/test/test-topic/partition1"} + + tests := []struct { + name string + aggregSpec AggregationSpec + expected interface{} + }{ + { + name: "MIN should return 0 not empty", + aggregSpec: AggregationSpec{Function: FuncMIN, Column: "id"}, + expected: int32(0), // Should extract the actual minimum value + }, + { + name: "MAX should return 99 not empty", + aggregSpec: AggregationSpec{Function: FuncMAX, Column: "id"}, + expected: int32(99), // Should extract the actual maximum value + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + results, err := computer.ComputeFastPathAggregations(ctx, []AggregationSpec{tt.aggregSpec}, dataSources, partitions) + + assert.NoError(t, err) + assert.Len(t, results, 1) + + // Verify the result is not nil/empty + if tt.aggregSpec.Function == FuncMIN { + assert.NotNil(t, results[0].Min, "MIN result should not be nil") + assert.Equal(t, tt.expected, results[0].Min) + } else if tt.aggregSpec.Function == FuncMAX { + assert.NotNil(t, results[0].Max, "MAX result should not be nil") + assert.Equal(t, tt.expected, results[0].Max) + } + }) + } +} + +// Test the formatAggregationResult function with MIN/MAX edge cases +func TestSQLEngine_FormatAggregationResult_MinMax(t *testing.T) { + engine := NewTestSQLEngine() + + tests := []struct { + name string + spec AggregationSpec + result AggregationResult + expected string + }{ + { + name: "MIN with zero value should not be empty", + spec: AggregationSpec{Function: FuncMIN, Column: "id"}, + result: AggregationResult{Min: int32(0)}, + expected: "0", + }, + { + name: "MAX with large value", + spec: AggregationSpec{Function: FuncMAX, Column: "id"}, + result: AggregationResult{Max: int32(99)}, + expected: "99", + }, + { + name: "MIN with negative value", + spec: AggregationSpec{Function: FuncMIN, Column: "score"}, + result: AggregationResult{Min: int64(-50)}, + expected: "-50", + }, + { + name: "MAX with float value", + spec: AggregationSpec{Function: FuncMAX, Column: "price"}, + result: AggregationResult{Max: float64(299.99)}, + expected: "299.99", + }, + { + name: "MIN with string value", + spec: AggregationSpec{Function: FuncMIN, Column: "name"}, + result: AggregationResult{Min: "Alice"}, + expected: "Alice", + }, + { + name: "MIN with nil should return NULL", + spec: AggregationSpec{Function: FuncMIN, Column: "missing"}, + result: AggregationResult{Min: nil}, + expected: "", // NULL values display as empty + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sqlValue := engine.formatAggregationResult(tt.spec, tt.result) + assert.Equal(t, tt.expected, sqlValue.String()) + }) + } +} + +// Test the direct formatAggregationResult scenario that was originally broken +func TestSQLEngine_MinMaxBugFixIntegration(t *testing.T) { + // This test focuses on the core bug fix without the complexity of table discovery + // It directly tests the scenario where MIN/MAX returned empty due to the bug + + engine := NewTestSQLEngine() + + // Test the direct formatting path that was failing + tests := []struct { + name string + aggregSpec AggregationSpec + aggResult AggregationResult + expectedEmpty bool + expectedValue string + }{ + { + name: "MIN with zero should not be empty (the original bug)", + aggregSpec: AggregationSpec{Function: FuncMIN, Column: "id", Alias: "MIN(id)"}, + aggResult: AggregationResult{Min: int32(0)}, // This was returning empty before fix + expectedEmpty: false, + expectedValue: "0", + }, + { + name: "MAX with valid value should not be empty", + aggregSpec: AggregationSpec{Function: FuncMAX, Column: "id", Alias: "MAX(id)"}, + aggResult: AggregationResult{Max: int32(99)}, + expectedEmpty: false, + expectedValue: "99", + }, + { + name: "MIN with negative value should work", + aggregSpec: AggregationSpec{Function: FuncMIN, Column: "score", Alias: "MIN(score)"}, + aggResult: AggregationResult{Min: int64(-10)}, + expectedEmpty: false, + expectedValue: "-10", + }, + { + name: "MIN with nil should be empty (expected behavior)", + aggregSpec: AggregationSpec{Function: FuncMIN, Column: "missing", Alias: "MIN(missing)"}, + aggResult: AggregationResult{Min: nil}, + expectedEmpty: true, + expectedValue: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Test the formatAggregationResult function directly + sqlValue := engine.formatAggregationResult(tt.aggregSpec, tt.aggResult) + result := sqlValue.String() + + if tt.expectedEmpty { + assert.Empty(t, result, "Result should be empty for nil values") + } else { + assert.NotEmpty(t, result, "Result should not be empty") + assert.Equal(t, tt.expectedValue, result) + } + }) + } +} + +// Test the tryFastParquetAggregation method specifically for the bug +func TestSQLEngine_FastParquetAggregationBugFix(t *testing.T) { + // This test verifies that the fast path aggregation logic works correctly + // and doesn't return nil/empty values when it should return actual data + + engine := NewMockSQLEngine() + computer := NewAggregationComputer(engine.SQLEngine) + + // Create realistic data sources that mimic the user's scenario + dataSources := &TopicDataSources{ + ParquetFiles: map[string][]*ParquetFileStats{ + "/topics/test/test-topic/v2025-09-01-22-54-02/0000-0630": { + { + RowCount: 100, + ColumnStats: map[string]*ParquetColumnStats{ + "id": { + ColumnName: "id", + MinValue: &schema_pb.Value{Kind: &schema_pb.Value_Int32Value{Int32Value: 0}}, + MaxValue: &schema_pb.Value{Kind: &schema_pb.Value_Int32Value{Int32Value: 99}}, + NullCount: 0, + RowCount: 100, + }, + }, + }, + }, + }, + ParquetRowCount: 100, + LiveLogRowCount: 0, // Pure parquet scenario + PartitionsCount: 1, + } + + partitions := []string{"/topics/test/test-topic/v2025-09-01-22-54-02/0000-0630"} + + tests := []struct { + name string + aggregations []AggregationSpec + validateResults func(t *testing.T, results []AggregationResult) + }{ + { + name: "Single MIN aggregation should return value not nil", + aggregations: []AggregationSpec{ + {Function: FuncMIN, Column: "id", Alias: "MIN(id)"}, + }, + validateResults: func(t *testing.T, results []AggregationResult) { + assert.Len(t, results, 1) + assert.NotNil(t, results[0].Min, "MIN result should not be nil") + assert.Equal(t, int32(0), results[0].Min, "MIN should return the correct minimum value") + }, + }, + { + name: "Single MAX aggregation should return value not nil", + aggregations: []AggregationSpec{ + {Function: FuncMAX, Column: "id", Alias: "MAX(id)"}, + }, + validateResults: func(t *testing.T, results []AggregationResult) { + assert.Len(t, results, 1) + assert.NotNil(t, results[0].Max, "MAX result should not be nil") + assert.Equal(t, int32(99), results[0].Max, "MAX should return the correct maximum value") + }, + }, + { + name: "Combined MIN/MAX should both return values", + aggregations: []AggregationSpec{ + {Function: FuncMIN, Column: "id", Alias: "MIN(id)"}, + {Function: FuncMAX, Column: "id", Alias: "MAX(id)"}, + }, + validateResults: func(t *testing.T, results []AggregationResult) { + assert.Len(t, results, 2) + assert.NotNil(t, results[0].Min, "MIN result should not be nil") + assert.NotNil(t, results[1].Max, "MAX result should not be nil") + assert.Equal(t, int32(0), results[0].Min) + assert.Equal(t, int32(99), results[1].Max) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + results, err := computer.ComputeFastPathAggregations(ctx, tt.aggregations, dataSources, partitions) + + assert.NoError(t, err, "ComputeFastPathAggregations should not error") + tt.validateResults(t, results) + }) + } +} + +// Test ExecutionPlanBuilder +func TestExecutionPlanBuilder_BuildAggregationPlan(t *testing.T) { + engine := NewMockSQLEngine() + builder := NewExecutionPlanBuilder(engine.SQLEngine) + + // Parse a simple SELECT statement using the native parser + stmt, err := ParseSQL("SELECT COUNT(*) FROM test_topic") + assert.NoError(t, err) + selectStmt := stmt.(*SelectStatement) + + aggregations := []AggregationSpec{ + {Function: FuncCOUNT, Column: "*"}, + } + + strategy := AggregationStrategy{ + CanUseFastPath: true, + Reason: "all_aggregations_supported", + } + + dataSources := &TopicDataSources{ + ParquetRowCount: 100, + LiveLogRowCount: 50, + PartitionsCount: 3, + ParquetFiles: map[string][]*ParquetFileStats{ + "partition-1": {{RowCount: 50}}, + "partition-2": {{RowCount: 50}}, + }, + } + + plan := builder.BuildAggregationPlan(selectStmt, aggregations, strategy, dataSources) + + assert.Equal(t, "SELECT", plan.QueryType) + assert.Equal(t, "hybrid_fast_path", plan.ExecutionStrategy) + assert.Contains(t, plan.DataSources, "parquet_stats") + assert.Contains(t, plan.DataSources, "live_logs") + assert.Equal(t, 3, plan.PartitionsScanned) + assert.Equal(t, 2, plan.ParquetFilesScanned) + assert.Contains(t, plan.OptimizationsUsed, "parquet_statistics") + assert.Equal(t, []string{"COUNT(*)"}, plan.Aggregations) + assert.Equal(t, int64(50), plan.TotalRowsProcessed) // Only live logs scanned +} + +// Test Error Types +func TestErrorTypes(t *testing.T) { + t.Run("AggregationError", func(t *testing.T) { + err := AggregationError{ + Operation: "MAX", + Column: "id", + Cause: errors.New("column not found"), + } + + expected := "aggregation error in MAX(id): column not found" + assert.Equal(t, expected, err.Error()) + }) + + t.Run("DataSourceError", func(t *testing.T) { + err := DataSourceError{ + Source: "partition_discovery:test.topic1", + Cause: errors.New("network timeout"), + } + + expected := "data source error in partition_discovery:test.topic1: network timeout" + assert.Equal(t, expected, err.Error()) + }) + + t.Run("OptimizationError", func(t *testing.T) { + err := OptimizationError{ + Strategy: "fast_path_aggregation", + Reason: "unsupported function: AVG", + } + + expected := "optimization failed for fast_path_aggregation: unsupported function: AVG" + assert.Equal(t, expected, err.Error()) + }) +} + +// Integration Tests +func TestIntegration_FastPathOptimization(t *testing.T) { + engine := NewMockSQLEngine() + + // Setup components + optimizer := NewFastPathOptimizer(engine.SQLEngine) + computer := NewAggregationComputer(engine.SQLEngine) + + // Mock data setup + aggregations := []AggregationSpec{ + {Function: FuncCOUNT, Column: "*"}, + {Function: FuncMAX, Column: "id"}, + } + + // Step 1: Determine strategy + strategy := optimizer.DetermineStrategy(aggregations) + assert.True(t, strategy.CanUseFastPath) + + // Step 2: Mock data sources + dataSources := &TopicDataSources{ + ParquetFiles: map[string][]*ParquetFileStats{ + "/topics/test/topic1/partition-1": {{ + RowCount: 75, + ColumnStats: map[string]*ParquetColumnStats{ + "id": createMockColumnStats("id", int64(1), int64(100)), + }, + }}, + }, + ParquetRowCount: 75, + LiveLogRowCount: 25, + PartitionsCount: 1, + } + + partitions := []string{"/topics/test/topic1/partition-1"} + + // Step 3: Compute aggregations + ctx := context.Background() + results, err := computer.ComputeFastPathAggregations(ctx, aggregations, dataSources, partitions) + assert.NoError(t, err) + assert.Len(t, results, 2) + assert.Equal(t, int64(100), results[0].Count) // 75 + 25 + assert.Equal(t, int64(100), results[1].Max) // From parquet stats mock +} + +func TestIntegration_FallbackToFullScan(t *testing.T) { + engine := NewMockSQLEngine() + optimizer := NewFastPathOptimizer(engine.SQLEngine) + + // Unsupported aggregations + aggregations := []AggregationSpec{ + {Function: "AVG", Column: "value"}, // Not supported + } + + // Step 1: Strategy should reject fast path + strategy := optimizer.DetermineStrategy(aggregations) + assert.False(t, strategy.CanUseFastPath) + assert.Equal(t, "unsupported_aggregation_functions", strategy.Reason) + assert.NotEmpty(t, strategy.UnsupportedSpecs) +} + +// Benchmark Tests +func BenchmarkFastPathOptimizer_DetermineStrategy(b *testing.B) { + engine := NewMockSQLEngine() + optimizer := NewFastPathOptimizer(engine.SQLEngine) + + aggregations := []AggregationSpec{ + {Function: FuncCOUNT, Column: "*"}, + {Function: FuncMAX, Column: "id"}, + {Function: "MIN", Column: "value"}, + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + strategy := optimizer.DetermineStrategy(aggregations) + _ = strategy.CanUseFastPath + } +} + +func BenchmarkAggregationComputer_ComputeFastPathAggregations(b *testing.B) { + engine := NewMockSQLEngine() + computer := NewAggregationComputer(engine.SQLEngine) + + dataSources := &TopicDataSources{ + ParquetFiles: map[string][]*ParquetFileStats{ + "partition-1": {{ + RowCount: 1000, + ColumnStats: map[string]*ParquetColumnStats{ + "id": createMockColumnStats("id", int64(1), int64(1000)), + }, + }}, + }, + ParquetRowCount: 1000, + LiveLogRowCount: 100, + } + + aggregations := []AggregationSpec{ + {Function: FuncCOUNT, Column: "*"}, + {Function: FuncMAX, Column: "id"}, + } + + partitions := []string{"partition-1"} + ctx := context.Background() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + results, err := computer.ComputeFastPathAggregations(ctx, aggregations, dataSources, partitions) + if err != nil { + b.Fatal(err) + } + _ = results + } +} + +// Tests for convertLogEntryToRecordValue - Protocol Buffer parsing bug fix +func TestSQLEngine_ConvertLogEntryToRecordValue_ValidProtobuf(t *testing.T) { + engine := NewTestSQLEngine() + + // Create a valid RecordValue protobuf with user data + originalRecord := &schema_pb.RecordValue{ + Fields: map[string]*schema_pb.Value{ + "id": {Kind: &schema_pb.Value_Int32Value{Int32Value: 42}}, + "name": {Kind: &schema_pb.Value_StringValue{StringValue: "test-user"}}, + "score": {Kind: &schema_pb.Value_DoubleValue{DoubleValue: 95.5}}, + }, + } + + // Serialize the protobuf (this is what MQ actually stores) + protobufData, err := proto.Marshal(originalRecord) + assert.NoError(t, err) + + // Create a LogEntry with the serialized data + logEntry := &filer_pb.LogEntry{ + TsNs: 1609459200000000000, // 2021-01-01 00:00:00 UTC + PartitionKeyHash: 123, + Data: protobufData, // Protocol buffer data (not JSON!) + Key: []byte("test-key-001"), + } + + // Test the conversion + result, source, err := engine.convertLogEntryToRecordValue(logEntry) + + // Verify no error + assert.NoError(t, err) + assert.Equal(t, "live_log", source) + assert.NotNil(t, result) + assert.NotNil(t, result.Fields) + + // Verify system columns are added correctly + assert.Contains(t, result.Fields, SW_COLUMN_NAME_TIMESTAMP) + assert.Contains(t, result.Fields, SW_COLUMN_NAME_KEY) + assert.Equal(t, int64(1609459200000000000), result.Fields[SW_COLUMN_NAME_TIMESTAMP].GetInt64Value()) + assert.Equal(t, []byte("test-key-001"), result.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()) + + // Verify user data is preserved + assert.Contains(t, result.Fields, "id") + assert.Contains(t, result.Fields, "name") + assert.Contains(t, result.Fields, "score") + assert.Equal(t, int32(42), result.Fields["id"].GetInt32Value()) + assert.Equal(t, "test-user", result.Fields["name"].GetStringValue()) + assert.Equal(t, 95.5, result.Fields["score"].GetDoubleValue()) +} + +func TestSQLEngine_ConvertLogEntryToRecordValue_InvalidProtobuf(t *testing.T) { + engine := NewTestSQLEngine() + + // Create LogEntry with invalid protobuf data (this would cause the original JSON parsing bug) + logEntry := &filer_pb.LogEntry{ + TsNs: 1609459200000000000, + PartitionKeyHash: 123, + Data: []byte{0x17, 0x00, 0xFF, 0xFE}, // Invalid protobuf data (starts with \x17 like in the original error) + Key: []byte("test-key"), + } + + // Test the conversion + result, source, err := engine.convertLogEntryToRecordValue(logEntry) + + // Should return error for invalid protobuf + assert.Error(t, err) + assert.Contains(t, err.Error(), "failed to unmarshal log entry protobuf") + assert.Nil(t, result) + assert.Empty(t, source) +} + +func TestSQLEngine_ConvertLogEntryToRecordValue_EmptyProtobuf(t *testing.T) { + engine := NewTestSQLEngine() + + // Create a minimal valid RecordValue (empty fields) + emptyRecord := &schema_pb.RecordValue{ + Fields: map[string]*schema_pb.Value{}, + } + protobufData, err := proto.Marshal(emptyRecord) + assert.NoError(t, err) + + logEntry := &filer_pb.LogEntry{ + TsNs: 1609459200000000000, + PartitionKeyHash: 456, + Data: protobufData, + Key: []byte("empty-key"), + } + + // Test the conversion + result, source, err := engine.convertLogEntryToRecordValue(logEntry) + + // Should succeed and add system columns + assert.NoError(t, err) + assert.Equal(t, "live_log", source) + assert.NotNil(t, result) + assert.NotNil(t, result.Fields) + + // Should have system columns + assert.Contains(t, result.Fields, SW_COLUMN_NAME_TIMESTAMP) + assert.Contains(t, result.Fields, SW_COLUMN_NAME_KEY) + assert.Equal(t, int64(1609459200000000000), result.Fields[SW_COLUMN_NAME_TIMESTAMP].GetInt64Value()) + assert.Equal(t, []byte("empty-key"), result.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()) + + // Should have no user fields + userFieldCount := 0 + for fieldName := range result.Fields { + if fieldName != SW_COLUMN_NAME_TIMESTAMP && fieldName != SW_COLUMN_NAME_KEY { + userFieldCount++ + } + } + assert.Equal(t, 0, userFieldCount) +} + +func TestSQLEngine_ConvertLogEntryToRecordValue_NilFieldsMap(t *testing.T) { + engine := NewTestSQLEngine() + + // Create RecordValue with nil Fields map (edge case) + recordWithNilFields := &schema_pb.RecordValue{ + Fields: nil, // This should be handled gracefully + } + protobufData, err := proto.Marshal(recordWithNilFields) + assert.NoError(t, err) + + logEntry := &filer_pb.LogEntry{ + TsNs: 1609459200000000000, + PartitionKeyHash: 789, + Data: protobufData, + Key: []byte("nil-fields-key"), + } + + // Test the conversion + result, source, err := engine.convertLogEntryToRecordValue(logEntry) + + // Should succeed and create Fields map + assert.NoError(t, err) + assert.Equal(t, "live_log", source) + assert.NotNil(t, result) + assert.NotNil(t, result.Fields) // Should be created by the function + + // Should have system columns + assert.Contains(t, result.Fields, SW_COLUMN_NAME_TIMESTAMP) + assert.Contains(t, result.Fields, SW_COLUMN_NAME_KEY) + assert.Equal(t, int64(1609459200000000000), result.Fields[SW_COLUMN_NAME_TIMESTAMP].GetInt64Value()) + assert.Equal(t, []byte("nil-fields-key"), result.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()) +} + +func TestSQLEngine_ConvertLogEntryToRecordValue_SystemColumnOverride(t *testing.T) { + engine := NewTestSQLEngine() + + // Create RecordValue that already has system column names (should be overridden) + recordWithSystemCols := &schema_pb.RecordValue{ + Fields: map[string]*schema_pb.Value{ + "user_field": {Kind: &schema_pb.Value_StringValue{StringValue: "user-data"}}, + SW_COLUMN_NAME_TIMESTAMP: {Kind: &schema_pb.Value_Int64Value{Int64Value: 999999999}}, // Should be overridden + SW_COLUMN_NAME_KEY: {Kind: &schema_pb.Value_StringValue{StringValue: "old-key"}}, // Should be overridden + }, + } + protobufData, err := proto.Marshal(recordWithSystemCols) + assert.NoError(t, err) + + logEntry := &filer_pb.LogEntry{ + TsNs: 1609459200000000000, + PartitionKeyHash: 100, + Data: protobufData, + Key: []byte("actual-key"), + } + + // Test the conversion + result, source, err := engine.convertLogEntryToRecordValue(logEntry) + + // Should succeed + assert.NoError(t, err) + assert.Equal(t, "live_log", source) + assert.NotNil(t, result) + + // System columns should use LogEntry values, not protobuf values + assert.Equal(t, int64(1609459200000000000), result.Fields[SW_COLUMN_NAME_TIMESTAMP].GetInt64Value()) + assert.Equal(t, []byte("actual-key"), result.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()) + + // User field should be preserved + assert.Contains(t, result.Fields, "user_field") + assert.Equal(t, "user-data", result.Fields["user_field"].GetStringValue()) +} + +func TestSQLEngine_ConvertLogEntryToRecordValue_ComplexDataTypes(t *testing.T) { + engine := NewTestSQLEngine() + + // Test with various data types + complexRecord := &schema_pb.RecordValue{ + Fields: map[string]*schema_pb.Value{ + "int32_field": {Kind: &schema_pb.Value_Int32Value{Int32Value: -42}}, + "int64_field": {Kind: &schema_pb.Value_Int64Value{Int64Value: 9223372036854775807}}, + "float_field": {Kind: &schema_pb.Value_FloatValue{FloatValue: 3.14159}}, + "double_field": {Kind: &schema_pb.Value_DoubleValue{DoubleValue: 2.718281828}}, + "bool_field": {Kind: &schema_pb.Value_BoolValue{BoolValue: true}}, + "string_field": {Kind: &schema_pb.Value_StringValue{StringValue: "test string with unicode 🎉"}}, + "bytes_field": {Kind: &schema_pb.Value_BytesValue{BytesValue: []byte{0x01, 0x02, 0x03}}}, + }, + } + protobufData, err := proto.Marshal(complexRecord) + assert.NoError(t, err) + + logEntry := &filer_pb.LogEntry{ + TsNs: 1609459200000000000, + PartitionKeyHash: 200, + Data: protobufData, + Key: []byte("complex-key"), + } + + // Test the conversion + result, source, err := engine.convertLogEntryToRecordValue(logEntry) + + // Should succeed + assert.NoError(t, err) + assert.Equal(t, "live_log", source) + assert.NotNil(t, result) + + // Verify all data types are preserved + assert.Equal(t, int32(-42), result.Fields["int32_field"].GetInt32Value()) + assert.Equal(t, int64(9223372036854775807), result.Fields["int64_field"].GetInt64Value()) + assert.Equal(t, float32(3.14159), result.Fields["float_field"].GetFloatValue()) + assert.Equal(t, 2.718281828, result.Fields["double_field"].GetDoubleValue()) + assert.Equal(t, true, result.Fields["bool_field"].GetBoolValue()) + assert.Equal(t, "test string with unicode 🎉", result.Fields["string_field"].GetStringValue()) + assert.Equal(t, []byte{0x01, 0x02, 0x03}, result.Fields["bytes_field"].GetBytesValue()) + + // System columns should still be present + assert.Contains(t, result.Fields, SW_COLUMN_NAME_TIMESTAMP) + assert.Contains(t, result.Fields, SW_COLUMN_NAME_KEY) +} + +// Tests for log buffer deduplication functionality +func TestSQLEngine_GetLogBufferStartFromFile_BinaryFormat(t *testing.T) { + engine := NewTestSQLEngine() + + // Create sample buffer start (binary format) + bufferStartBytes := make([]byte, 8) + binary.BigEndian.PutUint64(bufferStartBytes, uint64(1609459100000000001)) + + // Create file entry with buffer start + some chunks + entry := &filer_pb.Entry{ + Name: "test-log-file", + Extended: map[string][]byte{ + "buffer_start": bufferStartBytes, + }, + Chunks: []*filer_pb.FileChunk{ + {FileId: "chunk1", Offset: 0, Size: 1000}, + {FileId: "chunk2", Offset: 1000, Size: 1000}, + {FileId: "chunk3", Offset: 2000, Size: 1000}, + }, + } + + // Test extraction + result, err := engine.getLogBufferStartFromFile(entry) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, int64(1609459100000000001), result.StartIndex) + + // Test extraction works correctly with the binary format +} + +func TestSQLEngine_GetLogBufferStartFromFile_NoMetadata(t *testing.T) { + engine := NewTestSQLEngine() + + // Create file entry without buffer start + entry := &filer_pb.Entry{ + Name: "test-log-file", + Extended: nil, + } + + // Test extraction + result, err := engine.getLogBufferStartFromFile(entry) + assert.NoError(t, err) + assert.Nil(t, result) +} + +func TestSQLEngine_GetLogBufferStartFromFile_InvalidData(t *testing.T) { + engine := NewTestSQLEngine() + + // Create file entry with invalid buffer start (wrong size) + entry := &filer_pb.Entry{ + Name: "test-log-file", + Extended: map[string][]byte{ + "buffer_start": []byte("invalid-binary"), + }, + } + + // Test extraction + result, err := engine.getLogBufferStartFromFile(entry) + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid buffer_start format: expected 8 bytes") + assert.Nil(t, result) +} + +func TestSQLEngine_BuildLogBufferDeduplicationMap_NoBrokerClient(t *testing.T) { + engine := NewTestSQLEngine() + engine.catalog.brokerClient = nil // Simulate no broker client + + ctx := context.Background() + result, err := engine.buildLogBufferDeduplicationMap(ctx, "/topics/test/test-topic") + + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Empty(t, result) +} + +func TestSQLEngine_LogBufferDeduplication_ServerRestartScenario(t *testing.T) { + // Simulate scenario: Buffer indexes are now initialized with process start time + // This tests that buffer start indexes are globally unique across server restarts + + // Before server restart: Process 1 buffer start (3 chunks) + beforeRestartStart := LogBufferStart{ + StartIndex: 1609459100000000000, // Process 1 start time + } + + // After server restart: Process 2 buffer start (3 chunks) + afterRestartStart := LogBufferStart{ + StartIndex: 1609459300000000000, // Process 2 start time (DIFFERENT) + } + + // Simulate 3 chunks for each file + chunkCount := int64(3) + + // Calculate end indexes for range comparison + beforeEnd := beforeRestartStart.StartIndex + chunkCount - 1 // [start, start+2] + afterStart := afterRestartStart.StartIndex // [start, start+2] + + // Test range overlap detection (should NOT overlap) + overlaps := beforeRestartStart.StartIndex <= (afterStart+chunkCount-1) && beforeEnd >= afterStart + assert.False(t, overlaps, "Buffer ranges after restart should not overlap") + + // Verify the start indexes are globally unique + assert.NotEqual(t, beforeRestartStart.StartIndex, afterRestartStart.StartIndex, "Start indexes should be different") + assert.Less(t, beforeEnd, afterStart, "Ranges should be completely separate") + + // Expected values: + // Before restart: [1609459100000000000, 1609459100000000002] + // After restart: [1609459300000000000, 1609459300000000002] + expectedBeforeEnd := int64(1609459100000000002) + expectedAfterStart := int64(1609459300000000000) + + assert.Equal(t, expectedBeforeEnd, beforeEnd) + assert.Equal(t, expectedAfterStart, afterStart) + + // This demonstrates that buffer start indexes initialized with process start time + // prevent false positive duplicates across server restarts +} + +func TestBrokerClient_BinaryBufferStartFormat(t *testing.T) { + // Test scenario: getBufferStartFromEntry should only support binary format + // This tests the standardized binary format for buffer_start metadata + realBrokerClient := &BrokerClient{} + + // Test binary format (used by both log files and Parquet files) + binaryEntry := &filer_pb.Entry{ + Name: "2025-01-07-14-30-45", + IsDirectory: false, + Extended: map[string][]byte{ + "buffer_start": func() []byte { + // Binary format: 8-byte BigEndian + buf := make([]byte, 8) + binary.BigEndian.PutUint64(buf, uint64(2000001)) + return buf + }(), + }, + } + + bufferStart := realBrokerClient.getBufferStartFromEntry(binaryEntry) + assert.NotNil(t, bufferStart) + assert.Equal(t, int64(2000001), bufferStart.StartIndex, "Should parse binary buffer_start metadata") + + // Test Parquet file (same binary format) + parquetEntry := &filer_pb.Entry{ + Name: "2025-01-07-14-30.parquet", + IsDirectory: false, + Extended: map[string][]byte{ + "buffer_start": func() []byte { + buf := make([]byte, 8) + binary.BigEndian.PutUint64(buf, uint64(1500001)) + return buf + }(), + }, + } + + bufferStart = realBrokerClient.getBufferStartFromEntry(parquetEntry) + assert.NotNil(t, bufferStart) + assert.Equal(t, int64(1500001), bufferStart.StartIndex, "Should parse binary buffer_start from Parquet file") + + // Test missing metadata + emptyEntry := &filer_pb.Entry{ + Name: "no-metadata", + IsDirectory: false, + Extended: nil, + } + + bufferStart = realBrokerClient.getBufferStartFromEntry(emptyEntry) + assert.Nil(t, bufferStart, "Should return nil for entry without buffer_start metadata") + + // Test invalid format (wrong size) + invalidEntry := &filer_pb.Entry{ + Name: "invalid-metadata", + IsDirectory: false, + Extended: map[string][]byte{ + "buffer_start": []byte("invalid"), + }, + } + + bufferStart = realBrokerClient.getBufferStartFromEntry(invalidEntry) + assert.Nil(t, bufferStart, "Should return nil for invalid buffer_start metadata") +} + +// TestGetSQLValAlias tests the getSQLValAlias function, particularly for SQL injection prevention +func TestGetSQLValAlias(t *testing.T) { + engine := &SQLEngine{} + + tests := []struct { + name string + sqlVal *SQLVal + expected string + desc string + }{ + { + name: "simple string", + sqlVal: &SQLVal{ + Type: StrVal, + Val: []byte("hello"), + }, + expected: "'hello'", + desc: "Simple string should be wrapped in single quotes", + }, + { + name: "string with single quote", + sqlVal: &SQLVal{ + Type: StrVal, + Val: []byte("don't"), + }, + expected: "'don''t'", + desc: "String with single quote should have the quote escaped by doubling it", + }, + { + name: "string with multiple single quotes", + sqlVal: &SQLVal{ + Type: StrVal, + Val: []byte("'malicious'; DROP TABLE users; --"), + }, + expected: "'''malicious''; DROP TABLE users; --'", + desc: "String with SQL injection attempt should have all single quotes properly escaped", + }, + { + name: "empty string", + sqlVal: &SQLVal{ + Type: StrVal, + Val: []byte(""), + }, + expected: "''", + desc: "Empty string should result in empty quoted string", + }, + { + name: "integer value", + sqlVal: &SQLVal{ + Type: IntVal, + Val: []byte("123"), + }, + expected: "123", + desc: "Integer value should not be quoted", + }, + { + name: "float value", + sqlVal: &SQLVal{ + Type: FloatVal, + Val: []byte("123.45"), + }, + expected: "123.45", + desc: "Float value should not be quoted", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := engine.getSQLValAlias(tt.sqlVal) + assert.Equal(t, tt.expected, result, tt.desc) + }) + } +} |
