diff options
Diffstat (limited to 'weed/query/engine/fast_path_fix_test.go')
| -rw-r--r-- | weed/query/engine/fast_path_fix_test.go | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/weed/query/engine/fast_path_fix_test.go b/weed/query/engine/fast_path_fix_test.go new file mode 100644 index 000000000..3769e9215 --- /dev/null +++ b/weed/query/engine/fast_path_fix_test.go @@ -0,0 +1,193 @@ +package engine + +import ( + "context" + "testing" + + "github.com/seaweedfs/seaweedfs/weed/pb/schema_pb" + "github.com/stretchr/testify/assert" +) + +// TestFastPathCountFixRealistic tests the specific scenario mentioned in the bug report: +// Fast path returning 0 for COUNT(*) when slow path returns 1803 +func TestFastPathCountFixRealistic(t *testing.T) { + engine := NewMockSQLEngine() + + // Set up debug mode to see our new logging + ctx := context.WithValue(context.Background(), "debug", true) + + // Create realistic data sources that mimic a scenario with 1803 rows + dataSources := &TopicDataSources{ + ParquetFiles: map[string][]*ParquetFileStats{ + "/topics/test/large-topic/0000-1023": { + { + RowCount: 800, + ColumnStats: map[string]*ParquetColumnStats{ + "id": { + ColumnName: "id", + MinValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 1}}, + MaxValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 800}}, + NullCount: 0, + RowCount: 800, + }, + }, + }, + { + RowCount: 500, + ColumnStats: map[string]*ParquetColumnStats{ + "id": { + ColumnName: "id", + MinValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 801}}, + MaxValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 1300}}, + NullCount: 0, + RowCount: 500, + }, + }, + }, + }, + "/topics/test/large-topic/1024-2047": { + { + RowCount: 300, + ColumnStats: map[string]*ParquetColumnStats{ + "id": { + ColumnName: "id", + MinValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 1301}}, + MaxValue: &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: 1600}}, + NullCount: 0, + RowCount: 300, + }, + }, + }, + }, + }, + ParquetRowCount: 1600, // 800 + 500 + 300 + LiveLogRowCount: 203, // Additional live log data + PartitionsCount: 2, + LiveLogFilesCount: 15, + } + + partitions := []string{ + "/topics/test/large-topic/0000-1023", + "/topics/test/large-topic/1024-2047", + } + + t.Run("COUNT(*) should return correct total (1803)", func(t *testing.T) { + computer := NewAggregationComputer(engine.SQLEngine) + + aggregations := []AggregationSpec{ + {Function: FuncCOUNT, Column: "*", Alias: "COUNT(*)"}, + } + + results, err := computer.ComputeFastPathAggregations(ctx, aggregations, dataSources, partitions) + + assert.NoError(t, err, "Fast path aggregation should not error") + assert.Len(t, results, 1, "Should return one result") + + // This is the key test - before our fix, this was returning 0 + expectedCount := int64(1803) // 1600 (parquet) + 203 (live log) + actualCount := results[0].Count + + assert.Equal(t, expectedCount, actualCount, + "COUNT(*) should return %d (1600 parquet + 203 live log), but got %d", + expectedCount, actualCount) + }) + + t.Run("MIN/MAX should work with multiple partitions", func(t *testing.T) { + computer := NewAggregationComputer(engine.SQLEngine) + + aggregations := []AggregationSpec{ + {Function: FuncMIN, Column: "id", Alias: "MIN(id)"}, + {Function: FuncMAX, Column: "id", Alias: "MAX(id)"}, + } + + results, err := computer.ComputeFastPathAggregations(ctx, aggregations, dataSources, partitions) + + assert.NoError(t, err, "Fast path aggregation should not error") + assert.Len(t, results, 2, "Should return two results") + + // MIN should be the lowest across all parquet files + assert.Equal(t, int64(1), results[0].Min, "MIN should be 1") + + // MAX should be the highest across all parquet files + assert.Equal(t, int64(1600), results[1].Max, "MAX should be 1600") + }) +} + +// TestFastPathDataSourceDiscoveryLogging tests that our debug logging works correctly +func TestFastPathDataSourceDiscoveryLogging(t *testing.T) { + // This test verifies that our enhanced data source collection structure is correct + + t.Run("DataSources structure validation", func(t *testing.T) { + // Test the TopicDataSources structure initialization + dataSources := &TopicDataSources{ + ParquetFiles: make(map[string][]*ParquetFileStats), + ParquetRowCount: 0, + LiveLogRowCount: 0, + LiveLogFilesCount: 0, + PartitionsCount: 0, + } + + assert.NotNil(t, dataSources, "Data sources should not be nil") + assert.NotNil(t, dataSources.ParquetFiles, "ParquetFiles map should be initialized") + assert.GreaterOrEqual(t, dataSources.PartitionsCount, 0, "PartitionsCount should be non-negative") + assert.GreaterOrEqual(t, dataSources.ParquetRowCount, int64(0), "ParquetRowCount should be non-negative") + assert.GreaterOrEqual(t, dataSources.LiveLogRowCount, int64(0), "LiveLogRowCount should be non-negative") + }) +} + +// TestFastPathValidationLogic tests the enhanced validation we added +func TestFastPathValidationLogic(t *testing.T) { + t.Run("Validation catches data source vs computation mismatch", func(t *testing.T) { + // Create a scenario where data sources and computation might be inconsistent + dataSources := &TopicDataSources{ + ParquetFiles: make(map[string][]*ParquetFileStats), + ParquetRowCount: 1000, // Data sources say 1000 rows + LiveLogRowCount: 0, + PartitionsCount: 1, + } + + // But aggregation result says different count (simulating the original bug) + aggResults := []AggregationResult{ + {Count: 0}, // Bug: returns 0 when data sources show 1000 + } + + // This simulates the validation logic from tryFastParquetAggregation + totalRows := dataSources.ParquetRowCount + dataSources.LiveLogRowCount + countResult := aggResults[0].Count + + // Our validation should catch this mismatch + assert.NotEqual(t, totalRows, countResult, + "This test simulates the bug: data sources show %d but COUNT returns %d", + totalRows, countResult) + + // In the real code, this would trigger a fallback to slow path + validationPassed := (countResult == totalRows) + assert.False(t, validationPassed, "Validation should fail for inconsistent data") + }) + + t.Run("Validation passes for consistent data", func(t *testing.T) { + // Create a scenario where everything is consistent + dataSources := &TopicDataSources{ + ParquetFiles: make(map[string][]*ParquetFileStats), + ParquetRowCount: 1000, + LiveLogRowCount: 803, + PartitionsCount: 1, + } + + // Aggregation result matches data sources + aggResults := []AggregationResult{ + {Count: 1803}, // Correct: matches 1000 + 803 + } + + totalRows := dataSources.ParquetRowCount + dataSources.LiveLogRowCount + countResult := aggResults[0].Count + + // Our validation should pass this + assert.Equal(t, totalRows, countResult, + "Validation should pass when data sources (%d) match COUNT result (%d)", + totalRows, countResult) + + validationPassed := (countResult == totalRows) + assert.True(t, validationPassed, "Validation should pass for consistent data") + }) +} |
