diff options
Diffstat (limited to 'weed')
| -rw-r--r-- | weed/mq/logstore/read_parquet_to_log.go | 18 | ||||
| -rw-r--r-- | weed/mq/schema/write_parquet_test.go | 41 |
2 files changed, 41 insertions, 18 deletions
diff --git a/weed/mq/logstore/read_parquet_to_log.go b/weed/mq/logstore/read_parquet_to_log.go index 3438af61a..1c53129f4 100644 --- a/weed/mq/logstore/read_parquet_to_log.go +++ b/weed/mq/logstore/read_parquet_to_log.go @@ -4,6 +4,10 @@ import ( "context" "encoding/binary" "fmt" + "io" + "math" + "strings" + "github.com/parquet-go/parquet-go" "github.com/seaweedfs/seaweedfs/weed/filer" "github.com/seaweedfs/seaweedfs/weed/mq/schema" @@ -13,9 +17,6 @@ import ( "github.com/seaweedfs/seaweedfs/weed/util/chunk_cache" "github.com/seaweedfs/seaweedfs/weed/util/log_buffer" "google.golang.org/protobuf/proto" - "io" - "math" - "strings" ) var ( @@ -42,10 +43,6 @@ func GenParquetReadFunc(filerClient filer_pb.FilerClient, t topic.Topic, p topic WithField(SW_COLUMN_NAME_KEY, schema.TypeBytes). RecordTypeEnd() - parquetSchema, err := schema.ToParquetSchema(t.Name, recordType) - if err != nil { - return nil - } parquetLevels, err := schema.ToParquetLevels(recordType) if err != nil { return nil @@ -61,11 +58,12 @@ func GenParquetReadFunc(filerClient filer_pb.FilerClient, t topic.Topic, p topic readerAt := filer.NewChunkReaderAtFromClient(readerCache, chunkViews, int64(fileSize)) // create parquet reader - parquetReader := parquet.NewReader(readerAt, parquetSchema) + parquetReader := parquet.NewReader(readerAt) rows := make([]parquet.Row, 128) for { rowCount, readErr := parquetReader.ReadRows(rows) + // Process the rows first, even if EOF is returned for i := 0; i < rowCount; i++ { row := rows[i] // convert parquet row to schema_pb.RecordValue @@ -99,12 +97,16 @@ func GenParquetReadFunc(filerClient filer_pb.FilerClient, t topic.Topic, p topic } } + // Check for end conditions after processing rows if readErr != nil { if readErr == io.EOF { return processedTsNs, nil } return processedTsNs, readErr } + if rowCount == 0 { + return processedTsNs, nil + } } return } diff --git a/weed/mq/schema/write_parquet_test.go b/weed/mq/schema/write_parquet_test.go index f7ab26860..b7ecdcfc7 100644 --- a/weed/mq/schema/write_parquet_test.go +++ b/weed/mq/schema/write_parquet_test.go @@ -2,12 +2,13 @@ package schema import ( "fmt" - "github.com/parquet-go/parquet-go" - "github.com/parquet-go/parquet-go/compress/zstd" - "github.com/seaweedfs/seaweedfs/weed/pb/schema_pb" "io" "os" "testing" + + "github.com/parquet-go/parquet-go" + "github.com/parquet-go/parquet-go/compress/zstd" + "github.com/seaweedfs/seaweedfs/weed/pb/schema_pb" ) func TestWriteReadParquet(t *testing.T) { @@ -125,16 +126,25 @@ func testReadingParquetFile(t *testing.T, filename string, parquetSchema *parque t.Fatalf("os.Open failed: %v", err) } defer file.Close() - reader := parquet.NewReader(file, parquetSchema) + + // Get file info to determine size + fileInfo, err := file.Stat() + if err != nil { + t.Fatalf("file.Stat failed: %v", err) + } + + // Create a parquet file from the opened file + parquetFile, err := parquet.OpenFile(file, fileInfo.Size()) + if err != nil { + t.Fatalf("parquet.OpenFile failed: %v", err) + } + + reader := parquet.NewReader(parquetFile) rows := make([]parquet.Row, 128) for { rowCount, err := reader.ReadRows(rows) - if err != nil { - if err == io.EOF { - break - } - t.Fatalf("reader.Read failed: %v", err) - } + + // Process the rows first, even if EOF is returned for i := 0; i < rowCount; i++ { row := rows[i] // convert parquet row to schema_pb.RecordValue @@ -147,6 +157,17 @@ func testReadingParquetFile(t *testing.T, filename string, parquetSchema *parque } } total += rowCount + + // Check for end conditions after processing rows + if err != nil { + if err == io.EOF { + break + } + t.Fatalf("reader.Read failed: %v", err) + } + if rowCount == 0 { + break + } } fmt.Printf("total: %v\n", total) return |
