1 files changed, 259 insertions, 0 deletions
diff --git a/weed/mq/kafka/schema/envelope.go b/weed/mq/kafka/schema/envelope.go
new file mode 100644
index 000000000..b20d44006
--- /dev/null
+++ b/weed/mq/kafka/schema/envelope.go
@@ -0,0 +1,259 @@
+package schema
+
+import (
+	"encoding/binary"
+	"fmt"
+
+	"github.com/seaweedfs/seaweedfs/weed/glog"
+)
+
+// Format represents the schema format type
+type Format int
+
+const (
+	FormatUnknown Format = iota
+	FormatAvro
+	FormatProtobuf
+	FormatJSONSchema
+)
+
+func (f Format) String() string {
+	switch f {
+	case FormatAvro:
+		return "AVRO"
+	case FormatProtobuf:
+		return "PROTOBUF"
+	case FormatJSONSchema:
+		return "JSON_SCHEMA"
+	default:
+		return "UNKNOWN"
+	}
+}
+
+// ConfluentEnvelope represents the parsed Confluent Schema Registry envelope
+type ConfluentEnvelope struct {
+	Format        Format
+	SchemaID      uint32
+	Indexes       []int  // For Protobuf nested message resolution
+	Payload       []byte // The actual encoded data
+	OriginalBytes []byte // The complete original envelope bytes
+}
+
+// ParseConfluentEnvelope parses a Confluent Schema Registry framed message
+// Returns the envelope details and whether the message was successfully parsed
+func ParseConfluentEnvelope(data []byte) (*ConfluentEnvelope, bool) {
+	if len(data) < 5 {
+		return nil, false // Too short to contain magic byte + schema ID
+	}
+
+	// Check for Confluent magic byte (0x00)
+	if data[0] != 0x00 {
+		return nil, false // Not a Confluent-framed message
+	}
+
+	// Extract schema ID (big-endian uint32)
+	schemaID := binary.BigEndian.Uint32(data[1:5])
+
+	envelope := &ConfluentEnvelope{
+		Format:        FormatAvro, // Default assumption; will be refined by schema registry lookup
+		SchemaID:      schemaID,
+		Indexes:       nil,
+		Payload:       data[5:], // Default: payload starts after schema ID
+		OriginalBytes: data,     // Store the complete original envelope
+	}
+
+	// Note: Format detection should be done by the schema registry lookup
+	// For now, we'll default to Avro and let the manager determine the actual format
+	// based on the schema registry information
+
+	return envelope, true
+}
+
+// ParseConfluentProtobufEnvelope parses a Confluent Protobuf envelope with indexes
+// This is a specialized version for Protobuf that handles message indexes
+//
+// Note: This function uses heuristics to distinguish between index varints and
+// payload data, which may not be 100% reliable in all cases. For production use,
+// consider using ParseConfluentProtobufEnvelopeWithIndexCount if you know the
+// expected number of indexes.
+func ParseConfluentProtobufEnvelope(data []byte) (*ConfluentEnvelope, bool) {
+	// For now, assume no indexes to avoid parsing issues
+	// This can be enhanced later when we have better schema information
+	return ParseConfluentProtobufEnvelopeWithIndexCount(data, 0)
+}
+
+// ParseConfluentProtobufEnvelopeWithIndexCount parses a Confluent Protobuf envelope
+// when you know the expected number of indexes
+func ParseConfluentProtobufEnvelopeWithIndexCount(data []byte, expectedIndexCount int) (*ConfluentEnvelope, bool) {
+	if len(data) < 5 {
+		return nil, false
+	}
+
+	// Check for Confluent magic byte
+	if data[0] != 0x00 {
+		return nil, false
+	}
+
+	// Extract schema ID (big-endian uint32)
+	schemaID := binary.BigEndian.Uint32(data[1:5])
+
+	envelope := &ConfluentEnvelope{
+		Format:        FormatProtobuf,
+		SchemaID:      schemaID,
+		Indexes:       nil,
+		Payload:       data[5:], // Default: payload starts after schema ID
+		OriginalBytes: data,
+	}
+
+	// Parse the expected number of indexes
+	offset := 5
+	for i := 0; i < expectedIndexCount && offset < len(data); i++ {
+		index, bytesRead := readVarint(data[offset:])
+		if bytesRead == 0 {
+			// Invalid varint, stop parsing
+			break
+		}
+		envelope.Indexes = append(envelope.Indexes, int(index))
+		offset += bytesRead
+	}
+
+	envelope.Payload = data[offset:]
+	return envelope, true
+}
+
+// IsSchematized checks if the given bytes represent a Confluent-framed message
+func IsSchematized(data []byte) bool {
+	_, ok := ParseConfluentEnvelope(data)
+	return ok
+}
+
+// ExtractSchemaID extracts just the schema ID without full parsing (for quick checks)
+func ExtractSchemaID(data []byte) (uint32, bool) {
+	if len(data) < 5 || data[0] != 0x00 {
+		return 0, false
+	}
+	return binary.BigEndian.Uint32(data[1:5]), true
+}
+
+// CreateConfluentEnvelope creates a Confluent-framed message from components
+// This will be useful for reconstructing messages on the Fetch path
+func CreateConfluentEnvelope(format Format, schemaID uint32, indexes []int, payload []byte) []byte {
+	// Start with magic byte + schema ID (5 bytes minimum)
+	// Validate sizes to prevent overflow
+	const maxSize = 1 << 30 // 1 GB limit
+	indexSize := len(indexes) * 4
+	totalCapacity := 5 + len(payload) + indexSize
+	if len(payload) > maxSize || indexSize > maxSize || totalCapacity < 0 || totalCapacity > maxSize {
+		glog.Errorf("Envelope size too large: payload=%d, indexes=%d", len(payload), len(indexes))
+		return nil
+	}
+	result := make([]byte, 5, totalCapacity)
+	result[0] = 0x00 // Magic byte
+	binary.BigEndian.PutUint32(result[1:5], schemaID)
+
+	// For Protobuf, add indexes as varints
+	if format == FormatProtobuf && len(indexes) > 0 {
+		for _, index := range indexes {
+			varintBytes := encodeVarint(uint64(index))
+			result = append(result, varintBytes...)
+		}
+	}
+
+	// Append the actual payload
+	result = append(result, payload...)
+
+	return result
+}
+
+// ValidateEnvelope performs basic validation on a parsed envelope
+func (e *ConfluentEnvelope) Validate() error {
+	if e.SchemaID == 0 {
+		return fmt.Errorf("invalid schema ID: 0")
+	}
+
+	if len(e.Payload) == 0 {
+		return fmt.Errorf("empty payload")
+	}
+
+	// Format-specific validation
+	switch e.Format {
+	case FormatAvro:
+		// Avro payloads should be valid binary data
+		// More specific validation will be done by the Avro decoder
+	case FormatProtobuf:
+		// Protobuf validation will be implemented in Phase 5
+	case FormatJSONSchema:
+		// JSON Schema validation will be implemented in Phase 6
+	default:
+		return fmt.Errorf("unsupported format: %v", e.Format)
+	}
+
+	return nil
+}
+
+// Metadata returns a map of envelope metadata for storage
+func (e *ConfluentEnvelope) Metadata() map[string]string {
+	metadata := map[string]string{
+		"schema_format": e.Format.String(),
+		"schema_id":     fmt.Sprintf("%d", e.SchemaID),
+	}
+
+	if len(e.Indexes) > 0 {
+		// Store indexes for Protobuf reconstruction
+		indexStr := ""
+		for i, idx := range e.Indexes {
+			if i > 0 {
+				indexStr += ","
+			}
+			indexStr += fmt.Sprintf("%d", idx)
+		}
+		metadata["protobuf_indexes"] = indexStr
+	}
+
+	return metadata
+}
+
+// encodeVarint encodes a uint64 as a varint
+func encodeVarint(value uint64) []byte {
+	if value == 0 {
+		return []byte{0}
+	}
+
+	var result []byte
+	for value > 0 {
+		b := byte(value & 0x7F)
+		value >>= 7
+
+		if value > 0 {
+			b |= 0x80 // Set continuation bit
+		}
+
+		result = append(result, b)
+	}
+
+	return result
+}
+
+// readVarint reads a varint from the byte slice and returns the value and bytes consumed
+func readVarint(data []byte) (uint64, int) {
+	var result uint64
+	var shift uint
+
+	for i, b := range data {
+		if i >= 10 { // Prevent overflow (max varint is 10 bytes)
+			return 0, 0
+		}
+
+		result |= uint64(b&0x7F) << shift
+
+		if b&0x80 == 0 {
+			// Last byte (MSB is 0)
+			return result, i + 1
+		}
+
+		shift += 7
+	}
+
+	// Incomplete varint
+	return 0, 0
+}