datazip-inc
diff --git a/‎constants/constants.go‎
Lines changed: 0 additions & 1 deletion b/‎constants/constants.go‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎constants/state_version.go‎
Lines changed: 7 additions & 3 deletions b/‎constants/state_version.go‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎destination/iceberg/arrow-writer/utils.go‎
Lines changed: 11 additions & 20 deletions b/‎destination/iceberg/arrow-writer/utils.go‎
Lines changed: 11 additions & 20 deletions
diff --git a/‎destination/iceberg/arrow-writer/writer.go‎
Lines changed: 12 additions & 12 deletions b/‎destination/iceberg/arrow-writer/writer.go‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎destination/iceberg/iceberg.go‎
Lines changed: 8 additions & 28 deletions b/‎destination/iceberg/iceberg.go‎
Lines changed: 8 additions & 28 deletions
@@ -20,7 +20,6 @@ const (
 	OlakeTimestamp         = "_olake_timestamp"
 	OpType                 = "_op_type"
 	CdcTimestamp           = "_cdc_timestamp"
-	DBName                 = "_db"
 	StringifiedData        = "data"
 	DefaultReadPreference  = "secondaryPreferred"
 	EncryptionKey          = "OLAKE_ENCRYPTION_KEY"
 
@@ -13,14 +13,18 @@ package constants
 //     * When a string cannot be parsed as a timestamp, it returns epoch time (1970-01-01)
 //     * Used for state files created before version 1 was introduced
 //
-//   - Version 1: Current format (introduced stricter validation)
+//   - Version 1: Introduced stricter validation
 //     * Stricter date/timestamp parsing validation
 //     * When a string cannot be parsed as a timestamp, it will be returned as string. Earlier it was returning epoch time (1970-01-01)
 //     * This prevents data corruption by failing fast on invalid date strings
+//
+//   - Version 2: Current Version (Introduces consistent timezone handling between MySQL Full Refresh and CDC.)
+//     * Binlog CDC now uses TimestampStringLocation to align with the connection's timezone configuration.
+//     * This prevents discrepancies where CDC timestamps could differ from Full Refresh data.
 
 const (
-	LatestStateVersion = 1
+	LatestStateVersion = 2
 )
 
 // Used as the current version of the state when the program is running
-var LoadedStateVersion = 1
+var LoadedStateVersion = LatestStateVersion
@@ -7,7 +7,6 @@ import (
 	"io"
 	"sort"
 	"strconv"
-	"time"
 
 	"github.com/apache/arrow-go/v18/arrow"
 	"github.com/apache/arrow-go/v18/arrow/array"
@@ -246,30 +245,22 @@ func createPositionalDeleteArrowRecord(posDeletes []PositionalDelete, allocator
 	return recordBuilder.NewRecord()
 }
 
-func createArrowRecord(records []types.RawRecord, allocator memory.Allocator, schema *arrow.Schema, normalization bool, olakeTimestamp time.Time) (arrow.Record, error) {
+func createArrowRecord(records []types.RawRecord, allocator memory.Allocator, schema *arrow.Schema, normalization bool) (arrow.Record, error) {
 	recordBuilder := array.NewRecordBuilder(allocator, schema)
 	defer recordBuilder.Release()
-
 	for _, record := range records {
 		for idx, field := range schema.Fields() {
 			var val any
-			switch field.Name {
-			case constants.OlakeID:
-				val = record.OlakeID
-			case constants.OlakeTimestamp:
-				val = olakeTimestamp
-			case constants.OpType:
-				val = record.OperationType
-			case constants.CdcTimestamp:
-				if record.CdcTimestamp != nil {
-					val = record.CdcTimestamp
-				}
-			default:
-				if normalization {
-					val = record.Data[field.Name]
-				} else {
-					val = record.Data
-				}
+
+			// Check OlakeColumns first (CDC columns, _olake_id, _olake_timestamp, etc.)
+			if olakeVal, exists := record.OlakeColumns[field.Name]; exists {
+				val = olakeVal
+			} else if normalization {
+				//  For normalized tables, get field from Data
+				val = record.Data[field.Name]
+			} else if field.Name == constants.StringifiedData {
+				//  For non-normalized tables, the "data" column contains entire record.Data as JSON
+				val = record.Data
 			}
 
 			if val == nil {
 
@@ -150,9 +150,9 @@ func (w *ArrowWriter) getOrCreateWriter(ctx context.Context, pKey string, values
 }
 
 // extract partitions records and tracks deletes for upsert mode.
-func (w *ArrowWriter) extract(ctx context.Context, records []types.RawRecord, olakeTimestamp time.Time) error {
+func (w *ArrowWriter) extract(ctx context.Context, records []types.RawRecord) error {
 	for _, rec := range records {
-		pKey, values, err := w.getRecordPartition(rec, olakeTimestamp)
+		pKey, values, err := w.getRecordPartition(rec, rec.OlakeColumns[constants.OlakeTimestamp].(time.Time))
 		if err != nil {
 			return err
 		}
@@ -163,27 +163,28 @@ func (w *ArrowWriter) extract(ctx context.Context, records []types.RawRecord, ol
 		}
 
 		writer.data = append(writer.data, rec)
-
+		recordOpType := rec.OlakeColumns[constants.OpType].(string)
+		recordOlakeID := rec.OlakeColumns[constants.OlakeID].(string)
 		// Track deletes for upsert operations (d, u, c all need delete handling)
-		if w.upsertMode && (rec.OperationType == "d" || rec.OperationType == "u" || rec.OperationType == "c") {
+		if w.upsertMode && (recordOpType == "d" || recordOpType == "u" || recordOpType == "c") {
 			filePosition := writer.dataWriter.currentRowCount + int64(len(writer.data)-1)
 
-			if _, exists := writer.olakeIDPosition[rec.OlakeID]; !exists {
+			if _, exists := writer.olakeIDPosition[recordOlakeID]; !exists {
 				// first time, add to equality deletes and track position
-				writer.equalityDeletes = append(writer.equalityDeletes, rec.OlakeID)
-				writer.olakeIDPosition[rec.OlakeID] = PositionalDelete{
+				writer.equalityDeletes = append(writer.equalityDeletes, recordOlakeID)
+				writer.olakeIDPosition[recordOlakeID] = PositionalDelete{
 					FilePath: writer.dataWriter.filePath,
 					Position: filePosition,
 				}
 			} else {
 				// duplicates, add prev position to positional deletes (n-1 logic)
 				// the latest (nth) occurrence is kept in the map but not added to deletes
-				prev := writer.olakeIDPosition[rec.OlakeID]
+				prev := writer.olakeIDPosition[recordOlakeID]
 				writer.positionalDeletes = append(writer.positionalDeletes, PositionalDelete{
 					FilePath: prev.FilePath,
 					Position: prev.Position,
 				})
-				writer.olakeIDPosition[rec.OlakeID] = PositionalDelete{
+				writer.olakeIDPosition[recordOlakeID] = PositionalDelete{
 					FilePath: writer.dataWriter.filePath,
 					Position: filePosition,
 				}
@@ -195,10 +196,9 @@ func (w *ArrowWriter) extract(ctx context.Context, records []types.RawRecord, ol
 }
 
 func (w *ArrowWriter) Write(ctx context.Context, records []types.RawRecord) error {
-	olakeTimestamp := time.Now().UTC() // for olake timestamp, set current timestamp
 	var err error
 
-	if err := w.extract(ctx, records, olakeTimestamp); err != nil {
+	if err := w.extract(ctx, records); err != nil {
 		return fmt.Errorf("failed to partition data: %s", err)
 	}
 
@@ -237,7 +237,7 @@ func (w *ArrowWriter) Write(ctx context.Context, records []types.RawRecord) erro
 			}
 		}
 
-		record, err := createArrowRecord(writer.data, w.allocator, w.arrowSchema[fileTypeData], w.stream.NormalizationEnabled(), olakeTimestamp)
+		record, err := createArrowRecord(writer.data, w.allocator, w.arrowSchema[fileTypeData], w.stream.NormalizationEnabled())
 		if err != nil {
 			return fmt.Errorf("failed to create arrow record: %s", err)
 		}
 
@@ -3,6 +3,7 @@ package iceberg
 import (
 	"context"
 	"fmt"
+	"maps"
 	"regexp"
 	"runtime"
 	"strings"
@@ -101,7 +102,7 @@ func (i *Iceberg) Setup(ctx context.Context, stream types.StreamInterface, globa
 		logger.Infof("Creating destination table [%s] in Iceberg database [%s] for stream [%s]", i.stream.GetDestinationTable(), i.stream.GetDestinationDatabase(&i.config.IcebergDatabase), i.stream.Name())
 
 		var requestPayload proto.IcebergPayload
-		iceSchema := utils.Ternary(stream.NormalizationEnabled(), stream.Schema().ToIceberg(), icebergRawSchema()).([]*proto.IcebergPayload_SchemaField)
+		iceSchema := stream.Schema().ToIceberg(!stream.NormalizationEnabled())
 		requestPayload = proto.IcebergPayload{
 			Type: proto.IcebergPayload_GET_OR_CREATE_TABLE,
 			Metadata: &proto.IcebergPayload_Metadata{
@@ -204,7 +205,7 @@ func (i *Iceberg) Check(ctx context.Context) error {
 		Metadata: &proto.IcebergPayload_Metadata{
 			ThreadId:      server.serverID,
 			DestTableName: destinationDB,
-			Schema:        icebergRawSchema(),
+			Schema:        types.GetIcebergRawSchema(),
 		},
 	}
 
@@ -218,8 +219,8 @@ func (i *Iceberg) Check(ctx context.Context) error {
 
 	// try writing record in dest table
 	currentTime := time.Now().UTC()
-	protoSchema := icebergRawSchema()
-	record := types.CreateRawRecord(destinationDB, map[string]any{"name": "olake"}, "r", &currentTime)
+	protoSchema := types.GetIcebergRawSchema()
+	record := types.CreateRawRecord(map[string]any{"name": "olake"}, map[string]any{constants.OlakeID: "olake", constants.OpType: "r", constants.CdcTimestamp: &currentTime})
 	protoColumns, err := legacywriter.RawDataColumnBuffer(record, protoSchema)
 	if err != nil {
 		return fmt.Errorf("failed to create raw data column buffer: %s", err)
@@ -307,20 +308,12 @@ func (i *Iceberg) FlattenAndCleanData(ctx context.Context, records []types.RawRe
 		// parallel flatten data and detect schema difference
 		diffThreadSchema := atomic.Bool{}
 		err := utils.Concurrent(ctx, records, runtime.GOMAXPROCS(0)*16, func(_ context.Context, record types.RawRecord, idx int) error {
-			// set pre configured fields
-			records[idx].Data[constants.OlakeID] = record.OlakeID
-			records[idx].Data[constants.OlakeTimestamp] = time.Now().UTC()
-			records[idx].Data[constants.OpType] = record.OperationType
-			if record.CdcTimestamp != nil {
-				records[idx].Data[constants.CdcTimestamp] = *record.CdcTimestamp
-			}
-
-			flattenedRecord, err := typeutils.NewFlattener().Flatten(record.Data)
+			flattenRecord, err := typeutils.NewFlattener().Flatten(record.Data)
 			if err != nil {
 				return fmt.Errorf("failed to flatten record, iceberg writer: %s", err)
 			}
-			records[idx].Data = flattenedRecord
-
+			records[idx].Data = flattenRecord
+			maps.Copy(records[idx].Data, record.OlakeColumns)
 			// if schema difference is not detected, detect schema difference
 			if !diffThreadSchema.Load() {
 				// when detectChange is true, the function does not modify schema parameter
@@ -368,7 +361,6 @@ func (i *Iceberg) EvolveSchema(ctx context.Context, globalSchema, recordsRawSche
 	if !i.stream.NormalizationEnabled() {
 		return i.schema, nil
 	}
-
 	// cases as local thread schema has detected changes w.r.t. batch records schema
 	//  	i.  iceberg table already have changes (i.e. no difference with global schema), in this case
 	//		    only refresh table in iceberg for this thread.
@@ -582,18 +574,6 @@ func parseSchema(schemaStr string) (map[string]string, error) {
 	return fields, nil
 }
 
-// returns raw schema in iceberg format
-func icebergRawSchema() []*proto.IcebergPayload_SchemaField {
-	var icebergFields []*proto.IcebergPayload_SchemaField
-	for key, typ := range types.RawSchema {
-		icebergFields = append(icebergFields, &proto.IcebergPayload_SchemaField{
-			IceType: typ.ToIceberg(),
-			Key:     key,
-		})
-	}
-	return icebergFields
-}
-
 func getCommonAncestorType(d1, d2 string) string {
 	// check for cases:
 	// d1: string d2: int  -> return string