datazip-inc
diff --git a/‎.github/workflows/release-approval.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/release-approval.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 12 additions & 9 deletions b/‎CONTRIBUTING.md‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎constants/constants.go‎
Lines changed: 1 addition & 0 deletions b/‎constants/constants.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎constants/state_version.go‎
Lines changed: 26 additions & 0 deletions b/‎constants/state_version.go‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎destination/iceberg/arrow-writer/utils.go‎
Lines changed: 1 addition & 1 deletion b/‎destination/iceberg/arrow-writer/utils.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎destination/iceberg/legacy-writer/writer.go‎
Lines changed: 1 addition & 1 deletion b/‎destination/iceberg/legacy-writer/writer.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎drivers/kafka/internal/cdc.go‎
Lines changed: 10 additions & 2 deletions b/‎drivers/kafka/internal/cdc.go‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎drivers/mongodb/internal/backfill.go‎
Lines changed: 1 addition & 1 deletion b/‎drivers/mongodb/internal/backfill.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎drivers/mysql/internal/datatype_conversion.go‎
Lines changed: 3 additions & 3 deletions b/‎drivers/mysql/internal/datatype_conversion.go‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎drivers/s3/README.md‎
Lines changed: 136 additions & 0 deletions b/‎drivers/s3/README.md‎
Lines changed: 136 additions & 0 deletions
@@ -19,7 +19,7 @@ jobs:
 
     strategy:
       matrix:
-        driver: [mongodb, mysql, postgres, oracle, kafka] # Add new drivers here as they become available
+        driver: [mongodb, mysql, postgres, oracle, kafka, s3] # Add new drivers here as they become available
 
     uses: ./.github/workflows/build-and-release-driver.yml
     with:
 
@@ -9,21 +9,24 @@ To ensure consistency, we follow a structured contribution process. All guidelin
 
 ---
 
-## 🎃 Hacktoberfest 2025 @ OLake
+## ❄️ Social Winter of Code (SWoC 2026) @OLake
 
-OLake is officially open for **Hacktoberfest contributions**! 🚀  
+OLake is officially open for **Social Winter of Code 2026 contributions**! 🚀
 
-If you’re participating in Hacktoberfest, look out for any issues labeled:
-- **`hacktoberfest`**
-- **`good first issue`**
+If you’re participating in SWoC 2026, look out for issues labeled:
 
-These are designed to help new contributors get started quickly.  
-We welcome everything — bug fixes, documentation updates, tests, or feature enhancements.  
+* **`SWoC26`**
+* **`beginner`**
+* **`intermediate`**
+* **`advanced`**
 
-👉 [Check our open issues here](../../issues)
+These labels help you choose issues based on your experience level.
+We welcome all kinds of contributions — bug fixes, documentation improvements, tests, and feature enhancements.
 
-Let’s hack, learn, and grow together this Hacktoberfest. Happy contributing & happy engineering! ⚡
+👉 [Check our open issues here](../../issues)
 
+---
+Let’s hack, learn, and grow together this Hacktoberfest. Happy contributing & happy engineering! ⚡ 
 ---
 
 ## Getting Help
 
@@ -41,6 +41,7 @@ const (
 	Postgres DriverType = "postgres"
 	MySQL    DriverType = "mysql"
 	Oracle   DriverType = "oracle"
+	S3       DriverType = "s3"
 	Kafka    DriverType = "kafka"
 )
 
 
@@ -0,0 +1,26 @@
+package constants
+
+// State version constants for backward compatibility
+// State files can have different versions to support migration and backward compatibility
+// when the state file format or behavior changes.
+
+// LatestStateVersion is the current version of the state file format.
+// This version is used when creating new state files.
+//
+// Version History:
+//   - Version 0: Legacy format (backward compatibility)
+//     * More lenient date/timestamp parsing behavior
+//     * When a string cannot be parsed as a timestamp, it returns epoch time (1970-01-01)
+//     * Used for state files created before version 1 was introduced
+//
+//   - Version 1: Current format (introduced stricter validation)
+//     * Stricter date/timestamp parsing validation
+//     * When a string cannot be parsed as a timestamp, it will be returned as string. Earlier it was returning epoch time (1970-01-01)
+//     * This prevents data corruption by failing fast on invalid date strings
+
+const (
+	LatestStateVersion = 1
+)
+
+// Used as the current version of the state when the program is running
+var LoadedStateVersion = 1
@@ -216,7 +216,7 @@ func appendValueToBuilder(builder array.Builder, val interface{}) error {
 			return err
 		}
 	case *array.TimestampBuilder:
-		if timeVal, err := typeutils.ReformatDate(val); err == nil {
+		if timeVal, err := typeutils.ReformatDate(val, true); err == nil {
 			ts := arrow.Timestamp(timeVal.UnixMicro())
 			builder.Append(ts)
 		} else {
 
@@ -92,7 +92,7 @@ func (w *LegacyWriter) Write(ctx context.Context, records []types.RawRecord) err
 					}
 					protoColumnsValue = append(protoColumnsValue, &proto.IcebergPayload_IceRecord_FieldValue{Value: &proto.IcebergPayload_IceRecord_FieldValue_DoubleValue{DoubleValue: doubleValue}})
 				case "timestamptz":
-					timeValue, err := typeutils.ReformatDate(val)
+					timeValue, err := typeutils.ReformatDate(val, true)
 					if err != nil {
 						return fmt.Errorf("failed to reformat rawValue[%v] of type[%T] as time value: %s", val, val, err)
 					}
 
@@ -1,6 +1,7 @@
 package driver
 
 import (
+	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
@@ -181,14 +182,21 @@ func (k *Kafka) processKafkaMessages(ctx context.Context, reader *kafka.Reader,
 
 		var data map[string]interface{}
 		if message.Value != nil {
-			if err := json.Unmarshal(message.Value, &data); err != nil {
+			// decode message value
+			decoder := json.NewDecoder(bytes.NewReader(message.Value))
+			// to avoid automatic conversion of numbers to float64
+			decoder.UseNumber()
+			if err := decoder.Decode(&data); err != nil {
 				logger.Warnf("failed to unmarshal message value: %s", err)
 				continue
 			}
 			data[Partition] = message.Partition
 			data[Offset] = message.Offset
 			data[Key] = string(message.Key)
-			data[KafkaTimestamp], _ = typeutils.ReformatDate(message.Time)
+			data[KafkaTimestamp], err = typeutils.ReformatDate(message.Time, true)
+			if err != nil {
+				return fmt.Errorf("failed to reformat date: %s", err)
+			}
 		}
 
 		stopProcessing, err := stopProcessFn(types.KafkaRecord{Data: data, Message: message})
 
@@ -415,7 +415,7 @@ func buildMongoCondition(cond types.Condition) bson.D {
 		if strings.ToLower(val) == "true" || strings.ToLower(val) == "false" {
 			return strings.ToLower(val) == "true"
 		}
-		if timeVal, err := typeutils.ReformatDate(val); err == nil {
+		if timeVal, err := typeutils.ReformatDate(val, false); err == nil {
 			return timeVal
 		}
 		if intVal, err := typeutils.ReformatInt64(val); err == nil {
 
@@ -18,9 +18,9 @@ var mysqlTypeToDataTypes = map[string]types.DataType{
 	"bigint":             types.Int64,
 
 	// Floating point types
-	"float":   types.Float32,
-	"real":    types.Float32,
-	"double":  types.Float64,
+	"float":  types.Float32,
+	"real":   types.Float32,
+	"double": types.Float64,
 
 	// Can handle up to 15 significant digits accurately (e.g., DECIMAL(15,2) or DECIMAL(15,7))
 	// Values with 16 digits may have minor rounding. Beyond 16 (from 17) digits will have precision loss.
 
@@ -0,0 +1,136 @@
+# Olake S3 Source Driver
+Production-ready S3 source connector for Olake that ingests data directly from AWS S3 or S3-compatible storage (MinIO, LocalStack, etc.).
+
+## Highlights
+- **Multi-format**: CSV (plain or `.gz`), JSON (JSONL/array/object), and Parquet with schema inference.
+- **Incremental sync**: Tracks `_last_modified_time` per stream and processes only newer files.
+- **Parallel processing**: Chunked downloads and configurable `max_threads` keep throughput high.
+- **Stateful**: Stream-level state keeps cursor information so you can resume syncs reliably.
+
+## Configuration
+### Required fields
+| Field | Type | Description |
+| --- | --- | --- |
+| `bucket_name` | string | Target S3 bucket name |
+| `region` | string | AWS region (e.g., `us-east-1`) |
+| `file_format` | string | `csv`, `json`, or `parquet` |
+| `path_prefix` | string | Prefix used to group files into streams |
+
+### Optional fields
+| Field | Type | Default | Description |
+| --- | --- | --- | --- |
+| `access_key_id` | string | — | Static AWS access key (pair with `secret_access_key`) |
+| `secret_access_key` | string | — | Static AWS secret key (pair with `access_key_id`) |
+| `endpoint` | string | AWS S3 | Override S3 endpoint (MinIO/LocalStack) |
+| `max_threads` | integer | 10 | Concurrent file download/parsing workers |
+| `retry_count` | integer | 3 | Retries for transient failures |
+| `compression` | string | — | Override auto-detected compression (`gzip` or `none`)
+
+**Authentication note**: Omitting credentials lets the driver fall back to the AWS default credential chain (environment variables, IAM roles, instance profiles, etc.). If you provide one static credential, include the other as well.
+
+### CSV-specific tuning
+Use the nested `csv` block to customize parsing.
+| Field | Type | Default | Description |
+| --- | --- | --- | --- |
+| `delimiter` | string | `","` | Field delimiter |
+| `has_header` | boolean | `true` | Whether the first row is a header |
+| `skip_rows` | integer | 0 | Rows to skip before parsing |
+| `quote_character` | string | `\"\"\"` | Quote character for fields |
+
+### JSON-specific tuning
+Use the `json` block to override parsing of JSON files.
+| Field | Type | Default | Description |
+| --- | --- | --- | --- |
+| `line_delimited` | boolean | `true` | When true, treat each line as a separate record; set to false for JSON arrays or single objects |
+
+## Commands
+Run the driver binaries through the repository root `build.sh` helper:
+```
+./build.sh driver-s3 discover --config /path/to/source.json
+./build.sh driver-s3 sync --config /path/to/source.json --catalog /path/to/catalog.json --destination /path/to/destination.json --state /path/to/state.json
+```
+- `discover` generates a `streams.json` catalog describing each folder and inferred columns.
+- `sync` processes files in ~2 GB chunks, injects `_last_modified_time` as the cursor, and pushes records to the destination.
+- The `state.json` file records `_last_modified_time` per stream so subsequent runs only process changed files.
+
+## Example configuration snippets
+Use these as a starting point; substitute your bucket, path, and authentication values.
+### JSON stream example
+```json
+{
+  "bucket_name": "your-bucket",
+  "region": "us-east-1",
+  "path_prefix": "data/json/",
+  "file_format": "json",
+  "json": { "line_delimited": true },
+  "compression": "gzip",
+  "max_threads": 5,
+  "retry_count": 3
+}
+```
+### CSV stream example
+```json
+{
+  "bucket_name": "your-bucket",
+  "region": "us-east-1",
+  "path_prefix": "data/csv/",
+  "file_format": "csv",
+  "csv": { "has_header": true, "delimiter": "," },
+  "max_threads": 5,
+  "retry_count": 3
+}
+```
+### Parquet stream example
+```json
+{
+  "bucket_name": "your-bucket",
+  "region": "us-east-1",
+  "path_prefix": "data/parquet/",
+  "file_format": "parquet",
+  "compression": "none",
+  "max_threads": 5,
+  "retry_count": 3
+}
+```
+
+## Catalog guidance (streams)
+- `selected_streams` selects which folders (streams) to sync; each entry maps to a stream name under your prefix (e.g., `users`).
+- Each `streams[]` entry includes the inferred schema, available sync modes (`full_refresh`, `incremental`), and the cursor field (`_last_modified_time`).
+- `_last_modified_time` is added to every stream so you can configure incremental syncs per folder.
+
+Example catalog structure:
+```json
+{
+  "selected_streams": {
+    "data": [
+      { "stream_name": "users", "partition_regex": "" },
+      { "stream_name": "orders", "partition_regex": "" }
+    ]
+  },
+  "streams": [
+    {
+      "stream": {
+        "name": "users",
+        "namespace": "data",
+        "supported_sync_modes": ["full_refresh", "incremental"],
+        "cursor_field": "_last_modified_time",
+        "sync_mode": "incremental"
+      }
+    }
+  ]
+}
+```
+
+## State guidance
+The `state.json` structure mirrors the catalog streams and records the latest `_last_modified_time` per stream.
+```json
+{
+  "type": "STREAM",
+  "streams": [
+    { "stream": "users", "state": { "_last_modified_time": "2025-01-01T00:00:00Z", "chunks": [] } }
+  ]
+}
+```
+Use this file when re-running syncs to resume from the last `_last_modified_time` per stream.
+
+Find more at [S3 Docs](https://olake.io/docs/category/s3)
Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@ const (`
`41`	`41`	`Postgres DriverType = "postgres"`
`42`	`42`	`MySQL DriverType = "mysql"`
`43`	`43`	`Oracle DriverType = "oracle"`
	`44`	`+ S3 DriverType = "s3"`
`44`	`45`	`Kafka DriverType = "kafka"`
`45`	`46`	`)`
`46`	`47`
Original file line number	Diff line number	Diff line change
`@@ -216,7 +216,7 @@ func appendValueToBuilder(builder array.Builder, val interface{}) error {`
`216`	`216`	`return err`
`217`	`217`	`}`
`218`	`218`	`case *array.TimestampBuilder:`
`219`		`- if timeVal, err := typeutils.ReformatDate(val); err == nil {`
	`219`	`+ if timeVal, err := typeutils.ReformatDate(val, true); err == nil {`
`220`	`220`	`ts := arrow.Timestamp(timeVal.UnixMicro())`
`221`	`221`	`builder.Append(ts)`
`222`	`222`	`} else {`
Original file line number	Diff line number	Diff line change
`@@ -92,7 +92,7 @@ func (w *LegacyWriter) Write(ctx context.Context, records []types.RawRecord) err`
`92`	`92`	`}`
`93`	`93`	`protoColumnsValue = append(protoColumnsValue, &proto.IcebergPayload_IceRecord_FieldValue{Value: &proto.IcebergPayload_IceRecord_FieldValue_DoubleValue{DoubleValue: doubleValue}})`
`94`	`94`	`case "timestamptz":`
`95`		`- timeValue, err := typeutils.ReformatDate(val)`
	`95`	`+ timeValue, err := typeutils.ReformatDate(val, true)`
`96`	`96`	`if err != nil {`
`97`	`97`	`return fmt.Errorf("failed to reformat rawValue[%v] of type[%T] as time value: %s", val, val, err)`
`98`	`98`	`}`
Original file line number	Diff line number	Diff line change
`@@ -415,7 +415,7 @@ func buildMongoCondition(cond types.Condition) bson.D {`
`415`	`415`	`if strings.ToLower(val) == "true" \|\| strings.ToLower(val) == "false" {`
`416`	`416`	`return strings.ToLower(val) == "true"`
`417`	`417`	`}`
`418`		`- if timeVal, err := typeutils.ReformatDate(val); err == nil {`
	`418`	`+ if timeVal, err := typeutils.ReformatDate(val, false); err == nil {`
`419`	`419`	`return timeVal`
`420`	`420`	`}`
`421`	`421`	`if intVal, err := typeutils.ReformatInt64(val); err == nil {`