nhubbard
diff --git a/‎Cargo.toml‎
Lines changed: 2 additions & 1 deletion b/‎Cargo.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 158 additions & 14 deletions b/‎README.md‎
Lines changed: 158 additions & 14 deletions
diff --git a/‎src/lib.rs‎
Lines changed: 6 additions & 0 deletions b/‎src/lib.rs‎
Lines changed: 6 additions & 0 deletions
@@ -7,7 +7,7 @@ description = "A batch processing clone of Apache Beam in Rust."
 repository = "https://github.com/nhubbard/ironbeam"
 
 [features]
-default = ["io-jsonl", "io-csv", "io-parquet", "parallel-io", "compression-gzip", "compression-zstd", "compression-bzip2", "compression-xz", "metrics", "checkpointing"]
+default = ["io-jsonl", "io-csv", "io-parquet", "parallel-io", "compression-gzip", "compression-zstd", "compression-bzip2", "compression-xz", "metrics", "checkpointing", "spilling"]
 
 # IO backends
 io-jsonl = []
@@ -24,6 +24,7 @@ compression-xz = ["dep:xz2"]
 parallel-io = []
 metrics = []
 checkpointing = ["dep:bincode", "dep:sha2"]
+spilling = ["dep:bincode"]
 
 [dependencies]
 anyhow = "1"
 
@@ -4,17 +4,21 @@ A data processing framework for Rust inspired by Apache Beam and Google Cloud Da
 
 ## Features
 
-- Declarative pipeline API with fluent interface
-- Stateless operations: `map`, `filter`, `flat_map`, `map_batches`
-- Stateful operations: `group_by_key`, `combine_values`, keyed aggregations
-- Built-in combiners: Sum, Min, Max, Average, DistinctCount, TopK
-- Join support: inner, left, right, and full outer joins
-- Side inputs for enriching streams with auxiliary data
-- Sequential and parallel execution modes
-- Type-safe with compile-time correctness
-- Optional I/O backends: JSON Lines, CSV, Parquet
-- Optional compression: gzip, zstd, bzip2, xz
-- Metrics collection and checkpointing support
+- **Declarative pipeline API** with fluent interface
+- **Stateless operations**: `map`, `filter`, `flat_map`, `map_batches`
+- **Stateful operations**: `group_by_key`, `combine_values`, keyed aggregations
+- **Built-in combiners**: Sum, Min, Max, Average, DistinctCount, TopK
+- **Join support**: inner, left, right, and full outer joins
+- **Side inputs** for enriching streams with auxiliary data
+- **Sequential and parallel execution** modes
+- **Type-safe** with compile-time correctness
+- **Optional I/O backends**: JSON Lines, CSV, Parquet
+- **Optional compression**: gzip, zstd, bzip2, xz
+- **Metrics collection** and **checkpointing** for fault tolerance
+- **Automatic memory spilling** to disk for memory-constrained environments
+- **Cloud I/O abstractions** for provider-agnostic cloud integrations
+- **Data validation utilities** for production-grade error handling
+- **Comprehensive testing framework** with fixtures and assertions
 
 ## Installation
 
@@ -44,6 +48,7 @@ Available feature flags:
 - `parallel-io` - parallel I/O operations
 - `metrics` - pipeline metrics collection
 - `checkpointing` - checkpoint and recovery support
+- `spilling` - automatic memory spilling to disk
 
 ## Quick Start
 
@@ -224,7 +229,7 @@ let windowed = data
 
 ### Checkpointing
 
-Save and restore the pipeline's state:
+Save and restore the pipeline's state for fault tolerance:
 
 ```rust
 data.checkpoint("checkpoints/step1")?;
@@ -243,6 +248,87 @@ let metrics = p.get_metrics();
 println!("Elements processed: {}", metrics.elements_processed);
 ```
 
+### Automatic Memory Spilling
+
+For memory-constrained environments, Ironbeam can automatically spill data to disk when memory limits are exceeded:
+
+```rust
+use ironbeam::spill::SpillConfig;
+use ironbeam::spill_integration::init_spilling;
+
+// Configure automatic spilling with a 100MB memory limit
+init_spilling(SpillConfig::new()
+    .with_memory_limit(100 * 1024 * 1024)
+    .with_spill_directory("/tmp/ironbeam-spill")
+    .with_compression(true));
+
+// Now run your pipeline - spilling happens automatically
+let results = large_dataset
+    .map(|x| heavy_computation(x))
+    .collect_seq()?;
+```
+
+Data is transparently spilled to disk when memory pressure is detected and automatically restored when needed. This allows processing datasets larger than available RAM without manual intervention.
+
+[Learn more about spilling →](https://github.com/nhubbard/ironbeam/blob/main/src/spill.rs)
+
+### Cloud I/O Abstractions
+
+Write provider-agnostic code that works with any cloud service:
+
+```rust
+use ironbeam::io::cloud::*;
+
+// Use fake implementations for testing
+let storage = FakeObjectIO::new();
+storage.put_object("bucket", "key", b"data")?;
+
+// In production, swap in real implementations (AWS S3, GCS, Azure Blob, etc.)
+// without changing your business logic
+```
+
+Supported cloud service categories:
+- **Storage**: Object storage (S3/GCS/Azure), warehouses (BigQuery/Redshift), databases
+- **Messaging**: Pub/sub (Kafka/Kinesis), queues (SQS), notifications (SNS)
+- **Compute**: Serverless functions (Lambda/Cloud Functions)
+- **ML**: Model inference (SageMaker/Vertex AI)
+- **Observability**: Metrics, logging, configuration, caching
+
+All traits are synchronous by design and include fake implementations for unit testing.
+
+[Learn more about cloud I/O →](https://github.com/nhubbard/ironbeam/blob/main/src/io/cloud/mod.rs)
+
+### Data Validation
+
+Handle bad data gracefully in production pipelines:
+
+```rust
+use ironbeam::validation::*;
+
+impl Validate for MyRecord {
+    fn validate(&self) -> ValidationResult {
+        let mut errors = Vec::new();
+
+        if self.email.is_empty() {
+            errors.push(ValidationError::field("email", "Email required"));
+        }
+
+        if self.age < 0 || self.age > 150 {
+            errors.push(ValidationError::field("age", "Invalid age"));
+        }
+
+        if errors.is_empty() { Ok(()) } else { Err(errors) }
+    }
+}
+
+// Apply validation with configurable error handling
+let validated = data
+    .try_map(|record: &MyRecord| record.validate())
+    .collect_fail_fast()?; // Or use .collect_skip_errors()
+```
+
+[Learn more about validation →](https://github.com/nhubbard/ironbeam/blob/main/src/validation.rs)
+
 ## Examples
 
 The `examples/` directory contains complete demonstrations:
@@ -254,22 +340,80 @@ The `examples/` directory contains complete demonstrations:
 - `compressed_io.rs` - Working with compressed files
 - `checkpointing_demo.rs` - Checkpoint and recovery
 - `metrics_example.rs` - Collecting metrics
+- `cloud_io_demo.rs` - Cloud service integrations
+- `data_quality_validation.rs` - Production data validation
 - `testing_pipeline.rs` - Testing patterns
 
 Run examples with:
 
 ```bash
 cargo run --example etl_pipeline --features io-jsonl,io-csv
+cargo run --example cloud_io_demo
+```
+
+## Testing Your Pipelines
+
+Ironbeam includes a comprehensive testing framework with utilities specifically designed for pipeline testing:
+
+### Test Utilities
+
+```rust
+use ironbeam::testing::*;
+
+#[test]
+fn test_my_pipeline() -> anyhow::Result<()> {
+    let p = TestPipeline::new();
+
+    let result = from_vec(&p, vec![1, 2, 3])
+        .map(|x: &i32| x * 2)
+        .collect_seq()?;
+
+    // Specialized assertions for collections
+    assert_collections_equal(result, vec![2, 4, 6]);
+    assert_all(&result, |x| x % 2 == 0);
+
+    Ok(())
+}
 ```
 
-## Testing
+### Pre-built Test Fixtures
 
-Run tests with:
+```rust
+use ironbeam::testing::fixtures::*;
+
+// Ready-to-use test datasets
+let logs = sample_log_entries();       // Web server logs
+let users = user_product_interactions(); // Relational data
+let series = time_series_data();       // Sensor readings
+```
+
+### Debug Utilities
+
+```rust
+// Inspect data during test execution
+let result = data
+    .debug_inspect("after filter")
+    .filter(|x: &i32| x > 10)
+    .debug_count("filtered count")
+    .collect_seq()?;
+```
+
+[Learn more about testing →](https://github.com/nhubbard/ironbeam/blob/main/src/testing.rs)
+
+### Running Tests
+
+Run the full test suite:
 
 ```bash
 cargo test
 ```
 
+Run tests with specific features:
+
+```bash
+cargo test --features spilling
+```
+
 For coverage:
 
 ```bash
 
@@ -188,6 +188,7 @@
 //! - `parallel-io` - Enable parallel I/O operations (`write_*_par` methods)
 //! - `metrics` - Enable metrics collection and reporting (enabled by default)
 //! - `checkpointing` - Enable automatic checkpointing for fault tolerance (enabled by default)
+//! - `spilling` - Enable automatic memory spilling to disk (enabled by default)
 //!
 //! ## Examples
 //!
@@ -534,6 +535,11 @@ pub mod metrics;
 #[cfg(feature = "checkpointing")]
 pub mod checkpoint;
 
+#[cfg(feature = "spilling")]
+pub mod spill;
+
+pub mod spill_integration;
+
 // General re-exports
 pub use collection::{CombineFn, Count, PCollection, RFBound};
 pub use combiners::{AverageF64, DistinctCount, Max, Min, Sum, TopK};