keras-team
diff --git a/‎.github/workflows/e2e-tests.yaml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/e2e-tests.yaml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎AGENTS.md‎
Lines changed: 51 additions & 4 deletions b/‎AGENTS.md‎
Lines changed: 51 additions & 4 deletions
diff --git a/‎examples/example_data_api.py‎
Lines changed: 109 additions & 0 deletions b/‎examples/example_data_api.py‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎keras_remote/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎keras_remote/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎keras_remote/backend/execution.py‎
Lines changed: 61 additions & 9 deletions b/‎keras_remote/backend/execution.py‎
Lines changed: 61 additions & 9 deletions
@@ -44,6 +44,8 @@ jobs:
 
       - name: Set up gcloud
         uses: google-github-actions/setup-gcloud@v2
+        with:
+          install_components: "gke-gcloud-auth-plugin"
 
       - name: Get GKE credentials
         uses: google-github-actions/get-gke-credentials@v2
@@ -61,4 +63,4 @@ jobs:
           KERAS_REMOTE_PROJECT: ${{ secrets.GCP_PROJECT }}
           KERAS_REMOTE_ZONE: ${{ secrets.GKE_ZONE }}
           KERAS_REMOTE_CLUSTER: ${{ secrets.GKE_CLUSTER }}
-        run: python -m unittest discover -s tests/e2e -p "*_test.py" -v
+        run: python -m pytest tests/e2e/ -v -n auto
@@ -10,6 +10,7 @@ Keras Remote lets users execute Keras/JAX workloads on cloud TPUs and GPUs via a
 keras_remote/
 ├── core/           # @run decorator, accelerator registry & parser
 ├── backend/        # Job execution backends (GKE, Pathways)
+├── data/           # Data class for declaring data dependencies
 ├── infra/          # Docker container building & caching
 ├── runner/         # Remote worker entrypoint (runs inside container)
 ├── utils/          # Serialization (packager) and Cloud Storage helpers
@@ -26,7 +27,7 @@ keras_remote/
 @keras_remote.run() called
   → JobContext.from_params()        # Resolve config from args/env vars
   → ensure_credentials()            # Verify/auto-configure gcloud, ADC, kubeconfig
-  → _prepare_artifacts()            # Serialize function (cloudpickle), zip working dir
+  → _prepare_artifacts()            # Upload Data, serialize function, zip working dir
   → _build_container()              # Build or retrieve cached Docker image
   → _upload_artifacts()             # Upload payload.pkl, context.zip to GCS
   → backend.submit_job()            # Create K8s Job (GKE) or LeaderWorkerSet (Pathways)
@@ -46,9 +47,10 @@ keras_remote/
 | `backend/gke_client.py`      | K8s Job creation, status polling, pod log retrieval                              |
 | `backend/pathways_client.py` | LeaderWorkerSet creation for multi-host TPUs                                     |
 | `infra/container_builder.py` | Content-hashed Docker image building via Cloud Build                             |
-| `utils/packager.py`          | `save_payload()` (cloudpickle), `zip_working_dir()`                              |
-| `utils/storage.py`           | GCS upload/download/cleanup for job artifacts                                    |
-| `runner/remote_runner.py`    | Runs inside container: deserialize, execute, upload result                       |
+| `data/data.py`               | `Data` class, content hashing, data ref serialization                            |
+| `utils/packager.py`          | `save_payload()` (cloudpickle), `zip_working_dir()`, Data ref extraction         |
+| `utils/storage.py`           | GCS upload/download/cleanup for job artifacts and Data cache                     |
+| `runner/remote_runner.py`    | Runs inside container: resolve Data refs/volumes, execute, upload result         |
 | `cli/commands/pool.py`       | Node pool add/remove/list commands                                               |
 | `cli/infra/post_deploy.py`   | kubectl, LWS CRD, GPU driver setup after stack.up()                              |
 | `cli/constants.py`           | CLI defaults, paths, API list                                                    |
@@ -59,8 +61,53 @@ keras_remote/
 - **`JobContext`** (`backend/execution.py`): Mutable dataclass carrying all job state through the pipeline — inputs, generated IDs, artifact paths, image URI.
 - **`BaseK8sBackend`** (`backend/execution.py`): Base class with `submit_job`, `wait_for_job`, `cleanup_job`. Subclassed by `GKEBackend` and `PathwaysBackend`.
 - **`GpuConfig` / `TpuConfig`** (`core/accelerators.py`): Frozen dataclasses for accelerator metadata. Single source of truth used by runtime, container builder, and CLI.
+- **`Data`** (`data/data.py`): Wraps a local path or GCS URI. Passed as a function argument or via the `volumes` decorator parameter. Resolved to a plain filesystem path on the remote pod. Content-hashed for upload caching.
 - **`InfraConfig` / `NodePoolConfig`** (`cli/config.py`): CLI provisioning configuration. `InfraConfig` holds project, zone, cluster name, and a list of `NodePoolConfig` entries. `NodePoolConfig` pairs a unique pool name (e.g., `gpu-l4-a3f2`) with a `GpuConfig` or `TpuConfig`.
 
+## Data API
+
+The `Data` class (`keras_remote.Data`) declares data dependencies for remote functions. It accepts local file/directory paths or GCS URIs (`gs://...`).
+
+### Two usage patterns
+
+**Function arguments** — `Data` objects passed as args/kwargs are uploaded to GCS, serialized as data ref dicts in the payload, and resolved to local paths on the pod:
+
+```python
+@keras_remote.run(accelerator="v3-8")
+def train(data_dir, config_path):
+    ...  # data_dir and config_path are plain strings
+
+train(Data("./dataset/"), Data("./config.json"))
+```
+
+**Volumes** — `Data` objects in the `volumes=` decorator parameter are downloaded to fixed mount paths before execution:
+
+```python
+@keras_remote.run(accelerator="v3-8", volumes={"/data": Data("./dataset/")})
+def train():
+    files = os.listdir("/data")  # available at mount path
+```
+
+Both patterns can be combined. `Data` objects can also be nested inside lists, dicts, and other containers — they are recursively discovered and resolved.
+
+### Content-addressed caching
+
+Local `Data` objects are content-hashed (SHA-256 over sorted file contents). Uploads go to `gs://{bucket}/{namespace}/data-cache/{hash}/`. A `.cache_marker` sentinel enables O(1) cache-hit checks. Identical data is uploaded only once.
+
+### Pipeline integration
+
+During `_prepare_artifacts()`:
+
+1. Upload `Data` from `volumes` and function args via `storage.upload_data()` (content-addressed)
+2. Replace `Data` objects in args/kwargs with serializable `__data_ref__` dicts
+3. Local `Data` paths inside the caller directory are auto-excluded from `context.zip`
+
+On the remote pod (`remote_runner.py`):
+
+1. `resolve_volumes()` — download volume data to mount paths
+2. `resolve_data_refs()` — recursively resolve `__data_ref__` dicts in args/kwargs to local paths
+3. Single-file `Data` resolves to the file path; directory `Data` resolves to the directory path
+
 ## Conventions
 
 ### Code Style
 
@@ -0,0 +1,109 @@
+import json
+import os
+import tempfile
+
+import keras_remote
+from keras_remote import Data
+
+# Setup: create temporary dummy data
+tmp_dir = tempfile.mkdtemp(prefix="kr-data-example-")
+dataset_dir = os.path.join(tmp_dir, "dataset")
+os.makedirs(dataset_dir, exist_ok=True)
+
+# A small CSV file used by several tests below.
+train_csv = os.path.join(dataset_dir, "train.csv")
+with open(train_csv, "w") as f:
+  f.write("feature,label\n1,100\n2,200\n3,300\n")
+
+# A JSON config file used by the single-file and mixed tests.
+config_json = os.path.join(tmp_dir, "config.json")
+with open(config_json, "w") as f:
+  json.dump({"lr": 0.01, "epochs": 10}, f)
+
+print(f"Created temp data in {tmp_dir}\n")
+
+
+# Data as function arg (local directory)
+@keras_remote.run(accelerator="cpu")
+def test_data_arg(data_dir):
+  files = sorted(os.listdir(data_dir))
+  with open(f"{data_dir}/train.csv") as f:
+    content = f.read()
+  return {"files": files, "content": content}
+
+
+result = test_data_arg(Data(dataset_dir))
+print(f"Test 1 (dir arg): {result}")
+assert result["files"] == ["train.csv"]
+assert "1,100" in result["content"]
+
+
+# Data as function arg (single file)
+@keras_remote.run(accelerator="cpu")
+def test_file_arg(config_path):
+  with open(config_path) as f:
+    return json.load(f)
+
+
+result = test_file_arg(Data(config_json))
+print(f"Test 2 (file arg): {result}")
+assert result["lr"] == 0.01
+
+# Cache hit (re-run same data, check logs for "cache hit")
+result = test_file_arg(Data(config_json))
+print(f"Test 3 (cache hit): {result}")
+assert result["lr"] == 0.01
+
+
+# volumes (fixed-path mount)
+@keras_remote.run(
+  accelerator="cpu",
+  volumes={"/data": Data(dataset_dir)},
+)
+def test_volumes():
+  files = sorted(os.listdir("/data"))
+  with open("/data/train.csv") as f:
+    content = f.read()
+  return {"files": files, "content": content}
+
+
+result = test_volumes()
+print(f"Test 4 (volumes): {result}")
+assert result["files"] == ["train.csv"]
+
+
+# Mixed — volumes + Data arg + plain arg
+@keras_remote.run(
+  accelerator="cpu",
+  volumes={"/weights": Data(dataset_dir)},
+)
+def test_mixed(config_path, lr=0.001):
+  with open(config_path) as f:
+    cfg = json.load(f)
+  has_weights = os.path.isdir("/weights")
+  return {"config": cfg, "lr": lr, "has_weights": has_weights}
+
+
+result = test_mixed(Data(config_json), lr=0.01)
+print(f"Test 5 (mixed): {result}")
+assert result["config"]["lr"] == 0.01
+assert result["lr"] == 0.01
+assert result["has_weights"] is True
+
+
+# Data in nested structure
+@keras_remote.run(accelerator="cpu")
+def test_nested(datasets):
+  return [sorted(os.listdir(d)) for d in datasets]
+
+
+result = test_nested(
+  datasets=[
+    Data(dataset_dir),
+    Data(dataset_dir),
+  ]
+)
+print(f"Test 6 (nested): {result}")
+assert len(result) == 2
+
+print("\nAll E2E tests passed!")
@@ -6,3 +6,4 @@
 os.environ.setdefault("GRPC_ENABLE_FORK_SUPPORT", "0")
 
 from keras_remote.core.core import run as run
+from keras_remote.data import Data as Data
@@ -18,7 +18,9 @@
 from keras_remote.backend import gke_client, pathways_client
 from keras_remote.constants import get_default_zone, zone_to_region
 from keras_remote.credentials import ensure_credentials
+from keras_remote.data import _make_data_ref
 from keras_remote.infra import container_builder
+from keras_remote.infra.infra import get_default_project
 from keras_remote.utils import packager, storage
 
 
@@ -46,6 +48,9 @@ class JobContext:
   region: str = field(init=False)
   display_name: str = field(init=False)
 
+  # Data volumes {mount_path: Data}
+  volumes: Optional[dict] = None
+
   # Artifact paths (set during prepare phase)
   payload_path: Optional[str] = None
   context_path: Optional[str] = None
@@ -68,14 +73,13 @@ def from_params(
     zone: Optional[str],
     project: Optional[str],
     env_vars: dict,
+    volumes: Optional[dict] = None,
   ) -> "JobContext":
     """Factory method with default resolution for zone/project."""
     if not zone:
       zone = get_default_zone()
     if not project:
-      project = os.environ.get("KERAS_REMOTE_PROJECT") or os.environ.get(
-        "GOOGLE_CLOUD_PROJECT"
-      )
+      project = get_default_project()
       if not project:
         raise ValueError(
           "project must be specified or set KERAS_REMOTE_PROJECT"
@@ -91,13 +95,14 @@ def from_params(
       container_image=container_image,
       zone=zone,
       project=project,
+      volumes=volumes,
     )
 
 
 class BaseK8sBackend:
   """Base class for Kubernetes-based backends."""
 
-  def __init__(self, cluster: Optional[str] = None, namespace: str = "default"):
+  def __init__(self, cluster: str, namespace: str = "default"):
     self.cluster = cluster
     self.namespace = namespace
 
@@ -203,6 +208,14 @@ def _find_requirements(start_dir: str) -> Optional[str]:
   return None
 
 
+def _maybe_exclude(data_path, caller_path, exclude_paths):
+  """Add data_path to exclude_paths if it's inside the caller directory."""
+  data_abs = os.path.normpath(data_path)
+  caller_abs = os.path.normpath(caller_path)
+  if data_abs.startswith(caller_abs + os.sep) or data_abs == caller_abs:
+    exclude_paths.add(data_abs)
+
+
 def _prepare_artifacts(
   ctx: JobContext, tmpdir: str, caller_frame_depth: int = 3
 ) -> None:
@@ -212,21 +225,58 @@ def _prepare_artifacts(
   # Get caller directory
   frame = inspect.stack()[caller_frame_depth]
   module = inspect.getmodule(frame[0])
-  if module:
+  caller_path: str
+  if module and module.__file__:
     caller_path = os.path.dirname(os.path.abspath(module.__file__))
   else:
     caller_path = os.getcwd()
 
-  # Serialize function + args
+  # Process Data objects
+  exclude_paths: set[str] = set()
+  ref_map = {}  # id(Data) -> ref dict (for arg replacement)
+  volume_refs = []  # list of ref dicts (for volumes)
+
+  # Process volumes
+  if ctx.volumes:
+    for mount_path, data_obj in ctx.volumes.items():
+      gcs_uri = storage.upload_data(ctx.bucket_name, data_obj, ctx.project)
+      volume_refs.append(
+        _make_data_ref(gcs_uri, data_obj.is_dir, mount_path=mount_path)
+      )
+      if not data_obj.is_gcs:
+        _maybe_exclude(data_obj.path, caller_path, exclude_paths)
+
+  # Process Data in function args
+  data_refs = packager.extract_data_refs(ctx.args, ctx.kwargs)
+  for data_obj, _position in data_refs:
+    gcs_uri = storage.upload_data(ctx.bucket_name, data_obj, ctx.project)
+    ref_map[id(data_obj)] = _make_data_ref(gcs_uri, data_obj.is_dir)
+    if not data_obj.is_gcs:
+      _maybe_exclude(data_obj.path, caller_path, exclude_paths)
+
+  # Replace Data with refs in args/kwargs
+  if ref_map:
+    ctx.args, ctx.kwargs = packager.replace_data_with_refs(
+      ctx.args, ctx.kwargs, ref_map
+    )
+
+  # Serialize function + args (with volume refs)
   ctx.payload_path = os.path.join(tmpdir, "payload.pkl")
   packager.save_payload(
-    ctx.func, ctx.args, ctx.kwargs, ctx.env_vars, ctx.payload_path
+    ctx.func,
+    ctx.args,
+    ctx.kwargs,
+    ctx.env_vars,
+    ctx.payload_path,
+    volumes=volume_refs or None,
   )
   logging.info("Payload serialized to %s", ctx.payload_path)
 
-  # Zip working directory
+  # Zip working directory (excluding Data paths)
   ctx.context_path = os.path.join(tmpdir, "context.zip")
-  packager.zip_working_dir(caller_path, ctx.context_path)
+  packager.zip_working_dir(
+    caller_path, ctx.context_path, exclude_paths=exclude_paths
+  )
   logging.info("Context packaged to %s", ctx.context_path)
 
   # Find requirements.txt
@@ -258,6 +308,8 @@ def _build_container(ctx: JobContext) -> None:
 
 def _upload_artifacts(ctx: JobContext) -> None:
   """Phase 3: Upload artifacts to Cloud Storage."""
+  if ctx.payload_path is None or ctx.context_path is None:
+    raise ValueError("payload_path and context_path must be set before upload")
   logging.info("Uploading artifacts to Cloud Storage (job: %s)...", ctx.job_id)
   storage.upload_artifacts(
     bucket_name=ctx.bucket_name,
Original file line number	Diff line number	Diff line change
`@@ -6,3 +6,4 @@`
`6`	`6`	`os.environ.setdefault("GRPC_ENABLE_FORK_SUPPORT", "0")`
`7`	`7`
`8`	`8`	`from keras_remote.core.core import run as run`
	`9`	`+from keras_remote.data import Data as Data`