Add Data API examples, e2e tests, and CI updates (#67)

JyotinderSingh · web-flow · commit a41579aa1c05 · 2026-03-06T10:45:40.000+05:30
diff --git a/.github/workflows/e2e-tests.yaml b/.github/workflows/e2e-tests.yaml
@@ -44,6 +44,8 @@ jobs:
 
       - name: Set up gcloud
         uses: google-github-actions/setup-gcloud@v2
+        with:
+          install_components: "gke-gcloud-auth-plugin"
 
       - name: Get GKE credentials
         uses: google-github-actions/get-gke-credentials@v2
@@ -61,4 +63,4 @@ jobs:
           KERAS_REMOTE_PROJECT: ${{ secrets.GCP_PROJECT }}
           KERAS_REMOTE_ZONE: ${{ secrets.GKE_ZONE }}
           KERAS_REMOTE_CLUSTER: ${{ secrets.GKE_CLUSTER }}
-        run: python -m unittest discover -s tests/e2e -p "*_test.py" -v
+        run: python -m pytest tests/e2e/ -v -n auto
diff --git a/examples/example_data_api.py b/examples/example_data_api.py
@@ -0,0 +1,109 @@
+import json
+import os
+import tempfile
+
+import keras_remote
+from keras_remote import Data
+
+# Setup: create temporary dummy data
+tmp_dir = tempfile.mkdtemp(prefix="kr-data-example-")
+dataset_dir = os.path.join(tmp_dir, "dataset")
+os.makedirs(dataset_dir, exist_ok=True)
+
+# A small CSV file used by several tests below.
+train_csv = os.path.join(dataset_dir, "train.csv")
+with open(train_csv, "w") as f:
+  f.write("feature,label\n1,100\n2,200\n3,300\n")
+
+# A JSON config file used by the single-file and mixed tests.
+config_json = os.path.join(tmp_dir, "config.json")
+with open(config_json, "w") as f:
+  json.dump({"lr": 0.01, "epochs": 10}, f)
+
+print(f"Created temp data in {tmp_dir}\n")
+
+
+# Data as function arg (local directory)
+@keras_remote.run(accelerator="cpu")
+def test_data_arg(data_dir):
+  files = sorted(os.listdir(data_dir))
+  with open(f"{data_dir}/train.csv") as f:
+    content = f.read()
+  return {"files": files, "content": content}
+
+
+result = test_data_arg(Data(dataset_dir))
+print(f"Test 1 (dir arg): {result}")
+assert result["files"] == ["train.csv"]
+assert "1,100" in result["content"]
+
+
+# Data as function arg (single file)
+@keras_remote.run(accelerator="cpu")
+def test_file_arg(config_path):
+  with open(config_path) as f:
+    return json.load(f)
+
+
+result = test_file_arg(Data(config_json))
+print(f"Test 2 (file arg): {result}")
+assert result["lr"] == 0.01
+
+# Cache hit (re-run same data, check logs for "cache hit")
+result = test_file_arg(Data(config_json))
+print(f"Test 3 (cache hit): {result}")
+assert result["lr"] == 0.01
+
+
+# volumes (fixed-path mount)
+@keras_remote.run(
+  accelerator="cpu",
+  volumes={"/data": Data(dataset_dir)},
+)
+def test_volumes():
+  files = sorted(os.listdir("/data"))
+  with open("/data/train.csv") as f:
+    content = f.read()
+  return {"files": files, "content": content}
+
+
+result = test_volumes()
+print(f"Test 4 (volumes): {result}")
+assert result["files"] == ["train.csv"]
+
+
+# Mixed — volumes + Data arg + plain arg
+@keras_remote.run(
+  accelerator="cpu",
+  volumes={"/weights": Data(dataset_dir)},
+)
+def test_mixed(config_path, lr=0.001):
+  with open(config_path) as f:
+    cfg = json.load(f)
+  has_weights = os.path.isdir("/weights")
+  return {"config": cfg, "lr": lr, "has_weights": has_weights}
+
+
+result = test_mixed(Data(config_json), lr=0.01)
+print(f"Test 5 (mixed): {result}")
+assert result["config"]["lr"] == 0.01
+assert result["lr"] == 0.01
+assert result["has_weights"] is True
+
+
+# Data in nested structure
+@keras_remote.run(accelerator="cpu")
+def test_nested(datasets):
+  return [sorted(os.listdir(d)) for d in datasets]
+
+
+result = test_nested(
+  datasets=[
+    Data(dataset_dir),
+    Data(dataset_dir),
+  ]
+)
+print(f"Test 6 (nested): {result}")
+assert len(result) == 2
+
+print("\nAll E2E tests passed!")
diff --git a/keras_remote/infra/container_builder.py b/keras_remote/infra/container_builder.py
@@ -7,6 +7,7 @@
 import tarfile
 import tempfile
 import time
+import uuid
 
 from absl import logging
 from google.api_core import exceptions as google_exceptions
@@ -310,7 +311,7 @@ def _upload_build_source(tarball_path, bucket_name, project):
   bucket = client.bucket(bucket_name)
 
   # Upload tarball
-  blob_name = f"source-{int(time.time())}.tar.gz"
+  blob_name = f"source-{int(time.time())}-{uuid.uuid4().hex[:8]}.tar.gz"
   blob = bucket.blob(blob_name)
   blob.upload_from_filename(tarball_path)
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,7 @@ cli = [
 ]
 test = [
     "coverage>=7.0",
+    "pytest-xdist>=3.0",
 ]
 dev = [
     "pre-commit",
diff --git a/tests/e2e/data_api_test.py b/tests/e2e/data_api_test.py

Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,7 @@ cli = [`
`34`	`34`	`]`
`35`	`35`	`test = [`
`36`	`36`	`"coverage>=7.0",`
	`37`	`+ "pytest-xdist>=3.0",`
`37`	`38`	`]`
`38`	`39`	`dev = [`
`39`	`40`	`"pre-commit",`