ENH: Add files for C3PO-PCLR in YODA

daniellepace · daniellepace · commit d55ad43543f7 · 2025-05-07T12:09:35.000-04:00
diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/__init__.py b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/__init__.py
diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/c3po_pclr_model_schema.json b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/c3po_pclr_model_schema.json
@@ -0,0 +1,16 @@
+{
+  "inputs": [ 
+    {
+      "name": "ecg",
+      "shape": [2500, 12],
+      "dtype": "FP32"
+    }
+  ],
+  "outputs": [
+    {
+      "name": "output_0",
+      "shape": [320],
+      "dtype": "FP32"
+    },
+  ]
+}
diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/Dockerfile b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.9-slim
+WORKDIR /app
+COPY prepare.py /app/
+COPY finalize.py /app/
+COPY requirements.txt /app/
+RUN pip install -r /app/requirements.txt
+ENTRYPOINT ["python"]
diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/finalize.py b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/finalize.py
@@ -0,0 +1,32 @@
+import argparse
+import json
+import pandas as pd
+
+latent_dimensions = 320
+
+def finalize(input_csv, predictions_json, output_csv):
+    with open(predictions_json, "r") as f:
+        prediction_data = json.load(f)
+
+    df = pd.read_csv(input_csv, dtype={"file_id": str})
+
+    embedding = prediction_data["output_0"]
+
+    if len(embedding) != len(df):
+        raise ValueError(f"Mismatch: {len(embedding)} predictions but {len(df)} rows in input CSV!")
+
+    new_frame = pd.DataFrame(embedding, columns=[f'pclr_{i}' for i in range(latent_dimensions)])
+    df = pd.concat([df, new_frame], axis=1)
+
+    df.to_csv(output_csv, index=False)
+    print(f"✅ Predictions written to {output_csv} ({len(df)} rows).")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="Path to input CSV")
+    parser.add_argument("--output", required=True, help="Path to final CSV with predictions")
+    parser.add_argument("--predictions", required=True, help="Path to predictions JSON")
+    args = parser.parse_args()
+
+    finalize(args.input, args.predictions, args.output)
diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/prepare.py b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/prepare.py
@@ -0,0 +1,56 @@
+import argparse
+
+import h5py
+import numpy as np
+import pandas as pd
+import smart_open
+
+leads = [
+    'I', 'II', 'III', 'aVR', 'aVL', 'aVF',
+    'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
+]
+
+ECG_LENGTH = 2500
+ECG_SHAPE = (ECG_LENGTH, 12)
+ECG_HD5_PATH = 'ukb_ecg_rest'
+
+def ecg_as_tensor(ecg_file):
+    with smart_open.open(ecg_file, 'rb') as f:
+        with h5py.File(f, 'r') as hd5:
+            ecg = np.zeros(ECG_SHAPE, dtype=np.float32)
+            for k,l in enumerate(leads):
+                lead = np.array(hd5[f'{ECG_HD5_PATH}/strip_{l}/instance_0'])
+
+                interpolated_lead = np.interp(
+                    np.linspace(0, 1, ECG_LENGTH),
+                    np.linspace(0, 1, lead.shape[0]),
+                    lead,
+                )
+                ecg[:, k] = interpolated_lead / 1000
+
+    return ecg
+
+def prepare(input_csv, output_h5):
+    """Processes ECG files into HDF5 tensor format from GCS/Azure/Local."""
+    df = pd.read_csv(input_csv, dtype={"file": str})
+    h5_file = h5py.File(output_h5, "w")
+    tensors_group = h5_file.create_group("tensors")
+    df = df.dropna(subset=["file"])
+    df["file"] = df["file"].astype(str)
+    for _, row in df.iterrows():
+        sample_id, file_path = row["file_id"], row["file"]
+        print(f"Processing: sample_id={sample_id}, file_path={file_path}, type={type(file_path)}")
+        tensor = ecg_as_tensor(file_path)
+        tensors_group.create_dataset(str(sample_id), data=tensor)
+
+    h5_file.close()
+    print(f"Processed ECG tensors saved to {output_h5}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="Path to input CSV")
+    parser.add_argument("--output", required=True, help="Path to output HDF5 file")
+    args = parser.parse_args()
+
+    prepare(args.input, args.output)
diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/requirements.txt b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/requirements.txt
@@ -0,0 +1,4 @@
+pandas
+numpy
+h5py
+smart-open[gcs]

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +pandas
 +numpy
 +h5py
 +smart-open[gcs]