Open-Athena
diff --git a/‎experiments/defaults.py‎
Lines changed: 2 additions & 1 deletion b/‎experiments/defaults.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎experiments/plantcad/README.md‎
Lines changed: 39 additions & 80 deletions b/‎experiments/plantcad/README.md‎
Lines changed: 39 additions & 80 deletions
@@ -260,7 +260,8 @@ def default_train(
         tags: Any additional tags to add to the Wandb tracker.
         use_default_validation: Whether to use the default validation sets (currently Paloma).
         eval_harness_tasks: List of evaluation harness tasks. Defaults to the CORE set of tasks. Use () or [] to disable
-        shuffle: Whether to shuffle the training data. True=full shuffle, False=no shuffle, int=era shuffle with that length.
+        shuffle: Whether to shuffle the training data. True=full shuffle, False=no shuffle,
+                 int=era shuffle with that length.
     """
 
     pretraining_data = _prepare_data_config(tokenized, use_default_validation, shuffle=shuffle)
 
@@ -2,10 +2,10 @@
 
 PlantCAD1 reproduction experiments.
 
-## Setup
-
 Original tutorial: https://gist.github.com/eric-czech/31e5b79689d322f7becb94a109ce0b75
 
+## Setup
+
 ### Local
 
 ```bash
@@ -15,7 +15,7 @@ uv venv --python 3.11
 uv sync
 ```
 
-### SkyPilot
+### Remote (SkyPilot)
 
 ```bash
 sky api stop; [ -d ~/.sky ] && rm -rf ~/.sky
@@ -43,10 +43,10 @@ EOF
 uv pip install "skypilot[lambda]==0.10.3"
 sky check lambda
 sky launch \
-  --cluster marin --infra lambda --num-nodes 1 --gpus "A100:8" --disk-size 100 \
+  --cluster marin --infra lambda --num-nodes 1 --gpus "A10:1" --disk-size 100 \
   --env HUGGING_FACE_HUB_TOKEN --env WANDB_API_KEY \
   output/cluster.sky.yaml --retry-until-up --yes
-rsync -rPz ./ marin:/home/ubuntu/sky_workdir --exclude '.venv' --exclude '.git' --exclude src/marin/markdown
+REMOTE_USER=ubuntu
 ```
 
 #### GCP
@@ -59,7 +59,7 @@ sky launch \
   --instance-type a2-highgpu-1g --region us-east-1 \
   --env HUGGING_FACE_HUB_TOKEN --env WANDB_API_KEY \
   output/cluster.sky.yaml
-rsync -rPz ./ marin:/home/gcpuser/sky_workdir --exclude '.venv' --exclude '.git' --exclude src/marin/markdown
+REMOTE_USER=gcpuser
 ```
 
 #### CoreWeave
@@ -72,7 +72,7 @@ sky launch \
   --cpus 124 --memory 2008 \
   --env HUGGING_FACE_HUB_TOKEN --env WANDB_API_KEY \
   output/cluster.sky.yaml
-rsync -rPz ./ marin:/home/sky/sky_workdir --exclude '.venv' --exclude '.git' --exclude src/marin/markdown --exclude '__pycache__'
+REMOTE_USER=sky
 
 # For transformer-engine-jax:
 sudo apt update
@@ -81,39 +81,56 @@ sudo apt install build-essential g++ cmake ninja-build
 # hint: This error likely indicates that you need to install a library that provides "cuda_runtime_api.h" for `transformer-engine-jax@2.6.0.post1`
 ```
 
-#### Run
+## Execution
 
 ```bash
 ssh marin
 cd sky_workdir && conda deactivate && source .venv/bin/activate
 export RAY_DEBUG=legacy
 
-python -m experiments.plantcad.exp_pc1_tutorial --prefix local_store --force_run_failed true
-python -m experiments.plantcad.exp_pc1_batch_tune --prefix local_store --force_run_failed true
+# Code sync
+rsync -rPz ./ marin:/home/$REMOTE_USER/sky_workdir \
+  --exclude '.venv' --exclude '.git' \
+  --exclude src/marin/markdown --exclude '__pycache__'
 
-python -m experiments.plantcad.exp_pc1_lr_tune --prefix local_store --force_run_failed true
+# Experiments and tuning
+python -m experiments.plantcad.scripts.exp_pc1_tutorial --prefix local_store --force_run_failed true
+python -m experiments.plantcad.scripts.exp_pc1_batch_tune --prefix local_store --force_run_failed true
+python -m experiments.plantcad.scripts.exp_pc1_lr_tune --prefix local_store --force_run_failed true
 find local_store | grep -E 'step-668$' | xargs -I {} echo "hf upload plantcad/_dev_marin_plantcad1_v1_lr_tune {} {} --repo-type model"
 
+# Training
 mkdir -p logs
 screen -S train
-python -m experiments.plantcad.exp_pc1_train \
+python -m experiments.plantcad.scripts.exp_pc1_train \
   --prefix local_store --force_run_failed true 2>&1 | tee logs/exp_pc1_train.log
-# https://wandb.ai/eric-czech/marin/runs/plantcad-train-300m-r01-2aa671
-# 
-
-python -m experiments.plantcad.exp_pc1_eval --prefix local_store --force_run_failed true
 
-sky exec -c marin output/task.sky.yaml
+# Evaluation
+rm -rf local_store/evaluation/dna-conservation*; python -m experiments.plantcad.scripts.exp_pc1_eval --prefix local_store --force_run_failed true
 ```
 
-## EDA 
+```bash
+> python -m experiments.plantcad.misc.agg_eval_results
+roc_auc  step                                                                                                  checkpoint_path
+0.535217  1673  hf://plantcad/_dev_marin_plantcad1_v1_train/local_store/checkpoints/plantcad-train-300m-r02-432442/hf/step-1673
+0.546725  3346  hf://plantcad/_dev_marin_plantcad1_v1_train/local_store/checkpoints/plantcad-train-300m-r02-432442/hf/step-3346
+0.549917  5019  hf://plantcad/_dev_marin_plantcad1_v1_train/local_store/checkpoints/plantcad-train-300m-r02-432442/hf/step-5019
+0.558042  6692  hf://plantcad/_dev_marin_plantcad1_v1_train/local_store/checkpoints/plantcad-train-300m-r02-432442/hf/step-6692
+0.560290  8365  hf://plantcad/_dev_marin_plantcad1_v1_train/local_store/checkpoints/plantcad-train-300m-r02-432442/hf/step-8365
+0.565785 10038 hf://plantcad/_dev_marin_plantcad1_v1_train/local_store/checkpoints/plantcad-train-300m-r02-432442/hf/step-10038
+0.570048 11711 hf://plantcad/_dev_marin_plantcad1_v1_train/local_store/checkpoints/plantcad-train-300m-r02-432442/hf/step-11711
+0.576358 13384 hf://plantcad/_dev_marin_plantcad1_v1_train/local_store/checkpoints/plantcad-train-300m-r02-432442/hf/step-13384
+0.583593 15057 hf://plantcad/_dev_marin_plantcad1_v1_train/local_store/checkpoints/plantcad-train-300m-r02-432442/hf/step-15057
+0.585834 16730 hf://plantcad/_dev_marin_plantcad1_v1_train/local_store/checkpoints/plantcad-train-300m-r02-432442/hf/step-16730
+0.589215 18403 hf://plantcad/_dev_marin_plantcad1_v1_train/local_store/checkpoints/plantcad-train-300m-r02-432442/hf/step-18403
+0.588738 20076 hf://plantcad/_dev_marin_plantcad1_v1_train/local_store/checkpoints/plantcad-train-300m-r02-432442/hf/step-20076
+0.593178 21749 hf://plantcad/_dev_marin_plantcad1_v1_train/local_store/checkpoints/plantcad-train-300m-r02-432442/hf/step-21749
+```
 
-### Tokenizer stats
+## EDA
 
-From https://huggingface.co/kuleshov-group/PlantCaduceus_l20, e.g.:
-PlantCaduceus vocab size: 7
+Stats on kuleshov-group/Angiosperm_16_genomes:
 
-### Dataset stats
 ```
 > python count_dataset.py
 Number of examples: 5,485,282
@@ -137,61 +154,3 @@ Most common tokens:
 Total unique tokens: 5
 Token ID range: 2 - 6
 ```
-
-This means 2,808,464,384 / 20 ==> ~140.4M params is Chinchilla optimal for text.
-
-## TODO
-
-- Look for prefetch config
-- Debug: "Your setup doesn't support bf16/gpu." in eval with `bf16_full_eval`
-
-```
-# cat /tmp/ray/session_2025-09-20_04-11-22_232072_15326/runtime_resources/pip/f20b7e798eeb2fc9320b1a708aaeee4e0130ee14/virtualenv/lib/python3.11/site-packages/transformers/training_args.py | grep -i "doesn't support" -C 100
-if self.bf16 or self.bf16_full_eval:
-    if self.use_cpu and not is_torch_available() and not is_torch_xla_available():
-        # cpu
-        raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10")
-    elif not self.use_cpu:
-        if not is_torch_bf16_gpu_available() and not is_torch_xla_available():  # added for tpu support
-            error_message = "Your setup doesn't support bf16/gpu."
-            if is_torch_cuda_available():
-                error_message += " You need Ampere+ GPU with cuda>=11.0"
-            # gpu
-            raise ValueError(error_message)
-```
-
-- Discuss: `levanter.data.loader - loader.py:258 - INFO :: Prefetch wasn't fast enough: 33.836.`
-- Discuss this:
-
-```
-# TODO: discuss https://github.com/jax-ml/jax/issues/24909
-# (train_lm_task pid=31054) /tmp/ray/session_2025-09-16_11-16-05_933535_22116/runtime_resources/pip/96e8d2e31c1b75b4d19a0ea2c755a672438fdca3/virtualenv/lib/python3.11/site-packages/levanter/layers/attention.py:428: UserWarning: transformer_engine is not installed. Please install it to use NVIDIA's optimized fused attention.. Falling back to the reference implementation.
-# (train_lm_task pid=31054)   warnings.warn(f"{msg}. Falling back to the reference implementation.")
-# (train_lm_task pid=31054) E0916 11:23:11.594742   31054 buffer_comparator.cc:150] Difference at 10780: 16.375, expected 14.5
-# (train_lm_task pid=31054) E0916 11:23:11.594787   31054 buffer_comparator.cc:150] Difference at 10942: 17.25, expected 15.25
-# (train_lm_task pid=31054) E0916 11:23:11.594791   31054 buffer_comparator.cc:150] Difference at 11042: 17, expected 15.1875
-# (train_lm_task pid=31054) E0916 11:23:11.594795   31054 buffer_comparator.cc:150] Difference at 11132: 16.875, expected 14.8125
-# (train_lm_task pid=31054) E0916 11:23:11.594801   31054 buffer_comparator.cc:150] Difference at 12211: 15, expected 16.875
-# (train_lm_task pid=31054) E0916 11:23:11.594804   31054 buffer_comparator.cc:150] Difference at 12212: 14.625, expected 16.625
-# (train_lm_task pid=31054) E0916 11:23:11.594807   31054 buffer_comparator.cc:150] Difference at 12235: 14.75, expected 16.625
-# (train_lm_task pid=31054) E0916 11:23:11.594809   31054 buffer_comparator.cc:150] Difference at 12276: 15.0625, expected 16.875
-# (train_lm_task pid=31054) E0916 11:23:11.594812   31054 buffer_comparator.cc:150] Difference at 12327: 14.5, expected 16.25
-# (train_lm_task pid=31054) E0916 11:23:11.594815   31054 buffer_comparator.cc:150] Difference at 12336: 15.5625, expected 17.5
-# (train_lm_task pid=31054) 2025-09-16 11:23:11.594824: E external/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc:1070] Results do not match the reference. This is likely a bug/unexpected loss of precision.
-```
-
-- Discuss these  constant errors in deleting checkpoints:
-
-```
-train_lm_task pid=295292) 2025-09-26T19:19:35 - 0 - levanter.checkpoint - checkpoint.py:383 - INFO :: Saved checkpoint to local_store/checkpoints/plantcad-train-300m-r02-432442/checkpoints/step-20076 for step 20076
-(train_lm_task pid=295292) 2025-09-26T19:19:35 - 0 - levanter.checkpoint - checkpoint.py:230 - INFO :: Deleting old temporary checkpoint local_store/checkpoints/plantcad-train-300m-r02-432442/checkpoints/step-20061 after saving new checkpoint.
-(train_lm_task pid=295292) 2025-09-26T19:19:35 - 0 - levanter.checkpoint - checkpoint.py:262 - INFO :: Removing checkpoint local_store/checkpoints/plantcad-train-300m-r02-432442/checkpoints/step-20061
-(train_lm_task pid=295292) 2025-09-26T19:19:35 - 0 - levanter.checkpoint - checkpoint.py:270 - INFO :: Deleting old checkpoint local_store/checkpoints/plantcad-train-300m-r02-432442/checkpoints/step-20061 from /home/sky/sky_workdir/local_store/checkpoints/plantcad-train-300m-r02-432442/checkpoints/local_store/checkpoints/plantcad-train-300m-r02-432442/checkpoints/step-20061
-(train_lm_task pid=295292) 2025-09-26T19:19:35 - 0 - levanter.checkpoint - checkpoint.py:276 - ERROR :: Failed to delete checkpoint local_store/checkpoints/plantcad-train-300m-r02-432442/checkpoints/step-20061
-(train_lm_task pid=295292) Traceback (most recent call last):
-(train_lm_task pid=295292)   File "/tmp/ray/session_2025-09-26_10-17-29_045489_276828/runtime_resources/pip/96e8d2e31c1b75b4d19a0ea2c755a672438fdca3/virtualenv/lib/python3.11/site-packages/levanter/checkpoint.py", line 272, in _do_rm_checkpoint
-(train_lm_task pid=295292)     fs.rm(cp_path, recursive=True)
-(train_lm_task pid=295292)   File "/tmp/ray/session_2025-09-26_10-17-29_045489_276828/runtime_resources/pip/96e8d2e31c1b75b4d19a0ea2c755a672438fdca3/virtualenv/lib/python3.11/site-packages/fsspec/implementations/local.py", line 191, in rm
-(train_lm_task pid=295292)     os.remove(p)
-(train_lm_task pid=295292) FileNotFoundError: [Errno 2] No such file or directory: '/home/sky/sky_workdir/local_store/checkpoints/plantcad-train-300m-r02-432442/checkpoints/local_store/checkpoints/plantcad-train-300m-r02-432442/checkpoints/step-20061'
-```