From c07d6c3ca54dd78373d35493f1ea35dedbd03faa Mon Sep 17 00:00:00 2001 From: Gonzalo Benegas Date: Thu, 18 Dec 2025 21:31:04 +0000 Subject: [PATCH] Add experiment config --- README.md | 1 + .../clm_transformer_base_new_dataset.yaml | 33 +++++++++++++++++++ ...nsformer_base_new_dataset_vertebrates.yaml | 33 +++++++++++++++++++ 3 files changed, 67 insertions(+) create mode 100644 configs/experiment/clm_transformer_base_new_dataset.yaml create mode 100644 configs/experiment/clm_transformer_base_new_dataset_vertebrates.yaml diff --git a/README.md b/README.md index b3ef77a..3c933de 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,7 @@ make test-full ```bash uv run hf download songlab/gpn-animal-promoter-dataset --repo-type dataset --local-dir data/gpn-animal-promoter-dataset uv run hf download gonzalobenegas/Angiosperm_16_genomes_sharded --repo-type dataset --local-dir data/gonzalobenegas/Angiosperm_16_genomes_sharded +uv run hf download gonzalobenegas/genomes-v2-genome_set-animals-intervals-v1_512_256 --repo-type dataset --local-dir data/gonzalobenegas/genomes-v2-genome_set-animals-intervals-v1_512_256 ``` ## How to run diff --git a/configs/experiment/clm_transformer_base_new_dataset.yaml b/configs/experiment/clm_transformer_base_new_dataset.yaml new file mode 100644 index 0000000..5067ccd --- /dev/null +++ b/configs/experiment/clm_transformer_base_new_dataset.yaml @@ -0,0 +1,33 @@ +# @package _global_ + +# to execute this experiment run: +# python glm_experiments/train.py experiment=clm_transformer_base_new_dataset + +defaults: + - override /data: gpn_animal_promoter + - override /model: clm_transformer_base + - override /trainer: gpn_animal_promoter + +logger: + wandb: + name: experiment-clm-transformer-base-new-dataset + tags: ["experiment", "clm", "transformer", "base", "new-dataset"] + +data: + _target_: glm_experiments.data.lm_datamodule.CLMDataModule + dataset_name: data/gonzalobenegas/genomes-v2-genome_set-animals-intervals-v1_512_256 + per_device_batch_size: 256 + data_augmentation: false + +model: + scheduler: + _target_: transformers.get_cosine_with_min_lr_schedule_with_warmup + _partial_: true + num_warmup_steps: 2000 + num_training_steps: ${trainer.max_steps} + min_lr_rate: 0.1 # Decay to 10% of max lr + +trainer: + max_steps: 20000 + log_every_n_steps: 1000 + val_check_interval: 1000 diff --git a/configs/experiment/clm_transformer_base_new_dataset_vertebrates.yaml b/configs/experiment/clm_transformer_base_new_dataset_vertebrates.yaml new file mode 100644 index 0000000..43c2a96 --- /dev/null +++ b/configs/experiment/clm_transformer_base_new_dataset_vertebrates.yaml @@ -0,0 +1,33 @@ +# @package _global_ + +# to execute this experiment run: +# python glm_experiments/train.py experiment=clm_transformer_base_new_dataset + +defaults: + - override /data: gpn_animal_promoter + - override /model: clm_transformer_base + - override /trainer: gpn_animal_promoter + +logger: + wandb: + name: experiment-clm-transformer-base-new-dataset + tags: ["experiment", "clm", "transformer", "base", "new-dataset", "vertebrates"] + +data: + _target_: glm_experiments.data.lm_datamodule.CLMDataModule + dataset_name: data/gonzalobenegas/genomes-v2-genome_set-vertebrates-intervals-v1_512_256 + per_device_batch_size: 256 + data_augmentation: false + +model: + scheduler: + _target_: transformers.get_cosine_with_min_lr_schedule_with_warmup + _partial_: true + num_warmup_steps: 2000 + num_training_steps: ${trainer.max_steps} + min_lr_rate: 0.1 # Decay to 10% of max lr + +trainer: + max_steps: 20000 + log_every_n_steps: 1000 + val_check_interval: 1000