Skip to content

Commit 331add1

Browse files
Add plants training dataset and eval (#10)
* Refactor evals * Refactor label column * Update batch size * Refactor evals metrics * Add plants config * Update configs * Fix eval dataset ordering
1 parent 754d029 commit 331add1

19 files changed

+1094
-332
lines changed

configs/data/default.yaml

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,18 @@ max_val_lm_samples: null # Maximum number of samples for LM validation (null = u
2323
seed: 42
2424

2525
# Evaluation datasets (optional)
26-
# Set evals: null to disable all evals, or configure specific evals below
27-
evals:
28-
traitgym_mendelian_promoter:
29-
dataset_name: songlab/TraitGym
30-
dataset_config: mendelian_traits
31-
genome_url: https://ftp.ensembl.org/pub/release-115/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.toplevel.fa.gz
32-
genome_path: data/Homo_sapiens.GRCh38.dna_sm.toplevel.fa.gz
33-
window_size: 512
34-
batch_size: 128
26+
# Set evals: null to disable all evals, or configure specific evals in dataset-specific configs
27+
# Example structure:
28+
# evals:
29+
# - name: eval_name
30+
# dataset_name: songlab/TraitGym
31+
# dataset_config: mendelian_traits
32+
# split: test # Dataset split to load (default: "test")
33+
# genome_url: https://ftp.ensembl.org/...
34+
# filter_name: traitgym_promoter # Filter from EVAL_FILTERS registry (default: "none")
35+
# window_size: 512
36+
# batch_size: 128
37+
# label_column: label # Column to preserve as labels (default: "label")
38+
# transform: minus # Transform to apply to raw LLR: minus, identity, abs (default: identity)
39+
# metrics: [auprc] # Metrics to compute: auprc, auroc, spearman, pearson (default: [auprc])
40+
evals: null

configs/data/gpn_animal_promoter.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,17 @@ defaults:
44
batch_size: 2048 # Total effective batch size
55
per_device_batch_size: 128 # Batch size per device (adjust based on GPU memory)
66
num_workers: 8
7+
8+
# Evaluation datasets
9+
evals:
10+
- name: traitgym_mendelian_promoter
11+
dataset_name: songlab/TraitGym
12+
dataset_config: mendelian_traits
13+
split: test # Dataset split to load (default: "test")
14+
genome_url: https://ftp.ensembl.org/pub/release-115/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.toplevel.fa.gz
15+
filter_name: traitgym_promoter
16+
window_size: 512
17+
batch_size: 128
18+
label_column: label # Column to preserve as labels (default: "label")
19+
transform: minus # Transform to apply to raw LLR: minus, identity, abs (default: identity)
20+
metrics: [auprc] # Metrics to compute: auprc, auroc, spearman, pearson (default: [auprc])

configs/data/plants.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
defaults:
2+
- default
3+
4+
# Training dataset: Angiosperm 16 genomes
5+
dataset_name: kuleshov-group/Angiosperm_16_genomes
6+
7+
# Batch size configuration
8+
batch_size: 2048 # Total effective batch size
9+
per_device_batch_size: 128 # Batch size per device (adjust based on GPU memory)
10+
num_workers: 8
11+
12+
# Evaluation datasets
13+
evals:
14+
- name: maize_af
15+
dataset_name: plantcad/maize-allele-frequency
16+
dataset_config: null
17+
split: validation
18+
genome_url: https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-62/fasta/zea_mays/dna/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna_sm.toplevel.fa.gz
19+
filter_name: none
20+
window_size: 512
21+
batch_size: 128
22+
label_column: AF # Allele frequency column
23+
transform: identity # No transform for regression (default: identity)
24+
metrics: [pearson, spearman] # Correlation metrics for regression task

configs/experiment/clm_transformer_small.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# Short training run with small Transformer encoder for quick testing
44

55
defaults:
6+
- override /data: plants
67
- override /model: clm_transformer_small
78

89
logger:

configs/model/bert_bytenet_small.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
_target_: glm_experiments.models.lm_lit_module.MLMLitModule
22

33
soft_masked_weight: ${data.soft_masked_weight}
4+
evals: ${data.evals}
45

56
net:
67
_target_: glm_experiments.models.components.lm.MLM

configs/model/clm_transformer_base.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
_target_: glm_experiments.models.lm_lit_module.CLMLitModule
22

33
soft_masked_weight: ${data.soft_masked_weight}
4+
evals: ${data.evals}
45

56
net:
67
_target_: glm_experiments.models.components.lm.CLM

configs/model/clm_transformer_small.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
_target_: glm_experiments.models.lm_lit_module.CLMLitModule
22

33
soft_masked_weight: ${data.soft_masked_weight}
4+
evals: ${data.evals}
45

56
net:
67
_target_: glm_experiments.models.components.lm.CLM

configs/model/gpn_animal_promoter.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
_target_: glm_experiments.models.lm_lit_module.MLMLitModule
22

33
soft_masked_weight: ${data.soft_masked_weight}
4+
evals: ${data.evals}
45

56
net:
67
_target_: glm_experiments.models.components.lm.MLM

configs/model/mlm_transformer_base.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
_target_: glm_experiments.models.lm_lit_module.MLMLitModule
22

33
soft_masked_weight: ${data.soft_masked_weight}
4+
evals: ${data.evals}
45

56
net:
67
_target_: glm_experiments.models.components.lm.MLM

configs/model/mlm_transformer_small.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
_target_: glm_experiments.models.lm_lit_module.MLMLitModule
22

33
soft_masked_weight: ${data.soft_masked_weight}
4+
evals: ${data.evals}
45

56
net:
67
_target_: glm_experiments.models.components.lm.MLM

0 commit comments

Comments
 (0)