genecad/pipelines/prediction at main · plantcad/genecad · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Input variables
INPUT_FILE ?=
OUTPUT_DIR ?=
SPECIES_ID ?=
CHR_ID ?=
REQUIRE_UTRS ?= yes
INTERGENIC_BIAS ?= 0.0
LAUNCHER ?= python
PRED_BATCH_SIZE ?= 32
PRED_DTYPE ?= bfloat16

# Model configuration
HEAD_MODEL_PATH ?= plantcad/GeneCAD-l8-d768-PC2-Small
BASE_MODEL_PATH ?= kuleshov-group/PlantCAD2-Small-l24-d0768

# Validate required variables
ifndef INPUT_FILE
  $(error INPUT_FILE is required. Set it via: INPUT_FILE=/path/to/input.fasta)
endif
ifndef OUTPUT_DIR
  $(error OUTPUT_DIR is required. Set it via: OUTPUT_DIR=/path/to/output)
endif
ifndef SPECIES_ID
  $(error SPECIES_ID is required. Set it via: SPECIES_ID=species_id)
endif
ifndef CHR_ID
  $(error CHR_ID is required. Set it via: CHR_ID=chromosome_id)
endif

# Pipeline directory structure
PIPELINE_DIR = $(OUTPUT_DIR)/pipeline

# Define all pipeline targets
.PHONY: all clean help sequences predictions annotations
all: $(OUTPUT_DIR)/predictions.gff

# Convenient aliases
sequences: $(PIPELINE_DIR)/sequences.zarr     # Requires CPU-only
predictions: $(PIPELINE_DIR)/predictions.zarr # Requires CPU + GPU
annotations: $(OUTPUT_DIR)/predictions.gff    # Requires CPU-only

# Help target
help:
	@echo "Available targets:"
	@echo "  all - Run complete prediction pipeline"
	@echo "  sequences - Extract sequences only"
	@echo "  predictions - Generate predictions only"
	@echo "  annotations - Run complete pipeline (alias for all)"
	@echo "  clean - Remove all generated files"
	@echo ""
	@echo "Required variables:"
	@echo "  INPUT_FILE - Path to input FASTA file"
	@echo "  SPECIES_ID - Species identifier"
	@echo "  CHR_ID - Chromosome identifier"
	@echo "  HEAD_MODEL_PATH - Path to GeneCAD model checkpoint"
	@echo "  OUTPUT_DIR - Output directory for results"
	@echo ""
	@echo "Optional variables:"
	@echo "  REQUIRE_UTRS - Require UTRs in final output (default: yes)"
	@echo "  BASE_MODEL_PATH - Path to PlantCAD model checkpoint (default: kuleshov-group/PlantCAD2-Small-l24-d0768)"
	@echo "  LAUNCHER - Command launcher for GPU jobs (default: python)"
	@echo "  PRED_BATCH_SIZE - Batch size for prediction (default: 32)"

# Extract sequences from input FASTA file
$(PIPELINE_DIR)/sequences.zarr: $(INPUT_FILE)
	@mkdir -p $(PIPELINE_DIR)
	python scripts/extract.py extract_fasta_file \
	  --species-id $(SPECIES_ID) \
	  --fasta-file $(INPUT_FILE) \
	  --chrom-map "$(CHR_ID):$(CHR_ID)" \
	  --tokenizer-path $(BASE_MODEL_PATH) \
	  --output $(PIPELINE_DIR)/sequences.zarr

# Generate predictions from pre-trained model
$(PIPELINE_DIR)/predictions.zarr: $(PIPELINE_DIR)/sequences.zarr
	$(LAUNCHER) scripts/predict.py create_predictions \
	  --input $(PIPELINE_DIR)/sequences.zarr \
	  --output-dir $(PIPELINE_DIR)/predictions.zarr \
	  --model-path $(BASE_MODEL_PATH) \
	  --model-checkpoint $(HEAD_MODEL_PATH) \
	  --species-id $(SPECIES_ID) \
	  --chromosome-id $(CHR_ID) \
	  --batch-size $(PRED_BATCH_SIZE) \
	  --dtype $(PRED_DTYPE)

# Detect feature intervals from token-level predictions
$(PIPELINE_DIR)/intervals.zarr: $(PIPELINE_DIR)/predictions.zarr
	python scripts/predict.py detect_intervals \
	  --input-dir $(PIPELINE_DIR)/predictions.zarr \
	  --output $(PIPELINE_DIR)/intervals.zarr \
	  --decoding-methods "direct,viterbi" \
	  --remove-incomplete-features yes \
	  --intergenic-bias $(INTERGENIC_BIAS)

# Export raw GFF
$(PIPELINE_DIR)/predictions__raw.gff: $(PIPELINE_DIR)/intervals.zarr
	python scripts/predict.py export_gff \
	  --input $(PIPELINE_DIR)/intervals.zarr \
	  --output $(PIPELINE_DIR)/predictions__raw.gff \
	  --decoding-method viterbi \
	  --min-transcript-length 3 \
	  --strip-introns yes

# Remove implausibly small features (1 bp in size) and reconstruct gene/mRNA boundaries
$(PIPELINE_DIR)/predictions__raw__feat_len_2.gff: $(PIPELINE_DIR)/predictions__raw.gff
	python scripts/gff.py filter_to_min_feature_length \
	  --input $(PIPELINE_DIR)/predictions__raw.gff \
	  --output $(PIPELINE_DIR)/predictions__raw__feat_len_2.gff \
	  --feature-types "five_prime_UTR,three_prime_UTR,CDS" \
	  --min-length 2

# Remove genes/transcripts below a certain length (30bp)
$(PIPELINE_DIR)/predictions__raw__feat_len_2__gene_len_30.gff: $(PIPELINE_DIR)/predictions__raw__feat_len_2.gff
	python scripts/gff.py filter_to_min_gene_length \
	  --input $(PIPELINE_DIR)/predictions__raw__feat_len_2.gff \
	  --output $(PIPELINE_DIR)/predictions__raw__feat_len_2__gene_len_30.gff \
	  --min-length 30

# Filter to genes with at least one CDS exon and if configured, both a 5' and 3' UTR
$(PIPELINE_DIR)/predictions__raw__feat_len_2__gene_len_30__has_req_feats.gff: $(PIPELINE_DIR)/predictions__raw__feat_len_2__gene_len_30.gff
	python scripts/gff.py filter_to_valid_genes \
	  --input $(PIPELINE_DIR)/predictions__raw__feat_len_2__gene_len_30.gff \
	  --output $(PIPELINE_DIR)/predictions__raw__feat_len_2__gene_len_30__has_req_feats.gff \
	  --require-utrs $(REQUIRE_UTRS)

# Copy to final predictions file
$(OUTPUT_DIR)/predictions.gff: $(PIPELINE_DIR)/predictions__raw__feat_len_2__gene_len_30__has_req_feats.gff
	cp $(PIPELINE_DIR)/predictions__raw__feat_len_2__gene_len_30__has_req_feats.gff $(OUTPUT_DIR)/predictions.gff

# Clean target
clean:
	rm -rf $(OUTPUT_DIR)