Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ and `Semantic versioning 2.0.0 <http://semver.org/>`\_, with the exceptions that
- versions above `1.0.0` will be numbered as `major.minor.patch`, as is
typical

# Unreleased

Add new query type `RNADiffusionMaskedQuery`

# 0.6.0 (2024-12-17)

- More constraints on Boltz
Expand Down
11 changes: 5 additions & 6 deletions ginkgo_ai_client/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,21 +494,20 @@ class RNADiffusionMaskedQuery(QueryBase):
def to_request_params(self) -> Dict:

data = {
"three_utr": self.three_utr.replace(
"<mask>", "[MASK]"
), # UTR tokenizers require [MASK] but api client accepts <mask> for consistence across models
"five_utr": self.five_utr.replace("<mask>", "[MASK]"),
# Many people in the field use [MASK] but our API client uses <mask> for all models
"three_utr": self.three_utr.replace("[MASK]", "<mask>"),
"five_utr": self.five_utr.replace("[MASK]", "<mask>"),
"sequence_aa": self.protein_sequence,
"species": self.species,
"temperature": self.temperature,
"decoding_order_strategy": self.decoding_order_strategy,
"num_to_decode_per_step": self.unmaskings_per_step,
"unmaskings_per_step": self.unmaskings_per_step,
"num_samples": self.num_samples,
}
return {
"model": self.model,
"text": json.dumps(data),
"transforms": [{"type": "GENERATE"}],
"transforms": [{"type": "MRNA_DIFFUSION_GENERATE"}],
}

def parse_response(self, results: Dict) -> MultimodalDiffusionMaskedResponse:
Expand Down
20 changes: 11 additions & 9 deletions test/test_mrna_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,19 @@
GinkgoAIClient,
)


def test_get_mrna_species():

species = RNADiffusionMaskedQuery.get_species_dataframe()
assert len(species) == 324


def test_mrna_diffusion():
client = GinkgoAIClient()
three_utr="AAA<mask>TTTGGGCC<mask><mask>"
five_utr="AAA<mask>TTTGGGCC<mask><mask>"
protein_sequence="MAKS-" # '-' for end of sequence
species="HOMO_SAPIENS"
three_utr = "AAA<mask>TTTGGGCC<mask><mask>"
five_utr = "AAA<mask>TTTGGGCC<mask><mask>"
protein_sequence = "MAKS-" # '-' for end of sequence
species = "HOMO_SAPIENS"
num_samples = 3
query = RNADiffusionMaskedQuery(
three_utr=three_utr,
Expand All @@ -25,7 +27,7 @@ def test_mrna_diffusion():
temperature=1.0,
decoding_order_strategy="entropy",
unmaskings_per_step=10,
num_samples=num_samples
num_samples=num_samples,
)

response = client.send_request(query)
Expand All @@ -39,10 +41,10 @@ def test_mrna_diffusion():

# check codon sequence verbatim. +1 because of stop codon
assert len(sample["codon_sequence"]) == len(protein_sequence) * 3
assert sample["codon_sequence"].startswith("ATG") # Start codon
assert sample["codon_sequence"][-3:] in ["TAA","TAG","TGA"] # stop codon
assert sample["codon_sequence"].startswith("ATG") # Start codon
assert sample["codon_sequence"][-3:] in ["TAA", "TAG", "TGA"] # stop codon

# should translate
translated = str(Seq(sample["codon_sequence"]).translate())
print(translated, protein_sequence)
assert translated.replace("*","-") == protein_sequence
print(translated, protein_sequence)
assert translated.replace("*", "-") == protein_sequence