Fix for dasm2 issue 61 (remove division normalization) (#177)

willdumm · web-flow · commit 0b76ee0986f6 · 2025-12-03T10:08:12.000-08:00
diff --git a/netam/molevol.py b/netam/molevol.py
@@ -13,6 +13,7 @@
 
 from netam.codon_table import (
     CODON_AA_INDICATOR_MATRIX,
+    STOP_CODON_INDICATOR,
     STOP_CODON_ZAPPER,
     aa_idxs_of_codon_idxs,
 )
@@ -155,10 +156,16 @@ def aaprobs_of_codon_probs(codon_probs: Tensor) -> Tensor:
     """Compute the probability of each amino acid from the probability of each codon,
     for each parent codon along the sequence.
 
+    Note: The output amino acid probabilities will sum to (1 - P(stop codons)).
+    If inputs are valid probability distributions with small stop codon probability,
+    outputs will be approximately normalized.
+
     Args:
     codon_probs (torch.Tensor): A 4D tensor representing the probability of mutating
                                 to each codon for each parent codon along the sequence.
                                 Shape should be (codon_count, 4, 4, 4).
+                                Must be a valid probability distribution (non-negative,
+                                sums to 1 for each site).
 
     Returns:
     torch.Tensor: A 2D tensor with shape (codon_count, 20) where the ij-th entry is the probability
@@ -170,13 +177,9 @@ def aaprobs_of_codon_probs(codon_probs: Tensor) -> Tensor:
     # the `codon_count` dimension intact. This prepares the tensor for matrix multiplication.
     reshaped_probs = codon_probs.reshape(codon_count, -1)
 
-    # Perform matrix multiplication to get unnormalized amino acid probabilities.
+    # Perform matrix multiplication to get amino acid probabilities.
     aaprobs = torch.matmul(reshaped_probs, CODON_AA_INDICATOR_MATRIX)
 
-    # Normalize probabilities along the amino acid dimension.
-    row_sums = aaprobs.sum(dim=1, keepdim=True)
-    aaprobs /= row_sums
-
     return aaprobs
 
 
@@ -190,7 +193,8 @@ def aaprob_of_mut_and_sub(
     This function actually isn't used anymore, but there is a good test for it, which
     tests other functions, so we keep it.
 
-    Stop codons don't appear as part of this calculation.
+    Stop codons don't appear as part of this calculation. Stop codon probabilities are
+    zeroed and the remaining probabilities are renormalized before computing AA probs.
 
     Args:
     parent_codon_idxs (torch.Tensor): A 2D tensor where each row contains indices representing
@@ -210,6 +214,14 @@ def aaprob_of_mut_and_sub(
         parent_codon_idxs, codon_mut_probs, codon_csps
     )
     codon_probs = codon_probs_of_mutation_matrices(mut_matrices)
+    # Zero out stop codon probabilities and renormalize
+    flat_codon_probs = flatten_codons(codon_probs)
+    flat_codon_probs = zero_stop_codon_probs(flat_codon_probs)
+    # Need to convert these parent codon indices to the 64-based kind
+    flat_codon_probs = set_parent_codon_prob(
+        flat_codon_probs, sequences.flatten_codon_idxs(parent_codon_idxs)
+    )
+    codon_probs = unflatten_codons(flat_codon_probs)
     return aaprobs_of_codon_probs(codon_probs)
 
 
@@ -279,6 +291,9 @@ def aaprobs_of_parent_scaled_rates_and_csps(
     """Calculate per-site amino acid probabilities from per-site nucleotide rates and
     substitution probabilities.
 
+    Stop codon probabilities are zeroed and the parent codon probability is adjusted
+    to make the distribution sum to 1 before computing AA probs.
+
     Args:
         parent_idxs (torch.Tensor): Parent nucleotide indices. Shape should be (site_count,).
         scaled_nt_rates (torch.Tensor): Poisson rates of mutation per site, scaled by branch length.
@@ -290,11 +305,17 @@ def aaprobs_of_parent_scaled_rates_and_csps(
         torch.Tensor: A 2D tensor with rows corresponding to sites and columns
                       corresponding to amino acids.
     """
-    return aaprobs_of_codon_probs(
-        codon_probs_of_parent_scaled_nt_rates_and_csps(
-            parent_idxs, scaled_nt_rates, nt_csps
-        )
+    codon_probs = codon_probs_of_parent_scaled_nt_rates_and_csps(
+        parent_idxs, scaled_nt_rates, nt_csps
     )
+    # Zero out stop codon probabilities and set parent codon prob to make sum = 1
+    flat_codon_probs = flatten_codons(codon_probs)
+    flat_codon_probs = zero_stop_codon_probs(flat_codon_probs)
+    parent_codon_idxs = parent_idxs.reshape(-1, 3)
+    flat_parent_codon_idxs = sequences.flatten_codon_idxs(parent_codon_idxs)
+    flat_codon_probs = set_parent_codon_prob(flat_codon_probs, flat_parent_codon_idxs)
+    codon_probs = unflatten_codons(flat_codon_probs)
+    return aaprobs_of_codon_probs(codon_probs)
 
 
 def build_codon_mutsel(
diff --git a/tests/test_molevol.py b/tests/test_molevol.py
@@ -151,16 +151,20 @@ def iterative_aaprob_of_mut_and_sub(parent_codon, mut_probs, csps):
 
         aa_probs[aa] += child_prob
 
-    # need renormalization factor so that amino acid probabilities sum to 1,
-    # since probabilities to STOP codon are dropped
+    # Instead of renormalizing, add the "missing" probability (from stop codons)
+    # to the parent AA probability. This matches the behavior of set_parent_codon_prob.
     psum = sum(aa_probs.values())
+    parent_aa = translate_sequence(parent_codon)
+    aa_probs[parent_aa] += 1.0 - psum
 
-    return torch.tensor([aa_probs[aa] / psum for aa in AA_STR_SORTED])
+    return torch.tensor([aa_probs[aa] for aa in AA_STR_SORTED])
 
 
 def test_aaprob_of_mut_and_sub():
     crepe = pretrained.load("ThriftyHumV0.2-45")
-    [rates], [subs] = crepe([parent_nt_seq])
+    [rates], [subs_logits] = crepe([parent_nt_seq])
+    # Apply softmax to convert logits to valid CSPs (probability distributions)
+    subs = torch.softmax(subs_logits, dim=-1)
     mut_probs = 1.0 - torch.exp(-rates.squeeze().clone().detach())
     parent_codon = parent_nt_seq[0:3]
     parent_codon_idxs = nt_idx_tensor_of_str(parent_codon)