uw-ipd
diff --git a/‎README.md‎
Lines changed: 18 additions & 7 deletions b/‎README.md‎
Lines changed: 18 additions & 7 deletions
diff --git a/‎RF2na-linux.yml‎
Lines changed: 12 additions & 8 deletions b/‎RF2na-linux.yml‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎SE3Transformer/se3_transformer/model/layers/attention.py‎
Lines changed: 23 additions & 6 deletions b/‎SE3Transformer/se3_transformer/model/layers/attention.py‎
Lines changed: 23 additions & 6 deletions
diff --git a/‎SE3Transformer/se3_transformer/model/layers/convolution.py‎
Lines changed: 10 additions & 0 deletions b/‎SE3Transformer/se3_transformer/model/layers/convolution.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎SE3Transformer/se3_transformer/model/layers/norm.py‎
Lines changed: 2 additions & 0 deletions b/‎SE3Transformer/se3_transformer/model/layers/norm.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎SE3Transformer/se3_transformer/model/transformer.py‎
Lines changed: 11 additions & 2 deletions b/‎SE3Transformer/se3_transformer/model/transformer.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎example/dna_binding_protein.fa‎
Lines changed: 2 additions & 0 deletions b/‎example/dna_binding_protein.fa‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎example/protein.fa‎ ‎example/rna_binding_protein.fa‎example/protein.fa renamed to example/rna_binding_protein.fa b/‎example/protein.fa‎ ‎example/rna_binding_protein.fa‎example/protein.fa renamed to example/rna_binding_protein.fa
@@ -1,6 +1,11 @@
 # RF2NA
 GitHub repo for RoseTTAFold2 with nucleic acids
 
+**New: April 13, 2023 v0.2**
+* Updated weights (https://files.ipd.uw.edu/dimaio/RF2NA_apr23.tgz) for better prediction of homodimer:DNA interactions and better DNA-specific sequence recognition
+* Bugfixes in MSA generation pipeline
+* Support for paired protein/RNA MSAs
+
 ## Installation
 
 1. Clone the package
@@ -25,9 +30,9 @@ python setup.py install
 3. Download pre-trained weights under network directory
 ```
 cd network
-wget https://files.ipd.uw.edu/dimaio/RF2NA_sep22.tgz
-tar xvfz RF2NA_sep22.tgz
-ls weights/ # it should contain a 800mb weights file
+wget https://files.ipd.uw.edu/dimaio/RF2NA_apr23.tgz
+tar xvfz RF2NA_apr23.tgz
+ls weights/ # it should contain a 1.1GB weights file
 cd ..
 ```
 
@@ -62,7 +67,7 @@ wget ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/rfam/rfam_anno
 wget ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/id_mapping/id_mapping.tsv.gz
 wget ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/sequences/rnacentral_species_specific_ids.fasta.gz
 ../input_prep/reprocess_rnac.pl id_mapping.tsv.gz rfam_annotations.tsv.gz   # ~8 minutes
-gunzip -c rnacentral_species_specific_ids.fasta.gz | makeblastdb -in - -dbtype nucl  -out rnacentral.fasta -title "RNACentral"
+gunzip -c rnacentral_species_specific_ids.fasta.gz | makeblastdb -in - -dbtype nucl  -parse_seqids -out rnacentral.fasta -title "RNACentral"
 
 # nt [151G]
 update_blastdb.pl --decompress nt
@@ -73,9 +78,15 @@ cd ..
 ```
 conda activate RF2NA
 cd example
-../run_RF2NA.sh t000_ protein.fa R:RNA.fa
+# run Protein/RNA prediction
+../run_RF2NA.sh rna_pred rna_binding_protein.fa R:RNA.fa
+# run Protein/dsDNA prediction
+../run_RF2NA.sh dna_pred dna_binding_protein.fa D:DNA.fa
+
 ```
-The first argument to the script is the output folder; remaining arguments are fasta files for individual chains in the structure.  Use the tags `P:xxx.fa` `R:xxx.fa` `D:xxx.fa` to specify protein, RNA, DNA respectively (default is protein).  Each chain is a separate file (e.g., for double-stranded DNA, both strands need to be provided as separate fasta files).  Outputs are written to the folder `t000_`.
+The first argument to the script is the output folder; remaining arguments are fasta files for individual chains in the structure.  Use the tags `P:xxx.fa` `R:xxx.fa` `D:xxx.fa` `S:xxx.fa` and `PR:xxx.fa` to specify protein, RNA, dsDNA, ssDNA, and paired protein/RNA respectively (default is protein).  
+
+Each chain is a separate file; 'D' will automatically generate a complementary DNA strand to the input strand.  Outputs are written to the folder `dna_pred` and `rna_pred`.
 
 ## Expected outputs
-You will get a prediction with estimated per-residue LDDT in the B-factor column (model_00.pdb)
+You will get a prediction with estimated per-residue LDDT in the B-factor column (`models/model_00.pdb`)
@@ -1,20 +1,24 @@
 name: RF2NA
 channels:
+  - pytorch
+  - nvidia
   - defaults
+  - conda-forge
 dependencies:
-  - python=3.8
-  - pytorch::pytorch
+  - python=3.10
+  - pip
+  - pytorch
   - requests
-  - conda-forge::psutil 
-  - conda-forge::cudatoolkit=11.3
-  - conda-forge::tqdm
-  - dglteam::dgl-cuda11.3
+  - pytorch-cuda=11.7
+  - dglteam/label/cu117::dgl
+  - pyg::pyg
   - bioconda::mafft
   - bioconda::hhsuite
   - bioconda::blast
   - bioconda::hmmer>=3.3
   - bioconda::infernal
   - bioconda::cd-hit
   - bioconda::csblast
-  - biocore::psipred=4.01
-  - biocore::blast-legacy=2.2.26
+  - pip:
+    - psutil
+    - tqdm
@@ -78,11 +78,10 @@ def forward(
 
             with nvtx_range('attention dot product + softmax'):
                 # Compute attention weights (softmax of inner product between key and query)
-                with torch.cuda.amp.autocast(False):
-                    edge_weights = dgl.ops.e_dot_v(graph, key, query).squeeze(-1)
-                    edge_weights /= np.sqrt(self.key_fiber.num_features)
-                    edge_weights = edge_softmax(graph, edge_weights)
-                    edge_weights = edge_weights[..., None, None]
+                edge_weights = dgl.ops.e_dot_v(graph, key, query).squeeze(-1)
+                edge_weights /= np.sqrt(self.key_fiber.num_features)
+                edge_weights = edge_softmax(graph, edge_weights)
+                edge_weights = edge_weights[..., None, None]
 
             with nvtx_range('weighted sum'):
                 if isinstance(value, Tensor):
@@ -158,6 +157,11 @@ def forward(
             basis: Dict[str, Tensor]
     ):
         with nvtx_range('AttentionBlockSE3'):
+            #print ('AttentionBlockSE3 node_features',[torch.sum(torch.isnan(v)) for v in node_features.values()])
+            #print ('AttentionBlockSE3 edge_features',[torch.sum(torch.isnan(v)) for v in edge_features.values()])
+            #print ('AttentionBlockSE3 node_features',[torch.max(torch.abs(v)) for v in node_features.values()])
+            #print ('AttentionBlockSE3 edge_features',[torch.max(torch.abs(v)) for v in edge_features.values()])
+
             with nvtx_range('keys / values'):
                 fused_key_value = self.to_key_value(node_features, edge_features, graph, basis)
                 key, value = self._get_key_value_from_fused(fused_key_value)
@@ -166,9 +170,22 @@ def forward(
                 with torch.cuda.amp.autocast(False):
                     query = self.to_query(node_features)
 
+            #if (type(value) is dict):
+            #    print ('AttentionBlockSE3 value',[torch.sum(torch.isnan(v)) for v in value.values()])
+            #else:
+            #    print ('AttentionBlockSE3 value',[torch.sum(torch.isnan(value))])
+            #if (type(key) is dict):
+            #    print ('AttentionBlockSE3 key',[torch.sum(torch.isnan(k)) for k in key.values()])
+            #else:
+            #    print ('AttentionBlockSE3 key',[torch.sum(torch.isnan(key))])
+            #print ('AttentionBlockSE3 query',[torch.sum(torch.isnan(q)) for q in query.values()])
             z = self.attention(value, key, query, graph)
+            #print ('AttentionBlockSE3 b',[torch.sum(torch.isnan(zi)) for zi in z.values()])
             z_concat = aggregate_residual(node_features, z, 'cat')
-            return self.project(z_concat)
+            #print ('AttentionBlockSE3 c',[torch.sum(torch.isnan(zi)) for zi in z_concat.values()] )
+            output = self.project(z_concat)
+            #print ('AttentionBlockSE3 d',[torch.sum(torch.isnan(o)) for o in output.values()] )
+            return output
 
     def _get_key_value_from_fused(self, fused_key_value):
         # Extract keys and queries features from fused features
 
@@ -320,6 +320,9 @@ def forward(
             out = {}
             in_features = []
 
+            #print ('ConvSE3 node_feats',[torch.sum(torch.isnan(v)) for v in node_feats.values()])
+            #print ('ConvSE3 edge_feats',[torch.sum(torch.isnan(v)) for v in edge_feats.values()])
+
             # Fetch all input features from edge and node features
             for degree_in in self.fiber_in.degrees:
                 src_node_features = node_feats[str(degree_in)][src]
@@ -358,6 +361,11 @@ def forward(
                                                                         basis.get(dict_key, None))
                     out[str(degree_out)] = out_feature
 
+            #if (type(out) is dict):
+            #    print ('ConvSE3 out',[torch.sum(torch.isnan(v)) for v in out.values()])
+            #else:
+            #    print ('ConvSE3 out',[torch.sum(torch.isnan(out))])
+
             for degree_out in self.fiber_out.degrees:
                 if self.self_interaction and str(degree_out) in self.to_kernel_self:
                     with nvtx_range(f'self interaction'):
@@ -369,7 +377,9 @@ def forward(
                     if self.sum_over_edge:
                         with nvtx_range(f'pooling'):
                             if isinstance(out, dict):
+                                #print ('ConvSE3 pre-pool',degree_out,torch.sum(torch.isnan(out[str(degree_out)])), out[str(degree_out)].dtype )
                                 out[str(degree_out)] = dgl.ops.copy_e_sum(graph, out[str(degree_out)])
+                                #print ('ConvSE3 post-pool',degree_out,torch.sum(torch.isnan(out[str(degree_out)])), out[str(degree_out)].dtype )
                             else:
                                 out = dgl.ops.copy_e_sum(graph, out)
                     else:
 
@@ -61,6 +61,7 @@ def __init__(self, fiber: Fiber, nonlinearity: nn.Module = nn.ReLU()):
     def forward(self, features: Dict[str, Tensor], *args, **kwargs) -> Dict[str, Tensor]:
         with nvtx_range('NormSE3'):
             output = {}
+            #print ('NormSE3 features',[torch.sum(torch.isnan(v)) for v in features.values()])
             if hasattr(self, 'group_norm'):
                 # Compute per-degree norms of features
                 norms = [features[str(d)].norm(dim=-1, keepdim=True).clamp(min=self.NORM_CLAMP)
@@ -79,5 +80,6 @@ def forward(self, features: Dict[str, Tensor], *args, **kwargs) -> Dict[str, Ten
                     norm = feat.norm(dim=-1, keepdim=True).clamp(min=self.NORM_CLAMP)
                     new_norm = self.nonlinearity(self.layer_norms[degree](norm.squeeze(-1)).unsqueeze(-1))
                     output[degree] = new_norm * feat / norm
+            #print ('NormSE3 output',[torch.sum(torch.isnan(v)) for v in output.values()])
 
             return output
@@ -76,7 +76,7 @@ def __init__(self,
                  use_layer_norm: bool = True,
                  tensor_cores: bool = False,
                  low_memory: bool = False,
-                 populate_edge: bool = True,
+                 populate_edge: Optional[Literal['lin', 'arcsin', 'log', 'zero']] = 'lin',
                  sum_over_edge: bool = True,
                  **kwargs):
         """
@@ -168,8 +168,17 @@ def forward(self, graph: DGLGraph, node_feats: Dict[str, Tensor],
         basis = update_basis_with_fused(basis, self.max_degree, use_pad_trick=self.tensor_cores and not self.low_memory,
                                         fully_fused=self.tensor_cores and not self.low_memory)
 
-        if self.populate_edge:
+        if self.populate_edge=='lin':
             edge_feats = get_populated_edge_features(graph.edata['rel_pos'], edge_feats)
+        elif self.populate_edge=='arcsin':
+            r = graph.edata['rel_pos'].norm(dim=-1, keepdim=True)
+            r = torch.maximum(r, torch.zeros_like(r) + 4.0) - 4.0
+            r = torch.arcsinh(r)/3.0
+            edge_feats['0'] = torch.cat([edge_feats['0'], r[..., None]], dim=1)
+        elif self.populate_edge=='log':
+            # fd - replace with log(1+x)
+            r = torch.log( 1 + graph.edata['rel_pos'].norm(dim=-1, keepdim=True) )
+            edge_feats['0'] = torch.cat([edge_feats['0'], r[..., None]], dim=1)
         else:
             edge_feats['0'] = torch.cat((edge_feats['0'], torch.zeros_like(edge_feats['0'][:,:1,:])), dim=1)
 
 
@@ -0,0 +1,2 @@
+> ANTENNAPEDIA HOMEODOMAIN|Drosophila melanogaster (7227)
+MERKRGRQTYTRYQTLELEKEFHFNRYLTRRRRIEIAHALSLTERQIKIWFQNRRMKWKKEN
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+> ANTENNAPEDIA HOMEODOMAIN\|Drosophila melanogaster (7227)`
	`2`	`+MERKRGRQTYTRYQTLELEKEFHFNRYLTRRRRIEIAHALSLTERQIKIWFQNRRMKWKKEN`