Merge pull request #216 from macrocosm-os/staging

mccrindlebrian · web-flow · commit 9bc70bb17d63 · 2024-09-26T12:32:29.000-04:00
Staging
diff --git a/folding/__init__.py b/folding/__init__.py
@@ -1,7 +1,7 @@
 from .protocol import JobSubmissionSynapse
 from .validators.protein import Protein
 
-__version__ = "1.0.0"
+__version__ = "1.0.1"
 version_split = __version__.split(".")
 __spec_version__ = (
     (10000 * int(version_split[0]))
diff --git a/folding/store.py b/folding/store.py
@@ -153,7 +153,7 @@ class Job:
     gro_hash: str = None
     update_interval: pd.Timedelta = pd.Timedelta(minutes=10)
     updated_count: int = 0
-    max_time_no_improvement: pd.Timedelta = pd.Timedelta(minutes=20)
+    max_time_no_improvement: pd.Timedelta = pd.Timedelta(minutes=25)
     min_updates: int = 10
     epsilon: float = 0.05  # percentage.
     event: dict = None
@@ -173,8 +173,6 @@ def update(
         self,
         loss: float,
         hotkey: str,
-        commit_hash: str,
-        gro_hash: str,
         hotkeys: List[str] = None,
     ):
         """Updates the status of a job in the database. If the loss improves, the best loss, hotkey and hashes are updated."""
@@ -198,8 +196,6 @@ def update(
             self.best_loss = loss
             self.best_loss_at = pd.Timestamp.now().floor("s")
             self.best_hotkey = hotkey
-            self.commit_hash = commit_hash
-            self.gro_hash = gro_hash
         elif (  # if loss has been improved but not recently enough, trigger early stopping
             pd.Timestamp.now().floor("s") - self.best_loss_at
             > self.max_time_no_improvement
diff --git a/folding/utils/ops.py b/folding/utils/ops.py
@@ -24,6 +24,14 @@ def __init__(self, message="Version error occurred"):
         super().__init__(self.message)
 
 
+class ValidationError(Exception):
+    """Exception raised for errors in the versioning."""
+
+    def __init__(self, message="Version error occurred"):
+        self.message = message
+        super().__init__(self.message)
+
+
 def delete_directory(directory: str):
     """We create a lot of files in the process of tracking pdb files.
     Therefore, we want to delete the directory after we are done with the tests.
diff --git a/folding/validators/forward.py b/folding/validators/forward.py
@@ -49,12 +49,20 @@ def run_ping_step(self, uids: List[int], timeout: float) -> Dict:
 
 def run_step(
     self,
-    protein: Protein,
+    protein: Protein | None,
     uids: List[int],
     timeout: float,
     mdrun_args="",  #'-ntomp 64' #limit the number of threads to 64
 ) -> Dict:
     start_time = time.time()
+    if protein is None:
+        event = {
+            "block": self.block,
+            "step_length": time.time() - start_time,
+            "energies": [],
+            "active": False,
+        }
+        return event
 
     # Get the list of uids to query for this step.
     axons = [self.metagraph.axons[uid] for uid in uids]
diff --git a/folding/validators/protein.py b/folding/validators/protein.py
@@ -1,34 +1,33 @@
+import os
 import time
 import glob
-import os
-import pickle
+import base64
 import random
-import re
 import shutil
 from collections import defaultdict
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, List, Literal
-import base64
 
-import plotly.express as px
 import bittensor as bt
-import openmm as mm
-import pandas as pd
 import numpy as np
+import pandas as pd
+import plotly.express as px
 from openmm import app, unit
 from pdbfixer import PDBFixer
+
+from folding.base.simulation import OpenMMSimulation
 from folding.store import Job
 from folding.utils.opemm_simulation_config import SimulationConfig
 from folding.utils.ops import (
+    OpenMMException,
+    ValidationError,
     check_and_download_pdbs,
     check_if_directory_exists,
     load_pdb_ids,
     select_random_pdb_id,
     write_pkl,
 )
-from folding.store import Job
-from folding.base.simulation import OpenMMSimulation
 
 ROOT_DIR = Path(__file__).resolve().parents[2]
 
@@ -118,8 +117,8 @@ def from_job(job: Job, config: Dict):
             bt.logging.error(
                 f"from_job failed for {protein.pdb_id} with Exception {E}."
             )
-        finally:
-            return protein
+            return None
+        return protein
 
     @staticmethod
     def load_pdb_as_string(pdb_path: str) -> str:
@@ -185,7 +184,7 @@ def read_and_return_files(self, filenames: List) -> Dict:
                                 name
                             ] = f.read()  # This would be the pdb file.
 
-                except Exception as E:
+                except Exception:
                     continue
         return files_to_return
 
@@ -221,6 +220,13 @@ def setup_simulation(self):
 
         self.pdb_complexity = Protein._get_pdb_complexity(self.pdb_location)
         self.init_energy = self.calc_init_energy()
+
+        # Checking if init energy is nan
+        if np.isnan(self.init_energy):
+            raise OpenMMException(
+                f"Failed to calculate initial energy for {self.pdb_id}"
+            )
+
         self._calculate_epsilon()
 
     def __str__(self):
@@ -355,6 +361,9 @@ def get_miner_data_directory(self, hotkey: str):
     def process_md_output(
         self, md_output: dict, seed: int, state: str, hotkey: str
     ) -> bool:
+        MIN_LOGGING_ENTRIES = 500
+        MIN_SIMULATION_STEPS = 5000
+
         required_files_extensions = ["cpt", "log"]
         hotkey_alias = hotkey[:8]
         self.current_state = state
@@ -409,11 +418,16 @@ def process_md_output(
             self.log_file = pd.read_csv(log_file_path)
             self.log_step = self.log_file['#"Step"'].iloc[-1]
 
+            # Checks to see if we have enough steps in the log file to start validation
+            if len(self.log_file) < MIN_LOGGING_ENTRIES:
+                raise ValidationError(
+                    f"Miner {hotkey_alias} did not run enough steps in the simulation... Skipping!"
+                )
+
             # Make sure that we are enough steps ahead in the log file compared to the checkpoint file.
-            # Checks if log_file is 5000 steps ahead of checkpoint AND that the log_file has at least 5000 steps
-            if (
-                self.log_step - self.simulation.currentStep
-            ) < 5000 and len(self.log_file) >= 5000:
+            # Checks if log_file is MIN_STEPS steps ahead of checkpoint
+            if (self.log_step - self.simulation.currentStep) < MIN_SIMULATION_STEPS:
+                # If the miner did not run enough steps, we will load the old checkpoint
                 checkpoint_path = os.path.join(
                     self.miner_data_directory, f"{self.current_state}_old.cpt"
                 )
@@ -422,13 +436,17 @@ def process_md_output(
                         f"Miner {hotkey_alias} did not run enough steps since last checkpoint... Loading old checkpoint"
                     )
                     self.simulation.loadCheckpoint(checkpoint_path)
+                    # Checking to see if the old checkpoint has enough steps to validate
+                    if (
+                        self.log_step - self.simulation.currentStep
+                    ) < MIN_SIMULATION_STEPS:
+                        raise ValidationError(
+                            f"Miner {hotkey_alias} did not run enough steps in the simulation... Skipping!"
+                        )
                 else:
-                    bt.logging.warning(
+                    raise ValidationError(
                         f"Miner {hotkey_alias} did not run enough steps and no old checkpoint found... Skipping!"
                     )
-                    return False
-            else:
-                self.simulation.loadCheckpoint(checkpoint_path)
 
             self.cpt_step = self.simulation.currentStep
             self.checkpoint_path = checkpoint_path
@@ -444,6 +462,10 @@ def process_md_output(
                     write_mode="wb",
                 )
 
+        except ValidationError as E:
+            bt.logging.warning(f"{E}")
+            return False
+
         except Exception as e:
             bt.logging.error(f"Failed to recreate simulation: {e}")
             return False
@@ -500,7 +522,6 @@ def is_run_valid(self):
 
         # calculating absolute percent difference per step
         percent_diff = abs(((check_energies - miner_energies) / miner_energies) * 100)
-        min_length = len(percent_diff)
 
         # This is some debugging information for plotting the information from the miner.
         df = pd.DataFrame([check_energies, miner_energies]).T
@@ -559,7 +580,7 @@ def remove_pdb_directory(self):
         """
         shutil.rmtree(self.pdb_directory)
 
-    def calc_init_energy(self):
+    def calc_init_energy(self) -> float:
         """Calculate the potential energy from an edr file using gmx energy.
         Args:
             output_dir (str): directory containing the edr file
diff --git a/neurons/validator.py b/neurons/validator.py
@@ -247,10 +247,6 @@ def update_job(self, job: Job):
         energies = torch.Tensor(job.event["energies"])
         rewards = torch.zeros(len(energies))  # one-hot per update step
 
-        # TODO: we need to get the commit and gro hashes from the best hotkey
-        commit_hash = ""  # For next time
-        gro_hash = ""  # For next time
-
         best_index = np.argmin(energies)
         best_loss = energies[best_index].item()  # item because it's a torch.tensor
         best_hotkey = serving_hotkeys[best_index]
@@ -259,8 +255,6 @@ def update_job(self, job: Job):
             hotkeys=serving_hotkeys,
             loss=best_loss,
             hotkey=best_hotkey,
-            commit_hash=commit_hash,
-            gro_hash=gro_hash,
         )
 
         # If no miners respond appropriately, the energies will be all zeros
@@ -316,10 +310,14 @@ def prepare_event_for_logging(event: Dict):
         # If the job is finished, remove the pdb directory
         pdb_location = None
         protein = Protein.from_job(job=job, config=self.config.protein)
-        if job.active is False:
-            protein.remove_pdb_directory()
-        elif event["updated_count"] == 1:
-            pdb_location = protein.pdb_location
+
+        if protein is not None:
+            if job.active is False:
+                protein.remove_pdb_directory()
+            elif event["updated_count"] == 1:
+                pdb_location = protein.pdb_location
+        else:
+            bt.logging.error(f"Protein.from_job returns NONE for protein {job.pdb}")
 
         log_event(self, event=event, pdb_location=pdb_location)