Merge pull request #338 from macrocosm-os/staging

Sarkosos · web-flow · commit 9b048229d75f · 2025-01-27T17:50:08.000+01:00
Staging
diff --git a/.env.example b/.env.example
@@ -2,3 +2,10 @@ S3_REGION = "nyc3"
 S3_ENDPOINT = "https://nyc3.digitaloceanspaces.com"
 S3_KEY = "s3_key"
 S3_SECRET = "secret_key"
+RQLITE_HTTP_ADDR=0.0.0.0:4001
+RQLITE_RAFT_ADDR=0.0.0.0:4002
+RQLITE_HTTP_ADV_ADDR=123.456.7.8:4001
+RQLITE_RAFT_ADV_ADDR=123.456.7.8:4002
+RQLITE_DATA_DIR=db/
+JOIN_ADDR=174.138.3.61:4002
+HOTKEY=your_hotkey
diff --git a/.gitignore b/.gitignore
@@ -189,3 +189,6 @@ folding/db
 /charmm36-jul2022.ff
 
 tests/mock_data
+
+db
+local-gjp
diff --git a/README.md b/README.md
@@ -190,6 +190,7 @@ pm2 start pm2_configs/validator.config.js
 ```
 Keep in mind that you will need to change the default parameters for either the [miner](./scripts/run_miner.sh) or the [validator](./scripts/run_validator.sh). 
 
+Miners now have the opportunity to interact with the global job pool (GJP) locally. By creating a read-only node via `start_read_node.sh`, miners sync with the GJP on their local machine in the `db` directory. We have provided a script `scripts/query_rqlite.py` that returns jobs based on their priority in the GJP, or returns a specific job specified by `pdb_id`. With this information, miners can experiment with customizing their job queue. This script can also be helpful for downloading and analyzing checkpoint files from other miners. Please see the updated environment variables in `.env.example` and specify your public IP address in the following fields: `RQLITE_HTTP_ADV_ADDR`,`RQLITE_RAFT_ADV_ADDR`. 
 ## How does the Subnet Work?
 
 In this subnet, validators create protein folding challenges for miners, who in turn run simulations using OpenMM to obtain stable protein configurations. At a high level, each role can be broken down into parts: 
diff --git a/folding/__init__.py b/folding/__init__.py
@@ -1,7 +1,7 @@
 from .protocol import JobSubmissionSynapse
 from .validators.protein import Protein
 
-__version__ = "1.4.5"
+__version__ = "1.4.6"
 version_split = __version__.split(".")
 __spec_version__ = (10000 * int(version_split[0])) + (100 * int(version_split[1])) + (1 * int(version_split[2]))
 
diff --git a/folding/base/validator.py b/folding/base/validator.py
@@ -66,9 +66,7 @@ def __init__(self, config=None):
 
         # Set up initial scoring weights for validation
         logger.info("Building validation weights.")
-        self.scores = torch.zeros(
-            self.metagraph.n, dtype=torch.float32, device=self.device
-        )
+        self.scores = torch.zeros(self.metagraph.n, dtype=torch.float32, device=self.device)
 
         # Serve axon to enable external connections.
         if not self.config.neuron.axon_off:
@@ -115,9 +113,7 @@ def _serve_axon(self):
     @retry(
         stop=stop_after_attempt(3),  # Retry up to 3 times
         wait=wait_fixed(1),  # Wait 1 second between retries
-        retry=retry_if_result(
-            lambda result: result is False
-        ),  # Retry if the result is False
+        retry=retry_if_result(lambda result: result is False),  # Retry if the result is False
         after=print_on_retry,
     )
     def set_weights(self):
@@ -133,9 +129,7 @@ def set_weights(self):
 
         # Calculate the average reward for each uid across non-zero values.
         # Replace any NaN values with 0.
-        raw_weights = (
-            torch.nn.functional.normalize(self.scores, p=1, dim=0).to("cpu").numpy()
-        )
+        raw_weights = torch.nn.functional.normalize(self.scores, p=1, dim=0).to("cpu").numpy()
 
         logger.debug("raw_weights", raw_weights)
         logger.debug("raw_weight_uids", self.metagraph.uids)
@@ -191,9 +185,7 @@ def resync_metagraph(self):
         if previous_metagraph.axons == self.metagraph.axons:
             return
 
-        logger.info(
-            "Metagraph updated, re-syncing hotkeys, dendrite pool and moving averages"
-        )
+        logger.info("Metagraph updated, re-syncing hotkeys, dendrite pool and moving averages")
         # Zero out all hotkeys that have been replaced.
         for uid, hotkey in enumerate(self.hotkeys):
             if hotkey != self.metagraph.hotkeys[uid]:
@@ -227,17 +219,13 @@ async def update_scores(self, rewards: torch.FloatTensor, uids: List[int]):
 
         # Compute forward pass rewards, assumes uids are mutually exclusive.
         # shape: [ metagraph.n ]
-        scattered_rewards: torch.FloatTensor = self.scores.scatter(
-            0, uids_tensor, rewards
-        ).to(self.device)
+        scattered_rewards: torch.FloatTensor = self.scores.scatter(0, uids_tensor, rewards).to(self.device)
         logger.debug(f"Scattered rewards: {rewards}")
 
         # Update scores with rewards produced by this step.
         # shape: [ metagraph.n ]
         alpha: float = self.config.neuron.moving_average_alpha
-        self.scores: torch.FloatTensor = alpha * scattered_rewards + (
-            1 - alpha
-        ) * self.scores.to(self.device)
+        self.scores: torch.FloatTensor = alpha * scattered_rewards + (1 - alpha) * self.scores.to(self.device)
 
         logger.debug(f"Updated moving avg scores: {self.scores}")
 
@@ -264,24 +252,23 @@ def load_state(self):
             self.hotkeys = state["hotkeys"]
             logger.info("Loaded previously saved validator state information.")
         except:
-            logger.info(
-                "Previous validator state not found... Weight copying the average of the network."
-            )
-
-            valid_indices = np.where(self.metagraph.validator_permit)[0]
-            valid_weights = self.metagraph.weights[valid_indices]
-            valid_stakes = self.metagraph.S[valid_indices]
-            normalized_stakes = valid_stakes / np.sum(valid_stakes)
+            logger.info("Previous validator state not found... Weight copying the average of the network.")
 
+            self.scores = self.get_chain_weights()
             self.step = 1
-            self.scores = torch.tensor(np.dot(normalized_stakes, valid_weights)).to(
-                self.device
-            )
+
+    def get_chain_weights(self) -> torch.Tensor:
+        """Obtain the stake weighted average of all validator weights on chain."""
+        valid_indices = np.where(self.metagraph.validator_permit)[0]
+        valid_weights = self.metagraph.weights[valid_indices]
+        valid_stakes = self.metagraph.S[valid_indices]
+        normalized_stakes = valid_stakes / np.sum(valid_stakes)
+
+        weights = torch.tensor(np.dot(normalized_stakes, valid_weights)).to(self.device)
+        return weights
 
     def load_config_json(self):
-        config_json_path = os.path.join(
-            str(ROOT_DIR), "folding/utils/config_input.json"
-        )
+        config_json_path = os.path.join(str(ROOT_DIR), "folding/utils/config_input.json")
         with open(config_json_path, "r") as file:
             config = json.load(file)
         return config
diff --git a/folding/miners/folding_miner.py b/folding/miners/folding_miner.py
@@ -9,6 +9,7 @@
 from typing import Dict, List, Tuple
 import copy
 import traceback
+import asyncio 
 
 import bittensor as bt
 import openmm as mm
@@ -18,7 +19,6 @@
 from folding.base.miner import BaseMinerNeuron
 from folding.base.simulation import OpenMMSimulation
 from folding.protocol import JobSubmissionSynapse
-from folding.utils.logging import log_event
 from folding.utils.reporters import ExitFileReporter, LastTwoCheckpointsReporter
 from folding.utils.ops import (
     check_if_directory_exists,
@@ -146,6 +146,7 @@ def __init__(self, config=None, base_data_path: str = None):
 
         self.mock = None
         self.generate_random_seed = lambda: random.randint(0, 1000)
+        self.db_path = "/db/db.sqlite"
 
         # hardcorded for now -- TODO: make this more flexible
         self.STATES = ["nvt", "npt", "md_0_1"]
@@ -358,6 +359,7 @@ def forward(self, synapse: JobSubmissionSynapse) -> JobSubmissionSynapse:
             elif len(synapse.md_inputs) == 0:  # The vali sends nothing to the miner
                 return check_synapse(self=self, synapse=synapse, event=event)
 
+
     def submit_simulation(
         self,
         synapse: JobSubmissionSynapse,
diff --git a/folding/utils/config.py b/folding/utils/config.py
@@ -73,7 +73,7 @@ def add_args(cls, parser):
         "--neuron.epoch_length",
         type=int,
         help="The default epoch length (how often we set weights, measured in 12 second blocks).",
-        default=150,
+        default=300,
     )
 
     parser.add_argument(
diff --git a/folding/validators/protein.py b/folding/validators/protein.py
@@ -535,14 +535,15 @@ def check_masses(self) -> bool:
                 logger.error(f"Masses for atom {i} do not match. Validator: {v_mass}, Miner: {m_mass}")
                 return False
         return True
-    
+
     def compare_state_to_cpt(self, state_energies: list, checkpoint_energies: list) -> bool:
         """
         Check if the state file is the same as the checkpoint file by comparing the median of the first few energy values
         in the simulation created by the checkpoint and the state file respectively.
         """
-        
+
         WINDOW = 50
+        ANOMALY_THRESHOLD = 2
 
         state_energies = np.array(state_energies)
         checkpoint_energies = np.array(checkpoint_energies)
@@ -552,11 +553,9 @@ def compare_state_to_cpt(self, state_energies: list, checkpoint_energies: list)
 
         percent_diff = abs((state_median - checkpoint_median) / checkpoint_median) * 100
 
-        if percent_diff > self.epsilon:
+        if percent_diff > ANOMALY_THRESHOLD:
             return False
         return True
-        
-        
 
     def is_run_valid(self):
         """
@@ -585,7 +584,6 @@ def is_run_valid(self):
         # Run the simulation at most 3000 steps
         steps_to_run = min(3000, self.log_step - self.cpt_step)
 
-
         # This is where we are going to check the xml files for the state.
         logger.info(f"Recreating simulation for {self.pdb_id} for state-based analysis...")
         self.simulation, self.system_config = self.create_simulation(
@@ -604,7 +602,6 @@ def is_run_valid(self):
             logger.warning(f"hotkey {self.hotkey_alias} failed state-gradient check for {self.pdb_id}, ... Skipping!")
             return False, [], [], "state-gradient"
 
-
         # Reload in the checkpoint file and run the simulation for the same number of steps as the miner.
         self.simulation, self.system_config = self.create_simulation(
             pdb=self.load_pdb_file(pdb_file=self.pdb_location),
@@ -630,7 +627,7 @@ def is_run_valid(self):
             (self.log_file['#"Step"'] > self.cpt_step) & (self.log_file['#"Step"'] <= max_step)
         ]["Potential Energy (kJ/mole)"].values
 
-        self.simulation.step(steps_to_run)     
+        self.simulation.step(steps_to_run)
 
         check_log_file = pd.read_csv(current_state_logfile)
         check_energies: np.ndarray = check_log_file["Potential Energy (kJ/mole)"].values
@@ -642,9 +639,11 @@ def is_run_valid(self):
         if not self.check_gradient(check_energies=check_energies):
             logger.warning(f"hotkey {self.hotkey_alias} failed cpt-gradient check for {self.pdb_id}, ... Skipping!")
             return False, [], [], "cpt-gradient"
-        
+
         if not self.compare_state_to_cpt(state_energies=state_energies, checkpoint_energies=check_energies):
-            logger.warning(f"hotkey {self.hotkey_alias} failed state-checkpoint comparison for {self.pdb_id}, ... Skipping!")
+            logger.warning(
+                f"hotkey {self.hotkey_alias} failed state-checkpoint comparison for {self.pdb_id}, ... Skipping!"
+            )
             return False, [], [], "state-checkpoint"
 
         # calculating absolute percent difference per step
diff --git a/install.sh b/install.sh
@@ -41,4 +41,5 @@ poetry install
 sudo apt-get update
 sudo apt-get install build-essential cmake libfftw3-dev vim npm -y
 sudo npm install -g pm2 -y
+chmod +x install_rqlite.sh
 ./install_rqlite.sh
diff --git a/neurons/validator.py b/neurons/validator.py
@@ -289,7 +289,7 @@ async def update_job(self, job: Job):
         for uid, reason in zip(job.event["uids"], job.event["reason"]):
             if reason == "state-checkpoint":
                 logger.warning(f"Setting uid {uid} score to zero, State-checkpoint check failed.")
-                self.scores[uid] = 0
+                self.scores[uid] = 0.5 * self.scores[uid]
 
         best_index = np.argmin(energies)
         best_loss = energies[best_index].item()  # item because it's a torch.tensor
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "folding"
-version = "1.4.5"
+version = "1.4.6"
 description = "Macrocosmos Subnet 25: Folding"
 authors = ["Brian McCrindle <brian@macrocosmos.ai>", "Sergio Champoux <sergio@macrocosmos.ai>", "Szymon Fonau <szymon.fonau@macrocosmos.ai>"]
 
diff --git a/scripts/query_rqlite.py b/scripts/query_rqlite.py
diff --git a/scripts/start_read_node.sh b/scripts/start_read_node.sh
diff --git a/tests/test_miner_functions.py b/tests/test_miner_functions.py

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ def add_args(cls, parser):`
`73`	`73`	`"--neuron.epoch_length",`
`74`	`74`	`type=int,`
`75`	`75`	`help="The default epoch length (how often we set weights, measured in 12 second blocks).",`
`76`		`- default=150,`
	`76`	`+ default=300,`
`77`	`77`	`)`
`78`	`78`
`79`	`79`	`parser.add_argument(`