Skip to content

Commit 9b04822

Browse files
authored
Merge pull request #338 from macrocosm-os/staging
Staging
2 parents 04090ba + 63db13b commit 9b04822

14 files changed

+228
-51
lines changed

.env.example

+7
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,10 @@ S3_REGION = "nyc3"
22
S3_ENDPOINT = "https://nyc3.digitaloceanspaces.com"
33
S3_KEY = "s3_key"
44
S3_SECRET = "secret_key"
5+
RQLITE_HTTP_ADDR=0.0.0.0:4001
6+
RQLITE_RAFT_ADDR=0.0.0.0:4002
7+
RQLITE_HTTP_ADV_ADDR=123.456.7.8:4001
8+
RQLITE_RAFT_ADV_ADDR=123.456.7.8:4002
9+
RQLITE_DATA_DIR=db/
10+
JOIN_ADDR=174.138.3.61:4002
11+
HOTKEY=your_hotkey

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -189,3 +189,6 @@ folding/db
189189
/charmm36-jul2022.ff
190190

191191
tests/mock_data
192+
193+
db
194+
local-gjp

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ pm2 start pm2_configs/validator.config.js
190190
```
191191
Keep in mind that you will need to change the default parameters for either the [miner](./scripts/run_miner.sh) or the [validator](./scripts/run_validator.sh).
192192

193+
Miners now have the opportunity to interact with the global job pool (GJP) locally. By creating a read-only node via `start_read_node.sh`, miners sync with the GJP on their local machine in the `db` directory. We have provided a script `scripts/query_rqlite.py` that returns jobs based on their priority in the GJP, or returns a specific job specified by `pdb_id`. With this information, miners can experiment with customizing their job queue. This script can also be helpful for downloading and analyzing checkpoint files from other miners. Please see the updated environment variables in `.env.example` and specify your public IP address in the following fields: `RQLITE_HTTP_ADV_ADDR`,`RQLITE_RAFT_ADV_ADDR`.
193194
## How does the Subnet Work?
194195

195196
In this subnet, validators create protein folding challenges for miners, who in turn run simulations using OpenMM to obtain stable protein configurations. At a high level, each role can be broken down into parts:

folding/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from .protocol import JobSubmissionSynapse
22
from .validators.protein import Protein
33

4-
__version__ = "1.4.5"
4+
__version__ = "1.4.6"
55
version_split = __version__.split(".")
66
__spec_version__ = (10000 * int(version_split[0])) + (100 * int(version_split[1])) + (1 * int(version_split[2]))
77

folding/base/validator.py

+19-32
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,7 @@ def __init__(self, config=None):
6666

6767
# Set up initial scoring weights for validation
6868
logger.info("Building validation weights.")
69-
self.scores = torch.zeros(
70-
self.metagraph.n, dtype=torch.float32, device=self.device
71-
)
69+
self.scores = torch.zeros(self.metagraph.n, dtype=torch.float32, device=self.device)
7270

7371
# Serve axon to enable external connections.
7472
if not self.config.neuron.axon_off:
@@ -115,9 +113,7 @@ def _serve_axon(self):
115113
@retry(
116114
stop=stop_after_attempt(3), # Retry up to 3 times
117115
wait=wait_fixed(1), # Wait 1 second between retries
118-
retry=retry_if_result(
119-
lambda result: result is False
120-
), # Retry if the result is False
116+
retry=retry_if_result(lambda result: result is False), # Retry if the result is False
121117
after=print_on_retry,
122118
)
123119
def set_weights(self):
@@ -133,9 +129,7 @@ def set_weights(self):
133129

134130
# Calculate the average reward for each uid across non-zero values.
135131
# Replace any NaN values with 0.
136-
raw_weights = (
137-
torch.nn.functional.normalize(self.scores, p=1, dim=0).to("cpu").numpy()
138-
)
132+
raw_weights = torch.nn.functional.normalize(self.scores, p=1, dim=0).to("cpu").numpy()
139133

140134
logger.debug("raw_weights", raw_weights)
141135
logger.debug("raw_weight_uids", self.metagraph.uids)
@@ -191,9 +185,7 @@ def resync_metagraph(self):
191185
if previous_metagraph.axons == self.metagraph.axons:
192186
return
193187

194-
logger.info(
195-
"Metagraph updated, re-syncing hotkeys, dendrite pool and moving averages"
196-
)
188+
logger.info("Metagraph updated, re-syncing hotkeys, dendrite pool and moving averages")
197189
# Zero out all hotkeys that have been replaced.
198190
for uid, hotkey in enumerate(self.hotkeys):
199191
if hotkey != self.metagraph.hotkeys[uid]:
@@ -227,17 +219,13 @@ async def update_scores(self, rewards: torch.FloatTensor, uids: List[int]):
227219

228220
# Compute forward pass rewards, assumes uids are mutually exclusive.
229221
# shape: [ metagraph.n ]
230-
scattered_rewards: torch.FloatTensor = self.scores.scatter(
231-
0, uids_tensor, rewards
232-
).to(self.device)
222+
scattered_rewards: torch.FloatTensor = self.scores.scatter(0, uids_tensor, rewards).to(self.device)
233223
logger.debug(f"Scattered rewards: {rewards}")
234224

235225
# Update scores with rewards produced by this step.
236226
# shape: [ metagraph.n ]
237227
alpha: float = self.config.neuron.moving_average_alpha
238-
self.scores: torch.FloatTensor = alpha * scattered_rewards + (
239-
1 - alpha
240-
) * self.scores.to(self.device)
228+
self.scores: torch.FloatTensor = alpha * scattered_rewards + (1 - alpha) * self.scores.to(self.device)
241229

242230
logger.debug(f"Updated moving avg scores: {self.scores}")
243231

@@ -264,24 +252,23 @@ def load_state(self):
264252
self.hotkeys = state["hotkeys"]
265253
logger.info("Loaded previously saved validator state information.")
266254
except:
267-
logger.info(
268-
"Previous validator state not found... Weight copying the average of the network."
269-
)
270-
271-
valid_indices = np.where(self.metagraph.validator_permit)[0]
272-
valid_weights = self.metagraph.weights[valid_indices]
273-
valid_stakes = self.metagraph.S[valid_indices]
274-
normalized_stakes = valid_stakes / np.sum(valid_stakes)
255+
logger.info("Previous validator state not found... Weight copying the average of the network.")
275256

257+
self.scores = self.get_chain_weights()
276258
self.step = 1
277-
self.scores = torch.tensor(np.dot(normalized_stakes, valid_weights)).to(
278-
self.device
279-
)
259+
260+
def get_chain_weights(self) -> torch.Tensor:
261+
"""Obtain the stake weighted average of all validator weights on chain."""
262+
valid_indices = np.where(self.metagraph.validator_permit)[0]
263+
valid_weights = self.metagraph.weights[valid_indices]
264+
valid_stakes = self.metagraph.S[valid_indices]
265+
normalized_stakes = valid_stakes / np.sum(valid_stakes)
266+
267+
weights = torch.tensor(np.dot(normalized_stakes, valid_weights)).to(self.device)
268+
return weights
280269

281270
def load_config_json(self):
282-
config_json_path = os.path.join(
283-
str(ROOT_DIR), "folding/utils/config_input.json"
284-
)
271+
config_json_path = os.path.join(str(ROOT_DIR), "folding/utils/config_input.json")
285272
with open(config_json_path, "r") as file:
286273
config = json.load(file)
287274
return config

folding/miners/folding_miner.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from typing import Dict, List, Tuple
1010
import copy
1111
import traceback
12+
import asyncio
1213

1314
import bittensor as bt
1415
import openmm as mm
@@ -18,7 +19,6 @@
1819
from folding.base.miner import BaseMinerNeuron
1920
from folding.base.simulation import OpenMMSimulation
2021
from folding.protocol import JobSubmissionSynapse
21-
from folding.utils.logging import log_event
2222
from folding.utils.reporters import ExitFileReporter, LastTwoCheckpointsReporter
2323
from folding.utils.ops import (
2424
check_if_directory_exists,
@@ -146,6 +146,7 @@ def __init__(self, config=None, base_data_path: str = None):
146146

147147
self.mock = None
148148
self.generate_random_seed = lambda: random.randint(0, 1000)
149+
self.db_path = "/db/db.sqlite"
149150

150151
# hardcorded for now -- TODO: make this more flexible
151152
self.STATES = ["nvt", "npt", "md_0_1"]
@@ -358,6 +359,7 @@ def forward(self, synapse: JobSubmissionSynapse) -> JobSubmissionSynapse:
358359
elif len(synapse.md_inputs) == 0: # The vali sends nothing to the miner
359360
return check_synapse(self=self, synapse=synapse, event=event)
360361

362+
361363
def submit_simulation(
362364
self,
363365
synapse: JobSubmissionSynapse,

folding/utils/config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def add_args(cls, parser):
7373
"--neuron.epoch_length",
7474
type=int,
7575
help="The default epoch length (how often we set weights, measured in 12 second blocks).",
76-
default=150,
76+
default=300,
7777
)
7878

7979
parser.add_argument(

folding/validators/protein.py

+9-10
Original file line numberDiff line numberDiff line change
@@ -535,14 +535,15 @@ def check_masses(self) -> bool:
535535
logger.error(f"Masses for atom {i} do not match. Validator: {v_mass}, Miner: {m_mass}")
536536
return False
537537
return True
538-
538+
539539
def compare_state_to_cpt(self, state_energies: list, checkpoint_energies: list) -> bool:
540540
"""
541541
Check if the state file is the same as the checkpoint file by comparing the median of the first few energy values
542542
in the simulation created by the checkpoint and the state file respectively.
543543
"""
544-
544+
545545
WINDOW = 50
546+
ANOMALY_THRESHOLD = 2
546547

547548
state_energies = np.array(state_energies)
548549
checkpoint_energies = np.array(checkpoint_energies)
@@ -552,11 +553,9 @@ def compare_state_to_cpt(self, state_energies: list, checkpoint_energies: list)
552553

553554
percent_diff = abs((state_median - checkpoint_median) / checkpoint_median) * 100
554555

555-
if percent_diff > self.epsilon:
556+
if percent_diff > ANOMALY_THRESHOLD:
556557
return False
557558
return True
558-
559-
560559

561560
def is_run_valid(self):
562561
"""
@@ -585,7 +584,6 @@ def is_run_valid(self):
585584
# Run the simulation at most 3000 steps
586585
steps_to_run = min(3000, self.log_step - self.cpt_step)
587586

588-
589587
# This is where we are going to check the xml files for the state.
590588
logger.info(f"Recreating simulation for {self.pdb_id} for state-based analysis...")
591589
self.simulation, self.system_config = self.create_simulation(
@@ -604,7 +602,6 @@ def is_run_valid(self):
604602
logger.warning(f"hotkey {self.hotkey_alias} failed state-gradient check for {self.pdb_id}, ... Skipping!")
605603
return False, [], [], "state-gradient"
606604

607-
608605
# Reload in the checkpoint file and run the simulation for the same number of steps as the miner.
609606
self.simulation, self.system_config = self.create_simulation(
610607
pdb=self.load_pdb_file(pdb_file=self.pdb_location),
@@ -630,7 +627,7 @@ def is_run_valid(self):
630627
(self.log_file['#"Step"'] > self.cpt_step) & (self.log_file['#"Step"'] <= max_step)
631628
]["Potential Energy (kJ/mole)"].values
632629

633-
self.simulation.step(steps_to_run)
630+
self.simulation.step(steps_to_run)
634631

635632
check_log_file = pd.read_csv(current_state_logfile)
636633
check_energies: np.ndarray = check_log_file["Potential Energy (kJ/mole)"].values
@@ -642,9 +639,11 @@ def is_run_valid(self):
642639
if not self.check_gradient(check_energies=check_energies):
643640
logger.warning(f"hotkey {self.hotkey_alias} failed cpt-gradient check for {self.pdb_id}, ... Skipping!")
644641
return False, [], [], "cpt-gradient"
645-
642+
646643
if not self.compare_state_to_cpt(state_energies=state_energies, checkpoint_energies=check_energies):
647-
logger.warning(f"hotkey {self.hotkey_alias} failed state-checkpoint comparison for {self.pdb_id}, ... Skipping!")
644+
logger.warning(
645+
f"hotkey {self.hotkey_alias} failed state-checkpoint comparison for {self.pdb_id}, ... Skipping!"
646+
)
648647
return False, [], [], "state-checkpoint"
649648

650649
# calculating absolute percent difference per step

install.sh

+1
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,5 @@ poetry install
4141
sudo apt-get update
4242
sudo apt-get install build-essential cmake libfftw3-dev vim npm -y
4343
sudo npm install -g pm2 -y
44+
chmod +x install_rqlite.sh
4445
./install_rqlite.sh

neurons/validator.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ async def update_job(self, job: Job):
289289
for uid, reason in zip(job.event["uids"], job.event["reason"]):
290290
if reason == "state-checkpoint":
291291
logger.warning(f"Setting uid {uid} score to zero, State-checkpoint check failed.")
292-
self.scores[uid] = 0
292+
self.scores[uid] = 0.5 * self.scores[uid]
293293

294294
best_index = np.argmin(energies)
295295
best_loss = energies[best_index].item() # item because it's a torch.tensor

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "folding"
3-
version = "1.4.5"
3+
version = "1.4.6"
44
description = "Macrocosmos Subnet 25: Folding"
55
authors = ["Brian McCrindle <[email protected]>", "Sergio Champoux <[email protected]>", "Szymon Fonau <[email protected]>"]
66

0 commit comments

Comments
 (0)