Skip to content

Commit 9bc70bb

Browse files
Merge pull request #216 from macrocosm-os/staging
Staging
2 parents 667b959 + 1c1cc1f commit 9bc70bb

File tree

6 files changed

+70
-39
lines changed

6 files changed

+70
-39
lines changed

folding/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from .protocol import JobSubmissionSynapse
22
from .validators.protein import Protein
33

4-
__version__ = "1.0.0"
4+
__version__ = "1.0.1"
55
version_split = __version__.split(".")
66
__spec_version__ = (
77
(10000 * int(version_split[0]))

folding/store.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ class Job:
153153
gro_hash: str = None
154154
update_interval: pd.Timedelta = pd.Timedelta(minutes=10)
155155
updated_count: int = 0
156-
max_time_no_improvement: pd.Timedelta = pd.Timedelta(minutes=20)
156+
max_time_no_improvement: pd.Timedelta = pd.Timedelta(minutes=25)
157157
min_updates: int = 10
158158
epsilon: float = 0.05 # percentage.
159159
event: dict = None
@@ -173,8 +173,6 @@ def update(
173173
self,
174174
loss: float,
175175
hotkey: str,
176-
commit_hash: str,
177-
gro_hash: str,
178176
hotkeys: List[str] = None,
179177
):
180178
"""Updates the status of a job in the database. If the loss improves, the best loss, hotkey and hashes are updated."""
@@ -198,8 +196,6 @@ def update(
198196
self.best_loss = loss
199197
self.best_loss_at = pd.Timestamp.now().floor("s")
200198
self.best_hotkey = hotkey
201-
self.commit_hash = commit_hash
202-
self.gro_hash = gro_hash
203199
elif ( # if loss has been improved but not recently enough, trigger early stopping
204200
pd.Timestamp.now().floor("s") - self.best_loss_at
205201
> self.max_time_no_improvement

folding/utils/ops.py

+8
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,14 @@ def __init__(self, message="Version error occurred"):
2424
super().__init__(self.message)
2525

2626

27+
class ValidationError(Exception):
28+
"""Exception raised for errors in the versioning."""
29+
30+
def __init__(self, message="Version error occurred"):
31+
self.message = message
32+
super().__init__(self.message)
33+
34+
2735
def delete_directory(directory: str):
2836
"""We create a lot of files in the process of tracking pdb files.
2937
Therefore, we want to delete the directory after we are done with the tests.

folding/validators/forward.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,20 @@ def run_ping_step(self, uids: List[int], timeout: float) -> Dict:
4949

5050
def run_step(
5151
self,
52-
protein: Protein,
52+
protein: Protein | None,
5353
uids: List[int],
5454
timeout: float,
5555
mdrun_args="", #'-ntomp 64' #limit the number of threads to 64
5656
) -> Dict:
5757
start_time = time.time()
58+
if protein is None:
59+
event = {
60+
"block": self.block,
61+
"step_length": time.time() - start_time,
62+
"energies": [],
63+
"active": False,
64+
}
65+
return event
5866

5967
# Get the list of uids to query for this step.
6068
axons = [self.metagraph.axons[uid] for uid in uids]

folding/validators/protein.py

+43-22
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,33 @@
1+
import os
12
import time
23
import glob
3-
import os
4-
import pickle
4+
import base64
55
import random
6-
import re
76
import shutil
87
from collections import defaultdict
98
from dataclasses import dataclass
109
from pathlib import Path
1110
from typing import Dict, List, Literal
12-
import base64
1311

14-
import plotly.express as px
1512
import bittensor as bt
16-
import openmm as mm
17-
import pandas as pd
1813
import numpy as np
14+
import pandas as pd
15+
import plotly.express as px
1916
from openmm import app, unit
2017
from pdbfixer import PDBFixer
18+
19+
from folding.base.simulation import OpenMMSimulation
2120
from folding.store import Job
2221
from folding.utils.opemm_simulation_config import SimulationConfig
2322
from folding.utils.ops import (
23+
OpenMMException,
24+
ValidationError,
2425
check_and_download_pdbs,
2526
check_if_directory_exists,
2627
load_pdb_ids,
2728
select_random_pdb_id,
2829
write_pkl,
2930
)
30-
from folding.store import Job
31-
from folding.base.simulation import OpenMMSimulation
3231

3332
ROOT_DIR = Path(__file__).resolve().parents[2]
3433

@@ -118,8 +117,8 @@ def from_job(job: Job, config: Dict):
118117
bt.logging.error(
119118
f"from_job failed for {protein.pdb_id} with Exception {E}."
120119
)
121-
finally:
122-
return protein
120+
return None
121+
return protein
123122

124123
@staticmethod
125124
def load_pdb_as_string(pdb_path: str) -> str:
@@ -185,7 +184,7 @@ def read_and_return_files(self, filenames: List) -> Dict:
185184
name
186185
] = f.read() # This would be the pdb file.
187186

188-
except Exception as E:
187+
except Exception:
189188
continue
190189
return files_to_return
191190

@@ -221,6 +220,13 @@ def setup_simulation(self):
221220

222221
self.pdb_complexity = Protein._get_pdb_complexity(self.pdb_location)
223222
self.init_energy = self.calc_init_energy()
223+
224+
# Checking if init energy is nan
225+
if np.isnan(self.init_energy):
226+
raise OpenMMException(
227+
f"Failed to calculate initial energy for {self.pdb_id}"
228+
)
229+
224230
self._calculate_epsilon()
225231

226232
def __str__(self):
@@ -355,6 +361,9 @@ def get_miner_data_directory(self, hotkey: str):
355361
def process_md_output(
356362
self, md_output: dict, seed: int, state: str, hotkey: str
357363
) -> bool:
364+
MIN_LOGGING_ENTRIES = 500
365+
MIN_SIMULATION_STEPS = 5000
366+
358367
required_files_extensions = ["cpt", "log"]
359368
hotkey_alias = hotkey[:8]
360369
self.current_state = state
@@ -409,11 +418,16 @@ def process_md_output(
409418
self.log_file = pd.read_csv(log_file_path)
410419
self.log_step = self.log_file['#"Step"'].iloc[-1]
411420

421+
# Checks to see if we have enough steps in the log file to start validation
422+
if len(self.log_file) < MIN_LOGGING_ENTRIES:
423+
raise ValidationError(
424+
f"Miner {hotkey_alias} did not run enough steps in the simulation... Skipping!"
425+
)
426+
412427
# Make sure that we are enough steps ahead in the log file compared to the checkpoint file.
413-
# Checks if log_file is 5000 steps ahead of checkpoint AND that the log_file has at least 5000 steps
414-
if (
415-
self.log_step - self.simulation.currentStep
416-
) < 5000 and len(self.log_file) >= 5000:
428+
# Checks if log_file is MIN_STEPS steps ahead of checkpoint
429+
if (self.log_step - self.simulation.currentStep) < MIN_SIMULATION_STEPS:
430+
# If the miner did not run enough steps, we will load the old checkpoint
417431
checkpoint_path = os.path.join(
418432
self.miner_data_directory, f"{self.current_state}_old.cpt"
419433
)
@@ -422,13 +436,17 @@ def process_md_output(
422436
f"Miner {hotkey_alias} did not run enough steps since last checkpoint... Loading old checkpoint"
423437
)
424438
self.simulation.loadCheckpoint(checkpoint_path)
439+
# Checking to see if the old checkpoint has enough steps to validate
440+
if (
441+
self.log_step - self.simulation.currentStep
442+
) < MIN_SIMULATION_STEPS:
443+
raise ValidationError(
444+
f"Miner {hotkey_alias} did not run enough steps in the simulation... Skipping!"
445+
)
425446
else:
426-
bt.logging.warning(
447+
raise ValidationError(
427448
f"Miner {hotkey_alias} did not run enough steps and no old checkpoint found... Skipping!"
428449
)
429-
return False
430-
else:
431-
self.simulation.loadCheckpoint(checkpoint_path)
432450

433451
self.cpt_step = self.simulation.currentStep
434452
self.checkpoint_path = checkpoint_path
@@ -444,6 +462,10 @@ def process_md_output(
444462
write_mode="wb",
445463
)
446464

465+
except ValidationError as E:
466+
bt.logging.warning(f"{E}")
467+
return False
468+
447469
except Exception as e:
448470
bt.logging.error(f"Failed to recreate simulation: {e}")
449471
return False
@@ -500,7 +522,6 @@ def is_run_valid(self):
500522

501523
# calculating absolute percent difference per step
502524
percent_diff = abs(((check_energies - miner_energies) / miner_energies) * 100)
503-
min_length = len(percent_diff)
504525

505526
# This is some debugging information for plotting the information from the miner.
506527
df = pd.DataFrame([check_energies, miner_energies]).T
@@ -559,7 +580,7 @@ def remove_pdb_directory(self):
559580
"""
560581
shutil.rmtree(self.pdb_directory)
561582

562-
def calc_init_energy(self):
583+
def calc_init_energy(self) -> float:
563584
"""Calculate the potential energy from an edr file using gmx energy.
564585
Args:
565586
output_dir (str): directory containing the edr file

neurons/validator.py

+8-10
Original file line numberDiff line numberDiff line change
@@ -247,10 +247,6 @@ def update_job(self, job: Job):
247247
energies = torch.Tensor(job.event["energies"])
248248
rewards = torch.zeros(len(energies)) # one-hot per update step
249249

250-
# TODO: we need to get the commit and gro hashes from the best hotkey
251-
commit_hash = "" # For next time
252-
gro_hash = "" # For next time
253-
254250
best_index = np.argmin(energies)
255251
best_loss = energies[best_index].item() # item because it's a torch.tensor
256252
best_hotkey = serving_hotkeys[best_index]
@@ -259,8 +255,6 @@ def update_job(self, job: Job):
259255
hotkeys=serving_hotkeys,
260256
loss=best_loss,
261257
hotkey=best_hotkey,
262-
commit_hash=commit_hash,
263-
gro_hash=gro_hash,
264258
)
265259

266260
# If no miners respond appropriately, the energies will be all zeros
@@ -316,10 +310,14 @@ def prepare_event_for_logging(event: Dict):
316310
# If the job is finished, remove the pdb directory
317311
pdb_location = None
318312
protein = Protein.from_job(job=job, config=self.config.protein)
319-
if job.active is False:
320-
protein.remove_pdb_directory()
321-
elif event["updated_count"] == 1:
322-
pdb_location = protein.pdb_location
313+
314+
if protein is not None:
315+
if job.active is False:
316+
protein.remove_pdb_directory()
317+
elif event["updated_count"] == 1:
318+
pdb_location = protein.pdb_location
319+
else:
320+
bt.logging.error(f"Protein.from_job returns NONE for protein {job.pdb}")
323321

324322
log_event(self, event=event, pdb_location=pdb_location)
325323

0 commit comments

Comments
 (0)