Skip to content

Commit 070ff67

Browse files
Merge pull request #258 from macrocosm-os/staging
Staging
2 parents 07a6055 + 2e9fa99 commit 070ff67

File tree

10 files changed

+61
-35
lines changed

10 files changed

+61
-35
lines changed

environment.yml

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ dependencies:
1313
- gzip==1.13
1414
- pip:
1515
- bittensor==6.9.4
16+
- tenacity==9.0.0
1617
- python-dotenv==1.0.1
1718
- wandb==0.17.2
1819
- eth-utils==2.1.1

folding/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from .protocol import JobSubmissionSynapse
22
from .validators.protein import Protein
33

4-
__version__ = "1.1.0"
4+
__version__ = "1.1.1"
55
version_split = __version__.split(".")
66
__spec_version__ = (
77
(10000 * int(version_split[0]))

folding/base/neuron.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import os
2222

2323
import openmm
24+
from tenacity import RetryError
2425

2526
# Sync calls set weights and also resyncs the metagraph.
2627
from folding.utils.config import check_config, add_args, config
@@ -154,7 +155,11 @@ def sync(self):
154155
self.resync_metagraph()
155156

156157
if self.should_set_weights():
157-
self.set_weights()
158+
try:
159+
self.set_weights()
160+
except RetryError as e:
161+
bt.logging.error(f"Failed to set weights after retry attempts. Skipping for {self.config.neuron.epoch_length} blocks.")
162+
158163

159164
# Always save state.
160165
self.save_state()

folding/base/validator.py

+14-10
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,21 @@
1919
import os
2020
import json
2121
import copy
22-
import time
2322
import torch
2423
import asyncio
2524
import argparse
2625
import threading
2726
import bittensor as bt
2827

2928
from typing import List
30-
from traceback import print_exception
29+
from pathlib import Path
3130

3231
from folding.base.neuron import BaseNeuron
3332
from folding.mock import MockDendrite
3433
from folding.utils.config import add_validator_args
34+
from folding.utils.ops import print_on_retry
3535

36-
from pathlib import Path
36+
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result
3737

3838
ROOT_DIR = Path(__file__).resolve().parents[2]
3939

@@ -67,9 +67,6 @@ def __init__(self, config=None):
6767
self.metagraph.n, dtype=torch.float32, device=self.device
6868
)
6969

70-
# Init sync with the network. Updates the metagraph.
71-
self.sync()
72-
7370
# Serve axon to enable external connections.
7471
if not self.config.neuron.axon_off:
7572
self.serve_axon()
@@ -110,6 +107,14 @@ def serve_axon(self):
110107
bt.logging.error(f"Failed to create Axon initialize with exception: {e}")
111108
pass
112109

110+
@retry(
111+
stop=stop_after_attempt(3), # Retry up to 3 times
112+
wait=wait_fixed(1), # Wait 1 second between retries
113+
retry=retry_if_result(
114+
lambda result: result is False
115+
), # Retry if the result is False
116+
after=print_on_retry
117+
)
113118
def set_weights(self):
114119
"""
115120
Sets the validator weights to the metagraph hotkeys based on the scores it has received from the miners. The weights determine the trust and incentive level the validator assigns to miner nodes on the network.
@@ -161,10 +166,9 @@ def set_weights(self):
161166
wait_for_inclusion=False,
162167
version_key=self.spec_version,
163168
)
164-
if result is True:
165-
bt.logging.info("set_weights on chain successfully!")
166-
else:
167-
bt.logging.error("set_weights failed")
169+
170+
171+
return result
168172

169173
def resync_metagraph(self):
170174
"""Resyncs the metagraph and updates the hotkeys and moving averages based on the new metagraph."""

folding/utils/config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def add_args(cls, parser):
8282
"--neuron.epoch_length",
8383
type=int,
8484
help="The default epoch length (how often we set weights, measured in 12 second blocks).",
85-
default=100,
85+
default=250,
8686
)
8787

8888
parser.add_argument(

folding/utils/logging.py

+10-13
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@
33
from typing import List
44
from loguru import logger
55
from dataclasses import asdict, dataclass
6-
import datetime as dt
76
import os
8-
import asyncio
97

108
import folding
119
import bittensor as bt
@@ -36,7 +34,7 @@ def should_reinit_wandb(self):
3634
)
3735

3836

39-
async def init_wandb(self, pdb_id: str, reinit=True, failed=False):
37+
def init_wandb(self, pdb_id: str, reinit=True, failed=False):
4038
"""Starts a new wandb run."""
4139

4240
tags = [
@@ -53,7 +51,6 @@ async def init_wandb(self, pdb_id: str, reinit=True, failed=False):
5351
tags.append("mock")
5452
if self.config.neuron.disable_set_weights:
5553
tags.append("disable_set_weights")
56-
tags.append("async")
5754
wandb_config = {
5855
key: copy.deepcopy(self.config.get(key, None))
5956
for key in ("neuron", "reward", "netuid", "wandb")
@@ -62,7 +59,7 @@ async def init_wandb(self, pdb_id: str, reinit=True, failed=False):
6259

6360
id = None if pdb_id not in self.wandb_ids.keys() else self.wandb_ids[pdb_id]
6461

65-
run = await asyncio.to_thread(wandb.init,
62+
run = wandb.init(
6663
anonymous="allow",
6764
name=pdb_id,
6865
reinit=reinit,
@@ -93,27 +90,27 @@ async def init_wandb(self, pdb_id: str, reinit=True, failed=False):
9390
return run
9491

9592

96-
async def log_protein(run, pdb_id_path: str):
93+
def log_protein(run, pdb_id_path: str):
9794
"""Logs the protein visualization to wandb.
9895
pdb_id_path: str: path to the pdb file on disk.
9996
"""
10097
try:
101-
await asyncio.to_thread(run.log,{"protein_vis": wandb.Molecule(pdb_id_path)})
98+
run.log({"protein_vis": wandb.Molecule(pdb_id_path)})
10299
except:
103100
bt.logging.warning("Failed to log protein visualization")
104101

105102

106-
async def log_folded_protein(run, pdb_id_path: str):
103+
def log_folded_protein(run, pdb_id_path: str):
107104
"""Logs the folded protein visualization to wandb.
108105
pdb_id_path: str: path to the pdb file on disk.
109106
"""
110107
try:
111-
await asyncio.to_thread(run.log,{"folded_protein_vis": wandb.Molecule(pdb_id_path)})
108+
run.log({"folded_protein_vis": wandb.Molecule(pdb_id_path)})
112109
except:
113110
bt.logging.warning("Failed to log folded protein visualization")
114111

115112

116-
async def log_event(
113+
def log_event(
117114
self,
118115
event,
119116
failed=False,
@@ -127,16 +124,16 @@ async def log_event(
127124
return
128125
pdb_id = event["pdb_id"]
129126

130-
run = await init_wandb(self, pdb_id=pdb_id, failed=failed)
127+
run = init_wandb(self, pdb_id=pdb_id, failed=failed)
131128

132129
# Log the event to wandb.
133130
run.log(event)
134131
wandb.save(os.path.join(self.config.neuron.full_path, f"events.log"))
135132

136133
if pdb_location is not None:
137-
await log_protein(run, pdb_id_path=pdb_location)
134+
log_protein(run, pdb_id_path=pdb_location)
138135
if folded_protein_location is not None:
139-
await log_folded_protein(run, pdb_id_path=folded_protein_location)
136+
log_folded_protein(run, pdb_id_path=folded_protein_location)
140137
wandb.save(folded_protein_location)
141138

142139
run.finish()

folding/utils/ops.py

+6
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ def wrapper(*args, **kwargs):
7171

7272
return decorator
7373

74+
def print_on_retry(retry_state):
75+
function_name = retry_state.fn.__name__
76+
max_retries = retry_state.retry_object.stop.max_attempt_number
77+
bt.logging.warning(f"Retrying {function_name}: retry #{retry_state.attempt_number} out of {max_retries}")
78+
79+
7480

7581
def delete_directory(directory: str):
7682
"""We create a lot of files in the process of tracking pdb files.

folding/validators/forward.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ async def create_new_challenge(self, exclude: List) -> Dict:
183183
event["hp_search_time"] = time.time() - forward_start_time
184184

185185
# only log the event if the simulation was not successful
186-
await log_event(self, event, failed=True)
186+
log_event(self, event, failed=True)
187187
bt.logging.debug(
188188
f"❌❌ All hyperparameter combinations failed for pdb_id {pdb_id}.. Skipping! ❌❌"
189189
)

folding/validators/protein.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,7 @@ def is_run_valid(self):
587587
def get_ns_computed(self):
588588
"""Calculate the number of nanoseconds computed by the miner."""
589589

590-
return (self.cpt_step * self.system_config.time_step_size) / 1e6
590+
return (self.cpt_step * self.system_config.time_step_size) / 1e3
591591

592592
def save_pdb(self, output_path: str):
593593
"""Save the pdb file to the output path."""

neurons/validator.py

+20-7
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ def __init__(self, config=None):
5050

5151
self.load_state()
5252

53+
# Init sync with the network. Updates the metagraph.
54+
self.sync()
55+
5356
# TODO: Change the store to SQLiteJobStore if you want to use SQLite
5457
self.store = SQLiteJobStore()
5558
self.mdrun_args = self.parse_mdrun_args()
@@ -322,7 +325,7 @@ async def prepare_event_for_logging(event: Dict):
322325
bt.logging.error(f"Protein.from_job returns NONE for protein {job.pdb}")
323326

324327
# Remove these keys from the log because they polute the terminal.
325-
await log_event(
328+
log_event(
326329
self,
327330
event=event,
328331
pdb_location=pdb_location,
@@ -366,20 +369,28 @@ async def create_jobs(self):
366369
# Here is where we select, download and preprocess a pdb
367370
# We also assign the pdb to a group of workers (miners), based on their workloads
368371
await self.add_jobs(k=self.config.neuron.queue_size - queue.qsize())
369-
bt.logging.info(
370-
f"Sleeping {self.config.neuron.update_interval} seconds before next job creation loop."
371-
)
372+
bt.logging.info(
373+
f"Sleeping 60 seconds before next job creation loop."
374+
)
375+
else:
376+
bt.logging.info(
377+
"Job queue is full. Sleeping 60 seconds before next job creation loop."
378+
)
379+
372380
except Exception as e:
373381
bt.logging.error(f"Error in create_jobs: {e}")
374-
await asyncio.sleep(self.config.neuron.update_interval)
382+
383+
await asyncio.sleep(60)
375384

376385
async def update_jobs(self):
377386
while True:
378387
try:
379-
bt.logging.info(f"step({self.step}) block({self.block})")
380-
388+
# Wait at the beginning of update_jobs since we want to avoid attemping to update jobs before we get data back.
381389
await asyncio.sleep(self.config.neuron.update_interval)
390+
382391
bt.logging.info("Updating jobs.")
392+
bt.logging.info(f"step({self.step}) block({self.block})")
393+
383394
for job in self.store.get_queue(ready=False).queue:
384395
# Remove any deregistered hotkeys from current job. This will update the store when the job is updated.
385396
if not job.check_for_available_hotkeys(self.metagraph.hotkeys):
@@ -404,7 +415,9 @@ async def update_jobs(self):
404415
await self.update_job(job=job)
405416
except Exception as e:
406417
bt.logging.error(f"Error in update_jobs: {e}")
418+
407419
self.step += 1
420+
408421
bt.logging.info(
409422
f"Sleeping {self.config.neuron.update_interval} seconds before next job update loop."
410423
)

0 commit comments

Comments
 (0)