Skip to content

Commit 07a6055

Browse files
authored
Merge pull request #249 from macrocosm-os/staging
2 parents 21e5576 + 65a4d2e commit 07a6055

16 files changed

+655
-489
lines changed

environment.yml

+3-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ dependencies:
1010
- loguru=0.7.0
1111
- pip=24.2
1212
- pdbfixer=1.9
13+
- gzip==1.13
1314
- pip:
1415
- bittensor==6.9.4
1516
- python-dotenv==1.0.1
@@ -18,4 +19,5 @@ dependencies:
1819
- numpy==1.26.4
1920
- parmed==4.2.2
2021
- plotly==5.22.0
21-
- kaleido==0.2.1
22+
- kaleido==0.2.1
23+
- async-timeout==4.0.3

folding/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from .protocol import JobSubmissionSynapse
22
from .validators.protein import Protein
33

4-
__version__ = "1.0.6"
4+
__version__ = "1.1.0"
55
version_split = __version__.split(".")
66
__spec_version__ = (
77
(10000 * int(version_split[0]))

folding/base/neuron.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -122,17 +122,19 @@ def check_openmm_version(self):
122122
raise e
123123

124124
bt.logging.success(f"Running OpenMM version: {self.openmm_version}")
125-
125+
126126
def setup_wandb_logging(self):
127127
if os.path.isfile(f"{self.config.neuron.full_path}/wandb_ids.pkl"):
128-
self.wandb_ids = load_pkl(f"{self.config.neuron.full_path}/wandb_ids.pkl", "rb")
128+
self.wandb_ids = load_pkl(
129+
f"{self.config.neuron.full_path}/wandb_ids.pkl", "rb"
130+
)
129131
else:
130132
self.wandb_ids = {}
131-
133+
132134
def add_wandb_id(self, pdb_id: str, wandb_id: str):
133135
self.wandb_ids[pdb_id] = wandb_id
134136
write_pkl(self.wandb_ids, f"{self.config.neuron.full_path}/wandb_ids.pkl", "wb")
135-
137+
136138
def remove_wandb_id(self, pdb_id: str):
137139
self.wandb_ids.pop(pdb_id)
138140
write_pkl(self.wandb_ids, f"{self.config.neuron.full_path}/wandb_ids.pkl", "wb")
@@ -141,10 +143,6 @@ def remove_wandb_id(self, pdb_id: str):
141143
async def forward(self, synapse: bt.Synapse) -> bt.Synapse:
142144
...
143145

144-
@abstractmethod
145-
def run(self):
146-
...
147-
148146
def sync(self):
149147
"""
150148
Wrapper for synchronizing the state of the network for the given miner or validator.

folding/base/validator.py

+1-153
Original file line numberDiff line numberDiff line change
@@ -110,158 +110,6 @@ def serve_axon(self):
110110
bt.logging.error(f"Failed to create Axon initialize with exception: {e}")
111111
pass
112112

113-
async def concurrent_forward(self):
114-
coroutines = [
115-
self.forward() for _ in range(self.config.neuron.num_concurrent_forwards)
116-
]
117-
await asyncio.gather(*coroutines)
118-
119-
def run(self):
120-
"""
121-
Initiates and manages the main loop for the miner on the Bittensor network. The main loop handles graceful shutdown on keyboard interrupts and logs unforeseen errors.
122-
123-
This function performs the following primary tasks:
124-
1. Check for registration on the Bittensor network.
125-
2. Continuously forwards queries to the miners on the network, rewarding their responses and updating the scores accordingly.
126-
3. Periodically resynchronizes with the chain; updating the metagraph with the latest network state and setting weights.
127-
128-
The essence of the validator's operations is in the forward function, which is called every step. The forward function is responsible for querying the network and scoring the responses.
129-
130-
Note:
131-
- The function leverages the global configurations set during the initialization of the miner.
132-
- The miner's axon serves as its interface to the Bittensor network, handling incoming and outgoing requests.
133-
134-
Raises:
135-
KeyboardInterrupt: If the miner is stopped by a manual interruption.
136-
Exception: For unforeseen errors during the miner's operation, which are logged for diagnosis.
137-
"""
138-
139-
# Check that validator is registered on the network.
140-
self.sync()
141-
142-
bt.logging.info(f"Validator starting at block: {self.block}")
143-
144-
# This loop maintains the validator's operations until intentionally stopped.
145-
try:
146-
while True:
147-
# Our BaseValidator logic is intentionally as generic as possible so that the Validator neuron can apply problem-specific logic
148-
bt.logging.info(f"step({self.step}) block({self.block})")
149-
150-
# Check if we need to add more jobs to the queue
151-
queue = self.store.get_queue(ready=False)
152-
if queue.qsize() < self.config.neuron.queue_size:
153-
# Potential situation where (sample_size * queue_size) > available uids on the metagraph.
154-
# Therefore, this product must be less than the number of uids on the metagraph.
155-
if (
156-
self.config.neuron.sample_size * self.config.neuron.queue_size
157-
) > self.metagraph.n:
158-
raise ValueError(
159-
f"sample_size * queue_size must be less than the number of uids on the metagraph ({self.metagraph.n})."
160-
)
161-
162-
bt.logging.debug(f"✅ Creating jobs! ✅")
163-
# Here is where we select, download and preprocess a pdb
164-
# We also assign the pdb to a group of workers (miners), based on their workloads
165-
self.add_jobs(k=self.config.neuron.queue_size - queue.qsize())
166-
167-
# TODO: maybe concurrency for the loop below
168-
for job in self.store.get_queue(ready=False).queue:
169-
# Remove any deregistered hotkeys from current job. This will update the store when the job is updated.
170-
if not job.check_for_available_hotkeys(self.metagraph.hotkeys):
171-
self.store.update(job=job)
172-
continue
173-
174-
# Here we straightforwardly query the workers associated with each job and update the jobs accordingly
175-
job_event = self.forward(job=job)
176-
177-
# If we don't have any miners reply to the query, we will make it inactive.
178-
if len(job_event["energies"]) == 0:
179-
job.active = False
180-
self.store.update(job=job)
181-
continue
182-
183-
if isinstance(job.event, str):
184-
job.event = eval(job.event) # if str, convert to dict.
185-
186-
job.event.update(job_event)
187-
# Determine the status of the job based on the current energy and the previous values (early stopping)
188-
# Update the DB with the current status
189-
self.update_job(job)
190-
191-
# Check if we should exit.
192-
if self.should_exit:
193-
break
194-
195-
# Sync metagraph and potentially set weights.
196-
self.sync()
197-
198-
self.step += 1
199-
bt.logging.warning(
200-
f"Sleeping for {self.config.neuron.update_interval} before resampling..."
201-
)
202-
time.sleep(self.config.neuron.update_interval)
203-
204-
# If someone intentionally stops the validator, it'll safely terminate operations.
205-
except KeyboardInterrupt:
206-
self.axon.stop()
207-
bt.logging.debug("Validator killed by keyboard interrupt.")
208-
exit()
209-
210-
# In case of unforeseen errors, the validator will log the error and continue operations.
211-
except Exception as err:
212-
bt.logging.error("Error during validation", str(err))
213-
bt.logging.debug(print_exception(type(err), err, err.__traceback__))
214-
self.should_exit = True
215-
216-
def run_in_background_thread(self):
217-
"""
218-
Starts the validator's operations in a background thread upon entering the context.
219-
This method facilitates the use of the validator in a 'with' statement.
220-
"""
221-
if not self.is_running:
222-
bt.logging.debug("Starting validator in background thread.")
223-
self.should_exit = False
224-
self.thread = threading.Thread(target=self.run, daemon=True)
225-
self.thread.start()
226-
self.is_running = True
227-
bt.logging.debug("Started")
228-
229-
def stop_run_thread(self):
230-
"""
231-
Stops the validator's operations that are running in the background thread.
232-
"""
233-
if self.is_running:
234-
bt.logging.debug("Stopping validator in background thread.")
235-
self.should_exit = True
236-
self.thread.join(5)
237-
self.is_running = False
238-
bt.logging.debug("Stopped")
239-
240-
def __enter__(self):
241-
# self.run_in_background_thread()
242-
self.run()
243-
return self
244-
245-
def __exit__(self, exc_type, exc_value, traceback):
246-
"""
247-
Stops the validator's background operations upon exiting the context.
248-
This method facilitates the use of the validator in a 'with' statement.
249-
250-
Args:
251-
exc_type: The type of the exception that caused the context to be exited.
252-
None if the context was exited without an exception.
253-
exc_value: The instance of the exception that caused the context to be exited.
254-
None if the context was exited without an exception.
255-
traceback: A traceback object encoding the stack trace.
256-
None if the context was exited without an exception.
257-
"""
258-
if self.is_running:
259-
bt.logging.debug("Stopping validator in background thread.")
260-
self.should_exit = True
261-
self.thread.join(5)
262-
self.is_running = False
263-
bt.logging.debug("Stopped")
264-
265113
def set_weights(self):
266114
"""
267115
Sets the validator weights to the metagraph hotkeys based on the scores it has received from the miners. The weights determine the trust and incentive level the validator assigns to miner nodes on the network.
@@ -352,7 +200,7 @@ def resync_metagraph(self):
352200
# Update the hotkeys.
353201
self.hotkeys = copy.deepcopy(self.metagraph.hotkeys)
354202

355-
def update_scores(self, rewards: torch.FloatTensor, uids: List[int]):
203+
async def update_scores(self, rewards: torch.FloatTensor, uids: List[int]):
356204
"""Performs exponential moving average on the scores based on the rewards received from the miners."""
357205

358206
# Check if rewards contains NaN values.

folding/rewards/reward_pipeline.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from folding.rewards.linear_reward import divide_decreasing
55

66

7-
def reward_pipeline(
7+
async def reward_pipeline(
88
energies: torch.Tensor, rewards: torch.Tensor, top_reward: float, job: Job
99
):
1010
"""A reward pipeline that determines how to place rewards onto the miners sampled within the batch.

0 commit comments

Comments
 (0)