From 2d80f4af670e3bb2f40c8967f106ae26b220285e Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Thu, 27 Mar 2025 15:57:52 -0300 Subject: [PATCH 01/20] fix(dependencies): adding missing dep --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 0be4f66..97419f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "py-ed25519-zebra-bindings==1.1.0", "py-sr25519-bindings==0.2.1", "nats-py==2.6.0", + "scikit-learn", "bittensor", "fiber @ git+https://github.com/5u6r054/fiber.git@fix/remove-bittensor-commit-reveal-dependency" ] From 6f11a5803467f921d633c5362b53d013362485d1 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Tue, 1 Apr 2025 00:15:54 -0300 Subject: [PATCH 02/20] feat(validator): track worker_id --- db/routing_table_database.py | 151 +++++++++++++++++++++++++++++++-- db/telemetry_database.py | 9 +- interfaces/types.py | 1 + miner/routes_manager.py | 27 ++++++ validator/api_routes.py | 109 +++++++++++++++++++++++- validator/node_manager.py | 86 ++++++++++++++++++- validator/routing_table.py | 91 ++++++++++++++++---- validator/scorer.py | 3 +- validator/telemetry_storage.py | 15 +++- 9 files changed, 457 insertions(+), 35 deletions(-) diff --git a/db/routing_table_database.py b/db/routing_table_database.py index fbe7a20..525ae83 100644 --- a/db/routing_table_database.py +++ b/db/routing_table_database.py @@ -7,6 +7,7 @@ def __init__(self, db_path="./miner_tee_addresses.db"): self.db_path = db_path self.lock = Lock() self._create_table() + self._create_worker_registry_table() def _create_table(self): with self.lock, sqlite3.connect(self.db_path) as conn: @@ -17,36 +18,60 @@ def _create_table(self): hotkey TEXT, uid TEXT, address TEXT UNIQUE, + worker_id TEXT, timestamp DATETIME DEFAULT CURRENT_TIMESTAMP ) """ ) conn.commit() - def add_address(self, hotkey, uid, address): + def _create_worker_registry_table(self): with self.lock, sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() cursor.execute( """ - INSERT INTO miner_addresses (hotkey, uid, address) - VALUES (?, ?, ?) - """, - (hotkey, uid, address), + CREATE TABLE IF NOT EXISTS worker_registry ( + worker_id TEXT PRIMARY KEY, + hotkey TEXT NOT NULL, + timestamp DATETIME DEFAULT CURRENT_TIMESTAMP + ) + """ ) conn.commit() - def update_address(self, hotkey, uid, new_address): + def add_address(self, hotkey, uid, address, worker_id=None): with self.lock, sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() cursor.execute( """ - UPDATE miner_addresses SET address = ? - WHERE hotkey = ? AND uid = ? + INSERT INTO miner_addresses (hotkey, uid, address, worker_id) + VALUES (?, ?, ?, ?) """, - (new_address, hotkey, uid), + (hotkey, uid, address, worker_id), ) conn.commit() + def update_address(self, hotkey, uid, new_address, worker_id=None): + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + if worker_id is not None: + cursor.execute( + """ + UPDATE miner_addresses SET address = ?, worker_id = ? + WHERE hotkey = ? AND uid = ? + """, + (new_address, worker_id, hotkey, uid), + ) + else: + cursor.execute( + """ + UPDATE miner_addresses SET address = ? + WHERE hotkey = ? AND uid = ? + """, + (new_address, hotkey, uid), + ) + conn.commit() + def delete_address(self, hotkey, uid): with self.lock, sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() @@ -72,3 +97,111 @@ def clean_old_entries(self): """ ) conn.commit() + + def register_worker(self, worker_id, hotkey): + """ + Register a worker_id with a hotkey in the worker registry. + If the worker_id already exists, it will update the hotkey. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + INSERT OR REPLACE INTO worker_registry (worker_id, hotkey) + VALUES (?, ?) + """, + (worker_id, hotkey), + ) + conn.commit() + + def unregister_worker(self, worker_id): + """ + Remove a worker_id from the worker registry. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + DELETE FROM worker_registry + WHERE worker_id = ? + """, + (worker_id,), + ) + conn.commit() + + def unregister_workers_by_hotkey(self, hotkey): + """ + Remove all worker_ids associated with a hotkey from the registry. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + DELETE FROM worker_registry + WHERE hotkey = ? + """, + (hotkey,), + ) + conn.commit() + + def get_worker_hotkey(self, worker_id): + """ + Get the hotkey associated with a worker_id from the registry. + Returns None if the worker_id is not registered. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + SELECT hotkey FROM worker_registry + WHERE worker_id = ? + """, + (worker_id,), + ) + result = cursor.fetchone() + return result[0] if result else None + + def get_workers_by_hotkey(self, hotkey): + """ + Get all worker_ids associated with a hotkey from the registry. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + SELECT worker_id FROM worker_registry + WHERE hotkey = ? + """, + (hotkey,), + ) + results = cursor.fetchall() + return [row[0] for row in results] + + def get_all_worker_registrations(self): + """ + Get all worker_id and hotkey pairs from the registry. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + SELECT worker_id, hotkey FROM worker_registry + """ + ) + results = cursor.fetchall() + return [(row[0], row[1]) for row in results] + + def clean_old_worker_registrations(self, hours=24): + """ + Remove worker registrations older than the specified number of hours. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + DELETE FROM worker_registry + WHERE timestamp < datetime('now', ?) + """, + (f"-{hours} hours",), + ) + conn.commit() diff --git a/db/telemetry_database.py b/db/telemetry_database.py index 0b5823d..51ee3a7 100644 --- a/db/telemetry_database.py +++ b/db/telemetry_database.py @@ -28,7 +28,8 @@ def _create_table(self): twitter_returned_tweets INT, twitter_scrapes INT, web_errors INT, - web_success INT + web_success INT, + worker_id TEXT ) """ ) @@ -41,8 +42,9 @@ def add_telemetry(self, telemetry_data): """ INSERT INTO telemetry (hotkey, uid, boot_time, last_operation_time, current_time, twitter_auth_errors, twitter_errors, twitter_ratelimit_errors, twitter_returned_other, - twitter_returned_profiles, twitter_returned_tweets, twitter_scrapes, web_errors, web_success) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + twitter_returned_profiles, twitter_returned_tweets, twitter_scrapes, web_errors, web_success, + worker_id) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( telemetry_data.hotkey, @@ -59,6 +61,7 @@ def add_telemetry(self, telemetry_data): telemetry_data.twitter_scrapes, telemetry_data.web_errors, telemetry_data.web_success, + telemetry_data.worker_id, ), ) conn.commit() diff --git a/interfaces/types.py b/interfaces/types.py index b5dc1b5..cbc3d6e 100644 --- a/interfaces/types.py +++ b/interfaces/types.py @@ -20,6 +20,7 @@ class ConnectedNode(JSONSerializable): @dataclass class NodeData(JSONSerializable): hotkey: str + worker_id: str uid: int boot_time: int last_operation_time: int diff --git a/miner/routes_manager.py b/miner/routes_manager.py index 0bbe21e..2cdda8d 100644 --- a/miner/routes_manager.py +++ b/miner/routes_manager.py @@ -75,6 +75,13 @@ def register_routes(self) -> None: tags=["scoring"], ) + self.app.add_api_route( + "/custom-message", + self.custom_message_handler, + methods=["POST"], + tags=["monitor"], + ) + async def score_report_handler(self, request: Request): try: payload = await request.json() @@ -116,6 +123,26 @@ async def score_report_handler(self, request: Request): logger.error(f"\n\033[31mError processing score report: {str(e)}\033[0m") return {"status": "error", "message": str(e)} + async def custom_message_handler(self, request: Request): + try: + payload = await request.json() + message = payload.get("message", "No message provided") + sender = payload.get("sender", "Unknown") + + logger.info( + f"\n\033[36m" + f"====================================\n" + f" CUSTOM MESSAGE RECEIVED \n" + f"====================================\033[0m\n\n" + f" From: {sender}\n" + f" Message: {message}\n" + ) + + return {"status": "success", "received": True} + except Exception as e: + logger.error(f"\n\033[31mError processing custom message: {str(e)}\033[0m") + return {"status": "error", "message": str(e)} + async def healthcheck(self, request: Request): return healthcheck(self.miner) diff --git a/validator/api_routes.py b/validator/api_routes.py index 1162ad6..0c546b7 100644 --- a/validator/api_routes.py +++ b/validator/api_routes.py @@ -1,4 +1,4 @@ -from fastapi import FastAPI, Request +from fastapi import FastAPI def register_routes(app: FastAPI, healthcheck_func): @@ -30,6 +30,113 @@ def register_routes(self) -> None: tags=["healthcheck"], ) + # Add monitoring endpoints + self.app.add_api_route( + "/monitor/worker-registry", + self.monitor_worker_registry, + methods=["GET"], + tags=["monitoring"], + ) + + self.app.add_api_route( + "/monitor/routing-table", + self.monitor_routing_table, + methods=["GET"], + tags=["monitoring"], + ) + + self.app.add_api_route( + "/monitor/telemetry", + self.monitor_telemetry, + methods=["GET"], + tags=["monitoring"], + ) + + self.app.add_api_route( + "/monitor/telemetry/{hotkey}", + self.monitor_telemetry_by_hotkey, + methods=["GET"], + tags=["monitoring"], + ) + async def healthcheck(self): # Implement the healthcheck logic for the validator return self.validator.healthcheck() + + async def monitor_worker_registry(self): + """Return all worker registrations (worker_id to hotkey mappings)""" + try: + registrations = self.validator.routing_table.get_all_worker_registrations() + return { + "count": len(registrations), + "worker_registrations": [ + {"worker_id": worker_id, "hotkey": hotkey} + for worker_id, hotkey in registrations + ], + } + except Exception as e: + return {"error": str(e)} + + async def monitor_routing_table(self): + """Return all miner addresses and their associated hotkeys""" + try: + addresses = self.validator.routing_table.get_all_addresses_with_hotkeys() + return { + "count": len(addresses), + "miner_addresses": [ + { + "hotkey": hotkey, + "address": address, + "worker_id": worker_id if worker_id else None, + } + for hotkey, address, worker_id in addresses + ], + } + except Exception as e: + return {"error": str(e)} + + async def monitor_telemetry(self): + """Return a list of hotkeys that have telemetry data""" + try: + hotkeys = self.validator.telemetry_storage.get_all_hotkeys_with_telemetry() + return {"count": len(hotkeys), "hotkeys": hotkeys} + except Exception as e: + return {"error": str(e)} + + async def monitor_telemetry_by_hotkey(self, hotkey: str): + """Return telemetry data for a specific hotkey""" + try: + telemetry_data = self.validator.telemetry_storage.get_telemetry_by_hotkey( + hotkey + ) + + # Convert NodeData objects to dictionaries + telemetry_dict_list = [] + for data in telemetry_data: + telemetry_dict = { + "hotkey": data.hotkey, + "uid": data.uid, + "timestamp": data.timestamp, + "boot_time": data.boot_time, + "last_operation_time": data.last_operation_time, + "current_time": data.current_time, + "twitter_auth_errors": data.twitter_auth_errors, + "twitter_errors": data.twitter_errors, + "twitter_ratelimit_errors": data.twitter_ratelimit_errors, + "twitter_returned_other": data.twitter_returned_other, + "twitter_returned_profiles": data.twitter_returned_profiles, + "twitter_returned_tweets": data.twitter_returned_tweets, + "twitter_scrapes": data.twitter_scrapes, + "web_errors": data.web_errors, + "web_success": data.web_success, + "worker_id": data.worker_id if hasattr(data, "worker_id") else None, + } + telemetry_dict_list.append(telemetry_dict) + + return { + "hotkey": hotkey, + "count": len(telemetry_dict_list), + "telemetry_data": telemetry_dict_list, + } + except Exception as e: + return {"error": str(e)} diff --git a/validator/node_manager.py b/validator/node_manager.py index dddb005..f31f13e 100644 --- a/validator/node_manager.py +++ b/validator/node_manager.py @@ -7,6 +7,7 @@ import sqlite3 from fiber.logging_utils import get_logger from interfaces.types import NodeData +from validator.telemetry import TEETelemetryClient if TYPE_CHECKING: from neurons.validator import Validator @@ -149,6 +150,47 @@ async def remove_disconnected_nodes(self): self.validator.connected_tee_list = [] await self.update_tee_list() + async def send_custom_message(self, node_hotkey: str, message: str) -> None: + """ + Send a custom message to a specific miner. + + Args: + node_hotkey (str): The miner's hotkey + message (str): The message to send + """ + try: + if node_hotkey not in self.connected_nodes: + logger.warning(f"No connected node found for hotkey {node_hotkey}") + return + + node = self.connected_nodes[node_hotkey] + uid = str( + self.validator.metagraph.nodes[ + self.validator.keypair.ss58_address + ].node_id + ) + payload = { + "message": message, + "sender": f"Validator {uid} ({self.validator.keypair.ss58_address})", + } + + response = await self.validator.http_client_manager.client.post( + f"http://{node.ip}:{node.port}/custom-message", json=payload + ) + + if response.status_code == 200: + logger.debug(f"Successfully sent custom message to miner {node_hotkey}") + else: + logger.warning( + f"Failed to send custom message to miner {node_hotkey}. " + f"Status code: {response.status_code}" + ) + + except Exception as e: + logger.error( + f"Error sending custom message to miner {node_hotkey}: {str(e)}" + ) + async def update_tee_list(self): logger.info("Starting TEE list update") routing_table = self.validator.routing_table @@ -190,13 +232,55 @@ async def update_tee_list(self): continue try: + telemetry_client = TEETelemetryClient(tee_address) + + logger.info( + f"Executing telemetry sequence for node {hotkey} at {tee_address}" + ) + + telemetry_result = ( + await telemetry_client.execute_telemetry_sequence() + ) + + worker_id = telemetry_result.get("worker_id", "N/A") + + worker_hotkey = routing_table.get_worker_hotkey( + worker_id=worker_id + ) + + is_worker_already_owned = ( + worker_hotkey is not None + and worker_hotkey is not hotkey + ) + + # This checks that a worker address is only owned by the first node that requests it + # For removing this restriction shoot a message on discord + if is_worker_already_owned: + logger.warning( + f"Worker ID {worker_id} is already registered to another hotkey. " + f"Skipping registration for {hotkey}." + ) + continue + + routing_table.register_worker( + hotkey=hotkey, worker_id=worker_id + ) routing_table.add_miner_address( - hotkey, node.node_id, tee_address + hotkey, node.node_id, tee_address, worker_id ) + logger.debug( f"Added TEE address {tee_address} for " f"hotkey {hotkey}" ) + + if not worker_hotkey: + # Send notification to miner about successful registration + await self.send_custom_message( + hotkey, + f"Your TEE address {tee_address} has been successfully registered with worker_id {worker_id} for hotkey {hotkey}", + ) + except sqlite3.IntegrityError: logger.debug( f"TEE address {tee_address} already exists in " diff --git a/validator/routing_table.py b/validator/routing_table.py index 90eafc0..773cf1f 100644 --- a/validator/routing_table.py +++ b/validator/routing_table.py @@ -9,22 +9,22 @@ class RoutingTable: def __init__(self, db_path="miner_tee_addresses"): self.db = RoutingTableDatabase(db_path=db_path) - def add_miner_address(self, hotkey, uid, address): + def add_miner_address(self, hotkey, uid, address, worker_id=None): """Add a new miner address to the database.""" try: - self.db.add_address(hotkey, uid, address) + self.db.add_address(hotkey, uid, address, worker_id) except sqlite3.Error as e: - print(f"Failed to add address: {e}") + logger.error(f"Failed to add address: {e}") def remove_miner_address(self, hotkey, uid): """Remove a specific miner address from the database.""" try: self.db.delete_address(hotkey, uid) except sqlite3.Error as e: - print(f"Failed to remove address: {e}") + logger.error(f"Failed to remove address: {e}") def clear_miner(self, hotkey): - """Remove all addresses associated with a miner.""" + """Remove all addresses and worker registrations for a miner.""" try: with self.db.lock, sqlite3.connect(self.db.db_path) as conn: cursor = conn.cursor() @@ -35,8 +35,11 @@ def clear_miner(self, hotkey): (hotkey,), ) conn.commit() + + # Also remove all worker registrations for this hotkey + self.unregister_workers_by_hotkey(hotkey) except sqlite3.Error as e: - print(f"Failed to clear miner: {e}") + logger.error(f"Failed to clear miner: {e}") def get_miner_addresses(self, hotkey): """Retrieve all addresses associated with a given miner hotkey.""" @@ -45,14 +48,14 @@ def get_miner_addresses(self, hotkey): cursor = conn.cursor() cursor.execute( """ - SELECT address FROM miner_addresses WHERE hotkey = ? + SELECT address, worker_id FROM miner_addresses WHERE hotkey = ? """, (hotkey,), ) - addresses = cursor.fetchall() - return [address[0] for address in addresses] + results = cursor.fetchall() + return [(address, worker_id) for address, worker_id in results] except sqlite3.Error as e: - print(f"Failed to retrieve addresses: {e}") + logger.error(f"Failed to retrieve addresses: {e}") return [] def get_all_addresses(self): @@ -68,7 +71,7 @@ def get_all_addresses(self): addresses = cursor.fetchall() return [address[0] for address in addresses] except sqlite3.Error as e: - print(f"Failed to retrieve all addresses: {e}") + logger.error(f"Failed to retrieve all addresses: {e}") return [] def get_all_addresses_with_hotkeys(self): @@ -78,18 +81,74 @@ def get_all_addresses_with_hotkeys(self): cursor = conn.cursor() cursor.execute( """ - SELECT hotkey, address FROM miner_addresses + SELECT hotkey, address, worker_id FROM miner_addresses """ ) results = cursor.fetchall() - return [(hotkey, address) for hotkey, address in results] + return [ + (hotkey, address, worker_id) + for hotkey, address, worker_id in results + ] except sqlite3.Error as e: - print(f"Failed to retrieve addresses with hotkeys: {e}") + logger.error(f"Failed to retrieve addresses with hotkeys: {e}") return [] + def register_worker(self, worker_id, hotkey): + """Register a worker_id with a hotkey.""" + try: + self.db.register_worker(worker_id, hotkey) + except sqlite3.Error as e: + logger.error(f"Failed to register worker: {e}") + + def unregister_worker(self, worker_id): + """Remove a worker_id from the registry.""" + try: + self.db.unregister_worker(worker_id) + except sqlite3.Error as e: + logger.error(f"Failed to unregister worker: {e}") + + def unregister_workers_by_hotkey(self, hotkey): + """Remove all worker_ids associated with a hotkey.""" + try: + self.db.unregister_workers_by_hotkey(hotkey) + except sqlite3.Error as e: + logger.error(f"Failed to unregister workers for hotkey {hotkey}: {e}") + + def get_worker_hotkey(self, worker_id): + """Get the hotkey associated with a worker_id.""" + try: + return self.db.get_worker_hotkey(worker_id) + except sqlite3.Error as e: + logger.error(f"Failed to get hotkey for worker {worker_id}: {e}") + return None + + def get_workers_by_hotkey(self, hotkey): + """Get all worker_ids associated with a hotkey.""" + try: + return self.db.get_workers_by_hotkey(hotkey) + except sqlite3.Error as e: + logger.error(f"Failed to get workers for hotkey {hotkey}: {e}") + return [] + + def get_all_worker_registrations(self): + """Get all worker_id and hotkey pairs from the registry.""" + try: + return self.db.get_all_worker_registrations() + except sqlite3.Error as e: + logger.error(f"Failed to get all worker registrations: {e}") + return [] + + def clean_old_worker_registrations(self, hours=24): + """Clean worker registrations older than the specified hours.""" + try: + self.db.clean_old_worker_registrations(hours) + except sqlite3.Error as e: + logger.error(f"Failed to clean old worker registrations: {e}") + def clean_old_entries(self): - """Clean all entries where the timestamp is more than one hour older.""" + """Clean all old entries from both tables.""" try: self.db.clean_old_entries() + self.db.clean_old_worker_registrations() except sqlite3.Error as e: - logger.info(f"Failed to clean old entries: {e}") + logger.error(f"Failed to clean old entries: {e}") diff --git a/validator/scorer.py b/validator/scorer.py index db9aa8c..1bafab4 100644 --- a/validator/scorer.py +++ b/validator/scorer.py @@ -50,13 +50,14 @@ async def get_node_data(self): telemetry_result = await telemetry_client.execute_telemetry_sequence() if telemetry_result: - logger.debug( + logger.info( f"Node {hotkey} telemetry successful: {telemetry_result}" ) uid = self.validator.metagraph.nodes[hotkey].node_id telemetry_data = NodeData( hotkey=hotkey, uid=uid, + worker_id=telemetry_result.get("worker_id", "N/A"), timestamp="", boot_time=telemetry_result.get("boot_time", 0), last_operation_time=telemetry_result.get( diff --git a/validator/telemetry_storage.py b/validator/telemetry_storage.py index 90db484..30d600f 100644 --- a/validator/telemetry_storage.py +++ b/validator/telemetry_storage.py @@ -18,7 +18,10 @@ def add_telemetry(self, telemetry_data): logger.error(f"Failed to add telemetry: {e}") def clean_old_entries(self, hours): - """Clean all telemetry entries older than the specified number of hours.""" + """ + Clean all telemetry entries older than the specified number + of hours. + """ try: self.db.clean_old_entries(hours) except sqlite3.Error as e: @@ -26,8 +29,8 @@ def clean_old_entries(self, hours): def get_telemetry_by_hotkey(self, hotkey): """ - Retrieve telemetry data for a specific hotkey using the TelemetryDatabase - method. Returns a list of NodeData objects. + Retrieve telemetry data for a specific hotkey using the + TelemetryDatabase method. Returns a list of NodeData objects. """ try: telemetry_data = self.db.get_telemetry_by_hotkey(hotkey) @@ -48,6 +51,7 @@ def get_telemetry_by_hotkey(self, hotkey): web_errors=row[13], web_success=row[14], timestamp=row[2], + worker_id=row[15], ) for row in telemetry_data ] @@ -56,7 +60,10 @@ def get_telemetry_by_hotkey(self, hotkey): return [] def get_all_hotkeys_with_telemetry(self): - """Retrieve all unique hotkeys that have at least one telemetry entry using the TelemetryDatabase method.""" + """ + Retrieve all unique hotkeys that have at least one telemetry entry + using the TelemetryDatabase method. + """ try: hotkeys = self.db.get_all_hotkeys_with_telemetry() return hotkeys From dc82c093fd708bcede17cee8b561706e80b0a17a Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Thu, 3 Apr 2025 10:40:13 -0700 Subject: [PATCH 03/20] feat(validator): new tee linking --- db/errors_database.py | 127 +++++++++++++++++++ db/routing_table_database.py | 82 ++++++++++++- db/telemetry_database.py | 22 ++++ docker-compose.yml | 18 --- miner/nats_client.py | 46 +++++++ tests/test_weights_e2e.py | 41 ++++++- tests/test_weights_unit.py | 48 +++++++- validator/api_routes.py | 91 ++++++++++++++ validator/background_tasks.py | 1 + validator/errors_storage.py | 74 +++++++++++ validator/nats.py | 17 ++- validator/node_manager.py | 225 ++++++++++++++++++++++++++++++++-- validator/routing_table.py | 44 ++++++- validator/scorer.py | 51 +++++++- validator/weights.py | 1 + 15 files changed, 837 insertions(+), 51 deletions(-) create mode 100644 db/errors_database.py create mode 100644 validator/errors_storage.py diff --git a/db/errors_database.py b/db/errors_database.py new file mode 100644 index 0000000..1bd1663 --- /dev/null +++ b/db/errors_database.py @@ -0,0 +1,127 @@ +import sqlite3 +from threading import Lock + + +class ErrorsDatabase: + def __init__(self, db_path="./errors.db"): + self.db_path = db_path + self.lock = Lock() + self._create_table() + + def _create_table(self): + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS errors ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, + hotkey TEXT, + tee_address TEXT, + miner_address TEXT, + message TEXT + ) + """ + ) + conn.commit() + + def add_error(self, hotkey, tee_address, miner_address, message): + """ + Add a new error entry to the database. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + INSERT INTO errors (hotkey, tee_address, miner_address, message) + VALUES (?, ?, ?, ?) + """, + (hotkey, tee_address, miner_address, message), + ) + conn.commit() + + def get_errors_by_hotkey(self, hotkey, limit=100): + """ + Retrieve errors for a specific hotkey. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + SELECT timestamp, tee_address, miner_address, message + FROM errors + WHERE hotkey = ? + ORDER BY timestamp DESC + LIMIT ? + """, + (hotkey, limit), + ) + results = cursor.fetchall() + return [ + { + "timestamp": row[0], + "tee_address": row[1], + "miner_address": row[2], + "message": row[3], + } + for row in results + ] + + def get_all_errors(self, limit=100): + """ + Retrieve all errors, ordered by timestamp descending. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + SELECT timestamp, hotkey, tee_address, miner_address, message + FROM errors + ORDER BY timestamp DESC + LIMIT ? + """, + (limit,), + ) + results = cursor.fetchall() + return [ + { + "timestamp": row[0], + "hotkey": row[1], + "tee_address": row[2], + "miner_address": row[3], + "message": row[4], + } + for row in results + ] + + def clean_old_errors(self, hours=24): + """ + Remove error entries older than the specified number of hours. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + DELETE FROM errors + WHERE timestamp < datetime('now', ?) + """, + (f"-{hours} hours",), + ) + conn.commit() + return cursor.rowcount + + def get_error_count(self, hours=24): + """ + Get the count of errors in the last specified number of hours. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + SELECT COUNT(*) FROM errors + WHERE timestamp > datetime('now', ?) + """, + (f"-{hours} hours",), + ) + result = cursor.fetchone() + return result[0] if result else 0 diff --git a/db/routing_table_database.py b/db/routing_table_database.py index 525ae83..6856566 100644 --- a/db/routing_table_database.py +++ b/db/routing_table_database.py @@ -8,6 +8,7 @@ def __init__(self, db_path="./miner_tee_addresses.db"): self.lock = Lock() self._create_table() self._create_worker_registry_table() + self._create_unregistered_tees_table() def _create_table(self): with self.lock, sqlite3.connect(self.db_path) as conn: @@ -39,6 +40,20 @@ def _create_worker_registry_table(self): ) conn.commit() + def _create_unregistered_tees_table(self): + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS unregistered_tees ( + address TEXT PRIMARY KEY, + hotkey TEXT NOT NULL, + timestamp DATETIME DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + conn.commit() + def add_address(self, hotkey, uid, address, worker_id=None): with self.lock, sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() @@ -151,14 +166,17 @@ def get_worker_hotkey(self, worker_id): """ with self.lock, sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() + # Ensure worker_id is treated as a string for comparison + worker_id_str = str(worker_id) cursor.execute( """ - SELECT hotkey FROM worker_registry - WHERE worker_id = ? + SELECT hotkey FROM worker_registry WHERE worker_id = ?; """, - (worker_id,), + (worker_id_str,), ) + result = cursor.fetchone() + return result[0] if result else None def get_workers_by_hotkey(self, hotkey): @@ -205,3 +223,61 @@ def clean_old_worker_registrations(self, hours=24): (f"-{hours} hours",), ) conn.commit() + + def add_unregistered_tee(self, address, hotkey): + """ + Add a new unregistered TEE to the database. + If the address already exists, it will update the hotkey. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + INSERT OR REPLACE INTO unregistered_tees (address, hotkey) + VALUES (?, ?) + """, + (address, hotkey), + ) + conn.commit() + + def clean_old_unregistered_tees(self): + """ + Remove all unregistered TEEs where the timestamp is more than one hour old. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + DELETE FROM unregistered_tees + WHERE timestamp < datetime('now', '-1 hour') + """ + ) + conn.commit() + + def get_all_unregistered_tees(self): + """ + Get all unregistered TEEs from the database. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + SELECT address, hotkey FROM unregistered_tees + """ + ) + results = cursor.fetchall() + return [(address, hotkey) for address, hotkey in results] + + def get_all_unregistered_tee_addresses(self): + """ + Get all addresses from the unregistered_tees table. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + SELECT address FROM unregistered_tees + """ + ) + results = cursor.fetchall() + return [address[0] for address in results] diff --git a/db/telemetry_database.py b/db/telemetry_database.py index 51ee3a7..75a6509 100644 --- a/db/telemetry_database.py +++ b/db/telemetry_database.py @@ -7,6 +7,7 @@ def __init__(self, db_path="./telemetry_data.db"): self.db_path = db_path self.lock = Lock() self._create_table() + self._ensure_worker_id_column() def _create_table(self): with self.lock, sqlite3.connect(self.db_path) as conn: @@ -35,6 +36,27 @@ def _create_table(self): ) conn.commit() + def _ensure_worker_id_column(self): + """ + Ensure the worker_id column exists in the telemetry table. + This handles database migrations for existing databases. + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + # Check if worker_id column exists + cursor.execute("PRAGMA table_info(telemetry)") + columns = [col[1] for col in cursor.fetchall()] + + if "worker_id" not in columns: + # Add the worker_id column if it doesn't exist + cursor.execute( + """ + ALTER TABLE telemetry + ADD COLUMN worker_id TEXT + """ + ) + conn.commit() + def add_telemetry(self, telemetry_data): with self.lock, sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() diff --git a/docker-compose.yml b/docker-compose.yml index ea138e8..fb141f1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,19 +1,4 @@ services: - nats: - profiles: - - nats - - validator - image: nats:latest - pull_policy: always - ports: - - "4222:4222" - command: ["--jetstream"] - healthcheck: - test: ["CMD", "nats-server", "--ping"] - interval: 10s - timeout: 5s - retries: 5 - subnet42: profiles: ["miner", "validator"] image: masaengineering/subnet-42:latest @@ -61,6 +46,3 @@ services: options: max-size: "10m" max-file: "3" - -volumes: - nats_data: \ No newline at end of file diff --git a/miner/nats_client.py b/miner/nats_client.py index a32b932..bc29a15 100644 --- a/miner/nats_client.py +++ b/miner/nats_client.py @@ -54,3 +54,49 @@ async def send_connected_nodes(self, miners): # Ensure the NATS connection is closed logger.debug("Closing NATS connection") await self.nc.close() + + async def send_unregistered_tees(self, unregistered_tees): + # Connect to the NATS server + nats_url = os.getenv("NATS_URL", None) + logger.debug(f"Connecting to NATS server at {nats_url}") + + if nats_url: + try: + await self.nc.connect( + nats_url, + error_cb=self.error_callback, + ) + except Exception as e: + logger.info(f"An error ocurred when connecting to nats 🚩 {str(e)}") + logger.debug( + f"Failed to connect to NATS server ( {nats_url} ) : {str(e)}" + ) + return + + try: + nats_message = json.dumps({"Miners": unregistered_tees}) + channel_name = os.getenv( + "UNREGISTERED_TEE_NATS_CHANNEL_NAME", "registration" + ) + + logger.info( + f"Publishing message to channel '{channel_name}' with " + f"{len(unregistered_tees)} unregistered TEEs" + ) + logger.debug(f"Message content: {nats_message}") + + await self.nc.publish(channel_name, nats_message.encode()) + logger.info("Successfully published unregistered TEEs message ✅") + + except Exception as e: + logger.info( + f"Error publishing unregistered TEEs message to NATS ({nats_url})" + ) + logger.debug( + f"Error publishing unregistered TEEs message to NATS: {str(e)}" + ) + + finally: + # Ensure the NATS connection is closed + logger.debug("Closing NATS connection") + await self.nc.close() diff --git a/tests/test_weights_e2e.py b/tests/test_weights_e2e.py index ee67224..277f506 100644 --- a/tests/test_weights_e2e.py +++ b/tests/test_weights_e2e.py @@ -3,6 +3,7 @@ from validator.weights import WeightsManager from interfaces.types import NodeData + @pytest.mark.asyncio async def test_weights_e2e(): # Initialize a real Validator instance @@ -11,8 +12,42 @@ async def test_weights_e2e(): # Simulate node data node_data = [ - NodeData(hotkey="node1", posts=10, uptime=100, latency=50), - NodeData(hotkey="node2", posts=20, uptime=200, latency=30), + NodeData( + hotkey="node1", + worker_id="worker1", + uid=1, + boot_time=0, + last_operation_time=0, + current_time=0, + twitter_auth_errors=0, + twitter_errors=0, + twitter_ratelimit_errors=0, + twitter_returned_other=0, + twitter_returned_profiles=0, + twitter_returned_tweets=0, + twitter_scrapes=0, + web_errors=0, + web_success=10, + timestamp=0, + ), + NodeData( + hotkey="node2", + worker_id="worker2", + uid=2, + boot_time=0, + last_operation_time=0, + current_time=0, + twitter_auth_errors=0, + twitter_errors=0, + twitter_ratelimit_errors=0, + twitter_returned_other=0, + twitter_returned_profiles=0, + twitter_returned_tweets=20, + twitter_scrapes=0, + web_errors=0, + web_success=20, + timestamp=0, + ), ] # Calculate weights @@ -21,5 +56,5 @@ async def test_weights_e2e(): # Set weights await weights_manager.set_weights(node_data) - + # Here you would verify the weights were set correctly, possibly by querying the substrate diff --git a/tests/test_weights_unit.py b/tests/test_weights_unit.py index c5a58a0..7df9393 100644 --- a/tests/test_weights_unit.py +++ b/tests/test_weights_unit.py @@ -3,6 +3,7 @@ from validator.weights import WeightsManager from interfaces.types import NodeData + @pytest.fixture def mock_validator(): # Create a mock Validator instance @@ -12,20 +13,57 @@ def mock_validator(): mock_validator.keypair = MagicMock() return mock_validator + @pytest.fixture def weights_manager(mock_validator): # Create a WeightsManager instance with the mock validator return WeightsManager(validator=mock_validator) + def test_calculate_weights(weights_manager): # Test calculate_weights method node_data = [ - NodeData(hotkey="node1", posts=10, uptime=100, latency=50), - NodeData(hotkey="node2", posts=20, uptime=200, latency=30), + NodeData( + hotkey="node1", + worker_id="worker1", + uid=1, + boot_time=0, + last_operation_time=0, + current_time=0, + twitter_auth_errors=0, + twitter_errors=0, + twitter_ratelimit_errors=0, + twitter_returned_other=0, + twitter_returned_profiles=0, + twitter_returned_tweets=0, + twitter_scrapes=0, + web_errors=0, + web_success=10, + timestamp=0, + ), + NodeData( + hotkey="node2", + worker_id="worker2", + uid=2, + boot_time=0, + last_operation_time=0, + current_time=0, + twitter_auth_errors=0, + twitter_errors=0, + twitter_ratelimit_errors=0, + twitter_returned_other=0, + twitter_returned_profiles=0, + twitter_returned_tweets=20, + twitter_scrapes=0, + web_errors=0, + web_success=20, + timestamp=0, + ), ] uids, weights = weights_manager.calculate_weights(node_data) assert len(uids) == len(weights) == 2 - assert weights[0] < weights[1] # Assuming node2 has more posts + assert weights[0] < weights[1] # Assuming node2 has more activity + @pytest.mark.asyncio async def test_set_weights(weights_manager, mock_validator): @@ -35,6 +73,8 @@ async def test_set_weights(weights_manager, mock_validator): "node1": MagicMock(node_id=1), "node2": MagicMock(node_id=2), } - with patch('validator.weights.weights.set_node_weights', return_value=True) as mock_set_node_weights: + with patch( + "validator.weights.weights.set_node_weights", return_value=True + ) as mock_set_node_weights: await weights_manager.set_weights([]) mock_set_node_weights.assert_called_once() diff --git a/validator/api_routes.py b/validator/api_routes.py index 0c546b7..b08a4ba 100644 --- a/validator/api_routes.py +++ b/validator/api_routes.py @@ -59,6 +59,35 @@ def register_routes(self) -> None: tags=["monitoring"], ) + self.app.add_api_route( + "/monitor/worker/{worker_id}", + self.monitor_worker_hotkey, + methods=["GET"], + tags=["monitoring"], + ) + + # Add error monitoring endpoints + self.app.add_api_route( + "/monitor/errors", + self.monitor_errors, + methods=["GET"], + tags=["monitoring"], + ) + + self.app.add_api_route( + "/monitor/errors/{hotkey}", + self.monitor_errors_by_hotkey, + methods=["GET"], + tags=["monitoring"], + ) + + self.app.add_api_route( + "/monitor/errors/cleanup", + self.cleanup_old_errors, + methods=["POST"], + tags=["maintenance"], + ) + async def healthcheck(self): # Implement the healthcheck logic for the validator return self.validator.healthcheck() @@ -103,6 +132,22 @@ async def monitor_telemetry(self): except Exception as e: return {"error": str(e)} + async def monitor_worker_hotkey(self, worker_id: str): + """Return the hotkey associated with a worker_id""" + + try: + hotkey = self.validator.routing_table.get_worker_hotkey(worker_id) + if hotkey: + return {"worker_id": worker_id, "hotkey": hotkey} + else: + return { + "worker_id": worker_id, + "hotkey": None, + "message": "Worker ID not found", + } + except Exception as e: + return {"error": str(e)} + async def monitor_telemetry_by_hotkey(self, hotkey: str): """Return telemetry data for a specific hotkey""" try: @@ -140,3 +185,49 @@ async def monitor_telemetry_by_hotkey(self, hotkey: str): } except Exception as e: return {"error": str(e)} + + async def monitor_errors(self, limit: int = 100): + """Return all errors logged in the system""" + try: + # Get errors storage from node manager since that's where it's initialized + errors_storage = self.validator.node_manager.errors_storage + errors = errors_storage.get_all_errors(limit) + + return { + "count": len(errors), + "errors": errors, + "error_count_24h": errors_storage.get_error_count(hours=24), + "error_count_1h": errors_storage.get_error_count(hours=1), + } + except Exception as e: + return {"error": str(e)} + + async def monitor_errors_by_hotkey(self, hotkey: str, limit: int = 100): + """Return errors for a specific hotkey""" + try: + errors_storage = self.validator.node_manager.errors_storage + errors = errors_storage.get_errors_by_hotkey(hotkey, limit) + + return { + "hotkey": hotkey, + "count": len(errors), + "errors": errors, + } + except Exception as e: + return {"error": str(e)} + + async def cleanup_old_errors(self): + """Manually trigger cleanup of error logs based on retention period""" + try: + errors_storage = self.validator.node_manager.errors_storage + retention_days = errors_storage.retention_days + count = errors_storage.clean_errors_based_on_retention() + + return { + "success": True, + "retention_days": retention_days, + "removed_count": count, + "message": f"Removed {count} error logs older than {retention_days} days", + } + except Exception as e: + return {"success": False, "error": str(e)} diff --git a/validator/background_tasks.py b/validator/background_tasks.py index 28f0f0c..1b27c9b 100644 --- a/validator/background_tasks.py +++ b/validator/background_tasks.py @@ -39,6 +39,7 @@ async def update_tee(self, cadence_seconds) -> None: while True: try: await self.validator.NATSPublisher.send_connected_nodes() + await self.validator.NATSPublisher.send_unregistered_tees() self.validator.telemetry_storage.clean_old_entries( TELEMETRY_EXPIRATION_HOURS ) diff --git a/validator/errors_storage.py b/validator/errors_storage.py new file mode 100644 index 0000000..817760e --- /dev/null +++ b/validator/errors_storage.py @@ -0,0 +1,74 @@ +from db.errors_database import ErrorsDatabase +import sqlite3 +from fiber.logging_utils import get_logger +import os + +logger = get_logger(__name__) + + +class ErrorsStorage: + def __init__(self, db_path="errors.db"): + self.db = ErrorsDatabase(db_path=db_path) + # Get retention period from environment or use default of 5 days + self.retention_days = int(os.getenv("ERROR_LOGS_RETENTION_DAYS", "5")) + logger.info(f"Error logs retention period set to {self.retention_days} days") + + def add_error(self, hotkey, tee_address, miner_address, message): + """Add a new error entry to the database.""" + try: + logger.debug(f"Recording error for hotkey={hotkey}: {message}") + self.db.add_error(hotkey, tee_address, miner_address, message) + return True + except sqlite3.Error as e: + logger.error(f"Failed to add error: {e}") + return False + + def get_errors_by_hotkey(self, hotkey, limit=100): + """Get errors for a specific hotkey.""" + try: + return self.db.get_errors_by_hotkey(hotkey, limit) + except sqlite3.Error as e: + logger.error(f"Failed to get errors for hotkey {hotkey}: {e}") + return [] + + def get_all_errors(self, limit=100): + """Get all errors.""" + try: + return self.db.get_all_errors(limit) + except sqlite3.Error as e: + logger.error(f"Failed to get all errors: {e}") + return [] + + def clean_old_errors(self, hours=24): + """Clean errors older than the specified hours.""" + try: + count = self.db.clean_old_errors(hours) + logger.info(f"Cleaned {count} errors older than {hours} hours") + return count + except sqlite3.Error as e: + logger.error(f"Failed to clean old errors: {e}") + return 0 + + def clean_errors_based_on_retention(self): + """ + Clean errors based on the configured retention period. + Uses ERROR_LOGS_RETENTION_DAYS environment variable (default: 5 days). + """ + retention_hours = self.retention_days * 24 + try: + count = self.db.clean_old_errors(retention_hours) + logger.info( + f"Retention cleanup: removed {count} errors older than {self.retention_days} days" + ) + return count + except sqlite3.Error as e: + logger.error(f"Failed to clean errors based on retention period: {e}") + return 0 + + def get_error_count(self, hours=24): + """Get count of errors in the last specified hours.""" + try: + return self.db.get_error_count(hours) + except sqlite3.Error as e: + logger.error(f"Failed to get error count: {e}") + return 0 diff --git a/validator/nats.py b/validator/nats.py index 8ede0bb..51bb3e1 100644 --- a/validator/nats.py +++ b/validator/nats.py @@ -24,6 +24,21 @@ async def send_connected_nodes(self): logger.info(f"About to send {len(addresses)} IPs to NATS") - logger.debug(f"Sending IP list: {addresses}") + logger.info(f"Sending IP list: {addresses}") await self.nc.send_connected_nodes(addresses) + + async def send_unregistered_tees(self): + # Get unregistered TEEs from the validator + routing_table = self.validator.routing_table + unregistered_tees = routing_table.get_all_unregistered_tee_addresses() + + if len(unregistered_tees) == 0: + logger.debug("Skipping, no unregistered TEEs found") + return + + logger.info(f"About to send {len(unregistered_tees)} unregistered TEEs to NATS") + + logger.debug(f"Sending unregistered TEEs list: {unregistered_tees}") + + await self.nc.send_unregistered_tees(unregistered_tees) diff --git a/validator/node_manager.py b/validator/node_manager.py index f31f13e..efa01ea 100644 --- a/validator/node_manager.py +++ b/validator/node_manager.py @@ -8,6 +8,8 @@ from fiber.logging_utils import get_logger from interfaces.types import NodeData from validator.telemetry import TEETelemetryClient +from validator.errors_storage import ErrorsStorage +import asyncio if TYPE_CHECKING: from neurons.validator import Validator @@ -24,6 +26,26 @@ def __init__(self, validator: "Validator"): """ self.validator = validator self.connected_nodes: Dict[str, Node] = {} + self.errors_storage = ErrorsStorage() + + # Schedule error logs cleanup based on retention period + asyncio.create_task(self.run_periodic_error_cleanup()) + + async def run_periodic_error_cleanup(self): + """Run periodic cleanup of error logs based on retention period.""" + cleanup_interval_hours = 6 # Run cleanup every 6 hours + while True: + try: + # Wait for first interval + await asyncio.sleep(cleanup_interval_hours * 3600) + + # Perform cleanup based on retention policy + count = self.errors_storage.clean_errors_based_on_retention() + logger.info(f"Scheduled error logs cleanup removed {count} old entries") + + except Exception as e: + logger.error(f"Error during scheduled error logs cleanup: {str(e)}") + await asyncio.sleep(3600) # Wait one hour and try again async def connect_with_miner( self, miner_address: str, miner_hotkey: str, node: Node @@ -48,6 +70,12 @@ async def connect_with_miner( logger.error( f"Failed to establish secure connection with miner {miner_hotkey}" ) + self.errors_storage.add_error( + hotkey=miner_hotkey, + tee_address="", + miner_address=miner_address, + message="Failed to establish secure connection", + ) return False logger.debug( @@ -80,6 +108,12 @@ async def connect_with_miner( logger.debug( f"Failed to connect to miner {miner_address} - {miner_hotkey}: {str(e)}" ) + self.errors_storage.add_error( + hotkey=miner_hotkey, + tee_address="", + miner_address=miner_address, + message=f"Connection error: {str(e)}", + ) return False async def get_tee_address(self, node: Node) -> Optional[str]: @@ -88,6 +122,12 @@ async def get_tee_address(self, node: Node) -> Optional[str]: return await self.validator.make_non_streamed_get(node, endpoint) except Exception as e: logger.error(f"Failed to get tee address: {node.hotkey} {str(e)}") + self.errors_storage.add_error( + hotkey=node.hotkey, + tee_address="", + miner_address=f"{node.ip}:{node.port}", + message=f"Failed to get TEE address: {str(e)}", + ) async def connect_new_nodes(self) -> None: """ @@ -114,6 +154,17 @@ async def connect_new_nodes(self) -> None: logger.info(f"Found {len(available_nodes)} miners") for node in available_nodes: + + if node.ip == "0": + logger.warn(f"Skipping node {node.hotkey}: ip is {node.ip}") + self.errors_storage.add_error( + hotkey=node.hotkey, + tee_address="", + miner_address=f"{node.ip}:{node.port}", + message="Skipped: IP is 0", + ) + continue + server_address = vali_client.construct_server_address( node=node, replace_with_docker_localhost=True, @@ -128,7 +179,7 @@ async def connect_new_nodes(self) -> None: f"Connected to miner: {node.hotkey}, IP: {node.ip}, Port: {node.port}" ) else: - logger.info( + logger.debug( f"Failed to connect to miner {node.hotkey} with address {server_address}" ) @@ -142,6 +193,12 @@ async def remove_disconnected_nodes(self): logger.info( f"Hotkey: {hotkey} has been deregistered from the metagraph" ) + self.errors_storage.add_error( + hotkey=hotkey, + tee_address="", + miner_address="", + message="Node deregistered from metagraph", + ) keys_to_delete.append(hotkey) for hotkey in keys_to_delete: @@ -161,6 +218,12 @@ async def send_custom_message(self, node_hotkey: str, message: str) -> None: try: if node_hotkey not in self.connected_nodes: logger.warning(f"No connected node found for hotkey {node_hotkey}") + self.errors_storage.add_error( + hotkey=node_hotkey, + tee_address="", + miner_address="", + message="Failed to send message: Node not connected", + ) return node = self.connected_nodes[node_hotkey] @@ -185,11 +248,23 @@ async def send_custom_message(self, node_hotkey: str, message: str) -> None: f"Failed to send custom message to miner {node_hotkey}. " f"Status code: {response.status_code}" ) + self.errors_storage.add_error( + hotkey=node_hotkey, + tee_address="", + miner_address=f"{node.ip}:{node.port}", + message=f"Failed to send message: Status code {response.status_code}", + ) except Exception as e: logger.error( f"Error sending custom message to miner {node_hotkey}: {str(e)}" ) + self.errors_storage.add_error( + hotkey=node_hotkey, + tee_address="", + miner_address="", + message=f"Error sending message: {str(e)}", + ) async def update_tee_list(self): logger.info("Starting TEE list update") @@ -202,6 +277,16 @@ async def update_tee_list(self): logger.debug(f"Processing hotkey: {hotkey}") if hotkey in self.validator.metagraph.nodes: node = self.validator.metagraph.nodes[hotkey] + + if node.ip == "0": + self.errors_storage.add_error( + hotkey=hotkey, + tee_address="", + miner_address=f"{node.ip}:{node.port}", + message="Skipped updating TEE: IP is 0", + ) + continue + logger.debug(f"Found node in metagraph for hotkey: {hotkey}") try: @@ -222,6 +307,12 @@ async def update_tee_list(self): logger.debug( f"Skipping localhost TEE address {tee_address} - {hotkey}" ) + self.errors_storage.add_error( + hotkey=hotkey, + tee_address=tee_address, + miner_address=f"{node.ip}:{node.port}", + message="Skipped: localhost TEE address", + ) continue # Skip if not https @@ -229,37 +320,93 @@ async def update_tee_list(self): logger.debug( f"Skipping non-HTTPS TEE address {tee_address} - {hotkey}" ) + self.errors_storage.add_error( + hotkey=hotkey, + tee_address=tee_address, + miner_address=f"{node.ip}:{node.port}", + message="Skipped: non-HTTPS TEE address", + ) continue try: telemetry_client = TEETelemetryClient(tee_address) logger.info( - f"Executing telemetry sequence for node {hotkey} at {tee_address}" + f"Getting registration telemetry for {hotkey} at {tee_address}" ) telemetry_result = ( await telemetry_client.execute_telemetry_sequence() ) - worker_id = telemetry_result.get("worker_id", "N/A") + logger.info( + f"Telemetry successful for hotkey {hotkey} at {tee_address}: {telemetry_result}" + ) + logger.info( + f"Telemetry successful for hotkey {hotkey} at {tee_address} with worker_id {telemetry_result.get('worker_id', 'N/A')}" + ) - worker_hotkey = routing_table.get_worker_hotkey( - worker_id=worker_id + if not telemetry_result: + logger.warn( + f"Telemetry failed for hotkey {hotkey} - {tee_address} - {_.ip}:{_.port}" + ) + self.errors_storage.add_error( + hotkey=hotkey, + tee_address=tee_address, + miner_address=f"{node.ip}:{node.port}", + message="Telemetry failed to return results", + ) + continue + + worker_id = telemetry_result.get("worker_id", None) + + if worker_id is None: + logger.warning( + f"Skipping registration for {hotkey} at {tee_address} - No worker_id returned" + ) + # Add to unregistered TEEs table for tracking + self.validator.routing_table.add_unregistered_tee( + address=tee_address, hotkey=hotkey + ) + logger.info( + f"Added to unregistered TEEs: {tee_address} for hotkey {hotkey}" + ) + self.errors_storage.add_error( + hotkey=hotkey, + tee_address=tee_address, + miner_address=f"{node.ip}:{node.port}", + message="Skipped: No worker_id returned from telemetry", + ) + continue + + worker_hotkey = ( + self.validator.routing_table.get_worker_hotkey( + worker_id + ) ) + logger.info(f"worker id: {worker_id}") + logger.info(f"worker hotkey: {worker_hotkey}") + logger.info(f"node hotkey: {hotkey}") + is_worker_already_owned = ( worker_hotkey is not None - and worker_hotkey is not hotkey + and worker_hotkey != hotkey ) # This checks that a worker address is only owned by the first node that requests it - # For removing this restriction shoot a message on discord + # For removing this restriction please communicate on discord if is_worker_already_owned: logger.warning( - f"Worker ID {worker_id} is already registered to another hotkey. " + f"Worker ID {worker_id} is already registered to another hotkey. ({worker_hotkey})" f"Skipping registration for {hotkey}." ) + self.errors_storage.add_error( + hotkey=hotkey, + tee_address=tee_address, + miner_address=f"{node.ip}:{node.port}", + message=f"Skipped: Worker ID {worker_id} already registered to hotkey {worker_hotkey}", + ) continue routing_table.register_worker( @@ -274,26 +421,64 @@ async def update_tee_list(self): f"hotkey {hotkey}" ) - if not worker_hotkey: - # Send notification to miner about successful registration + # Check if this is a new worker registration (worker_id was not set before) + if worker_hotkey is None: + logger.info( + f"New worker registration: {worker_id} for hotkey {hotkey}" + ) + # Send notification about new worker registration await self.send_custom_message( hotkey, - f"Your TEE address {tee_address} has been successfully registered with worker_id {worker_id} for hotkey {hotkey}", + f"New worker registration: Your worker ID {worker_id} has been registered for the first time with hotkey {hotkey}", ) + # Send notification to miner about successful registration + await self.send_custom_message( + hotkey, + f"Your TEE address {tee_address} has been successfully registered with worker_id {worker_id} for hotkey {hotkey}", + ) + except sqlite3.IntegrityError: logger.debug( f"TEE address {tee_address} already exists in " f"routing table for hotkey {hotkey}" ) + except Exception as e: + logger.error( + f"Error processing TEE address {tee_address} for hotkey {hotkey}: {str(e)}" + ) + self.errors_storage.add_error( + hotkey=hotkey, + tee_address=tee_address, + miner_address=f"{node.ip}:{node.port}", + message=f"Error processing TEE: {str(e)}", + ) else: logger.debug(f"No TEE addresses returned for hotkey {hotkey}") + self.errors_storage.add_error( + hotkey=hotkey, + tee_address="", + miner_address=f"{node.ip}:{node.port}", + message="No TEE addresses returned", + ) except Exception as e: logger.error( f"Error processing TEE addresses for hotkey {hotkey}: {e}" ) + self.errors_storage.add_error( + hotkey=hotkey, + tee_address="", + miner_address=f"{node.ip}:{node.port}", + message=f"Error processing TEE addresses: {str(e)}", + ) else: logger.debug(f"Hotkey {hotkey} not found in metagraph") + self.errors_storage.add_error( + hotkey=hotkey, + tee_address="", + miner_address="", + message="Hotkey not found in metagraph", + ) logger.info("Completed TEE list update ✅") async def send_score_report( @@ -310,6 +495,12 @@ async def send_score_report( try: if node_hotkey not in self.connected_nodes: logger.warning(f"No connected node found for hotkey {node_hotkey}") + self.errors_storage.add_error( + hotkey=node_hotkey, + tee_address="", + miner_address="", + message="Failed to send score report: Node not connected", + ) return node = self.connected_nodes[node_hotkey] @@ -346,6 +537,18 @@ async def send_score_report( f"Failed to send score report to miner {node_hotkey}. " f"Status code: {response.status_code}" ) + self.errors_storage.add_error( + hotkey=node_hotkey, + tee_address="", + miner_address=f"{node.ip}:{node.port}", + message=f"Failed to send score report: Status code {response.status_code}", + ) except Exception as e: logger.error(f"Error sending score report to miner {node_hotkey}: {str(e)}") + self.errors_storage.add_error( + hotkey=node_hotkey, + tee_address="", + miner_address="", + message=f"Error sending score report: {str(e)}", + ) diff --git a/validator/routing_table.py b/validator/routing_table.py index 773cf1f..c017bb9 100644 --- a/validator/routing_table.py +++ b/validator/routing_table.py @@ -6,13 +6,17 @@ class RoutingTable: - def __init__(self, db_path="miner_tee_addresses"): + def __init__(self, db_path="miner_tee_addresses.db"): self.db = RoutingTableDatabase(db_path=db_path) def add_miner_address(self, hotkey, uid, address, worker_id=None): """Add a new miner address to the database.""" try: + logger.info( + f"Adding miner to routing table: hotkey={hotkey}, uid={uid}, address={address}, worker_id={worker_id}" + ) self.db.add_address(hotkey, uid, address, worker_id) + logger.debug(f"Successfully added miner address to routing table") except sqlite3.Error as e: logger.error(f"Failed to add address: {e}") @@ -35,9 +39,6 @@ def clear_miner(self, hotkey): (hotkey,), ) conn.commit() - - # Also remove all worker registrations for this hotkey - self.unregister_workers_by_hotkey(hotkey) except sqlite3.Error as e: logger.error(f"Failed to clear miner: {e}") @@ -149,6 +150,39 @@ def clean_old_entries(self): """Clean all old entries from both tables.""" try: self.db.clean_old_entries() - self.db.clean_old_worker_registrations() except sqlite3.Error as e: logger.error(f"Failed to clean old entries: {e}") + + def add_unregistered_tee(self, address, hotkey): + """Add an unregistered TEE to the database.""" + try: + logger.info( + f"Adding unregistered TEE: " f"address={address}, hotkey={hotkey}" + ) + self.db.add_unregistered_tee(address, hotkey) + logger.debug("Successfully added unregistered TEE") + except sqlite3.Error as e: + logger.error(f"Failed to add unregistered TEE: {e}") + + def clean_old_unregistered_tees(self): + """Clean unregistered TEEs older than one hour.""" + try: + self.db.clean_old_unregistered_tees() + except sqlite3.Error as e: + logger.error(f"Failed to clean old unregistered TEEs: {e}") + + def get_all_unregistered_tees(self): + """Get all unregistered TEEs from the database.""" + try: + return self.db.get_all_unregistered_tees() + except sqlite3.Error as e: + logger.error(f"Failed to get all unregistered TEEs: {e}") + return [] + + def get_all_unregistered_tee_addresses(self): + """Get all addresses from unregistered TEEs.""" + try: + return self.db.get_all_unregistered_tee_addresses() + except sqlite3.Error as e: + logger.error(f"Failed to get unregistered TEE addresses: {e}") + return [] diff --git a/validator/scorer.py b/validator/scorer.py index 1bafab4..f232679 100644 --- a/validator/scorer.py +++ b/validator/scorer.py @@ -31,33 +31,49 @@ async def get_node_data(self): :return: A list of NodeData objects containing node information. """ - logger.info("Starting telemetry fetching...") + logger.info("Starting telemetry fetching process...") + logger.info("Syncing metagraph to get latest node information") self.validator.metagraph.sync_nodes() + nodes = self.validator.routing_table.get_all_addresses_with_hotkeys() + logger.info(f"Found {len(nodes)} nodes in the routing table") logger.debug(f"Found {len(nodes)} nodes") node_data = [] - for hotkey, ip in nodes: + successful_nodes = 0 + failed_nodes = 0 + + logger.info("Beginning telemetry collection for each node") + for index, (hotkey, ip, worker_id) in enumerate(nodes): + logger.info(f"Processing node {index+1}/{len(nodes)}: {hotkey[:10]}...") logger.debug(f"Processing node {hotkey} at IP {ip}") try: + logger.info(f"Connecting to node {hotkey[:10]}... at {ip}") logger.debug(f"Creating telemetry client for node {hotkey}") # Determine the server address server_address = ip telemetry_client = TEETelemetryClient(server_address) + logger.info(f"Executing telemetry sequence for node {hotkey[:10]}...") logger.debug(f"Executing telemetry sequence for node {hotkey}") telemetry_result = await telemetry_client.execute_telemetry_sequence() if telemetry_result: - logger.info( + successful_nodes += 1 + logger.info(f"Node {hotkey[:10]}... telemetry successful") + logger.debug( f"Node {hotkey} telemetry successful: {telemetry_result}" ) uid = self.validator.metagraph.nodes[hotkey].node_id + logger.info(f"Node {hotkey[:10]}... has UID: {uid}") + + logger.info(f"Node {hotkey[:10]}... worker ID: {worker_id}") + telemetry_data = NodeData( hotkey=hotkey, uid=uid, - worker_id=telemetry_result.get("worker_id", "N/A"), + worker_id=worker_id, timestamp="", boot_time=telemetry_result.get("boot_time", 0), last_operation_time=telemetry_result.get( @@ -92,19 +108,42 @@ async def get_node_data(self): "web_success", 0 ), ) - logger.info("Storing telemetry") - logger.info(f"telemetry for {hotkey}: {telemetry_data}") + logger.info(f"Storing telemetry for node {hotkey[:10]}...") + logger.info( + f"Twitter stats for {hotkey[:10]}: " + f"scrapes={telemetry_data.twitter_scrapes}, " + f"profiles={telemetry_data.twitter_returned_profiles}, " + f"tweets={telemetry_data.twitter_returned_tweets}" + ) + logger.info( + f"Web stats for {hotkey[:10]}: " + f"success={telemetry_data.web_success}, " + f"errors={telemetry_data.web_errors}" + ) + logger.debug(f"telemetry for {hotkey}: {telemetry_data}") + self.validator.telemetry_storage.add_telemetry(telemetry_data) + logger.info(f"Successfully stored telemetry for {hotkey[:10]}...") node_data.append(telemetry_data) + else: + failed_nodes += 1 + logger.info(f"Node {hotkey[:10]}... returned no telemetry data") # Should add empty telemetry if a node isnt replying? except Exception as e: + failed_nodes += 1 + logger.info(f"Failed to get telemetry for node {hotkey[:10]}...") logger.error( f"Failed to get telemetry for node {hotkey}: {str(e)}", exc_info=True, ) + logger.info(f"Telemetry collection summary:") + logger.info(f" - Total nodes processed: {len(nodes)}") + logger.info(f" - Successful telemetry collections: {successful_nodes}") + logger.info(f" - Failed telemetry collections: {failed_nodes}") + logger.info(f" - Success rate: {successful_nodes/len(nodes)*100:.2f}%") logger.info(f"Completed telemetry fetching for {len(node_data)} nodes") self.telemetry = node_data diff --git a/validator/weights.py b/validator/weights.py index 3954521..0694de2 100644 --- a/validator/weights.py +++ b/validator/weights.py @@ -125,6 +125,7 @@ def _get_delta_node_data(self) -> List[NodeData]: delta_data = NodeData( hotkey=hotkey, uid=latest.uid, + worker_id=latest.worker_id, timestamp=latest.timestamp, boot_time=latest.boot_time - oldest.boot_time, last_operation_time=( From bb14d747629d82f3e5ee5f0094f75a8104dd06c0 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Thu, 3 Apr 2025 11:11:59 -0700 Subject: [PATCH 04/20] feat(telemetry): fixing edge case --- validator/background_tasks.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/validator/background_tasks.py b/validator/background_tasks.py index 1b27c9b..f47b984 100644 --- a/validator/background_tasks.py +++ b/validator/background_tasks.py @@ -24,18 +24,32 @@ def __init__(self, validator: "Validator"): async def sync_loop(self, cadence_seconds) -> None: """Background task to sync metagraph""" + # Ensure cadence_seconds is never zero to prevent division by zero + if cadence_seconds <= 0: + cadence_seconds = 60 # Default to 1 minute if invalid + logger.warning(f"Invalid sync cadence, using default: 60 seconds") + while True: try: - await self.validator.node_manager.connect_new_nodes() + logger.info("Running sync loop") await self.validator.metagraph_manager.sync_metagraph() await asyncio.sleep(cadence_seconds) except Exception as e: logger.error(f"Error in sync metagraph: {str(e)}") - await asyncio.sleep(cadence_seconds / 2) # Wait before retrying + # Use a minimum retry delay to avoid division by zero + retry_delay = max(30, cadence_seconds / 2) # At least 30 seconds + await asyncio.sleep(retry_delay) # Wait before retrying async def update_tee(self, cadence_seconds) -> None: """Background task to update tee""" + # Ensure cadence_seconds is never zero to prevent division by zero + if cadence_seconds <= 0: + cadence_seconds = 120 # Default to 2 minutes if invalid + logger.warning( + f"Invalid TEE update cadence ({cadence_seconds}), using default: 120 seconds" + ) + while True: try: await self.validator.NATSPublisher.send_connected_nodes() @@ -48,10 +62,17 @@ async def update_tee(self, cadence_seconds) -> None: except Exception as e: logger.error(f"Error updating TEE 🚩: {str(e)}") logger.debug(f"Error in updating tee: {str(e)}") - await asyncio.sleep(cadence_seconds / 2) # Wait before retrying + # Use a minimum retry delay to avoid division by zero + retry_delay = max(30, cadence_seconds / 2) # At least 30 seconds + await asyncio.sleep(retry_delay) # Wait before retrying async def set_weights_loop(self, cadence_seconds) -> None: """Background task to set weights using the weights manager""" + # Ensure cadence_seconds is never zero to prevent division by zero + if cadence_seconds <= 0: + cadence_seconds = 60 # Default to 1 minute if invalid + logger.warning(f"Invalid weights cadence, using default: 60 seconds") + while True: try: # TODO: Calculate scores and set weights @@ -59,4 +80,6 @@ async def set_weights_loop(self, cadence_seconds) -> None: await asyncio.sleep(cadence_seconds) except Exception as e: logger.error(f"Error in setting weights: {str(e)}") - await asyncio.sleep(cadence_seconds / 2) # Wait before retrying + # Use a minimum retry delay to avoid division by zero + retry_delay = max(30, cadence_seconds / 2) # At least 30 seconds + await asyncio.sleep(retry_delay) # Wait before retrying From f2c6f5513a3216e45b4d21a177ec52ac711d621c Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Thu, 3 Apr 2025 12:24:09 -0700 Subject: [PATCH 05/20] feat(miner): telemetry fix --- neurons/validator.py | 7 ++ validator/background_tasks.py | 124 +++++++++++++++++++++++++++------- validator/node_manager.py | 11 ++- 3 files changed, 109 insertions(+), 33 deletions(-) diff --git a/neurons/validator.py b/neurons/validator.py index 317f868..da8ecaf 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -102,6 +102,13 @@ async def start(self) -> None: ) ) + # Start telemetry collection in its own task + asyncio.create_task( + self.background_tasks.telemetry_loop( + int(os.getenv("TELEMETRY_COLLECTION_CADENCE_SECONDS", "30")) + ) + ) + except Exception as e: logger.error(f"Failed to start validator: {str(e)}") raise diff --git a/validator/background_tasks.py b/validator/background_tasks.py index f47b984..d32722c 100644 --- a/validator/background_tasks.py +++ b/validator/background_tasks.py @@ -24,62 +24,134 @@ def __init__(self, validator: "Validator"): async def sync_loop(self, cadence_seconds) -> None: """Background task to sync metagraph""" - # Ensure cadence_seconds is never zero to prevent division by zero - if cadence_seconds <= 0: - cadence_seconds = 60 # Default to 1 minute if invalid - logger.warning(f"Invalid sync cadence, using default: 60 seconds") + # Ensure we have a safe cadence value (at least 30 seconds) + safe_cadence = max(30, int(cadence_seconds or 60)) + + if safe_cadence != cadence_seconds: + logger.warning( + f"Adjusted sync cadence from {cadence_seconds} to {safe_cadence} seconds" + ) + + # Calculate a safe retry delay (at least 30 seconds) + retry_delay = max(30, safe_cadence // 2) # Integer division to avoid float + + logger.info( + f"Starting sync loop (cadence: {safe_cadence}s, retry: {retry_delay}s)" + ) while True: try: + # Main tasks logger.info("Running sync loop") await self.validator.metagraph_manager.sync_metagraph() - await asyncio.sleep(cadence_seconds) + # Wait for next cycle + await asyncio.sleep(safe_cadence) except Exception as e: + # Log the error logger.error(f"Error in sync metagraph: {str(e)}") - # Use a minimum retry delay to avoid division by zero - retry_delay = max(30, cadence_seconds / 2) # At least 30 seconds - await asyncio.sleep(retry_delay) # Wait before retrying + + # Wait before retrying (using pre-calculated safe delay) + await asyncio.sleep(retry_delay) async def update_tee(self, cadence_seconds) -> None: """Background task to update tee""" - # Ensure cadence_seconds is never zero to prevent division by zero - if cadence_seconds <= 0: - cadence_seconds = 120 # Default to 2 minutes if invalid + # Ensure we have a safe cadence value (at least 30 seconds) + safe_cadence = max(30, int(cadence_seconds or 120)) + + if safe_cadence != cadence_seconds: logger.warning( - f"Invalid TEE update cadence ({cadence_seconds}), using default: 120 seconds" + f"Adjusted TEE update cadence from {cadence_seconds} to {safe_cadence} seconds" ) + # Calculate a safe retry delay (at least 30 seconds) + retry_delay = max(30, safe_cadence // 2) # Integer division to avoid float + + logger.info( + f"Starting TEE update loop (cadence: {safe_cadence}s, retry: {retry_delay}s)" + ) + while True: try: + # Main tasks + await self.validator.node_manager.connect_new_nodes() await self.validator.NATSPublisher.send_connected_nodes() await self.validator.NATSPublisher.send_unregistered_tees() self.validator.telemetry_storage.clean_old_entries( TELEMETRY_EXPIRATION_HOURS ) - await self.scorer.get_node_data() - await asyncio.sleep(cadence_seconds) + # Node data collection moved to its own loop + + # Wait for next cycle + await asyncio.sleep(safe_cadence) except Exception as e: + # Log the error logger.error(f"Error updating TEE 🚩: {str(e)}") logger.debug(f"Error in updating tee: {str(e)}") - # Use a minimum retry delay to avoid division by zero - retry_delay = max(30, cadence_seconds / 2) # At least 30 seconds - await asyncio.sleep(retry_delay) # Wait before retrying + + # Wait before retrying (using pre-calculated safe delay) + await asyncio.sleep(retry_delay) + + async def telemetry_loop(self, cadence_seconds) -> None: + """Background task to collect node telemetry data independently""" + # Ensure we have a safe cadence value (at least 30 seconds) + safe_cadence = max(30, int(cadence_seconds or 180)) # Default: 3 minutes + + if safe_cadence != cadence_seconds: + logger.warning( + f"Adjusted telemetry cadence from {cadence_seconds} to {safe_cadence} seconds" + ) + + # Calculate a safe retry delay (at least 30 seconds) + retry_delay = max(30, safe_cadence // 2) # Integer division to avoid float + + logger.info( + f"Starting telemetry loop (cadence: {safe_cadence}s, retry: {retry_delay}s)" + ) + + while True: + try: + # Collect node telemetry data + logger.info("Collecting node telemetry data") + await self.scorer.get_node_data() + + # Wait for next cycle + await asyncio.sleep(safe_cadence) + except Exception as e: + # Log the error + logger.error(f"Error collecting telemetry data: {str(e)}") + logger.debug(f"Detailed telemetry error: {str(e)}") + + # Wait before retrying (using pre-calculated safe delay) + await asyncio.sleep(retry_delay) async def set_weights_loop(self, cadence_seconds) -> None: """Background task to set weights using the weights manager""" - # Ensure cadence_seconds is never zero to prevent division by zero - if cadence_seconds <= 0: - cadence_seconds = 60 # Default to 1 minute if invalid - logger.warning(f"Invalid weights cadence, using default: 60 seconds") + # Ensure we have a safe cadence value (at least 30 seconds) + safe_cadence = max(30, int(cadence_seconds or 60)) + + if safe_cadence != cadence_seconds: + logger.warning( + f"Adjusted weights cadence from {cadence_seconds} to {safe_cadence} seconds" + ) + + # Calculate a safe retry delay (at least 30 seconds) + retry_delay = max(30, safe_cadence // 2) # Integer division to avoid float + + logger.info( + f"Starting weights loop (cadence: {safe_cadence}s, retry: {retry_delay}s)" + ) while True: try: - # TODO: Calculate scores and set weights + # Main tasks await self.validator.weights_manager.set_weights() - await asyncio.sleep(cadence_seconds) + + # Wait for next cycle + await asyncio.sleep(safe_cadence) except Exception as e: + # Log the error logger.error(f"Error in setting weights: {str(e)}") - # Use a minimum retry delay to avoid division by zero - retry_delay = max(30, cadence_seconds / 2) # At least 30 seconds - await asyncio.sleep(retry_delay) # Wait before retrying + + # Wait before retrying (using pre-calculated safe delay) + await asyncio.sleep(retry_delay) diff --git a/validator/node_manager.py b/validator/node_manager.py index efa01ea..9314f0c 100644 --- a/validator/node_manager.py +++ b/validator/node_manager.py @@ -339,13 +339,6 @@ async def update_tee_list(self): await telemetry_client.execute_telemetry_sequence() ) - logger.info( - f"Telemetry successful for hotkey {hotkey} at {tee_address}: {telemetry_result}" - ) - logger.info( - f"Telemetry successful for hotkey {hotkey} at {tee_address} with worker_id {telemetry_result.get('worker_id', 'N/A')}" - ) - if not telemetry_result: logger.warn( f"Telemetry failed for hotkey {hotkey} - {tee_address} - {_.ip}:{_.port}" @@ -358,6 +351,10 @@ async def update_tee_list(self): ) continue + logger.info( + f"Telemetry successful for hotkey {hotkey} at {tee_address} with worker_id {telemetry_result.get('worker_id', 'N/A')}" + ) + worker_id = telemetry_result.get("worker_id", None) if worker_id is None: From 83149e3c17b542d7665d7589ad2e15222b2ddfff Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Fri, 4 Apr 2025 10:12:17 -0700 Subject: [PATCH 06/20] feat(miner): override external ip --- neurons/miner.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/neurons/miner.py b/neurons/miner.py index 3e00a4c..07debe2 100644 --- a/neurons/miner.py +++ b/neurons/miner.py @@ -36,7 +36,6 @@ def __init__(self): self.wallet_name, self.hotkey_name ) - self.httpx_client: Optional[httpx.AsyncClient] = None self.netuid = int(os.getenv("NETUID", "42")) self.subtensor_network = os.getenv("SUBTENSOR_NETWORK") @@ -94,14 +93,17 @@ def post_ip_to_chain(self) -> None: node = self.node() logger.debug(f"Retrieved node from metagraph: {node}") + # Use override_external_ip if provided, else use self.external_ip + external_ip = os.getenv("OVERRIDE_EXTERNAL_IP", self.external_ip) + if node: - if node.ip != self.external_ip or node.port != self.port: + if node.ip != external_ip or node.port != self.port: logger.info( f"IP/Port mismatch detected - Current chain values: " f"IP={node.ip}, Port={node.port}" ) logger.info( - f"Updating chain with new values: IP={self.external_ip}, " + f"Updating chain with new values: IP={external_ip}, " f"Port={self.port}" ) @@ -120,7 +122,7 @@ def post_ip_to_chain(self) -> None: f" substrate: {self.substrate}\n" f" keypair: {self.keypair}\n" f" netuid: {self.netuid}\n" - f" external_ip: {self.external_ip}\n" + f" external_ip: {external_ip}\n" f" external_port: {self.port}\n" f" coldkey_ss58_address: {coldkey_keypair_pub.ss58_address}" ) @@ -128,7 +130,7 @@ def post_ip_to_chain(self) -> None: substrate=self.substrate, keypair=self.keypair, netuid=self.netuid, - external_ip=self.external_ip, + external_ip=external_ip, external_port=self.port, coldkey_ss58_address=coldkey_keypair_pub.ss58_address, ) From 8b07033fd0b5350dd41d7b95b2451a0cceaf4a10 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Fri, 4 Apr 2025 11:48:27 -0700 Subject: [PATCH 07/20] feat(miner): removing rising errors when post ip fails --- neurons/miner.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/neurons/miner.py b/neurons/miner.py index 07debe2..43d0d72 100644 --- a/neurons/miner.py +++ b/neurons/miner.py @@ -140,7 +140,6 @@ def post_ip_to_chain(self) -> None: logger.error( f"Failed to post IP/Port to chain: {str(e)}", exc_info=True ) - raise Exception(f"Failed to post IP/Port to chain {e}") from e else: logger.info( f"IP/Port already up to date on chain: IP={node.ip}, " @@ -152,11 +151,9 @@ def post_ip_to_chain(self) -> None: f"Please ensure it is registered." ) logger.error(err_msg) - raise Exception(err_msg) except Exception as e: logger.error(f"Error in post_ip_to_chain: {str(e)}", exc_info=True) - raise def node(self) -> Optional[Node]: try: From 681e3750392a8d8fceaf2a5eb8771f0da16adc85 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Fri, 4 Apr 2025 16:03:58 -0700 Subject: [PATCH 08/20] feat(validator): registration queue fix --- validator/node_manager.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/validator/node_manager.py b/validator/node_manager.py index 9314f0c..61acb05 100644 --- a/validator/node_manager.py +++ b/validator/node_manager.py @@ -343,6 +343,13 @@ async def update_tee_list(self): logger.warn( f"Telemetry failed for hotkey {hotkey} - {tee_address} - {_.ip}:{_.port}" ) + # Add to unregistered TEEs table for tracking + self.validator.routing_table.add_unregistered_tee( + address=tee_address, hotkey=hotkey + ) + logger.info( + f"Added to unregistered TEEs: {tee_address} for hotkey {hotkey}" + ) self.errors_storage.add_error( hotkey=hotkey, tee_address=tee_address, From 20446a58f021970a072f4869390bb4d3926cbe0c Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Mon, 7 Apr 2025 11:01:51 -0700 Subject: [PATCH 09/20] feat: monitor api key --- .env.example | 4 ++ validator/api_routes.py | 80 ++++++++++++++++++++++++++++++++++- validator/background_tasks.py | 2 +- validator/config.py | 1 + 4 files changed, 84 insertions(+), 3 deletions(-) diff --git a/.env.example b/.env.example index 93ddd74..ead39ad 100644 --- a/.env.example +++ b/.env.example @@ -22,6 +22,10 @@ MINER_PORT=8091 # Port for the validator service VALIDATOR_PORT=8092 +# API Configuration (for validator) +# API key to protect /monitor endpoints +API_KEY=your_secure_api_key_here + # NATS Configuration (for validator) # NATS server URL #NATS_URL=nats://nats:4222 diff --git a/validator/api_routes.py b/validator/api_routes.py index b08a4ba..436a809 100644 --- a/validator/api_routes.py +++ b/validator/api_routes.py @@ -1,4 +1,35 @@ -from fastapi import FastAPI +from fastapi import FastAPI, Depends, HTTPException, Header +from typing import Optional, Callable + + +def get_api_key(api_key: Optional[str] = Header(None, alias="X-API-Key")) -> str: + """ + Dependency to check the API key in the header. + + :param api_key: The API key provided in the header. + :return: The API key if valid. + :raises HTTPException: If the API key is invalid or missing. + """ + if not api_key: + raise HTTPException(status_code=401, detail="API Key header missing") + return api_key + + +def require_api_key(api_key: str = Depends(get_api_key), config=None) -> None: + """ + Dependency to validate the API key against the configured value. + + :param api_key: The API key from the request header. + :param config: The configuration object with API_KEY defined. + :raises HTTPException: If the API key doesn't match the configured value or + no API key is configured. + """ + # Check if the API key is valid + if not config or not hasattr(config, "API_KEY") or not config.API_KEY: + return # No API key configured, skip validation + + if api_key != config.API_KEY: + raise HTTPException(status_code=403, detail="Invalid API Key") def register_routes(app: FastAPI, healthcheck_func): @@ -22,6 +53,19 @@ def __init__(self, validator): self.app = FastAPI() self.register_routes() + def get_api_key_dependency(self) -> Callable: + """Get a dependency function that checks the API key against config.""" + config = self.validator.config + + def api_key_validator(api_key: str = Depends(get_api_key)): + if not hasattr(config, "API_KEY") or not config.API_KEY: + return # No API key configured, skip validation + + if api_key != config.API_KEY: + raise HTTPException(status_code=403, detail="Invalid API Key") + + return api_key_validator + def register_routes(self) -> None: self.app.add_api_route( "/healthcheck", @@ -30,12 +74,16 @@ def register_routes(self) -> None: tags=["healthcheck"], ) - # Add monitoring endpoints + # Create API key dependency with config + api_key_dependency = self.get_api_key_dependency() + + # Add monitoring endpoints with API key protection self.app.add_api_route( "/monitor/worker-registry", self.monitor_worker_registry, methods=["GET"], tags=["monitoring"], + dependencies=[Depends(api_key_dependency)], ) self.app.add_api_route( @@ -43,6 +91,7 @@ def register_routes(self) -> None: self.monitor_routing_table, methods=["GET"], tags=["monitoring"], + dependencies=[Depends(api_key_dependency)], ) self.app.add_api_route( @@ -50,6 +99,15 @@ def register_routes(self) -> None: self.monitor_telemetry, methods=["GET"], tags=["monitoring"], + dependencies=[Depends(api_key_dependency)], + ) + + self.app.add_api_route( + "/monitor/unregistered-tee-addresses", + self.monitor_unregistered_tee_addresses, + methods=["GET"], + tags=["monitoring"], + dependencies=[Depends(api_key_dependency)], ) self.app.add_api_route( @@ -57,6 +115,7 @@ def register_routes(self) -> None: self.monitor_telemetry_by_hotkey, methods=["GET"], tags=["monitoring"], + dependencies=[Depends(api_key_dependency)], ) self.app.add_api_route( @@ -64,6 +123,7 @@ def register_routes(self) -> None: self.monitor_worker_hotkey, methods=["GET"], tags=["monitoring"], + dependencies=[Depends(api_key_dependency)], ) # Add error monitoring endpoints @@ -72,6 +132,7 @@ def register_routes(self) -> None: self.monitor_errors, methods=["GET"], tags=["monitoring"], + dependencies=[Depends(api_key_dependency)], ) self.app.add_api_route( @@ -79,6 +140,7 @@ def register_routes(self) -> None: self.monitor_errors_by_hotkey, methods=["GET"], tags=["monitoring"], + dependencies=[Depends(api_key_dependency)], ) self.app.add_api_route( @@ -86,6 +148,7 @@ def register_routes(self) -> None: self.cleanup_old_errors, methods=["POST"], tags=["maintenance"], + dependencies=[Depends(api_key_dependency)], ) async def healthcheck(self): @@ -231,3 +294,16 @@ async def cleanup_old_errors(self): } except Exception as e: return {"success": False, "error": str(e)} + + async def monitor_unregistered_tee_addresses(self): + """Return all unregistered TEE addresses in the system""" + try: + addresses = ( + self.validator.routing_table.get_all_unregistered_tee_addresses() + ) + return { + "count": len(addresses), + "unregistered_tee_addresses": addresses, + } + except Exception as e: + return {"error": str(e)} diff --git a/validator/background_tasks.py b/validator/background_tasks.py index d32722c..350c031 100644 --- a/validator/background_tasks.py +++ b/validator/background_tasks.py @@ -75,8 +75,8 @@ async def update_tee(self, cadence_seconds) -> None: try: # Main tasks await self.validator.node_manager.connect_new_nodes() - await self.validator.NATSPublisher.send_connected_nodes() await self.validator.NATSPublisher.send_unregistered_tees() + await self.validator.NATSPublisher.send_connected_nodes() self.validator.telemetry_storage.clean_old_entries( TELEMETRY_EXPIRATION_HOURS ) diff --git a/validator/config.py b/validator/config.py index fe70731..3be7fc7 100644 --- a/validator/config.py +++ b/validator/config.py @@ -14,3 +14,4 @@ class Config: "SUBTENSOR_ADDRESS", "wss://entrypoint-finney.opentensor.ai:443" ) MINER_WHITELIST = os.getenv("MINER_WHITELIST", "").split(",") + API_KEY = os.getenv("API_KEY", None) From bda581955b6f45e4a0320a822d247cc754080f5b Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Tue, 8 Apr 2025 09:31:18 -0700 Subject: [PATCH 10/20] feat(validator): adding monitor dashboard and fixes --- neurons/validator.py | 143 +++++++ scripts/run_validator.py | 6 + static/css/dashboard.css | 363 ++++++++++++++++++ static/error-logs.html | 614 ++++++++++++++++++++++++++++++ static/index.html | 138 +++++++ static/js/dashboard.js | 94 +++++ static/routing-table.html | 527 ++++++++++++++++++++++++++ static/score-simulation.html | 669 +++++++++++++++++++++++++++++++++ static/unregistered-nodes.html | 341 +++++++++++++++++ static/worker-registry.html | 372 ++++++++++++++++++ validator/api_routes.py | 206 +++++++++- 11 files changed, 3465 insertions(+), 8 deletions(-) create mode 100644 static/css/dashboard.css create mode 100644 static/error-logs.html create mode 100644 static/index.html create mode 100644 static/js/dashboard.js create mode 100644 static/routing-table.html create mode 100644 static/score-simulation.html create mode 100644 static/unregistered-nodes.html create mode 100644 static/worker-registry.html diff --git a/neurons/validator.py b/neurons/validator.py index da8ecaf..b9de01d 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -4,6 +4,7 @@ import asyncio import uvicorn from typing import Optional, Any +import datetime from fiber.chain import chain_utils, interface from fiber.chain.metagraph import Metagraph @@ -186,3 +187,145 @@ def healthcheck(self): except Exception as e: logger.error(f"Failed to get validator info: {str(e)}") return None + + def dashboard(self): + """Return a simple HTML dashboard for the validator.""" + try: + # Get basic validator info + info = self.healthcheck() + + # Get worker registry stats + worker_count = len(self.routing_table.get_all_worker_registrations()) + + # Get error stats from the last 24 hours + error_count_24h = self.node_manager.errors_storage.get_error_count(hours=24) + + # Get uptime (approximate) + import time + import math + + start_time = os.getenv("START_TIME", str(int(time.time()))) + uptime_seconds = int(time.time()) - int(start_time) + uptime_days = math.floor(uptime_seconds / (60 * 60 * 24)) + + # Read the HTML template + try: + with open("static/index.html", "r") as f: + template = f.read() + except FileNotFoundError: + logger.error("Dashboard template not found") + return "Dashboard template not found" + + # Replace template variables with actual values + replace_dict = { + "{{ss58_address}}": info.get("ss58_address", "N/A"), + "{{uid}}": info.get("uid", "N/A"), + "{{ip}}": info.get("ip", "N/A"), + "{{port}}": info.get("port", "N/A"), + "{{subtensor_network}}": info.get("subtensor_network", "N/A"), + "{{netuid}}": info.get("netuid", "N/A"), + "{{worker_count}}": str(worker_count), + "{{error_count_24h}}": str(error_count_24h), + "{{network}}": info.get("subtensor_network", "N/A").upper(), + "{{uptime_days}}": str(uptime_days), + "{{current_year}}": str(datetime.datetime.now().year), + } + + for key, value in replace_dict.items(): + template = template.replace(key, value) + + return template + + except Exception as e: + logger.error(f"Failed to generate dashboard: {str(e)}") + return f""" + + +

Dashboard Error

+

Failed to load dashboard: {str(e)}

+ + + """ + + def dashboard_data(self): + """Return a JSON object with dashboard data for API calls.""" + try: + # Get basic validator info + info = self.healthcheck() + + # Get worker registry stats + worker_count = len(self.routing_table.get_all_worker_registrations()) + + # Get error stats from the last 24 hours + error_count_24h = self.node_manager.errors_storage.get_error_count(hours=24) + + # Get uptime (approximate) + import time + import math + + start_time = os.getenv("START_TIME", str(int(time.time()))) + uptime_seconds = int(time.time()) - int(start_time) + uptime_days = math.floor(uptime_seconds / (60 * 60 * 24)) + + # Return JSON data + return { + "ss58_address": info.get("ss58_address", "N/A"), + "uid": info.get("uid", "N/A"), + "ip": info.get("ip", "N/A"), + "port": info.get("port", "N/A"), + "subtensor_network": info.get("subtensor_network", "N/A"), + "netuid": info.get("netuid", "N/A"), + "worker_count": worker_count, + "error_count_24h": error_count_24h, + "uptime_days": uptime_days, + "network": info.get("subtensor_network", "N/A").upper(), + "current_year": datetime.datetime.now().year, + } + + except Exception as e: + logger.error(f"Failed to generate dashboard data: {str(e)}") + return {"error": str(e)} + + async def get_score_simulation_data(self): + """Calculate simulated scores based on recently fetched telemetry data.""" + logger.info("Starting score simulation based on recent telemetry...") + try: + # 1. Fetch the latest telemetry data for reachable nodes + data_to_score = self.weights_manager._get_delta_node_data() + + logger.info(f"Data to score: {data_to_score}") + # 2. Calculate weights (scores) using the WeightsManager + logger.info("Calculating weights using WeightsManager...") + uids, scores = await self.weights_manager.calculate_weights( + data_to_score, simulation=True + ) + + logger.info(f"Weights calculated for {len(uids)} UIDs.") + + if uids is None or scores is None or len(uids) != len(scores): + logger.error("Mismatch or None returned from calculate_weights.") + return {"scores": []} # Return empty if calculation failed + + # 3. Map UIDs back to hotkeys using the metagraph + uid_to_hotkey = { + node.node_id: node.hotkey for node in self.metagraph.nodes.values() + } + + logger.info(f"UID TO HOTKEY: {uid_to_hotkey} {uids}") + + # 4. Format the scores for the API response - directly use raw scores from calculate_weights + formatted_scores = [ + {"hotkey": uid_to_hotkey.get(int(uid)), "score": float(score)} + for uid, score in zip(uids, scores) + if int(uid) in uid_to_hotkey + ] + + logger.info( + f"Score simulation complete. Returning {len(formatted_scores)} scores." + ) + return {"scores": formatted_scores} + + except Exception as e: + logger.error(f"Error during score simulation: {str(e)}", exc_info=True) + # Raise the exception to see the full traceback in logs + raise diff --git a/scripts/run_validator.py b/scripts/run_validator.py index 4d77edc..1bc48a2 100644 --- a/scripts/run_validator.py +++ b/scripts/run_validator.py @@ -1,6 +1,12 @@ +import os import asyncio +import time from neurons.validator import Validator +# Set START_TIME environment variable for uptime tracking +if "START_TIME" not in os.environ: + os.environ["START_TIME"] = str(int(time.time())) + async def main(): # Initialize validator diff --git a/static/css/dashboard.css b/static/css/dashboard.css new file mode 100644 index 0000000..fc64d97 --- /dev/null +++ b/static/css/dashboard.css @@ -0,0 +1,363 @@ +:root { + --primary-gradient: linear-gradient(135deg, #2563eb, #4f46e5, #7c3aed); + --gold-gradient: linear-gradient(135deg, #f59e0b, #d97706, #b45309); + --card-bg: rgba(30, 30, 35, 0.7); + --body-bg: #000000; + --text-primary: #ffffff; + --text-secondary: #9ca3af; + --accent-color: #f59e0b; + --border-color: rgba(255, 255, 255, 0.1); + --success-color: #10b981; + --warning-color: #f59e0b; + --danger-color: #ef4444; +} + +body { + font-family: 'Inter', -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; + line-height: 1.6; + color: var(--text-primary); + margin: 0 auto; + padding: 20px; + background-color: var(--body-bg); + background-image: url('https://masa.ai/images/bg-space.webp'); + background-size: cover; + background-attachment: fixed; + background-position: center; + min-height: 100vh; + max-width: 1400px; + position: relative; + overflow-x: hidden; +} + +body::before { + content: ''; + position: absolute; + top: 0; + left: 0; + right: 0; + bottom: 0; + background: radial-gradient(ellipse at center, rgba(0,0,0,0.4) 0%, rgba(0,0,0,0.8) 70%, rgba(0,0,0,0.95) 100%); + z-index: -1; +} + +.stars { + position: fixed; + top: 0; + left: 0; + width: 100%; + height: 100%; + z-index: -2; + overflow: hidden; +} + +.star { + position: absolute; + background-color: #fff; + border-radius: 50%; + animation: twinkle var(--duration) infinite; + opacity: var(--opacity); +} + +@keyframes twinkle { + 0%, 100% { opacity: var(--opacity); } + 50% { opacity: calc(var(--opacity) * 0.3); } +} + +h1, h2, h3 { + font-weight: 700; + margin-top: 0; + letter-spacing: -0.5px; +} + +h1 { + font-size: 3.5rem; + color: var(--text-primary); + margin-bottom: 0; + padding-bottom: 0; + border: none; + line-height: 1.1; +} + +.subtitle { + color: var(--accent-color); + font-size: 1.2rem; + font-weight: 500; + margin-top: 10px; + margin-bottom: 40px; +} + +.logo { + font-size: 1.5rem; + font-weight: 700; + margin-bottom: 30px; + color: var(--text-primary); + text-decoration: none; + display: inline-block; +} + +.header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 60px; + flex-wrap: wrap; +} + +.nav { + display: flex; + gap: 20px; + align-items: center; +} + +.nav-link { + color: var(--text-secondary); + text-decoration: none; + font-weight: 500; + transition: color 0.2s; + font-size: 0.9rem; + text-transform: uppercase; + letter-spacing: 1px; +} + +.nav-link:hover { + color: var(--text-primary); +} + +.header h1 { + margin: 0; + display: flex; + flex-direction: column; +} + +.network-badge { + display: inline-block; + padding: 6px 12px; + background: var(--gold-gradient); + color: white; + border-radius: 20px; + font-size: 0.9em; + font-weight: 500; + letter-spacing: 0.5px; + box-shadow: 0 4px 12px rgba(245, 158, 11, 0.3); +} + +.container { + max-width: 1200px; + margin: 0 auto; + position: relative; + z-index: 1; +} + +.card { + background: var(--card-bg); + border-radius: 16px; + padding: 30px; + margin-bottom: 40px; + box-shadow: 0 10px 30px rgba(0, 0, 0, 0.25); + border: 1px solid var(--border-color); + transition: transform 0.3s ease, box-shadow 0.3s ease; + backdrop-filter: blur(10px); +} + +.card:hover { + transform: translateY(-5px); + box-shadow: 0 15px 40px rgba(0, 0, 0, 0.35); +} + +.card h2 { + font-size: 1.8rem; + margin-bottom: 30px; + color: var(--accent-color); +} + +.stats { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); + gap: 30px; + margin-bottom: 40px; +} + +.stat-box { + background: rgba(30, 30, 35, 0.5); + border-radius: 16px; + padding: 30px; + box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2); + border: 1px solid var(--border-color); + transition: transform 0.3s ease, box-shadow 0.3s ease; + position: relative; + overflow: hidden; + backdrop-filter: blur(10px); +} + +.stat-box::before { + content: ''; + position: absolute; + top: 0; + left: 0; + width: 100%; + height: 4px; + background: var(--primary-gradient); +} + +.stat-box:hover { + transform: translateY(-8px); + box-shadow: 0 15px 40px rgba(0, 0, 0, 0.3); +} + +.stat-title { + font-size: 1rem; + color: var(--text-secondary); + margin-bottom: 15px; + text-transform: uppercase; + letter-spacing: 1px; + font-weight: 600; +} + +.stat-value { + font-size: 3.2rem; + font-weight: 800; + color: var(--text-primary); + margin-bottom: 10px; + line-height: 1; +} + +.info-item { + margin-bottom: 20px; + display: flex; + border-bottom: 1px solid var(--border-color); + padding-bottom: 20px; +} + +.info-item:last-child { + border-bottom: none; + margin-bottom: 0; +} + +.info-label { + font-weight: 500; + min-width: 180px; + color: var(--text-secondary); +} + +.info-value { + font-family: 'Roboto Mono', monospace; + color: var(--text-primary); + font-weight: 500; +} + +footer { + text-align: center; + margin-top: 60px; + color: var(--text-secondary); + font-size: 0.9em; + padding-top: 30px; + border-top: 1px solid var(--border-color); +} + +.quick-links { + list-style-type: none; + padding: 0; + display: grid; + grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); + gap: 20px; +} + +.quick-links li { + margin-bottom: 0; +} + +.quick-links a { + display: flex; + color: white; + text-decoration: none; + padding: 16px 20px; + border-radius: 12px; + background: rgba(50, 50, 60, 0.5); + transition: all 0.3s ease; + align-items: center; + justify-content: center; + font-weight: 500; + border: 1px solid var(--border-color); +} + +.quick-links a:hover { + transform: translateY(-5px); + box-shadow: 0 10px 25px rgba(0, 0, 0, 0.2); + background: rgba(70, 70, 90, 0.7); + border-color: rgba(255, 255, 255, 0.2); +} + +.btn { + display: inline-flex; + align-items: center; + justify-content: center; + background: var(--gold-gradient); + color: white; + border: none; + border-radius: 100px; + padding: 12px 24px; + font-weight: 600; + font-size: 0.9rem; + cursor: pointer; + transition: all 0.3s ease; + text-decoration: none; + margin-right: 15px; + box-shadow: 0 4px 12px rgba(245, 158, 11, 0.3); +} + +.btn:hover { + transform: translateY(-3px); + box-shadow: 0 8px 20px rgba(245, 158, 11, 0.4); +} + +.btn-secondary { + background: rgba(255, 255, 255, 0.1); + border: 1px solid rgba(255, 255, 255, 0.2); + box-shadow: none; +} + +.btn-secondary:hover { + background: rgba(255, 255, 255, 0.2); + box-shadow: 0 8px 20px rgba(0, 0, 0, 0.2); +} + +/* Define color variants for stats */ +.stat-box.success::before { background: var(--success-color); } +.stat-box.warning::before { background: var(--warning-color); } +.stat-box.danger::before { background: var(--danger-color); } + +/* Animation for refreshing data */ +@keyframes pulse { + 0% { opacity: 1; } + 50% { opacity: 0.5; } + 100% { opacity: 1; } +} + +.refreshing { + animation: pulse 1s ease-in-out; +} + +/* Responsive adjustments */ +@media (max-width: 768px) { + .stats { + grid-template-columns: 1fr; + } + + .header { + flex-direction: column; + align-items: flex-start; + } + + .network-badge { + margin-top: 10px; + } + + h1 { + font-size: 2.5rem; + } + + .nav { + margin-top: 20px; + gap: 15px; + } +} \ No newline at end of file diff --git a/static/error-logs.html b/static/error-logs.html new file mode 100644 index 0000000..02e2fcd --- /dev/null +++ b/static/error-logs.html @@ -0,0 +1,614 @@ + + + + + + Error Logs - Subnet 42 Validator + + + + + + + +
+ +
+
+ + +
+ + + + + + + Back to Dashboard + + +
+
+

Error Logs

+
Monitor and troubleshoot validator errors
+
+ {{network}} +
+ +
+
+
+ Severity: + +
+
+ Time Range: + +
+
+ Hotkey: + +
+
+ Error Type: + +
+
+ + +
+
+ +
+
+ Showing: + 0 of 0 +
+
+ Last 24h: + 0 +
+
+ Last 1h: + 0 +
+
+ + + + + + + + + + + + + +
TimestampSeverityHotkeyMessage
+ + +
+ + + + +
+ + + + + \ No newline at end of file diff --git a/static/index.html b/static/index.html new file mode 100644 index 0000000..f0a1d58 --- /dev/null +++ b/static/index.html @@ -0,0 +1,138 @@ + + + + + + Subnet 42 Validator Dashboard + + + + + + +
+ +
+
+ + +
+ +
+
+

Subnet 42 Validator

+
The #1 Real-Time Data Network for Fair AI
+
+ {{network}} +
+ +
+
+
Registered Workers
+
{{worker_count}}
+
+
+
Errors (24h)
+
{{error_count_24h}}
+
+
+
Uptime
+
{{uptime_days}} days
+
+
+ +
+

Validator Info

+
+ SS58 Address: + {{ss58_address}} +
+
+ UID: + {{uid}} +
+
+ IP: + {{ip}} +
+
+ Port: + {{port}} +
+
+ Network: + {{subtensor_network}} +
+
+ NetUID: + {{netuid}} +
+
+ + + + + + +
+ + + + + \ No newline at end of file diff --git a/static/js/dashboard.js b/static/js/dashboard.js new file mode 100644 index 0000000..cbf7908 --- /dev/null +++ b/static/js/dashboard.js @@ -0,0 +1,94 @@ +document.addEventListener('DOMContentLoaded', function() { + // Format timestamps to local time + const timestamps = document.querySelectorAll('.timestamp'); + timestamps.forEach(function(timestamp) { + const date = new Date(Number(timestamp.getAttribute('data-time')) * 1000); + timestamp.textContent = date.toLocaleString(); + }); + + // Add click handlers for collapsible sections + const collapsibles = document.querySelectorAll('.collapsible-header'); + collapsibles.forEach(function(header) { + header.addEventListener('click', function() { + const content = this.nextElementSibling; + if (content.style.maxHeight) { + content.style.maxHeight = null; + this.classList.remove('active'); + } else { + content.style.maxHeight = content.scrollHeight + "px"; + this.classList.add('active'); + } + }); + }); + + // Refresh the data every 5 seconds + function scheduleRefresh() { + setTimeout(function() { + refreshData(); + scheduleRefresh(); + }, 5000); // Refresh every 5 seconds + } + + function refreshData() { + fetch('/dashboard/data', { + headers: { + 'X-API-Key': localStorage.getItem('apiKey') || '', + 'X-Requested-With': 'XMLHttpRequest' + } + }) + .then(response => response.json()) + .then(data => { + updateDashboardData(data); + // Add visual indication that data was refreshed + const stats = document.querySelectorAll('.stat-box'); + stats.forEach(function(stat) { + stat.classList.add('refreshing'); + setTimeout(() => stat.classList.remove('refreshing'), 1000); + }); + }) + .catch(error => console.error('Error refreshing data:', error)); + } + + function updateDashboardData(data) { + // Update worker count + const workerCountElement = document.getElementById('worker-count'); + if (workerCountElement && data.worker_count !== undefined) { + workerCountElement.textContent = data.worker_count; + } + + // Update error count + const errorCountElement = document.getElementById('error-count'); + if (errorCountElement && data.error_count_24h !== undefined) { + errorCountElement.textContent = data.error_count_24h; + } + + // Update uptime + const uptimeElement = document.getElementById('uptime'); + if (uptimeElement && data.uptime_days !== undefined) { + uptimeElement.textContent = data.uptime_days + ' days'; + } + + // Add last refresh time + const lastRefreshElement = document.getElementById('last-refresh'); + if (lastRefreshElement) { + const now = new Date(); + lastRefreshElement.textContent = now.toLocaleTimeString(); + } + } + + // Store API key in local storage for future requests + function storeApiKey() { + const urlParams = new URLSearchParams(window.location.search); + const apiKey = urlParams.get('api_key'); + if (apiKey) { + localStorage.setItem('apiKey', apiKey); + // Remove the API key from the URL + urlParams.delete('api_key'); + const newUrl = window.location.pathname + (urlParams.toString() ? '?' + urlParams.toString() : ''); + window.history.replaceState({}, document.title, newUrl); + } + } + + storeApiKey(); + scheduleRefresh(); +}); \ No newline at end of file diff --git a/static/routing-table.html b/static/routing-table.html new file mode 100644 index 0000000..5827c27 --- /dev/null +++ b/static/routing-table.html @@ -0,0 +1,527 @@ + + + + + + Routing Table - Subnet 42 Validator + + + + + + + +
+ +
+
+ + +
+ + + + + + + Back to Dashboard + + +
+
+

Routing Table

+
Network connections and data flow
+
+ {{network}} +
+ +
+
+
0
+
Total Routes
+
+
+
0
+
Connected
+
+
+
0
+
Disconnected
+
+
+ +
+

Network Visualization

+
+
+ +
+ + + + + + + + + + + + + + + +
StatusHotkeyAddressWorker IDActions
+
+ + +
+ + + + + \ No newline at end of file diff --git a/static/score-simulation.html b/static/score-simulation.html new file mode 100644 index 0000000..3dabf8f --- /dev/null +++ b/static/score-simulation.html @@ -0,0 +1,669 @@ + + + + + + Score Simulation - Subnet 42 Validator + + + + + + + +
+ +
+
+ + +
+ + + + + + + Back to Dashboard + + +
+
+

Score Simulation

+
Estimated scores based on current telemetry
+
+ {{network}} +
+ +
+
+
0
+
Nodes Scored
+
+
+
0.000
+
Average Score
+
+
+
0.000
+
Median Score
+
+
+ + +
+

Scatter Plot Distribution (Sorted by Score - Highest First)

+
+ +
+
+ +
+ + + + + + + + + + + + + +
RankHotkeySimulated Score
+
+ + +
+ + + + + + \ No newline at end of file diff --git a/static/unregistered-nodes.html b/static/unregistered-nodes.html new file mode 100644 index 0000000..c965093 --- /dev/null +++ b/static/unregistered-nodes.html @@ -0,0 +1,341 @@ + + + + + + Unregistered Nodes - Subnet 42 Validator + + + + + + + +
+ +
+
+ + +
+ + + + + + + Back to Dashboard + + +
+
+

Unregistered TEE Addresses

+
Nodes detected but not yet linked to a hotkey
+
+ {{network}} +
+ +
+
+
0
+
Total Unregistered
+
+
+ +
+ + + + + + + + + + + + +
TEE AddressFirst Seen (Approx)
+
+ + +
+ + + + + \ No newline at end of file diff --git a/static/worker-registry.html b/static/worker-registry.html new file mode 100644 index 0000000..27caa50 --- /dev/null +++ b/static/worker-registry.html @@ -0,0 +1,372 @@ + + + + + + Worker Registry - Subnet 42 Validator + + + + + + + +
+ +
+
+ + +
+ + + + + + + Back to Dashboard + + +
+
+

Worker Registry

+
Manage and monitor data miners
+
+ {{network}} +
+ +
+
+
0
+
Total Workers
+
+
+
0
+
Active Workers
+
+
+
0
+
Inactive Workers
+
+
+ +
+ + + + + + + + + + + + + + + +
StatusWorker IDHotkeyLast SeenActions
+
+ + +
+ + + + + \ No newline at end of file diff --git a/validator/api_routes.py b/validator/api_routes.py index 436a809..09a6bb5 100644 --- a/validator/api_routes.py +++ b/validator/api_routes.py @@ -1,5 +1,12 @@ from fastapi import FastAPI, Depends, HTTPException, Header +from fastapi.responses import HTMLResponse +from fastapi.staticfiles import StaticFiles from typing import Optional, Callable +import os +from fiber.logging_utils import get_logger +import datetime + +logger = get_logger(__name__) def get_api_key(api_key: Optional[str] = Header(None, alias="X-API-Key")) -> str: @@ -55,18 +62,19 @@ def __init__(self, validator): def get_api_key_dependency(self) -> Callable: """Get a dependency function that checks the API key against config.""" - config = self.validator.config - - def api_key_validator(api_key: str = Depends(get_api_key)): - if not hasattr(config, "API_KEY") or not config.API_KEY: - return # No API key configured, skip validation - if api_key != config.API_KEY: - raise HTTPException(status_code=403, detail="Invalid API Key") + def check_api_key(): + return require_api_key(config=self.validator.config) - return api_key_validator + return check_api_key def register_routes(self) -> None: + # Mount static files directory + try: + self.app.mount("/static", StaticFiles(directory="static"), name="static") + except Exception as e: + logger.error(f"Failed to mount static files: {str(e)}, cwd: {os.getcwd()}") + self.app.add_api_route( "/healthcheck", self.healthcheck, @@ -151,6 +159,81 @@ def register_routes(self) -> None: dependencies=[Depends(api_key_dependency)], ) + # Add HTML page routes + self.app.add_api_route( + "/errors", + self.serve_error_logs_page, + methods=["GET"], + tags=["pages"], + response_class=HTMLResponse, + dependencies=[Depends(api_key_dependency)], + ) + + self.app.add_api_route( + "/workers", + self.serve_worker_registry_page, + methods=["GET"], + tags=["pages"], + response_class=HTMLResponse, + dependencies=[Depends(api_key_dependency)], + ) + + self.app.add_api_route( + "/routing", + self.serve_routing_table_page, + methods=["GET"], + tags=["pages"], + response_class=HTMLResponse, + dependencies=[Depends(api_key_dependency)], + ) + + self.app.add_api_route( + "/unregistered-nodes", + self.serve_unregistered_nodes_page, + methods=["GET"], + tags=["pages"], + response_class=HTMLResponse, + dependencies=[Depends(api_key_dependency)], + ) + + # Add dashboard endpoint + self.app.add_api_route( + "/dashboard", + self.dashboard, + methods=["GET"], + tags=["dashboard"], + response_class=HTMLResponse, + dependencies=[Depends(api_key_dependency)], + ) + + # Add JSON API endpoint for dashboard data + self.app.add_api_route( + "/dashboard/data", + self.dashboard_data, + methods=["GET"], + tags=["dashboard"], + dependencies=[Depends(api_key_dependency)], + ) + + # Add JSON API endpoint for score simulation data + self.app.add_api_route( + "/score-simulation/data", + self.score_simulation_data, + methods=["GET"], + tags=["simulation"], + dependencies=[Depends(api_key_dependency)], + ) + + # Add Score Simulation HTML Page Route + self.app.add_api_route( + "/score-simulation", + self.serve_score_simulation_page, + methods=["GET"], + tags=["pages"], + response_class=HTMLResponse, + dependencies=[Depends(api_key_dependency)], + ) + async def healthcheck(self): # Implement the healthcheck logic for the validator return self.validator.healthcheck() @@ -307,3 +390,110 @@ async def monitor_unregistered_tee_addresses(self): } except Exception as e: return {"error": str(e)} + + async def dashboard(self): + # Implement the dashboard logic for the validator + return self.validator.dashboard() + + async def dashboard_data(self): + # Implement the dashboard data logic for the validator + return self.validator.dashboard_data() + + async def serve_error_logs_page(self): + """Serve the error logs HTML page""" + try: + with open("static/error-logs.html", "r") as f: + content = f.read() + + # Replace placeholders with actual values + network = self.validator.config.SUBTENSOR_NETWORK.upper() + content = content.replace("{{network}}", network) + content = content.replace( + "{{current_year}}", str(datetime.datetime.now().year) + ) + + return HTMLResponse(content=content) + except Exception as e: + logger.error(f"Failed to serve error logs page: {str(e)}") + return HTMLResponse(content=f"Error: {str(e)}") + + async def serve_worker_registry_page(self): + """Serve the worker registry HTML page""" + try: + with open("static/worker-registry.html", "r") as f: + content = f.read() + + # Replace placeholders with actual values + network = self.validator.config.SUBTENSOR_NETWORK.upper() + content = content.replace("{{network}}", network) + content = content.replace( + "{{current_year}}", str(datetime.datetime.now().year) + ) + + return HTMLResponse(content=content) + except Exception as e: + logger.error(f"Failed to serve worker registry page: {str(e)}") + return HTMLResponse(content=f"Error: {str(e)}") + + async def serve_routing_table_page(self): + """Serve the routing table HTML page""" + try: + with open("static/routing-table.html", "r") as f: + content = f.read() + + # Replace placeholders with actual values + network = self.validator.config.SUBTENSOR_NETWORK.upper() + content = content.replace("{{network}}", network) + content = content.replace( + "{{current_year}}", str(datetime.datetime.now().year) + ) + + return HTMLResponse(content=content) + except Exception as e: + logger.error(f"Failed to serve routing table page: {str(e)}") + return HTMLResponse(content=f"Error: {str(e)}") + + async def serve_unregistered_nodes_page(self): + """Serve the unregistered nodes HTML page""" + try: + with open("static/unregistered-nodes.html", "r") as f: + content = f.read() + + # Replace placeholders with actual values + network = self.validator.config.SUBTENSOR_NETWORK.upper() + content = content.replace("{{network}}", network) + content = content.replace( + "{{current_year}}", str(datetime.datetime.now().year) + ) + + return HTMLResponse(content=content) + except Exception as e: + logger.error(f"Failed to serve unregistered nodes page: {str(e)}") + return HTMLResponse(content=f"Error: {str(e)}") + + async def serve_score_simulation_page(self): + """Serve the score simulation HTML page""" + try: + with open("static/score-simulation.html", "r") as f: + content = f.read() + + # Replace placeholders with actual values + network = self.validator.config.SUBTENSOR_NETWORK.upper() + content = content.replace("{{network}}", network) + content = content.replace( + "{{current_year}}", str(datetime.datetime.now().year) + ) + + return HTMLResponse(content=content) + except Exception as e: + logger.error(f"Failed to serve score simulation page: {str(e)}") + return HTMLResponse(content=f"Error: {str(e)}") + + async def score_simulation_data(self): + """Return JSON data for score simulation based on telemetry""" + try: + data = await self.validator.get_score_simulation_data() + return data + except Exception as e: + logger.error(f"Failed to get score simulation data: {str(e)}") + return {"error": str(e)} From deb83430ac57e6bc83b152cd6ff5cc1834d96fd6 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Tue, 8 Apr 2025 09:54:14 -0700 Subject: [PATCH 11/20] fix(validator): removing log --- neurons/validator.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/neurons/validator.py b/neurons/validator.py index b9de01d..22aa07a 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -311,8 +311,6 @@ async def get_score_simulation_data(self): node.node_id: node.hotkey for node in self.metagraph.nodes.values() } - logger.info(f"UID TO HOTKEY: {uid_to_hotkey} {uids}") - # 4. Format the scores for the API response - directly use raw scores from calculate_weights formatted_scores = [ {"hotkey": uid_to_hotkey.get(int(uid)), "score": float(score)} From 3c0341143a3f6f1cfad9d000a230267dcc29c6a1 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Tue, 8 Apr 2025 11:17:40 -0700 Subject: [PATCH 12/20] feat(valdiator): monitoring --- static/routing-table.html | 28 +++++++++------------ static/score-simulation.html | 6 ++--- static/worker-registry.html | 29 ++++++++++------------ validator/api_routes.py | 25 +++++++++++++++---- validator/node_manager.py | 48 +++++++++++++++++++++++++++++++++--- 5 files changed, 93 insertions(+), 43 deletions(-) diff --git a/static/routing-table.html b/static/routing-table.html index 5827c27..c787fdb 100644 --- a/static/routing-table.html +++ b/static/routing-table.html @@ -330,7 +330,7 @@

Network Visualization

.then(response => response.json()) .then(data => { populateRoutingTable(data.miner_addresses); - updateRoutingStats(data.miner_addresses); + updateRoutingStats(data.miner_addresses, data.count); updateNetworkVisualization(data.miner_addresses); updateLastRefresh(); }) @@ -370,7 +370,7 @@

Network Visualization

const row = document.createElement('tr'); // Randomly determine if route is connected (for demo) - const isConnected = Math.random() > 0.3; + const isConnected = true; const statusClass = isConnected ? 'status-connected' : 'status-disconnected'; const statusText = isConnected ? 'Connected' : 'Disconnected'; @@ -394,13 +394,13 @@

Network Visualization

addCopyListeners(); } - function updateRoutingStats(routes) { - document.getElementById('total-routes').textContent = routes.length; + function updateRoutingStats(routes, count) { + document.getElementById('total-routes').textContent = count; // Randomly determine connected routes for demo purposes - const connectedCount = Math.floor(routes.length * 0.7); + const connectedCount = routes.length; document.getElementById('connected-routes').textContent = connectedCount; - document.getElementById('disconnected-routes').textContent = routes.length - connectedCount; + document.getElementById('disconnected-routes').textContent = count - connectedCount; } function updateNetworkVisualization(routes) { @@ -450,16 +450,12 @@

Network Visualization

line.style.transform = `rotate(${angle2}deg)`; // Random connection status - const isConnected = Math.random() > 0.3; - if (isConnected) { - line.style.background = 'rgba(16, 185, 129, 0.3)'; - minerNode.style.background = 'var(--success-color)'; - minerNode.style.boxShadow = '0 0 10px var(--success-color)'; - } else { - line.style.background = 'rgba(239, 68, 68, 0.3)'; - minerNode.style.background = 'var(--danger-color)'; - minerNode.style.boxShadow = '0 0 10px var(--danger-color)'; - } + // const isConnected = Math.random() > 0.3; + + line.style.background = 'rgba(16, 185, 129, 0.3)'; + minerNode.style.background = 'var(--success-color)'; + minerNode.style.boxShadow = '0 0 10px var(--success-color)'; + container.appendChild(line); } diff --git a/static/score-simulation.html b/static/score-simulation.html index 3dabf8f..7efe3b9 100644 --- a/static/score-simulation.html +++ b/static/score-simulation.html @@ -222,7 +222,7 @@

Score Simulation

-

Scatter Plot Distribution (Sorted by Score - Highest First)

+

Scatter Plot Distribution (Sorted by Score - Lowest First)

@@ -396,8 +396,8 @@

Scatter Plot Distribution (Sorted by Score - Highest First)

const ctx = document.getElementById('scatterChart'); if (!ctx) return; - // Create dataset for scatter plot with pre-sorted scores - const sortedScores = scores.slice().sort((a, b) => b.score - a.score); + // Create dataset for scatter plot with pre-sorted scores (lowest to highest) + const sortedScores = scores.slice().sort((a, b) => a.score - b.score); const dataPoints = sortedScores.map((item, index) => ({ x: index + 1, // Rank (1-based) y: item.score, // Score for y-axis diff --git a/static/worker-registry.html b/static/worker-registry.html index 27caa50..6b2b677 100644 --- a/static/worker-registry.html +++ b/static/worker-registry.html @@ -196,7 +196,6 @@

Worker Registry

- @@ -276,7 +275,7 @@

Worker Registry

if (!workers || workers.length === 0) { const row = document.createElement('tr'); - row.innerHTML = ``; + row.innerHTML = ``; tableBody.appendChild(row); return; } @@ -293,29 +292,27 @@

Worker Registry

if (filteredWorkers.length === 0) { const row = document.createElement('tr'); - row.innerHTML = ``; + row.innerHTML = ``; tableBody.appendChild(row); return; } - // Generate random last seen times for demo purposes + // For displaying last seen time const now = new Date(); filteredWorkers.forEach(worker => { const row = document.createElement('tr'); - // Randomly determine if worker is active (for demo) - const isActive = Math.random() > 0.3; - const statusClass = isActive ? 'status-active' : 'status-inactive'; - const statusText = isActive ? 'Active' : 'Inactive'; - - // Generate random last seen time between now and 3 days ago - const randomHours = Math.floor(Math.random() * 72); - const lastSeen = new Date(now.getTime() - randomHours * 60 * 60 * 1000); - const lastSeenStr = lastSeen.toLocaleString(); + // Use the last seen time if provided, or generate a placeholder + let lastSeenStr = worker.last_seen || 'Unknown'; + if (lastSeenStr === 'Unknown') { + // Fallback to a generated time for demo purposes + const randomHours = Math.floor(Math.random() * 72); + const lastSeen = new Date(now.getTime() - randomHours * 60 * 60 * 1000); + lastSeenStr = lastSeen.toLocaleString(); + } row.innerHTML = ` - @@ -331,8 +328,8 @@

Worker Registry

function updateWorkerStats(workers) { document.getElementById('total-workers').textContent = workers.length; - // Randomly determine active workers for demo purposes - const activeCount = Math.floor(workers.length * 0.7); + // Calculate active and inactive counts based on routing table presence + const activeCount = workers.filter(worker => worker.is_in_routing_table).length; document.getElementById('active-workers').textContent = activeCount; document.getElementById('inactive-workers').textContent = workers.length - activeCount; } diff --git a/validator/api_routes.py b/validator/api_routes.py index 09a6bb5..fe36aba 100644 --- a/validator/api_routes.py +++ b/validator/api_routes.py @@ -242,12 +242,26 @@ async def monitor_worker_registry(self): """Return all worker registrations (worker_id to hotkey mappings)""" try: registrations = self.validator.routing_table.get_all_worker_registrations() + worker_registrations = [] + + for worker_id, hotkey in registrations: + # Check if the worker is in the routing table (active) + miner_addresses = self.validator.routing_table.get_miner_addresses( + hotkey + ) + is_in_routing_table = len(miner_addresses) > 0 + + worker_registrations.append( + { + "worker_id": worker_id, + "hotkey": hotkey, + "is_in_routing_table": is_in_routing_table, + } + ) + return { "count": len(registrations), - "worker_registrations": [ - {"worker_id": worker_id, "hotkey": hotkey} - for worker_id, hotkey in registrations - ], + "worker_registrations": worker_registrations, } except Exception as e: return {"error": str(e)} @@ -256,8 +270,9 @@ async def monitor_routing_table(self): """Return all miner addresses and their associated hotkeys""" try: addresses = self.validator.routing_table.get_all_addresses_with_hotkeys() + nodes_count = len(self.validator.metagraph.nodes) return { - "count": len(addresses), + "count": nodes_count, "miner_addresses": [ { "hotkey": hotkey, diff --git a/validator/node_manager.py b/validator/node_manager.py index 61acb05..4804604 100644 --- a/validator/node_manager.py +++ b/validator/node_manager.py @@ -295,9 +295,24 @@ async def update_tee_list(self): f"Retrieved TEE addresses for hotkey {hotkey}: {tee_addresses}" ) - # Cleaning DB from addresses under this hotkey - routing_table.clear_miner(hotkey=node.hotkey) - logger.debug(f"Cleared existing addresses for hotkey {hotkey}") + # Instead of clearing all entries, get current TEEs for this hotkey + current_tees = routing_table.get_miner_addresses(hotkey=node.hotkey) + logger.debug( + f"Retrieved {len(current_tees) if current_tees else 0} current TEEs for {hotkey}" + ) + + # Create a set of current TEE addresses for comparison + current_tee_urls = set() + if current_tees: + for address, worker_id in current_tees: + current_tee_urls.add(address) + + logger.debug( + f"Current TEE addresses for hotkey {hotkey}: {current_tee_urls}" + ) + + # Track successfully verified TEEs in this update + verified_tees = set() if tee_addresses: for tee_address in tee_addresses.split(","): @@ -425,6 +440,9 @@ async def update_tee_list(self): f"hotkey {hotkey}" ) + # Add to verified TEEs + verified_tees.add(tee_address) + # Check if this is a new worker registration (worker_id was not set before) if worker_hotkey is None: logger.info( @@ -465,6 +483,30 @@ async def update_tee_list(self): miner_address=f"{node.ip}:{node.port}", message="No TEE addresses returned", ) + + # Remove only TEEs that were not verified in this update + tees_to_remove = current_tee_urls - verified_tees + if tees_to_remove: + logger.info( + f"Removing {len(tees_to_remove)} unresponsive TEEs for hotkey {hotkey}" + ) + for tee_to_remove in tees_to_remove: + logger.debug( + f"Removing unresponsive TEE {tee_to_remove} for hotkey {hotkey}" + ) + # Find the UID for this address + for address, worker_id in current_tees: + if address == tee_to_remove: + # Call remove_miner_address with the correct parameters (hotkey, uid) + routing_table.remove_miner_address( + hotkey=node.hotkey, uid=node.node_id + ) + break + + logger.debug( + f"Kept {len(verified_tees)} verified TEEs for hotkey {hotkey}" + ) + except Exception as e: logger.error( f"Error processing TEE addresses for hotkey {hotkey}: {e}" From ea74f5c77a674ef0a83a4e6b2a1fb9b41f212350 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Tue, 8 Apr 2025 11:52:56 -0700 Subject: [PATCH 13/20] feat(docker): fixing dashbaord site --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 91570ba..79d028f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,7 +32,7 @@ COPY neurons neurons/ COPY miner miner/ COPY validator validator/ COPY db db/ - +COPY static static/ # Copy entrypoint script and make it executable COPY entrypoint.sh . From 1a1102bb5a6866461f999a86cc1dd860218f3029 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Tue, 8 Apr 2025 12:57:00 -0700 Subject: [PATCH 14/20] feat(validator): scoring improving --- validator/weights.py | 65 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/validator/weights.py b/validator/weights.py index 0694de2..d6cd48a 100644 --- a/validator/weights.py +++ b/validator/weights.py @@ -105,7 +105,15 @@ def _get_delta_node_data(self) -> List[NodeData]: hotkeys_to_score = ( self.validator.telemetry_storage.get_all_hotkeys_with_telemetry() ) - + # Get all hotkeys from metagraph to ensure we include those without telemetry + all_hotkeys = [] + for node_idx, node in enumerate(self.validator.metagraph.nodes): + node_data = self.validator.metagraph.nodes[node] + hotkey = node_data.hotkey + all_hotkeys.append((node_data.node_id, hotkey)) + + # Process hotkeys with telemetry data + processed_hotkeys = set() for hotkey in hotkeys_to_score: node_telemetry = self.validator.telemetry_storage.get_telemetry_by_hotkey( hotkey @@ -156,11 +164,59 @@ def _get_delta_node_data(self) -> List[NodeData]: ) delta_node_data.append(delta_data) + processed_hotkeys.add(hotkey) logger.debug(f"Calculated deltas for {hotkey}: {delta_data}") else: logger.debug( f"Not enough telemetry data for {hotkey} to calculate deltas" ) + # Find UID for this hotkey + uid = next((uid for uid, hk in all_hotkeys if hk == hotkey), 0) + # Add empty telemetry for hotkeys with insufficient data + delta_data = NodeData( + hotkey=hotkey, + uid=uid, + worker_id="", + timestamp=0, + boot_time=0, + last_operation_time=0, + current_time=0, + twitter_auth_errors=0, + twitter_errors=0, + twitter_ratelimit_errors=0, + twitter_returned_other=0, + twitter_returned_profiles=0, + twitter_returned_tweets=0, + twitter_scrapes=0, + web_errors=0, + web_success=0, + ) + delta_node_data.append(delta_data) + processed_hotkeys.add(hotkey) + + # Add empty telemetry for hotkeys without any telemetry data + for uid, hotkey in all_hotkeys: + if hotkey not in processed_hotkeys: + logger.debug(f"Adding empty telemetry for {hotkey} (uid: {uid})") + delta_data = NodeData( + hotkey=hotkey, + uid=uid, + worker_id="", + timestamp=0, + boot_time=0, + last_operation_time=0, + current_time=0, + twitter_auth_errors=0, + twitter_errors=0, + twitter_ratelimit_errors=0, + twitter_returned_other=0, + twitter_returned_profiles=0, + twitter_returned_tweets=0, + twitter_scrapes=0, + web_errors=0, + web_success=0, + ) + delta_node_data.append(delta_data) logger.info(f"Calculated deltas for {len(delta_node_data)} nodes") return delta_node_data @@ -243,8 +299,11 @@ async def calculate_weights( logger.error( f"Node with hotkey '{node.hotkey}' not found in metagraph." ) - - uids = sorted(miner_scores.keys()) + # Convert string UIDs to integers for proper sorting, if needed + uids = sorted( + miner_scores.keys(), + key=lambda x: int(x) if isinstance(x, str) and x.isdigit() else x, + ) weights = [float(miner_scores[uid]) for uid in uids] logger.info(f"Completed weight calculation for {len(uids)} nodes") From b7e92399140809a4e02cc8775788715fb340abb4 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Tue, 8 Apr 2025 13:24:14 -0700 Subject: [PATCH 15/20] feat(validator): restarting telemetry wehn TEE is restarted --- db/telemetry_database.py | 13 ++++ validator/telemetry_storage.py | 13 ++++ validator/weights.py | 132 ++++++++++++++++++++++++--------- 3 files changed, 123 insertions(+), 35 deletions(-) diff --git a/db/telemetry_database.py b/db/telemetry_database.py index 75a6509..de52d0f 100644 --- a/db/telemetry_database.py +++ b/db/telemetry_database.py @@ -127,3 +127,16 @@ def get_all_hotkeys_with_telemetry(self): ) hotkeys = [row[0] for row in cursor.fetchall()] return hotkeys + + def delete_telemetry_by_hotkey(self, hotkey): + """Delete all telemetry entries for a specific hotkey.""" + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + DELETE FROM telemetry WHERE hotkey = ? + """, + (hotkey,), + ) + conn.commit() + return cursor.rowcount # Return the number of rows deleted diff --git a/validator/telemetry_storage.py b/validator/telemetry_storage.py index 30d600f..f72a4a5 100644 --- a/validator/telemetry_storage.py +++ b/validator/telemetry_storage.py @@ -70,3 +70,16 @@ def get_all_hotkeys_with_telemetry(self): except sqlite3.Error as e: logger.error(f"Failed to retrieve hotkeys with telemetry: {e}") return [] + + def delete_telemetry_by_hotkey(self, hotkey): + """ + Delete all telemetry entries for a specific hotkey using the + TelemetryDatabase method. + """ + try: + rows_deleted = self.db.delete_telemetry_by_hotkey(hotkey) + logger.info(f"Deleted {rows_deleted} telemetry entries for hotkey {hotkey}") + return rows_deleted + except sqlite3.Error as e: + logger.error(f"Failed to delete telemetry for hotkey {hotkey}: {e}") + return 0 diff --git a/validator/weights.py b/validator/weights.py index d6cd48a..e17c4ab 100644 --- a/validator/weights.py +++ b/validator/weights.py @@ -129,43 +129,105 @@ def _get_delta_node_data(self) -> List[NodeData]: latest = sorted_telemetry[0] oldest = sorted_telemetry[-1] - # Calculate deltas between latest and oldest values - delta_data = NodeData( - hotkey=hotkey, - uid=latest.uid, - worker_id=latest.worker_id, - timestamp=latest.timestamp, - boot_time=latest.boot_time - oldest.boot_time, - last_operation_time=( - latest.last_operation_time - oldest.last_operation_time - ), - current_time=latest.current_time - oldest.current_time, - twitter_auth_errors=( - latest.twitter_auth_errors - oldest.twitter_auth_errors - ), - twitter_errors=(latest.twitter_errors - oldest.twitter_errors), - twitter_ratelimit_errors=( - latest.twitter_ratelimit_errors - - oldest.twitter_ratelimit_errors - ), - twitter_returned_other=( - latest.twitter_returned_other - oldest.twitter_returned_other - ), - twitter_returned_profiles=( - latest.twitter_returned_profiles - - oldest.twitter_returned_profiles - ), - twitter_returned_tweets=( - latest.twitter_returned_tweets - oldest.twitter_returned_tweets - ), - twitter_scrapes=(latest.twitter_scrapes - oldest.twitter_scrapes), - web_errors=latest.web_errors - oldest.web_errors, - web_success=latest.web_success - oldest.web_success, + # Check for negative deltas (TEE restart) + has_negative_delta = False + + # Calculate and check key metrics + boot_time_delta = latest.boot_time - oldest.boot_time + last_operation_time_delta = ( + latest.last_operation_time - oldest.last_operation_time + ) + twitter_auth_errors_delta = ( + latest.twitter_auth_errors - oldest.twitter_auth_errors + ) + twitter_errors_delta = latest.twitter_errors - oldest.twitter_errors + twitter_ratelimit_errors_delta = ( + latest.twitter_ratelimit_errors - oldest.twitter_ratelimit_errors + ) + twitter_returned_other_delta = ( + latest.twitter_returned_other - oldest.twitter_returned_other + ) + twitter_returned_profiles_delta = ( + latest.twitter_returned_profiles - oldest.twitter_returned_profiles ) + twitter_returned_tweets_delta = ( + latest.twitter_returned_tweets - oldest.twitter_returned_tweets + ) + twitter_scrapes_delta = latest.twitter_scrapes - oldest.twitter_scrapes + web_errors_delta = latest.web_errors - oldest.web_errors + web_success_delta = latest.web_success - oldest.web_success + + # Check if any delta is negative, indicating a TEE restart + if ( + boot_time_delta < 0 + or last_operation_time_delta < 0 + or twitter_auth_errors_delta < 0 + or twitter_errors_delta < 0 + or twitter_ratelimit_errors_delta < 0 + or twitter_returned_other_delta < 0 + or twitter_returned_profiles_delta < 0 + or twitter_returned_tweets_delta < 0 + or twitter_scrapes_delta < 0 + or web_errors_delta < 0 + or web_success_delta < 0 + ): + + has_negative_delta = True + logger.warning( + f"Detected negative delta for {hotkey}, indicating TEE restart. Deleting telemetry data." + ) + # Delete all telemetry for this node + self.validator.telemetry_storage.delete_telemetry_by_hotkey(hotkey) + + if not has_negative_delta: + # Calculate deltas between latest and oldest values, ensuring no negatives + delta_data = NodeData( + hotkey=hotkey, + uid=latest.uid, + worker_id=latest.worker_id, + timestamp=latest.timestamp, + boot_time=max(0, boot_time_delta), + last_operation_time=max(0, last_operation_time_delta), + current_time=latest.current_time - oldest.current_time, + twitter_auth_errors=max(0, twitter_auth_errors_delta), + twitter_errors=max(0, twitter_errors_delta), + twitter_ratelimit_errors=max(0, twitter_ratelimit_errors_delta), + twitter_returned_other=max(0, twitter_returned_other_delta), + twitter_returned_profiles=max( + 0, twitter_returned_profiles_delta + ), + twitter_returned_tweets=max(0, twitter_returned_tweets_delta), + twitter_scrapes=max(0, twitter_scrapes_delta), + web_errors=max(0, web_errors_delta), + web_success=max(0, web_success_delta), + ) - delta_node_data.append(delta_data) - processed_hotkeys.add(hotkey) - logger.debug(f"Calculated deltas for {hotkey}: {delta_data}") + delta_node_data.append(delta_data) + processed_hotkeys.add(hotkey) + logger.debug(f"Calculated deltas for {hotkey}: {delta_data}") + else: + # Create empty telemetry data for this node since we deleted its telemetry + uid = next((uid for uid, hk in all_hotkeys if hk == hotkey), 0) + delta_data = NodeData( + hotkey=hotkey, + uid=uid, + worker_id="", + timestamp=0, + boot_time=0, + last_operation_time=0, + current_time=0, + twitter_auth_errors=0, + twitter_errors=0, + twitter_ratelimit_errors=0, + twitter_returned_other=0, + twitter_returned_profiles=0, + twitter_returned_tweets=0, + twitter_scrapes=0, + web_errors=0, + web_success=0, + ) + delta_node_data.append(delta_data) + processed_hotkeys.add(hotkey) else: logger.debug( f"Not enough telemetry data for {hotkey} to calculate deltas" From 88dc7bb01104147adc455cf1270e21628ebb3779 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Wed, 9 Apr 2025 12:05:02 -0700 Subject: [PATCH 16/20] feat(validator): removing registered nodes from unregistered list --- db/routing_table_database.py | 19 +++++++++++++ static/routing-table.html | 53 ++++-------------------------------- validator/api_routes.py | 1 + validator/node_manager.py | 25 +++++++++++++++++ validator/routing_table.py | 14 ++++++++++ 5 files changed, 64 insertions(+), 48 deletions(-) diff --git a/db/routing_table_database.py b/db/routing_table_database.py index 6856566..7c9cda0 100644 --- a/db/routing_table_database.py +++ b/db/routing_table_database.py @@ -281,3 +281,22 @@ def get_all_unregistered_tee_addresses(self): ) results = cursor.fetchall() return [address[0] for address in results] + + def remove_unregistered_tee(self, address): + """ + Remove a specific unregistered TEE by address. + + :param address: The address of the unregistered TEE to remove + :return: True if an entry was removed, False if not found + """ + with self.lock, sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + DELETE FROM unregistered_tees + WHERE address = ? + """, + (address,), + ) + conn.commit() + return cursor.rowcount > 0 diff --git a/static/routing-table.html b/static/routing-table.html index c787fdb..c16d81f 100644 --- a/static/routing-table.html +++ b/static/routing-table.html @@ -42,24 +42,6 @@ max-width: 250px; } - .connection-status { - display: inline-flex; - align-items: center; - font-size: 0.85rem; - padding: 4px 8px; - border-radius: 4px; - } - - .status-connected { - background: rgba(16, 185, 129, 0.2); - color: var(--success-color); - } - - .status-disconnected { - background: rgba(239, 68, 68, 0.2); - color: var(--danger-color); - } - .search-box { display: flex; margin-bottom: 20px; @@ -238,14 +220,6 @@

Routing Table

0
Total Routes
-
-
0
-
Connected
-
-
-
0
-
Disconnected
-
@@ -262,7 +236,6 @@

Network Visualization

Status Worker ID Hotkey Last SeenNo workers registeredNo workers registeredNo matching workers foundNo matching workers found${statusText} ${worker.worker_id} ${worker.hotkey} ${lastSeenStr}
- @@ -343,7 +316,7 @@

Network Visualization

if (!routes || routes.length === 0) { const row = document.createElement('tr'); - row.innerHTML = ``; + row.innerHTML = ``; tableBody.appendChild(row); return; } @@ -361,7 +334,7 @@

Network Visualization

if (filteredRoutes.length === 0) { const row = document.createElement('tr'); - row.innerHTML = ``; + row.innerHTML = ``; tableBody.appendChild(row); return; } @@ -369,16 +342,10 @@

Network Visualization

filteredRoutes.forEach(route => { const row = document.createElement('tr'); - // Randomly determine if route is connected (for demo) - const isConnected = true; - const statusClass = isConnected ? 'status-connected' : 'status-disconnected'; - const statusText = isConnected ? 'Connected' : 'Disconnected'; - // Add copy feedback span to copyable cells const copyFeedbackSpan = 'Copied!'; row.innerHTML = ` - @@ -396,11 +363,6 @@

Network Visualization

function updateRoutingStats(routes, count) { document.getElementById('total-routes').textContent = count; - - // Randomly determine connected routes for demo purposes - const connectedCount = routes.length; - document.getElementById('connected-routes').textContent = connectedCount; - document.getElementById('disconnected-routes').textContent = count - connectedCount; } function updateNetworkVisualization(routes) { @@ -432,6 +394,8 @@

Network Visualization

minerNode.className = 'node'; minerNode.style.top = `${y}%`; minerNode.style.left = `${x}%`; + minerNode.style.background = 'var(--accent-color)'; + minerNode.style.boxShadow = '0 0 10px var(--accent-color)'; container.appendChild(minerNode); // Create connection line @@ -448,14 +412,7 @@

Network Visualization

line.style.left = '50%'; line.style.top = '50%'; line.style.transform = `rotate(${angle2}deg)`; - - // Random connection status - // const isConnected = Math.random() > 0.3; - - line.style.background = 'rgba(16, 185, 129, 0.3)'; - minerNode.style.background = 'var(--success-color)'; - minerNode.style.boxShadow = '0 0 10px var(--success-color)'; - + line.style.background = 'rgba(255, 255, 255, 0.2)'; container.appendChild(line); } diff --git a/validator/api_routes.py b/validator/api_routes.py index fe36aba..9314148 100644 --- a/validator/api_routes.py +++ b/validator/api_routes.py @@ -271,6 +271,7 @@ async def monitor_routing_table(self): try: addresses = self.validator.routing_table.get_all_addresses_with_hotkeys() nodes_count = len(self.validator.metagraph.nodes) + return { "count": nodes_count, "miner_addresses": [ diff --git a/validator/node_manager.py b/validator/node_manager.py index 4804604..d45d016 100644 --- a/validator/node_manager.py +++ b/validator/node_manager.py @@ -525,6 +525,31 @@ async def update_tee_list(self): miner_address="", message="Hotkey not found in metagraph", ) + + # Clean up any unregistered TEEs that are now in the routing table + try: + # Get all registered addresses + registered_addrs = routing_table.get_all_addresses() + + # Get current list of unregistered TEE addresses + unregistered_addrs = routing_table.get_all_unregistered_tee_addresses() + + # Check which addresses should be removed from unregistered list + cleaned_count = 0 + + for address in registered_addrs: + if address in unregistered_addrs: + # This address was previously unregistered but is now registered + routing_table.remove_unregistered_tee(address) + cleaned_count += 1 + + if cleaned_count > 0: + logger.info( + f"Cleaned {cleaned_count} addresses from unregistered TEEs that are now registered" + ) + except Exception as e: + logger.error(f"Error cleaning up unregistered TEEs: {str(e)}") + logger.info("Completed TEE list update ✅") async def send_score_report( diff --git a/validator/routing_table.py b/validator/routing_table.py index c017bb9..37cb235 100644 --- a/validator/routing_table.py +++ b/validator/routing_table.py @@ -186,3 +186,17 @@ def get_all_unregistered_tee_addresses(self): except sqlite3.Error as e: logger.error(f"Failed to get unregistered TEE addresses: {e}") return [] + + def remove_unregistered_tee(self, address): + """Remove a specific unregistered TEE by address.""" + try: + logger.info(f"Removing unregistered TEE: address={address}") + result = self.db.remove_unregistered_tee(address) + if result: + logger.debug(f"Successfully removed unregistered TEE: {address}") + else: + logger.debug(f"No unregistered TEE found with address: {address}") + return result + except sqlite3.Error as e: + logger.error(f"Failed to remove unregistered TEE: {e}") + return False From 2bc18c541b98db35ae729599eaa660736d0ae569 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Wed, 9 Apr 2025 13:51:06 -0700 Subject: [PATCH 17/20] feat(validator): using dedicated TEE --- .env.example | 3 +++ validator/node_manager.py | 4 ++- validator/scorer.py | 4 ++- validator/telemetry.py | 54 +++++++++++++++++++++++++++++++-------- 4 files changed, 53 insertions(+), 12 deletions(-) diff --git a/.env.example b/.env.example index ead39ad..e2ea289 100644 --- a/.env.example +++ b/.env.example @@ -40,6 +40,9 @@ MINER_TEE_ADDRESS=https://tee-worker-ip:8080 # Set to 1 for simulation mode OE_SIMULATION=1 +# TEE Configuration (for validator) +TELEMETRY_RESULT_WORKER_ADDRESS=https://alternate-tee-worker:8080 + # System Configuration # Enable debug logging DEBUG=false diff --git a/validator/node_manager.py b/validator/node_manager.py index d45d016..ad0198b 100644 --- a/validator/node_manager.py +++ b/validator/node_manager.py @@ -351,7 +351,9 @@ async def update_tee_list(self): ) telemetry_result = ( - await telemetry_client.execute_telemetry_sequence() + await telemetry_client.execute_telemetry_sequence( + routing_table=routing_table + ) ) if not telemetry_result: diff --git a/validator/scorer.py b/validator/scorer.py index f232679..d9882c3 100644 --- a/validator/scorer.py +++ b/validator/scorer.py @@ -57,7 +57,9 @@ async def get_node_data(self): logger.info(f"Executing telemetry sequence for node {hotkey[:10]}...") logger.debug(f"Executing telemetry sequence for node {hotkey}") - telemetry_result = await telemetry_client.execute_telemetry_sequence() + telemetry_result = await telemetry_client.execute_telemetry_sequence( + routing_table=self.validator.routing_table + ) if telemetry_result: successful_nodes += 1 diff --git a/validator/telemetry.py b/validator/telemetry.py index fcf0834..75598c6 100644 --- a/validator/telemetry.py +++ b/validator/telemetry.py @@ -11,6 +11,12 @@ class TEETelemetryClient: def __init__(self, tee_worker_address): self.tee_worker_address = tee_worker_address + # Get alternative TEE worker address for result submission from environment variable + self.result_tee_worker_address = os.getenv( + "TELEMETRY_RESULT_WORKER_ADDRESS", self.tee_worker_address + ) + logger.debug(f"TEE worker address: {self.tee_worker_address}") + logger.debug(f"Result TEE worker address: {self.result_tee_worker_address}") async def generate_telemetry_job(self): async with httpx.AsyncClient(verify=False) as client: @@ -51,7 +57,7 @@ async def check_telemetry_job(self, job_uuid): signature = content.decode("utf-8") return signature - async def return_telemetry_job(self, sig, result_sig): + async def return_telemetry_job(self, sig, result_sig, routing_table=None): # Remove quotes and backslashes from signatures if result_sig.startswith('"') and result_sig.endswith('"'): result_sig = result_sig[1:-1] @@ -61,16 +67,44 @@ async def return_telemetry_job(self, sig, result_sig): sig = sig[1:-1] sig = sig.replace("\\", "") - async with httpx.AsyncClient(verify=False) as client: - response = await client.post( - f"{self.tee_worker_address}/job/result", - headers={"Content-Type": "application/json"}, - json={"encrypted_result": result_sig, "encrypted_request": sig}, + # Use the result TEE worker address instead of the original one + logger.debug(f"Submitting result to: {self.result_tee_worker_address}") + try: + async with httpx.AsyncClient(verify=False) as client: + response = await client.post( + f"{self.result_tee_worker_address}/job/result", + headers={"Content-Type": "application/json"}, + json={"encrypted_result": result_sig, "encrypted_request": sig}, + ) + response.raise_for_status() + return response.json() + except Exception as e: + logger.error( + f"Failed to submit telemetry result to {self.result_tee_worker_address}: {str(e)}" ) - response.raise_for_status() - return response.json() - async def execute_telemetry_sequence(self, max_retries=3, delay=5): + # Add the failed TEE worker to unregistered list if routing_table is provided + if ( + routing_table is not None + and self.result_tee_worker_address != self.tee_worker_address + ): + # Get current unregistered addresses + unregistered_addrs = routing_table.get_all_unregistered_tee_addresses() + + # Only add if not already in the unregistered list + if self.result_tee_worker_address not in unregistered_addrs: + logger.warning( + f"Adding failed result TEE worker to unregistered list: {self.result_tee_worker_address}" + ) + routing_table.add_unregistered_tee( + address=self.result_tee_worker_address, + hotkey="validator", # Using "validator" as hotkey since this isn't associated with a specific miner + ) + raise + + async def execute_telemetry_sequence( + self, max_retries=3, delay=5, routing_table=None + ): retries = 0 while retries < max_retries: try: @@ -87,7 +121,7 @@ async def execute_telemetry_sequence(self, max_retries=3, delay=5): logger.debug(f"Job status signature: {status_sig}") logger.debug("Returning telemetry job result...") - result = await self.return_telemetry_job(sig, status_sig) + result = await self.return_telemetry_job(sig, status_sig, routing_table) logger.debug(f"Telemetry job result: {result}") return result From f16b25af19d93017042e69fb0acaff0e2268cc48 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Wed, 9 Apr 2025 15:21:47 -0700 Subject: [PATCH 18/20] feat(validator): logs improvement --- validator/node_manager.py | 7 +++++-- validator/telemetry.py | 7 ++++--- validator/weights.py | 11 +++++------ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/validator/node_manager.py b/validator/node_manager.py index ad0198b..4523390 100644 --- a/validator/node_manager.py +++ b/validator/node_manager.py @@ -156,7 +156,8 @@ async def connect_new_nodes(self) -> None: for node in available_nodes: if node.ip == "0": - logger.warn(f"Skipping node {node.hotkey}: ip is {node.ip}") + if os.getenv("DEBUG", "false").lower() == "true": + logger.warn(f"Skipping node {node.hotkey}: ip is {node.ip}") self.errors_storage.add_error( hotkey=node.hotkey, tee_address="", @@ -217,7 +218,9 @@ async def send_custom_message(self, node_hotkey: str, message: str) -> None: """ try: if node_hotkey not in self.connected_nodes: - logger.warning(f"No connected node found for hotkey {node_hotkey}") + logger.debug( + f"Warning: No connected node found for hotkey {node_hotkey}" + ) self.errors_storage.add_error( hotkey=node_hotkey, tee_address="", diff --git a/validator/telemetry.py b/validator/telemetry.py index 75598c6..5cc69da 100644 --- a/validator/telemetry.py +++ b/validator/telemetry.py @@ -126,9 +126,10 @@ async def execute_telemetry_sequence( return result except Exception as e: - logger.error( - f"Error in telemetry sequence: {self.tee_worker_address} {e}" - ) + if os.getenv("DEBUG", "false").lower() == "true": + logger.warning( + f"Error in telemetry sequence: {self.tee_worker_address} {e}" + ) retries += 1 logger.debug( f"Retrying... {self.tee_worker_address} ({retries}/{max_retries})" diff --git a/validator/weights.py b/validator/weights.py index e17c4ab..b45f24e 100644 --- a/validator/weights.py +++ b/validator/weights.py @@ -369,10 +369,8 @@ async def calculate_weights( weights = [float(miner_scores[uid]) for uid in uids] logger.info(f"Completed weight calculation for {len(uids)} nodes") - logger.info(f"UIDs: {uids}") - logger.info(f"weights: {weights}") - - logger.info(f"Weights: {[f'{w:.4f}' for w in weights]}") + logger.debug(f"UIDs: {uids}") + logger.debug(f"weights: {weights}") return uids, weights @@ -434,8 +432,9 @@ async def set_weights(self) -> None: ) if success: - logger.info(f"UIDS: {uids}") - logger.info(f"Scores: {scores}") + + logger.debug(f"UIDS: {uids}") + logger.debug(f"Scores: {scores}") logger.info("✅ Successfully set weights!") return else: From fa7a8ff44cbdfeefa5d0e8aa19221dad3ee0a301 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Thu, 10 Apr 2025 13:27:36 -0700 Subject: [PATCH 19/20] feat(validator): extending loops --- neurons/validator.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/neurons/validator.py b/neurons/validator.py index 22aa07a..019b8c3 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -42,7 +42,7 @@ TIME_PER_WEIGHT_SETTING / 2 ) # half of a weight setting period -SYNC_LOOP_CADENCE_SECONDS = 10 +SYNC_LOOP_CADENCE_SECONDS = 120 class Validator: @@ -95,7 +95,9 @@ async def start(self) -> None: asyncio.create_task( self.background_tasks.sync_loop(SYNC_LOOP_CADENCE_SECONDS) ) - asyncio.create_task(self.background_tasks.set_weights_loop(60)) + asyncio.create_task( + self.background_tasks.set_weights_loop(WEIGHTS_LOOP_CADENCE_SECONDS) + ) asyncio.create_task( self.background_tasks.update_tee( @@ -106,7 +108,7 @@ async def start(self) -> None: # Start telemetry collection in its own task asyncio.create_task( self.background_tasks.telemetry_loop( - int(os.getenv("TELEMETRY_COLLECTION_CADENCE_SECONDS", "30")) + int(os.getenv("TELEMETRY_COLLECTION_CADENCE_SECONDS", "120")) ) ) From 0a679e386416b9bc69eb4302fcf5f6431a6968e2 Mon Sep 17 00:00:00 2001 From: hide-on-bush-x Date: Fri, 11 Apr 2025 11:53:02 -0700 Subject: [PATCH 20/20] feat(scoring): added md --- SCORING.md | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 SCORING.md diff --git a/SCORING.md b/SCORING.md new file mode 100644 index 0000000..751fb7d --- /dev/null +++ b/SCORING.md @@ -0,0 +1,116 @@ +# Subnet 42 Scoring System + +## Overview + +The Subnet 42 scoring system evaluates miner performance by analyzing telemetry data collected from their TEE (Trusted Execution Environment) workers. This scoring mechanism is designed to reward miners that successfully process web scraping and Twitter data collection tasks, while penalizing those with errors or failed operations. + +## Telemetry Collection Process + +1. **Telemetry Data Sources**: Each registered TEE worker periodically reports telemetry data that includes: + - Web scraping success and failure counts + - Twitter data collection metrics (tweets, profiles, etc.) + - Error counts by type (auth errors, rate limit errors, etc.) + - Operational timestamps + +2. **Delta-based Calculations**: Rather than using absolute values, the system calculates *deltas* (changes) between telemetry snapshots to measure recent performance: + - The system stores multiple telemetry snapshots over time + - Scores are based on changes between oldest and newest snapshots + - This approach rewards recent activity and improvements + +3. **TEE Restart Handling**: When a TEE worker restarts, telemetry counters may reset to zero, causing negative deltas. The system: + - Detects negative delta values in any telemetry metric + - Deletes all telemetry for that node to start fresh + - Ensures all deltas used for scoring are non-negative (using `max(0, delta)`) + +## Scoring Algorithm + +The scoring calculation follows these key steps: + +1. **Telemetry Data Collection**: For each node: + - Retrieve telemetry data snapshots + - Calculate deltas between oldest and newest records + - Handle any negative deltas (TEE restarts) + - Store normalized delta values for scoring + +2. **Key Performance Metrics**: The score primarily considers: + - `web_success`: Successful web scraping operations + - `twitter_returned_tweets`: Successfully retrieved tweets + - `twitter_returned_profiles`: Successfully retrieved Twitter profiles + +3. **Kurtosis Weighting**: The system applies a custom kurtosis-like function to weight top performers more heavily: + ```python + def apply_kurtosis_custom( + x, + top_percentile=90, + reward_factor=0.4, + steepness=2.0, + center_sensitivity=0.5, + boost_factor=0.2 + ): + ``` + This function: + - Applies higher weights to nodes in the top percentile (default 90%) + - Uses configurable parameters to adjust the curve's shape + - Avoids excessively punishing nodes that are performing adequately but not exceptionally + +4. **Metric Normalization**: Each metric is normalized to ensure fair comparison: + - Values are scaled to a 0-1 range using min-max scaling + - Nodes with zero values receive minimal but non-zero scores + - Extreme outliers are handled appropriately + +5. **Score Combination**: The final score combines weighted metrics: + - Web success, tweet retrieval, and profile retrieval each contribute to the final score + - Nodes with balanced performance across all metrics receive higher scores + - Nodes with exceptional performance in one area but poor in others receive moderate scores + +6. **Validation**: Scores undergo validation checks: + - Nodes with extremely low activity receive minimal scores + - Scores are normalized to sum to 1.0 across all nodes (for setting weights) + - Invalid or disconnected nodes receive zero scores + +## Weight Setting + +The calculated scores are used to set weights on the Bittensor blockchain: + +1. **Weight Conversion**: + - Scores are converted to weights suitable for the blockchain + - The weights determine how much TAO (the network token) each miner earns + +2. **Update Frequency**: + - Weights are updated at regular intervals + - A minimum interval between updates prevents excessive blockchain transactions + - Updates are retried up to 3 times if they fail + +3. **Notification**: + - Miners receive score reports with their performance metrics + - These reports provide transparency and help miners optimize their operations + +## Performance Optimization + +To maximize your score as a miner: + +1. **Maintain Uptime**: Keep your TEE worker running continuously to avoid restarts +2. **Minimize Errors**: Reduce authentication errors, rate limits, and other failures +3. **Maximize Successful Operations**: Focus on achieving high success rates for X and web scraping +4. **Balance Performance**: Aim for good performance across all metrics rather than excelling in just one +5. **Monitor Telemetry**: Regularly check your telemetry data to identify and address issues + +## Technical Implementation + +The scoring system is implemented across several components: + +1. **WeightsManager**: Handles the overall weight calculation process +2. **NodeManager**: Manages connections to miners and collects telemetry +3. **TelemetryStorage**: Stores and retrieves telemetry data +4. **ScoringFunctions**: Implements various mathematical scoring functions + +The core of the scoring logic is in the `calculate_weights` method of the `WeightsManager` class, which: +1. Processes delta telemetry data +2. Extracts and normalizes metrics +3. Applies kurtosis weighting +4. Calculates final scores +5. Converts scores to weights + +## Conclusion + +The Subnet 42 scoring system is designed to fairly reward miners based on their actual performance in web scraping and Twitter data collection. By using delta-based metrics and kurtosis weighting, the system encourages continuous improvement and rewards both consistency and excellence. \ No newline at end of file
Status Hotkey Address Worker IDNo routes availableNo routes availableNo matching routes foundNo matching routes found${statusText} ${route.hotkey}${copyFeedbackSpan} ${route.address}${copyFeedbackSpan} ${route.worker_id || 'N/A'}${copyFeedbackSpan}