diff --git a/Cargo.lock b/Cargo.lock index 610b091a..77e28511 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1928,6 +1928,7 @@ version = "0.0.0" dependencies = [ "clap", "clap-markdown", + "indoc", "rcgen", "rustls", "thiserror 2.0.17", @@ -1961,6 +1962,7 @@ dependencies = [ "hypha-messages", "hypha-network", "hypha-telemetry", + "indoc", "libp2p", "libp2p-stream", "miette", @@ -1984,6 +1986,7 @@ dependencies = [ "hypha-messages", "hypha-network", "hypha-telemetry", + "indoc", "libp2p", "miette", "serde", @@ -2055,6 +2058,7 @@ dependencies = [ "hypha-messages", "hypha-network", "hypha-telemetry", + "indoc", "itertools 0.14.0", "libp2p", "libp2p-stream", @@ -2115,6 +2119,7 @@ dependencies = [ "hypha-messages", "hypha-network", "hypha-telemetry", + "indoc", "libp2p", "libp2p-stream", "miette", @@ -2321,6 +2326,15 @@ dependencies = [ "web-time", ] +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + [[package]] name = "inlinable_string" version = "0.1.15" diff --git a/Cargo.toml b/Cargo.toml index 81fb2c46..f70efdd9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ ciborium = "0.2.2" clap = { version = "4.5.31", features = ["derive"] } documented = "0.9.2" figment = { version = "0.10", features = ["toml", "env"] } +indoc = "2" futures-util = "0.3" http-body-util = "0.1.3" hypha-config = { path = "crates/config" } diff --git a/crates/certutil/Cargo.toml b/crates/certutil/Cargo.toml index a7e9d34f..50af24ff 100644 --- a/crates/certutil/Cargo.toml +++ b/crates/certutil/Cargo.toml @@ -12,11 +12,13 @@ path = "src/main.rs" [dependencies] clap.workspace = true +indoc.workspace = true rcgen.workspace = true rustls.workspace = true thiserror.workspace = true time = "0.3.41" [build-dependencies] +indoc.workspace = true clap.workspace = true clap-markdown = "0.1.5" diff --git a/crates/certutil/src/cli.rs b/crates/certutil/src/cli.rs index bd2f2a18..4550e8c1 100644 --- a/crates/certutil/src/cli.rs +++ b/crates/certutil/src/cli.rs @@ -1,12 +1,30 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; +use indoc::indoc; #[derive(Parser)] -#[command(name = "hypha-certutil")] -#[command(about = "Certificate utility for Hypha network", long_about = None)] -#[command(version)] -#[command(after_help = "For more information and examples, see the module documentation")] +#[command( + name = "hypha-certutil", + version, + about = "Hypha Certificate Utility", + long_about = indoc!{" + Certificate generation and management tool for the Hypha network. + + Creates a three-tier certificate hierarchy: + + * Root CA - Central certificate authority (stored securely, used rarely) + * Organization CAs - Intermediate CAs for each tenant/organization + * Node Certificates - End-entity certificates for individual services + + Uses Ed25519 exclusively for compatibility with libp2p and strong security with + small key sizes. All private keys are stored in PKCS#8 format as required by + the Hypha network. + "}, + after_help = indoc!{" + Important! For production use, it is not recommended to use this tool as it is not designed for high security and scalability. Instead, consider using a dedicated PKI management tool or a third-party service providers." + } +)] pub struct Cli { #[command(subcommand)] pub command: Commands, @@ -14,87 +32,248 @@ pub struct Cli { #[derive(Subcommand)] pub enum Commands { - /// Generate a Root CA certificate - /// - /// This creates the root of your PKI hierarchy. In production, this would be - /// stored securely and used rarely. For development, you typically create one - /// root CA and reuse it across your test environment. - #[command(after_help = "Example: hypha-certutil root -n 'Test Root CA' -d certs/root")] + #[command( + about = "Generate a Root CA certificate (top of PKI hierarchy)", + long_about = indoc!{" + Generate a Root CA certificate (top of PKI hierarchy) + + Creates the root certificate authority that forms the trust anchor for your entire + PKI. This certificate signs Organization CAs, which in turn sign node certificates. + + + OUTPUT FILES: + * `-root-ca-cert.pem` - Root CA certificate (public, distribute to all nodes) + * `-root-ca-key.pem` - Root CA private key (KEEP SECURE, never share) + + The certificate uses Ed25519 algorithm and includes basic constraints marking it + as a CA certificate with path length constraint. + "}, + after_help = indoc!{" + Example: + + ``` + hypha-certutil root -o 'ACME Corporation' --country US -d /secure/root-ca + ``` + "} + )] Root { - /// Organization name - #[arg(short = 'o', long)] + /// Organization name (appears in certificate subject) + /// + /// This name identifies the entity that operates the PKI. Choose a name that + /// clearly identifies your organization for certificate validation. + /// + /// Examples: "ACME Corporation", "Hypha Dev", "Example University" + #[arg(short = 'o', long, verbatim_doc_comment)] organization: String, - /// Country name (2-letter code) - #[arg(long, default_value = "US")] + /// Country code (ISO 3166-1 alpha-2, 2 letters) + /// + /// Two-letter country code for the certificate subject. Common values: + /// * US - United States + /// * GB - United Kingdom + /// * DE - Germany + /// * CN - China + /// + /// While optional, including country helps with certificate identification. + #[arg(long, default_value = "US", verbatim_doc_comment)] country: String, - /// Common name for the Root CA (defaults to " CA") - #[arg(short = 'n', long)] + /// Common name for the Root CA (defaults to " Root CA") + /// + /// The CN field in the certificate subject. If not specified, automatically + /// generated as " Root CA". + /// + /// Best practice: Include "Root CA" in the name for easy identification. + /// + /// Examples: "ACME Production Root CA", "Hypha Test Root CA 2025" + #[arg(short = 'n', long, verbatim_doc_comment)] name: Option, - /// Directory to save the certificate and key files - #[arg(short, long, default_value = ".")] + /// Output directory for certificate and private key files + /// + /// Directory where the Root CA certificate and private key will be saved. + /// The directory will be created if it doesn't exist. + /// + /// SECURITY: Use a secure location with restricted access: + /// * Encrypted filesystem + /// * Access controls (chmod 600 recommended) + #[arg(short, long, default_value = ".", verbatim_doc_comment)] dir: PathBuf, }, - /// Generate an Intermediate Organization CA certificate signed by Root CA - /// - /// Organization CAs represent tenants in the Hypha network. Each tenant gets - /// their own CA certificate that can issue certificates for their nodes. - /// This provides cryptographic isolation between tenants. + #[command( - after_help = "Example: hypha-certutil org --root-cert root-ca-cert.pem --root-key root-ca-key.pem -o acme-corp" + about = "Generate an Organization/Tenant CA (intermediate CA)", + long_about = indoc!{" + Generate an Organization/Tenant CA (intermediate CA) + + Creates an intermediate CA certificate signed by the Root CA. Organization CAs + represent tenants or organizational units in the Hypha network. Each tenant gets + their own CA that can issue node certificates, providing cryptographic isolation. + + OUTPUT FILES: + * `-ca-cert.pem` - Organization CA certificate (public) + * `-ca-key.pem` - Organization CA private key (KEEP SECURE) + * `-ca-trust.pem` - Trust chain (Org CA + Root CA, for node configs) + + The trust chain file bundles the Organization CA and Root CA certificates, + making it easy to configure nodes with the complete trust chain. + "}, + after_help = indoc! {" + Example: Create Organization CA for tenant 'acme-corp' + + ``` + hypha-certutil org --root-cert certs/root/root-ca-cert.pem + --root-key certs/root/root-ca-key.pem + -o acme-corp -d certs/tenants/acme + ``` + "} )] Org { - /// Root CA certificate file path - #[arg(long)] + /// Path to the Root CA certificate file + /// + /// The Root CA certificate that will sign this Organization CA. Must be + /// a valid PEM-encoded X.509 certificate. + /// + /// Typically a `*-cert.pem` generated via 'hypha-certutil root' + #[arg(long, verbatim_doc_comment)] root_cert: PathBuf, - /// Root CA private key file path - #[arg(long)] + /// Path to the Root CA private key file + /// + /// The Root CA's private key used to sign this Organization CA certificate. + /// Must be in PKCS#8 PEM format. + /// + /// SECURITY: This operation requires access to the Root CA private key. + /// + /// Typically a `*-key.pem` generated via 'hypha-certutil root' + #[arg(long, verbatim_doc_comment)] root_key: PathBuf, - /// Organization/tenant name (e.g., acme-corp) - #[arg(short = 'o', long)] + /// Organization/tenant name (e.g., acme-corp, globex) + /// + /// Identifies the tenant or organizational unit. This name appears in the + /// certificate subject and is used in output filenames. + /// + /// Choose a name that: + /// * Clearly identifies the tenant + /// * Is filesystem-safe (no special characters) + /// * Is consistent with your naming conventions + /// + /// Examples: "acme-corp", "engineering-dept", "tenant-001" + #[arg(short = 'o', long, verbatim_doc_comment)] organization: String, /// Common name for the Organization CA (defaults to " CA") - #[arg(short = 'n', long)] + /// + /// The CN field in the certificate subject. If not specified, automatically + /// generated as " CA". + /// + /// Examples: "ACME Corp Intermediate CA", "Engineering Department CA" + #[arg(short = 'n', long, verbatim_doc_comment)] name: Option, - /// Directory to save the certificate and key files - #[arg(short, long, default_value = ".")] + /// Output directory for certificate and key files + /// + /// Directory where the Organization CA certificate, private key, and trust + /// chain will be saved. Created if it doesn't exist. + #[arg(short, long, default_value = ".", verbatim_doc_comment)] dir: PathBuf, }, - /// Generate a certificate signed by a CA (intermediate or root) - /// - /// Node certificates are used by individual services and nodes in the network. - /// They should typically be signed by an Organization CA, not the root CA directly. - /// The certificate will include a trust file (bundle) for easy deployment. + #[command( - after_help = "Example: hypha-certutil node --ca-cert acme-ca-cert.pem --ca-key acme-ca-key.pem -n node1.acme.local -s node1.acme.local,*.acme.local" + about = "Generate a node certificate (end-entity certificate)", + long_about = indoc!{" + Generate a node certificate (end-entity certificate) + + Creates a certificate for an individual node, service, or component in the Hypha + network. Node certificates are signed by an Organization. + + These certificates are used by: + + * Gateways - Network entry points + * Schedulers - Job orchestrators + * Workers - Task executors + * Data nodes - Dataset providers + + OUTPUT FILES: + * `-cert.pem` - Node certificate (public) + * `-key.pem` - Node private key (KEEP SECURE, chmod 600) + * `-trust.pem` - Complete trust chain (Org CA + Root CA) + + The trust chain enables the node to validate peer certificates by including + the full CA hierarchy. + "}, + after_help = indoc!{" + Example: simple scheduler certificate + + ``` + hypha-certutil node + -n scheduler + --ca-cert acme-ca-cert.pem + --ca-key acme-ca-key.pem + -d certs/nodes/scheduler-01 + ``` + "} )] Node { - /// CA certificate file path + /// Path to the Organization CA certificate file + /// + /// The CA certificate that will sign this node certificate. Typically an + /// Organization CA, but could be the Root CA for testing. + /// + /// Typically: `-ca-cert.pem` from 'hypha-certutil org' #[arg(long)] ca_cert: PathBuf, - /// CA private key file path - #[arg(long)] + /// Path to the Organization CA private key file + /// + /// The CA's private key used to sign this node certificate. Must be in + /// PKCS#8 PEM format. + /// + /// SECURITY: Protect this key as it can issue certificates for the organization. + /// + /// Typically: `-ca-key.pem` from 'hypha-certutil org' + #[arg(long, verbatim_doc_comment)] ca_key: PathBuf, - /// Common name for the certificate (e.g., node1.acme-corp.hypha.network) - #[arg(short = 'n', long)] + /// Common name for the node certificate + /// + /// Unique identifier for this node. Best practices: + /// * Use FQDN format: .. + /// * Include node type: gateway-01, scheduler-prod, worker-gpu-03 + /// * Keep consistent naming convention across deployment + /// * Avoid special characters (use hyphens, not underscores) + /// + /// The CN is automatically added to SANs if not explicitly included. + #[arg(short = 'n', long, verbatim_doc_comment)] name: String, - /// Subject Alternative Names (SANs) - DNS names, IPs, etc. - /// Format: comma-separated list of DNS names and IP addresses - /// The common name will be automatically added if not present - #[arg(short, long, value_delimiter = ',', default_value = "0.0.0.0")] + /// Subject Alternative Names (SANs) - DNS names and IP addresses + /// + /// Comma-separated list of DNS names and/or IP addresses that this certificate + /// will be valid for. Critical for TLS validation and peer connectivity. + /// + /// The common name (-n) is automatically included if not in this list. + /// + /// NOTE: 0.0.0.0 is included by default for local testing. Override with + /// specific addresses for production. + #[arg( + short, + long, + value_delimiter = ',', + default_value = "0.0.0.0", + verbatim_doc_comment + )] san: Vec, - /// Directory to save the certificate and key files - #[arg(short, long, default_value = ".")] + /// Output directory for certificate and key files + /// + /// Directory where the node certificate, private key, and trust chain will + /// be saved. Created if it doesn't exist. + /// + /// SECURITY: Set restrictive permissions on this directory (chmod 700) + /// and especially on `*-key.pem` files (chmod 600). + #[arg(short, long, default_value = ".", verbatim_doc_comment)] dir: PathBuf, }, } diff --git a/crates/data/Cargo.toml b/crates/data/Cargo.toml index 0ab1646a..b5162d6e 100644 --- a/crates/data/Cargo.toml +++ b/crates/data/Cargo.toml @@ -15,6 +15,7 @@ hypha-config.workspace = true hypha-messages.workspace = true hypha-network.workspace = true hypha-telemetry.workspace = true +indoc.workspace = true libp2p.workspace = true libp2p-stream.workspace = true miette.workspace = true @@ -25,6 +26,7 @@ tracing.workspace = true tracing-subscriber.workspace = true [build-dependencies] +indoc.workspace = true clap.workspace = true serde.workspace = true clap-markdown = "0.1.5" diff --git a/crates/data/src/cli.rs b/crates/data/src/cli.rs index c3a2987e..fedee5bb 100644 --- a/crates/data/src/cli.rs +++ b/crates/data/src/cli.rs @@ -1,15 +1,21 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; +use indoc::indoc; use serde::Serialize; #[derive(Debug, Parser, Serialize)] #[command( name = "hypha-data", version, - about = "Hypha Data Node", - long_about = "Runs a Hypha Data Node which provides data.", - after_help = "For more information, see the project documentation." + about = "Hypha Data - Dataset Provider", + long_about = indoc!{" + The Hypha Data serves datasets to workers via the Hypha network. + + Data peers connect to gateways, announce their datasets via the DHT, and respond + to data fetch requests from workers during training. Each dataset is divided into + slices (files) that can be distributed across multiple workers for parallel processing. + "} )] pub struct Cli { #[command(subcommand)] @@ -18,50 +24,123 @@ pub struct Cli { #[derive(Debug, Subcommand, Serialize)] pub enum Commands { + #[command( + about = "Generate a default configuration file", + long_about = indoc!{" + Generate a default configuration file + + Creates a TOML configuration file with sensible defaults for dataset serving. + The generated config includes certificate paths, network addresses, gateway + connections, and dataset path configuration. + + IMPORTANT: If the output file exists, it will be overwritten without warning. + "} + )] Init { /// Path where the configuration file will be written - #[clap(short, long, default_value = "config.toml")] + #[clap(short, long, default_value = "config.toml", verbatim_doc_comment)] output: PathBuf, }, - /// Probe a target multiaddr for readiness and exit 0 if healthy. + + #[command( + about = "Check if a remote peer is healthy and reachable", + long_about = indoc!{" + Check if a remote peer is healthy and reachable + + Connects to the specified multiaddr, sends a health check request, and exits + with code 0 if the peer responds as healthy, or non-zero otherwise. + + Useful for: + * Verifying gateway connectivity before announcing datasets + * Container health checks (Docker, Kubernetes) + * Monitoring data node availability + * Deployment verification and readiness checks + + NOTE: It's not possible to self-probe using the same certificate used to run the data node. + "} + )] #[serde(untagged)] Probe { - /// Path to the configuration file. - #[clap(short, long("config"), default_value = "config.toml")] + /// Path to the configuration file + /// + /// Used to load TLS certificates for secure connection to the target peer. + #[arg( + short, + long("config"), + default_value = "config.toml", + verbatim_doc_comment + )] config_file: PathBuf, - /// Path to the certificate pem. - #[clap(long("cert"))] + /// Path to the certificate PEM file (overrides config) + /// + /// Must be a valid X.509 certificate in PEM format. If not provided, uses + /// cert_pem from the configuration file. + #[arg(long("cert"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] cert_pem: Option, - /// Path to the private key pem. - #[clap(long("key"))] + /// Path to the private key PEM file (overrides config) + /// + /// Must correspond to the certificate. If not provided, uses key_pem from + /// the configuration file. + /// + /// SECURITY: Ensure this file has restricted permissions (e.g., chmod 600). + #[arg(long("key"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] key_pem: Option, - /// Path to the trust pem (bundle). - #[clap(long("trust"))] + /// Path to the trust chain PEM file (overrides config) + /// + /// CA bundle containing certificates trusted by this node. If not provided, + /// uses trust_pem from the configuration file. + #[arg(long("trust"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] trust_pem: Option, - /// Path to the certificate revocation list pem. - #[clap(long("crls"))] + /// Path to the certificate revocation list PEM (overrides config) + /// + /// Optional CRL for rejecting compromised certificates. If not provided, + /// uses crls_pem from the configuration file if present. + #[arg(long("crls"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] crls_pem: Option, - /// Timeout in milliseconds - #[clap(long, default_value_t = 2000)] + /// Maximum time to wait for health response (milliseconds) + /// + /// If the peer doesn't respond within this duration, the probe fails. + /// Increase for high-latency networks or overloaded peers. + #[arg(long, default_value_t = 2000, verbatim_doc_comment)] timeout: u64, - /// Target multiaddr to probe (e.g., /ip4/127.0.0.1/tcp/8080) - #[clap(index = 1)] + /// Target peer multiaddr to probe + /// + /// Examples: + /// /ip4/192.168.1.100/tcp/8080/ + /// /dns4/data.example.com/tcp/443/p2p/12D3KooW... + #[arg(index = 1, verbatim_doc_comment)] address: String, }, + + #[command( + about = "Start the data node and begin serving datasets", + long_about = indoc!{" + Start the data node and begin serving datasets + + Loads configuration, connects to gateways, and enters the main serving loop. + The process runs until interrupted (SIGINT/SIGTERM), then performs graceful + shutdown to ensure data transfers are properly terminated. + "} + )] #[serde(untagged)] Run { - /// Path to the configuration file. - #[clap(short, long("config"), default_value = "config.toml")] + /// Path to the configuration file + #[arg( + short, + long("config"), + default_value = "config.toml", + verbatim_doc_comment + )] #[serde(skip)] config_file: PathBuf, }, diff --git a/crates/data/src/config.rs b/crates/data/src/config.rs index 81421707..abd1d30c 100644 --- a/crates/data/src/config.rs +++ b/crates/data/src/config.rs @@ -12,44 +12,172 @@ use libp2p::Multiaddr; use serde::{Deserialize, Serialize}; #[derive(Deserialize, Serialize, Documented, DocumentedFieldsOpt)] -/// Configure network settings, security certificates, and runtime parameters. +/// Data node configuration for dataset serving and distribution. +/// +/// The data node provides training datasets to workers via the P2P network. It must have +/// fast storage access and sufficient network bandwidth for concurrent data transfers. pub struct Config { - /// Path to the certificate pem. + /// Path to the TLS certificate PEM file. + /// + /// Must be a valid X.509 certificate in PEM format that establishes this data node's + /// identity in the P2P network. The certificate must match the private key and be + /// trusted by all peers. + /// + /// SECURITY: Use certificates from a recognized CA or internal PKI for production deployments. cert_pem: PathBuf, - /// Path to the private key pem. + + /// Path to the private key PEM file. + /// + /// Must correspond to cert_pem. This is the data node's cryptographic identity. + /// + /// SECURITY: + /// * Restrict file permissions (chmod 600 recommended) + /// * Never commit to version control + /// * Store securely using secrets management systems in production + /// * Keep backups in secure, encrypted storage key_pem: PathBuf, - /// Path to the trust pem (bundle). + + /// Path to the trust chain PEM file (CA bundle). + /// + /// Contains root and intermediate certificates trusted by this data node. Peers presenting + /// certificates signed by these CAs will be accepted for network connections. trust_pem: PathBuf, - /// Path to the certificate revocation list pem. + + /// Path to certificate revocation list PEM (optional). + /// + /// Specifies certificates that should no longer be trusted, even if they're in the trust + /// chain. Used for compromised certificates or decommissioned peers. + /// + /// SECURITY: Keep this updated with your certificate authority's latest CRL to maintain + /// network security. Automate CRL updates in production environments. crls_pem: Option, - /// Addresses of the gateways. + + /// Gateway addresses to connect to (required for network entry). + /// + /// Specifies one or more gateways for network bootstrapping and relay functionality. + /// + /// Multiple gateways provide redundancy; the data node attempts to connect to all + /// and succeeds if any are reachable. + /// + /// Examples: + /// * "/ip4/203.0.113.10/tcp/8080/" + /// * "/dns4/gateway.hypha.example/tcp/443/" gateway_addresses: Vec, - /// Addresses to listen on. + + /// Network addresses to listen on for incoming connections. + /// + /// Supports TCP and QUIC protocols. Use port 0 to let the OS assign available ports. + /// + /// Examples: + /// * "/ip4/0.0.0.0/tcp/0" - TCP on all interfaces, OS-assigned port + /// * "/ip4/0.0.0.0/udp/0/quic-v1" - QUIC on all interfaces, OS-assigned port listen_addresses: Vec, - /// Path or file providing a dataset. + + /// Path to the dataset directory. + /// + /// Directory containing dataset files (slices) to serve to workers. Each file in the + /// directory is treated as an independent dataset slice that can be fetched by workers. + /// + /// REQUIREMENTS: + /// * Must be a valid directory path + /// * Must contain at least one data file + /// * Directory name becomes the dataset identifier in the DHT + /// * Files should be consistently formatted for worker consumption + /// + /// DIRECTORY STRUCTURE: + /// + /// ```ignore + /// dataset-name/ + /// slice-0000.bin + /// slice-0001.bin + /// slice-0002.bin + /// ... + /// ``` + /// + /// The directory is scanned at startup and dataset metadata is announced via DHT. dataset_path: PathBuf, - /// CIDR address filters applied before adding Identify-reported listen addresses to Kademlia. - /// Use standard CIDR notation (e.g., "10.0.0.0/8", "fc00::/7"). + + /// CIDR address filters for DHT routing table management. + /// + /// Peer addresses matching these CIDR ranges are excluded from the Kademlia DHT before + /// being added. This prevents routing to non-routable or private addresses. + /// + /// Defaults to reserved/private ranges (loopback, RFC1918, etc.). + /// + /// Add additional ranges to filter internal addresses specific to your network topology. + /// + /// NOTE: This only affects DHT address filtering, not direct peer connections. #[serde(default = "reserved_cidrs")] exclude_cidr: Vec, + + /// OpenTelemetry Protocol (OTLP) endpoint for exporting telemetry data. + /// + /// Sends metrics, traces, and logs to an OpenTelemetry collector or compatible backend + /// (e.g., Jaeger, Prometheus, Grafana Cloud, ...). + /// + /// If unset, telemetry export is disabled (local logging only). #[serde(alias = "exporter_otlp_endpoint")] - /// OTLP Exporter endpoint for telemetry data. If unset, telemetry is disabled. telemetry_endpoint: Option, + + /// Resource attributes included in all telemetry data. + /// + /// Key-value pairs that identify this data node instance in your observability platform. + /// Useful for filtering and grouping metrics across multiple data nodes. + /// + /// Example Attributes: + /// * service.name: "hypha-data" + /// * service.version: "0.1.0" + /// * deployment.environment: "production" + /// * host.name: "data-01" + /// * dataset.name: "imagenet-train" + /// * storage.type: "ssd" + /// + /// These attributes appear in all exported metrics, traces, and logs. #[serde(alias = "resource_attributes")] - /// Attributes to be included in telemetry. telemetry_attributes: Option, + + /// HTTP/gRPC headers for OTLP endpoint authentication. + /// + /// Used to authenticate with your telemetry backend. Common use cases: + /// * API keys: {"Authorization": "Bearer YOUR_API_KEY"} + /// * Custom headers: {"X-API-Key": "secret"} + /// + /// SECURITY: Protect these credentials. Use environment variables or secrets management + /// instead of hardcoding in config files. Never commit credentials to version control. #[serde(alias = "exporter_otlp_headers")] - /// Headers for OTLP telemetry endpoint request used for authentication. telemetry_headers: Option, + + /// Protocol for OTLP telemetry endpoint communication. + /// + /// Choose based on your collector's supported protocols. #[serde(alias = "exporter_otlp_protocol")] - /// Protocol for OTLP telemetry endpoint request. telemetry_protocol: Option, + + /// Trace sampling strategy to control volume and costs. + /// + /// Options: + /// * "always_on" - Sample every trace (high volume, expensive) + /// * "always_off" - Disable tracing (metrics and logs only) + /// * "traceidratio" - Sample traces by probability (cost-effective) + /// * "parentbased_traceidratio" - Honor parent trace decisions with fallback ratio + /// + /// RECOMMENDATION: Use "traceidratio" with sample_ratio for production to balance + /// observability with costs. Start with 0.1 (10%) and adjust based on data volume. #[serde(alias = "traces_sampler")] - /// Traces sampler: one of "always_on", "always_off", "traceidratio", or "parentbased_traceidratio". telemetry_sampler: Option, + + /// Sampling probability for ratio-based trace samplers. + /// + /// Valid range: 0.0 to 1.0 + /// * 1.0 = 100% sampling (sample every trace) + /// * 0.1 = 10% sampling (sample 1 in 10 traces) + /// * 0.01 = 1% sampling (sample 1 in 100 traces) + /// + /// Only applies to "traceidratio" and "parentbased_traceidratio" samplers. + /// + /// NOTE: Lower ratios reduce telemetry costs while maintaining statistical + /// significance. For high-throughput data nodes, 0.01-0.1 is probably sufficient. #[serde(alias = "traces_sampler_arg")] - /// For `traceidratio` and `parentbased_traceidratio` samplers: Sampling probability in [0..1], - /// e.g. "0.25". Default is 1.0. telemetry_sample_ratio: Option, } diff --git a/crates/gateway/Cargo.toml b/crates/gateway/Cargo.toml index d4459c7a..1e035b5a 100644 --- a/crates/gateway/Cargo.toml +++ b/crates/gateway/Cargo.toml @@ -15,6 +15,7 @@ hypha-config.workspace = true hypha-network.workspace = true hypha-messages.workspace = true hypha-telemetry.workspace = true +indoc.workspace = true libp2p.workspace = true miette.workspace = true serde.workspace = true @@ -25,6 +26,7 @@ tracing-subscriber.workspace = true [build-dependencies] clap.workspace = true hypha-network.workspace = true +indoc.workspace = true libp2p.workspace = true serde.workspace = true clap-markdown = "0.1.5" diff --git a/crates/gateway/src/bin/hypha-gateway.rs b/crates/gateway/src/bin/hypha-gateway.rs index 36f3e311..bc9ec9ec 100644 --- a/crates/gateway/src/bin/hypha-gateway.rs +++ b/crates/gateway/src/bin/hypha-gateway.rs @@ -173,7 +173,8 @@ async fn main() -> Result<()> { .with_provider(Toml::file(config_file)) .with_provider(Env::prefixed("HYPHA_")) .with_provider(Serialized::defaults(&args)) - .build()?; + .build()? + .validate()?; let exclude_cidrs = config.exclude_cidr().clone(); let (network, driver) = Network::create( @@ -212,7 +213,8 @@ async fn main() -> Result<()> { .with_provider(Env::prefixed("HYPHA_")) .with_provider(Env::prefixed("OTEL_")) .with_provider(Serialized::defaults(args)) - .build()?; + .build()? + .validate()?; return run(config).await; } diff --git a/crates/gateway/src/cli.rs b/crates/gateway/src/cli.rs index eb745bcd..ba0385d8 100644 --- a/crates/gateway/src/cli.rs +++ b/crates/gateway/src/cli.rs @@ -2,6 +2,7 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; use hypha_network::IpNet; +use indoc::indoc; use libp2p::Multiaddr; use serde::Serialize; @@ -9,9 +10,11 @@ use serde::Serialize; #[command( name = "hypha-gateway", version, - about = "Hypha Gateway Node", - long_about = "Runs the Hypha Gateway facilitating network connectivity between peers.", - after_help = "For more information, see the project documentation." + about = "Hypha Gateway - Network Entry Point and Relay", + long_about = indoc!{" + The Hypha Gateway is a stable, publicly accessible entry point for the Hypha P2P + network. It provides bootstrapping, relaying, and DHT participation for discovery. + "} )] pub struct Cli { #[command(subcommand)] @@ -20,86 +23,174 @@ pub struct Cli { #[derive(Debug, Subcommand, Serialize)] pub enum Commands { + #[command( + about = "Generate a default configuration file", + long_about = indoc!{" + Generate a default configuration file + + Creates a TOML configuration file with sensible defaults for running a gateway, + including certificate paths and network addresses. + + IMPORTANT: If the output file exists, it will be overwritten without warning. + "} + )] Init { /// Path where the configuration file will be written - #[clap(short, long, default_value = "config.toml")] + #[clap(short, long, default_value = "config.toml", verbatim_doc_comment)] output: PathBuf, }, - /// Probe a target multiaddr for readiness and exit 0 if healthy. + + #[command( + about = "Check if a remote peer is healthy and reachable", + long_about = indoc!{" + Check if a remote peer is healthy and reachable + + Connects to the specified multiaddr, sends a health check request, and exits + with code 0 if the peer is healthy, or non-zero otherwise. + + Useful for: + * Container health checks (Docker, Kubernetes) + * Monitoring and alerting + * Deployment verification and readiness checks + + NOTE: It's not possible to self-probe using the same certificate used to run the gateway. + "} + )] #[serde(untagged)] Probe { - /// Path to the configuration file. - #[clap(short, long("config"), default_value = "config.toml")] + /// Path to the configuration file + /// + /// Used to load TLS certificates for secure connection to the target peer. + #[arg( + short, + long("config"), + default_value = "config.toml", + verbatim_doc_comment + )] config_file: PathBuf, - /// Path to the certificate pem. - #[clap(long("cert"))] + /// Path to the certificate PEM file (overrides config) + /// + /// Must be a valid X.509 certificate in PEM format. If not provided, uses + /// cert_pem from the configuration file. + #[arg(long("cert"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] cert_pem: Option, - /// Path to the private key pem. - #[clap(long("key"))] + /// Path to the private key PEM file (overrides config) + /// + /// Must correspond to the certificate. If not provided, uses key_pem from + /// the configuration file. + /// + /// SECURITY: Ensure this file has restricted permissions (e.g., chmod 600). + #[arg(long("key"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] key_pem: Option, - /// Path to the trust pem (bundle). - #[clap(long("trust"))] + /// Path to the trust chain PEM file (overrides config) + /// + /// CA bundle containing certificates trusted by this node. If not provided, + /// uses trust_pem from the configuration file. + #[arg(long("trust"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] trust_pem: Option, - /// Path to the certificate revocation list pem. - #[clap(long("crls"))] + /// Path to the certificate revocation list PEM (overrides config) + /// + /// Optional CRL for rejecting compromised certificates. If not provided, + /// uses crls_pem from the configuration file if present. + #[arg(long("crls"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] crls_pem: Option, - /// Timeout in milliseconds - #[clap(long, default_value_t = 2000)] + /// Maximum time to wait for health response (milliseconds) + /// + /// If the peer doesn't respond within this duration, the probe fails. + /// Increase for high-latency networks or overloaded peers. + #[arg(long, default_value_t = 2000, verbatim_doc_comment)] timeout: u64, - /// Target multiaddr to probe (e.g., /ip4/127.0.0.1/tcp/8080) - #[clap(index = 1)] + /// Target peer multiaddr to probe + /// + /// Examples: + /// /ip4/192.168.1.100/tcp/8080/ + /// /dns4/gateway.example.com/tcp/443/p2p/12D3KooW... + #[arg(index = 1, verbatim_doc_comment)] address: String, }, + + #[command( + about = "Start the gateway and begin serving the network", + long_about = indoc!{" + Start the gateway and begin serving the network + + Loads configuration, starts listeners, participates in the DHT and relay, + and runs until interrupted (SIGINT/SIGTERM) with a graceful shutdown. + "} + )] #[serde(untagged)] Run { - /// Path to the configuration file. - #[clap(short, long("config"), default_value = "config.toml")] + /// Path to the configuration file + #[arg( + short, + long("config"), + default_value = "config.toml", + verbatim_doc_comment + )] #[serde(skip)] config_file: PathBuf, - /// Path to the certificate pem. - #[clap(long("cert"))] + /// Path to the certificate PEM file (overrides config) + /// + /// Must be a valid X.509 certificate in PEM format. + #[arg(long("cert"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] cert_pem: Option, - /// Path to the private key pem. - #[clap(long("key"))] + /// Path to the private key PEM file (overrides config) + /// + /// Must correspond to the certificate. Security: restrict permissions (e.g., chmod 600). + #[arg(long("key"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] key_pem: Option, - /// Path to the trust pem (bundle). - #[clap(long("trust"))] + /// Path to the trust chain PEM file (overrides config) + /// + /// CA bundle containing certificates trusted by this node. If not provided, + /// uses trust_pem from the configuration file. + #[arg(long("trust"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] trust_pem: Option, - /// Path to the certificate revocation list pem. - #[clap(long("crls"))] + /// Path to the certificate revocation list PEM (overrides config) + /// + /// Optional CRL for rejecting compromised certificates. If not provided, + /// uses crls_pem from the configuration file if present. + #[arg(long("crls"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] crls_pem: Option, - /// Addresses to listen on (can be specified multiple times). - #[clap(long("listen"))] + /// Addresses to listen on (repeatable, overrides config) + /// + /// Where this gateway accepts incoming connections. + /// Examples: /ip4/0.0.0.0/tcp/8080, /ip4/0.0.0.0/udp/8080/quic-v1 + #[arg(long("listen"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] listen_addresses: Option>, - /// External addresses to advertise (can be specified multiple times). - #[clap(long("external"))] + /// External addresses to advertise (repeatable, overrides config) + /// + /// Publicly reachable addresses peers should use to connect. + /// Examples: /ip4/203.0.113.10/tcp/8080, /dns4/gateway.example.com/tcp/8080 + #[arg(long("external"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] external_addresses: Option>, - /// CIDR exclusion (repeatable). Overrides config if provided. - /// Example: --exclude-cidr 10.0.0.0/8 --exclude-cidr fc00::/7 - #[clap(long("exclude-cidr"))] + /// CIDR ranges to exclude from DHT (repeatable, overrides config) + /// + /// Filters out peer addresses matching these ranges before adding to the DHT. + /// Examples: 10.0.0.0/8, fc00::/7 + #[arg(long("exclude-cidr"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] exclude_cidr: Option>, }, diff --git a/crates/gateway/src/config.rs b/crates/gateway/src/config.rs index 3cff50ae..3dd9662e 100644 --- a/crates/gateway/src/config.rs +++ b/crates/gateway/src/config.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use documented::{Documented, DocumentedFieldsOpt}; -use hypha_config::TLSConfig; +use hypha_config::{ConfigError, ConfigWithMetadata, TLSConfig, ValidatableConfig}; use hypha_network::{IpNet, reserved_cidrs}; use hypha_telemetry::{ attributes::Attributes, @@ -12,42 +12,159 @@ use libp2p::Multiaddr; use serde::{Deserialize, Serialize}; #[derive(Deserialize, Serialize, Documented, DocumentedFieldsOpt)] -/// Configure gateway network settings, security certificates, and runtime parameters. +/// Gateway configuration for network entry point and relay functionality. +/// +/// The gateway serves as a stable, publicly-accessible entry point for the Hypha P2P network. +/// It must be deployed on a machine with public IP address and stable connectivity. pub struct Config { - /// Path to the certificate pem. + /// Path to the TLS certificate PEM file. + /// + /// Must be a valid X.509 certificate in PEM format that establishes this gateway's identity + /// in the P2P network. The certificate must match the private key and be trusted by all peers. + /// + /// SECURITY: Use certificates from a recognized CA or internal PKI for production deployments. + /// For testing, self-signed certificates are acceptable but must be distributed to all peers. cert_pem: PathBuf, - /// Path to the private key pem. + + /// Path to the private key PEM file. + /// + /// Must correspond to cert_pem. This is the gateway's cryptographic identity. + /// + /// SECURITY: + /// * Restrict file permissions (chmod 600 recommended) + /// * Never commit to version control + /// * Store securely using secrets management systems in production + /// * Keep backups in secure, encrypted storage key_pem: PathBuf, - /// Path to the trust pem (bundle). + + /// Path to the trust chain PEM file (CA bundle). + /// + /// Contains root and intermediate certificates trusted by this gateway. Peers presenting + /// certificates signed by these CAs will be accepted for network connections. + /// + /// For self-signed deployments, include all peer certificates. For production, use + /// certificates from your organization's PKI or a recognized CA. trust_pem: PathBuf, - /// Path to the certificate revocation list pem. + + /// Path to certificate revocation list PEM (optional). + /// + /// Specifies certificates that should no longer be trusted, even if they're in the trust + /// chain. Used for compromised certificates or decommissioned peers. + /// + /// SECURITY: Keep this updated with your certificate authority's latest CRL to maintain + /// network security. Automate CRL updates in production environments. crls_pem: Option, - /// Addresses to listen on. + + /// Network addresses to listen on for incoming connections. + /// + /// Supports TCP and QUIC protocols. Use 0.0.0.0 to bind to all interfaces, or specify + /// particular IPs to restrict to certain interfaces. + /// + /// Examples: + /// * "/ip4/0.0.0.0/tcp/8080" - TCP on all interfaces + /// * "/ip4/0.0.0.0/udp/8080/quic-v1" - QUIC on all interfaces + /// * "/ip6/::/tcp/8080" - TCP on all IPv6 interfaces listen_addresses: Vec, - /// External addresses to advertise. Only list addresses that are guaranteed to be reachable from the internet. + + /// External addresses to advertise for peer discovery. + /// + /// IMPORTANT: Only list addresses that are guaranteed to be reachable from the public + /// internet. These addresses are shared with all peers for network connectivity. + /// + /// REQUIREMENTS: + /// * Must be publicly routable IP addresses or resolvable DNS names + /// * Must have proper port forwarding configured if behind NAT + /// * Should be stable and consistently available + /// * At least one external address required for network to function + /// + /// Examples: + /// * "/ip4/203.0.113.10/tcp/8080" - Public IPv4 address + /// * "/dns4/gateway.example.com/tcp/8080" - DNS name with public IP external_addresses: Vec, + + /// OpenTelemetry Protocol (OTLP) endpoint for exporting telemetry data. + /// + /// Sends metrics, traces, and logs to an OpenTelemetry collector or compatible backend + /// (e.g., Jaeger, Prometheus, Grafana Cloud, ...). + /// + /// If unset, telemetry export is disabled (local logging only). #[serde(alias = "exporter_otlp_endpoint")] - /// OTLP Exporter endpoint for telemetry data. If unset, telemetry is disabled. telemetry_endpoint: Option, + + /// Resource attributes included in all telemetry data. + /// + /// Key-value pairs that identify this gateway instance in your observability platform. + /// Useful for filtering and grouping metrics across multiple gateways. + /// + /// Example Attributes: + /// * service.name: "hypha-gateway" + /// * service.version: "0.1.0" + /// * deployment.environment: "production" + /// * host.name: "gateway-01" + /// * cloud.provider: "aws" + /// * cloud.region: "us-east-1" + /// + /// These attributes appear in all exported metrics, traces, and logs. #[serde(alias = "resource_attributes")] - /// Attributes to be included in telemetry. telemetry_attributes: Option, + + /// HTTP/gRPC headers for OTLP endpoint authentication. + /// + /// Used to authenticate with your telemetry backend. Common use cases: + /// * API keys: {"Authorization": "Bearer YOUR_API_KEY"} + /// * Custom headers: {"X-API-Key": "secret"} + /// + /// SECURITY: Protect these credentials. Use environment variables or secrets management + /// instead of hardcoding in config files. Never commit credentials to version control. #[serde(alias = "exporter_otlp_headers")] - /// Headers for OTLP telemetry endpoint request used for authentication. telemetry_headers: Option, + + /// Protocol for OTLP telemetry endpoint communication. + /// + /// Choose based on your collector's supported protocols. #[serde(alias = "exporter_otlp_protocol")] - /// Protocol for OTLP telemetry endpoint request. telemetry_protocol: Option, + + /// Trace sampling strategy to control volume and costs. + /// + /// Options: + /// * "always_on" - Sample every trace (high volume, expensive) + /// * "always_off" - Disable tracing (metrics and logs only) + /// * "traceidratio" - Sample traces by probability (cost-effective) + /// * "parentbased_traceidratio" - Honor parent trace decisions with fallback ratio + /// + /// RECOMMENDATION: Use "traceidratio" with sample_ratio for production to balance + /// observability with costs. Start with 0.1 (10%) and adjust based on traffic volume. #[serde(alias = "traces_sampler")] - /// Traces sampler: one of "always_on", "always_off", "traceidratio", or "parentbased_traceidratio". telemetry_sampler: Option, + + /// Sampling probability for ratio-based trace samplers. + /// + /// Valid range: 0.0 to 1.0 + /// * 1.0 = 100% sampling (sample every trace) + /// * 0.1 = 10% sampling (sample 1 in 10 traces) + /// * 0.01 = 1% sampling (sample 1 in 100 traces) + /// + /// Only applies to "traceidratio" and "parentbased_traceidratio" samplers. + /// + /// NOTE: Lower ratios reduce telemetry costs while maintaining statistical + /// significance. For high-traffic gateways, 0.01-0.1 is typically sufficient. #[serde(alias = "traces_sampler_arg")] - /// For `traceidratio` and `parentbased_traceidratio` samplers: Sampling probability in [0..1], - /// e.g. "0.25". Default is 1.0. telemetry_sample_ratio: Option, - /// CIDR address filters applied before adding Identify-reported listen addresses to Kademlia. + + /// CIDR address filters for DHT routing table management. + /// + /// Peer addresses matching these CIDR ranges are excluded from the Kademlia DHT before + /// being added. This prevents routing to non-routable or private addresses. /// - /// Use standard CIDR notation (e.g., "10.0.0.0/8", "fc00::/7"). Defaults to loopback addresses. + /// Defaults to reserved/private ranges: + /// * 127.0.0.0/8, ::1/128 (loopback) + /// * 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 (RFC1918 private) + /// * fc00::/7 (IPv6 unique local) + /// + /// Add additional ranges to filter internal addresses specific to your network topology. + /// + /// Note: This only affects DHT address filtering, not direct peer connections. #[serde(default = "reserved_cidrs")] exclude_cidr: Vec, } @@ -143,3 +260,22 @@ impl TLSConfig for Config { self.crls_pem.as_deref() } } + +impl ValidatableConfig for Config { + fn validate(cfg: &ConfigWithMetadata) -> std::result::Result<(), ConfigError> { + // NOTE: Gateways MUST have external addresses for the network to function properly. + // Without them, peers cannot discover or connect to the gateway. + if cfg.external_addresses().is_empty() { + let metadata = cfg.find_metadata("external_addresses"); + let message = "Gateway must have at least one external address configured. \ + External addresses are required for peers to discover and connect \ + to the gateway."; + + return Err(ConfigError::with_metadata(&metadata)(ConfigError::Invalid( + message.to_string(), + ))); + } + + Ok(()) + } +} diff --git a/crates/scheduler/Cargo.toml b/crates/scheduler/Cargo.toml index 76f5663b..ec8b6595 100644 --- a/crates/scheduler/Cargo.toml +++ b/crates/scheduler/Cargo.toml @@ -17,6 +17,7 @@ hypha-leases.workspace = true hypha-messages.workspace = true hypha-network.workspace = true hypha-telemetry.workspace = true +indoc.workspace = true itertools = "0.14.0" libp2p.workspace = true libp2p-stream.workspace = true @@ -34,6 +35,7 @@ tracing-subscriber.workspace = true uuid.workspace = true [build-dependencies] +indoc.workspace = true clap.workspace = true hypha-network.workspace = true libp2p.workspace = true diff --git a/crates/scheduler/src/cli.rs b/crates/scheduler/src/cli.rs index e3aaf9e3..dffb54ac 100644 --- a/crates/scheduler/src/cli.rs +++ b/crates/scheduler/src/cli.rs @@ -2,6 +2,7 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; use hypha_network::IpNet; +use indoc::indoc; use libp2p::Multiaddr; use serde::Serialize; @@ -9,9 +10,11 @@ use serde::Serialize; #[command( name = "hypha-scheduler", version, - about = "Hypha Scheduler", - long_about = "Runs the Hypha Scheduler coordinating workers.", - after_help = "For more information, see the project documentation." + about = "Hypha Scheduler - ML Job Orchestration", + long_about = indoc!{" + The Hypha Scheduler discovers workers via the Hypha network and orchestrates + distributed ML training jobs. + "} )] pub struct Cli { #[command(subcommand)] @@ -20,76 +23,169 @@ pub struct Cli { #[derive(Debug, Subcommand, Serialize)] pub enum Commands { + #[command( + about = "Generate a default configuration file", + long_about = indoc!{" + Generate a default configuration file + + Creates a TOML configuration file with sensible defaults for job orchestration, + including certificate paths, network addresses, gateway connections, and job settings. + + IMPORTANT: If the output file exists, it will be overwritten without warning. + "} + )] Init { /// Path where the configuration file will be written - #[clap(short, long, default_value = "config.toml")] + #[clap(short, long, default_value = "config.toml", verbatim_doc_comment)] output: PathBuf, }, - /// Probe a target multiaddr for readiness and exit 0 if healthy. + + #[command( + about = "Check if a remote peer is healthy and reachable", + long_about = indoc!{" + Check if a remote peer is healthy and reachable + + Connects to the specified multiaddr, sends a health check request, and exits + with code 0 if the peer is healthy, or non-zero otherwise. + + Useful for: + * Verifying gateway connectivity before starting jobs + * Container health checks (Docker, Kubernetes) + * Monitoring scheduler availability + * Deployment verification and readiness checks + + NOTE: It's not possible to self-probe using the same certificate used to run the scheduler. + "} + )] #[serde(untagged)] Probe { - /// Path to the configuration file. - #[clap(short, long("config"), default_value = "config.toml")] + /// Path to the configuration file + /// + /// Used to load TLS certificates for secure connection to the target peer. + #[arg( + short, + long("config"), + default_value = "config.toml", + verbatim_doc_comment + )] config_file: PathBuf, - /// Path to the certificate pem. - #[clap(long("cert"))] + /// Path to the certificate PEM file (overrides config) + /// + /// Must be a valid X.509 certificate in PEM format. If not provided, uses + /// cert_pem from the configuration file. + #[arg(long("cert"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] cert_pem: Option, - /// Path to the private key pem. - #[clap(long("key"))] + /// Path to the private key PEM file (overrides config) + /// + /// Must correspond to the certificate. If not provided, uses key_pem from + /// the configuration file. + /// + /// SECURITY: Ensure this file has restricted permissions (e.g., chmod 600). + #[arg(long("key"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] key_pem: Option, - /// Path to the trust pem (bundle). - #[clap(long("trust"))] + /// Path to the trust chain PEM file (overrides config) + /// + /// CA bundle containing certificates trusted by this node. If not provided, + /// uses trust_pem from the configuration file. + #[arg(long("trust"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] trust_pem: Option, - /// Path to the certificate revocation list pem. - #[clap(long("crls"))] + /// Path to the certificate revocation list PEM (overrides config) + /// + /// Optional CRL for rejecting compromised certificates. If not provided, + /// uses crls_pem from the configuration file if present. + #[arg(long("crls"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] crls_pem: Option, - /// Timeout in milliseconds - #[clap(long, default_value_t = 2000)] + /// Maximum time to wait for health response (milliseconds) + /// + /// If the peer doesn't respond within this duration, the probe fails. + /// Increase for high-latency networks or overloaded peers. + #[arg(long, default_value_t = 2000, verbatim_doc_comment)] timeout: u64, - /// Target multiaddr to probe (e.g., /ip4/127.0.0.1/tcp/8080) - #[clap(index = 1)] + /// Target peer multiaddr to probe + /// + /// Examples: + /// /ip4/192.168.1.100/tcp/8080/ + /// /dns4/scheduler.example.com/tcp/443/p2p/12D3KooW... + #[arg(index = 1, verbatim_doc_comment)] address: String, }, + + #[command( + about = "Start the scheduler and begin job orchestration", + long_about = indoc!{" + Start the scheduler and begin job orchestration + + Loads configuration, connects to gateways, discovers workers, and orchestrates + training jobs. Runs until interrupted (SIGINT/SIGTERM) with a graceful shutdown. + "} + )] #[serde(untagged)] Run { - /// Path to the configuration file. - #[clap(short, long("config"), default_value = "config.toml")] + /// Path to the configuration file + #[arg( + short, + long("config"), + default_value = "config.toml", + verbatim_doc_comment + )] #[serde(skip)] config_file: PathBuf, - /// Addresses of the gateways (can be specified multiple times). - #[clap(long("gateway"))] + /// Gateway addresses to connect to (repeatable, overrides config) + /// + /// Gateways provide network bootstrapping, DHT access, and optional relay. + /// + /// Examples: + /// --gateway /ip4/203.0.113.10/tcp/8080/ + /// --gateway /dns4/gateway.hypha.example/tcp/443/ + #[arg(long("gateway"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] gateway_addresses: Option>, - /// Addresses to listen on (can be specified multiple times). - #[clap(long("listen"))] + /// Addresses to listen on (repeatable, overrides config) + /// + /// Where the scheduler accepts incoming connections. + /// + /// Examples: + /// --listen /ip4/0.0.0.0/tcp/9090 + /// --listen /ip4/0.0.0.0/udp/9090/quic-v1 + #[arg(long("listen"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] listen_addresses: Option>, - /// External addresses to advertise (can be specified multiple times). - #[clap(long("external"))] + /// External addresses to advertise (repeatable, overrides config) + /// + /// Publicly reachable addresses peers should use to connect. + /// + /// Examples: + /// --external /ip4/203.0.113.20/tcp/9090 + /// --external /dns4/scheduler.example.com/tcp/9090 + #[arg(long("external"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] external_addresses: Option>, - /// Enable listening via relay P2pCircuit (via gateway). Defaults to true. - #[clap(long("relay-circuit"))] + /// Enable relay circuit listening via gateway (overrides config) + /// + /// true = use relay (default), false = direct connections only. + #[arg(long("relay-circuit"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] relay_circuit: Option, - /// CIDR exclusion (repeatable). Overrides config if provided. - /// Example: --exclude-cidr 10.0.0.0/8 --exclude-cidr fc00::/7 - #[clap(long("exclude-cidr"))] + /// CIDR ranges to exclude from DHT (repeatable, overrides config) + /// + /// Filters out peer addresses matching these ranges before adding to the DHT. + /// Examples: 10.0.0.0/8, fc00::/7 + #[arg(long("exclude-cidr"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] exclude_cidr: Option>, }, diff --git a/crates/scheduler/src/config.rs b/crates/scheduler/src/config.rs index 0c017675..b0b0e7df 100644 --- a/crates/scheduler/src/config.rs +++ b/crates/scheduler/src/config.rs @@ -14,51 +14,181 @@ use serde::{Deserialize, Serialize}; use crate::scheduler_config::SchedulerConfig; #[derive(Deserialize, Serialize, Documented, DocumentedFieldsOpt)] -/// Configure scheduler network settings, security certificates, and runtime parameters. +/// Scheduler configuration for ML job orchestration and coordination. +/// +/// The scheduler discovers worker nodes and orchestrates distributed ML training jobs. +/// It should be deployed with sufficient resources for coordination overhead. pub struct Config { - /// Path to the certificate pem. + /// Path to the TLS certificate PEM file. + /// + /// Must be a valid X.509 certificate in PEM format that establishes this scheduler's + /// identity in the P2P network. The certificate must match the private key and be + /// trusted by all peers. + /// + /// SECURITY: Use certificates from a recognized CA or internal PKI for production deployments. cert_pem: PathBuf, - /// Path to the private key pem. + + /// Path to the private key PEM file. + /// + /// Must correspond to cert_pem. This is the scheduler's cryptographic identity. + /// + /// SECURITY: + /// * Restrict file permissions (chmod 600 recommended) + /// * Never commit to version control + /// * Store securely using secrets management systems in production + /// * Keep backups in secure, encrypted storage key_pem: PathBuf, - /// Path to the trust pem (bundle). + + /// Path to the trust chain PEM file (CA bundle). + /// + /// Contains root and intermediate certificates trusted by this scheduler. Peers presenting + /// certificates signed by these CAs will be accepted for network connections. trust_pem: PathBuf, - /// Path to the certificate revocation list pem. + + /// Path to certificate revocation list PEM (optional). + /// + /// Specifies certificates that should no longer be trusted, even if they're in the trust + /// chain. Used for compromised certificates or decommissioned peers. + /// + /// SECURITY: Keep this updated with your certificate authority's latest CRL to maintain + /// network security. Automate CRL updates in production environments. crls_pem: Option, - /// Addresses of the gateways. + + /// Gateway addresses to connect to (required for network entry). + /// + /// Specifies one or more gateways for network bootstrapping and relay functionality. + /// + /// Multiple gateways provide redundancy; the scheduler attempts to connect to all + /// and succeeds if any are reachable. + /// + /// Examples: + /// * "/ip4/203.0.113.10/tcp/8080/" + /// * "/dns4/gateway.hypha.example/tcp/443/" gateway_addresses: Vec, - /// Addresses to listen on. + + /// Network addresses to listen on for incoming connections. + /// + /// Supports TCP and QUIC protocols. Use port 0 to let the OS assign available ports. + /// + /// Examples: + /// * "/ip4/0.0.0.0/tcp/0" - TCP on all interfaces, OS-assigned port + /// * "/ip4/0.0.0.0/udp/0/quic-v1" - QUIC on all interfaces, OS-assigned port listen_addresses: Vec, - /// External addresses to advertise. Only list addresses that are guaranteed to be reachable from the internet. + + /// External addresses to advertise for peer discovery (optional). + /// + /// Only advertise addresses that workers can reliably reach. Most schedulers rely on + /// relay circuits and don't need external addresses. + /// + /// Examples: + /// * "/ip4/203.0.113.20/tcp/9090" + /// * "/dns4/scheduler.example.com/tcp/9090" external_addresses: Vec, - /// CIDR address filters applied before adding Identify-reported listen addresses to Kademlia. - /// Use standard CIDR notation (e.g., "10.0.0.0/8", "fc00::/7"). + + /// CIDR address filters for DHT routing table management. + /// + /// Peer addresses matching these CIDR ranges are excluded from the Kademlia DHT before + /// being added. This prevents routing to non-routable or private addresses. + /// + /// Defaults to reserved/private ranges (loopback, RFC1918, etc.). + /// + /// Add additional ranges to filter internal addresses specific to your network topology. + /// + /// NOTE: This only affects DHT address filtering, not direct peer connections. #[serde(default = "reserved_cidrs")] exclude_cidr: Vec, - /// Enable listening via relay P2pCircuit through the gateway. - /// Default is true to ensure inbound connectivity via relays. + + /// Enable listening via relay circuit through the gateway. + /// + /// When enabled (default), the scheduler establishes a listening address via the gateway's + /// relay circuit (/p2p-circuit). This allows workers to reach the scheduler even if it's + /// behind NAT or firewall. + /// + /// RECOMMENDATION: Keep enabled (true) unless the scheduler has public IP and external + /// addresses configured for direct connectivity. relay_circuit: bool, + + /// OpenTelemetry Protocol (OTLP) endpoint for exporting telemetry data. + /// + /// Sends metrics, traces, and logs to an OpenTelemetry collector or compatible backend + /// (e.g., Jaeger, Prometheus, Grafana Cloud, ...). + /// + /// If unset, telemetry export is disabled (local logging only). #[serde(alias = "exporter_otlp_endpoint")] - /// OTLP Exporter endpoint for telemetry data. If unset, telemetry is disabled. telemetry_endpoint: Option, + + /// Resource attributes included in all telemetry data. + /// + /// Key-value pairs that identify this scheduler instance in your observability platform. + /// Useful for filtering and grouping metrics across multiple schedulers. + /// + /// Example Attributes: + /// * service.name: "hypha-scheduler" + /// * service.version: "0.1.0" + /// * deployment.environment: "production" + /// * host.name: "scheduler-01" + /// * job.type: "training" + /// + /// These attributes appear in all exported metrics, traces, and logs. #[serde(alias = "resource_attributes")] - /// Attributes to be included in telemetry. telemetry_attributes: Option, + + /// HTTP/gRPC headers for OTLP endpoint authentication. + /// + /// Used to authenticate with your telemetry backend. Common use cases: + /// * API keys: {"Authorization": "Bearer YOUR_API_KEY"} + /// * Custom headers: {"X-API-Key": "secret"} + /// + /// SECURITY: Protect these credentials. Use environment variables or secrets management + /// instead of hardcoding in config files. Never commit credentials to version control. #[serde(alias = "exporter_otlp_headers")] - /// Headers for OTLP telemetry endpoint request used for authentication. telemetry_headers: Option, + + /// Protocol for OTLP telemetry endpoint communication. + /// + /// Choose based on your collector's supported protocols. #[serde(alias = "exporter_otlp_protocol")] - /// Protocol for OTLP telemetry endpoint request. telemetry_protocol: Option, + + /// Trace sampling strategy to control volume and costs. + /// + /// Options: + /// * "always_on" - Sample every trace (high volume, expensive) + /// * "always_off" - Disable tracing (metrics and logs only) + /// * "traceidratio" - Sample traces by probability (cost-effective) + /// * "parentbased_traceidratio" - Honor parent trace decisions with fallback ratio + /// + /// RECOMMENDATION: Use "traceidratio" with sample_ratio for production to balance + /// observability with costs. Start with 0.1 (10%) and adjust based on job volume. #[serde(alias = "traces_sampler")] - /// Traces sampler: one of "always_on", "always_off", "traceidratio", or "parentbased_traceidratio". telemetry_sampler: Option, + + /// Sampling probability for ratio-based trace samplers. + /// + /// Valid range: 0.0 to 1.0 + /// * 1.0 = 100% sampling (sample every trace) + /// * 0.1 = 10% sampling (sample 1 in 10 traces) + /// * 0.01 = 1% sampling (sample 1 in 100 traces) + /// + /// Only applies to "traceidratio" and "parentbased_traceidratio" samplers. + /// + /// NOTE: Lower ratios reduce telemetry costs while maintaining statistical + /// significance. For high-volume schedulers, 0.01-0.1 is probably sufficient. #[serde(alias = "traces_sampler_arg")] - /// For `traceidratio` and `parentbased_traceidratio` samplers: Sampling probability in [0..1], - /// e.g. "0.25". Default is 1.0. telemetry_sample_ratio: Option, - // path to AIM relay server + + /// AIM relay server address for real-time training metrics (optional). + /// + /// Connects to an AIM server to stream training metrics in real-time. Useful for + /// monitoring job progress and visualizing training curves. + /// + /// Example: "0.0.0.0:61000" status_bridge: Option, - /// Scheduler configuration. + + /// Scheduler-specific configuration for job orchestration. + /// + /// Contains settings for resource allocation, job scheduling policies, and worker + /// management strategies. scheduler: SchedulerConfig, } diff --git a/crates/worker/Cargo.toml b/crates/worker/Cargo.toml index b6c55435..0d2926ae 100644 --- a/crates/worker/Cargo.toml +++ b/crates/worker/Cargo.toml @@ -20,6 +20,7 @@ hypha-leases.workspace = true hypha-messages.workspace = true hypha-network.workspace = true hypha-telemetry.workspace = true +indoc.workspace = true libp2p.workspace = true libp2p-stream.workspace = true miette.workspace = true @@ -41,6 +42,7 @@ utoipa.workspace = true uuid.workspace = true [build-dependencies] +indoc.workspace = true clap.workspace = true hypha-network.workspace = true libp2p.workspace = true diff --git a/crates/worker/src/cli.rs b/crates/worker/src/cli.rs index bd8ad6c2..94d39d2a 100644 --- a/crates/worker/src/cli.rs +++ b/crates/worker/src/cli.rs @@ -2,6 +2,7 @@ use std::path::PathBuf; use clap::{Parser, Subcommand, ValueEnum}; use hypha_network::IpNet; +use indoc::indoc; use libp2p::Multiaddr; use serde::{Deserialize, Serialize}; @@ -15,9 +16,11 @@ pub enum Role { #[command( name = "hypha-worker", version, - about = "Hypha Worker Node", - long_about = "Runs a Hypha Worker which executes jobs.", - after_help = "For more information, see the project documentation." + about = "Hypha Worker", + long_about = indoc!{" + The Hypha Worker executes training and inference jobs assigned by schedulers via + the Hypha network. + "} )] pub struct Cli { #[command(subcommand)] @@ -26,88 +29,191 @@ pub struct Cli { #[derive(Debug, Subcommand, Serialize)] pub enum Commands { + #[command( + about = "Generate a default configuration file", + long_about = indoc!{" + Generate a default configuration file + + Creates a TOML configuration file with sensible defaults for job execution, + including certificate paths, network addresses, gateway connections, and + resource/executor settings. + + IMPORTANT: If the output file exists, it will be overwritten without warning. + "} + )] Init { /// Path where the configuration file will be written - #[clap(short, long, default_value = "config.toml")] + #[clap(short, long, default_value = "config.toml", verbatim_doc_comment)] output: PathBuf, }, - /// Probe a target multiaddr for readiness and exit 0 if healthy. + #[command( + about = "Check if a remote peer is healthy and reachable", + long_about = indoc!{" + Check if a remote peer is healthy and reachable + + Connects to the specified multiaddr, sends a health check request, and exits + with code 0 if the peer is healthy, or non-zero otherwise. + + Useful for: + * Verifying gateway connectivity before starting jobs + * Container health checks (Docker, Kubernetes) + * Monitoring worker availability + * Deployment verification and readiness checks + + NOTE: It's not possible to self-probe using the same certificate used to run the worker. + "} + )] #[serde(untagged)] Probe { - /// Path to the configuration file. - #[clap(short, long("config"), default_value = "config.toml")] + /// Path to the configuration file + /// + /// Used to load TLS certificates for secure connection to the target peer. + #[arg( + short, + long("config"), + default_value = "config.toml", + verbatim_doc_comment + )] config_file: PathBuf, - /// Path to the certificate pem. - #[clap(long("cert"))] + /// Path to the certificate PEM file (overrides config) + /// + /// Must be a valid X.509 certificate in PEM format. If not provided, uses + /// cert_pem from the configuration file. + #[arg(long("cert"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] cert_pem: Option, - /// Path to the private key pem. - #[clap(long("key"))] + /// Path to the private key PEM file (overrides config) + /// + /// Must correspond to the certificate. If not provided, uses key_pem from + /// the configuration file. + /// + /// SECURITY: Ensure this file has restricted permissions (e.g., chmod 600). + #[arg(long("key"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] key_pem: Option, - /// Path to the trust pem (bundle). - #[clap(long("trust"))] + /// Path to the trust chain PEM file (overrides config) + /// + /// CA bundle containing certificates trusted by this node. If not provided, + /// uses trust_pem from the configuration file. + #[arg(long("trust"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] trust_pem: Option, - /// Path to the certificate revocation list pem. - #[clap(long("crls"))] + /// Path to the certificate revocation list PEM (overrides config) + /// + /// Optional CRL for rejecting compromised certificates. If not provided, + /// uses crls_pem from the configuration file if present. + #[arg(long("crls"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] crls_pem: Option, - /// Target multiaddr to probe (e.g., /ip4/127.0.0.1/tcp/8080) - #[clap(index = 1)] + /// Target peer multiaddr to probe + /// + /// Examples: + /// /ip4/192.168.1.100/tcp/8080/ + /// /dns4/worker.example.com/tcp/443/p2p/12D3KooW... + #[arg(index = 1, verbatim_doc_comment)] address: String, - /// Timeout in milliseconds - #[clap(long, default_value_t = 2000)] + /// Maximum time to wait for health response (milliseconds) + /// + /// If the peer doesn't respond within this duration, the probe fails. + /// Increase for high-latency networks or overloaded peers. + #[arg(long, default_value_t = 2000, verbatim_doc_comment)] timeout: u64, }, + + #[command( + about = "Start the worker and begin accepting jobs", + long_about = indoc!{" + Start the worker and begin accepting jobs + + Loads configuration, connects to gateways, advertises resources, and executes + assigned jobs. Runs until interrupted (SIGINT/SIGTERM) with a graceful shutdown. + "} + )] #[serde(untagged)] Run { - /// Path to the configuration file. - #[clap(short, long("config"), default_value = "config.toml")] + /// Path to the configuration file + #[arg( + short, + long("config"), + default_value = "config.toml", + verbatim_doc_comment + )] #[serde(skip)] config_file: PathBuf, - /// Addresses of the gateways (can be specified multiple times). - #[clap(long("gateway"))] + /// Gateway addresses to connect to (repeatable, overrides config) + /// + /// Gateways provide network bootstrapping, DHT access, and optional relay. + /// Must include the peer ID in the multiaddr. + /// + /// Examples: + /// --gateway /ip4/203.0.113.10/tcp/8080/p2p/12D3KooWAbc... + /// --gateway /dns4/gateway.hypha.example/tcp/443/p2p/12D3KooWAbc... + /// Required: connect to at least one gateway. + #[arg(long("gateway"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] gateway_addresses: Option>, - /// Addresses to listen on (can be specified multiple times). - #[clap(long("listen"))] + /// Addresses to listen on (repeatable, overrides config) + /// + /// Where the worker accepts incoming connections. + /// + /// Examples: + /// --listen /ip4/0.0.0.0/tcp/9091 + /// --listen /ip4/0.0.0.0/udp/9091/quic-v1 + #[arg(long("listen"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] listen_addresses: Option>, - /// External addresses to advertise (can be specified multiple times). - #[clap(long("external"))] + /// External addresses to advertise (repeatable, overrides config) + /// + /// Publicly reachable addresses peers should use to connect. + /// + /// Examples: + /// --external /ip4/203.0.113.30/tcp/9091 + /// --external /dns4/worker.example.com/tcp/9091 + #[arg(long("external"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] external_addresses: Option>, - /// Enable listening via relay P2pCircuit (via gateway). Defaults to true. - #[clap(long("relay-circuit"))] + /// Enable relay circuit listening via gateway (overrides config) + /// + /// true = use relay (default), false = direct connections only. + #[arg(long("relay-circuit"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] relay_circuit: Option, - /// Socket to use for driver communication. - #[clap(long("socket"))] + /// Socket path for driver communication (overrides config) + /// + /// Unix domain socket for worker-executor communication (optional). + #[arg(long("socket"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] socket_address: Option, - /// Base directory for per-job working directories (default: /tmp). - /// Example: --work-dir /mnt/tmp - #[clap(long("work-dir"))] + /// Base directory for job working directories (overrides config) + /// + /// Where per-job working directories are created. + /// + /// Examples: + /// --work-dir /tmp + /// --work-dir /mnt/fast-ssd/hypha + #[arg(long("work-dir"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] work_dir: Option, - /// CIDR exclusion (repeatable). Overrides config if provided. - /// Example: --exclude-cidr 10.0.0.0/8 --exclude-cidr fc00::/7 - #[clap(long("exclude-cidr"))] + /// CIDR ranges to exclude from DHT (repeatable, overrides config) + /// + /// Filters out peer addresses matching these ranges before adding to the DHT. + /// + /// Examples: 10.0.0.0/8, fc00::/7 + #[arg(long("exclude-cidr"), verbatim_doc_comment)] #[serde(skip_serializing_if = "Option::is_none")] exclude_cidr: Option>, }, diff --git a/crates/worker/src/config.rs b/crates/worker/src/config.rs index 42b90d28..a1477f66 100644 --- a/crates/worker/src/config.rs +++ b/crates/worker/src/config.rs @@ -15,41 +15,98 @@ use serde::{Deserialize, Serialize}; use crate::resources::ComputeResources; #[derive(Deserialize, Serialize, Documented, DocumentedFieldsOpt)] -/// Configure available resources. +/// Available compute resources advertised to schedulers for job allocation. +/// +/// Resources are reserved during job allocation. Configure conservatively to avoid +/// overcommitment and ensure jobs have sufficient resources to complete successfully. pub struct ResourceConfig { /// Available CPU cores. + /// + /// Number of CPU cores available for job execution. Jobs can request specific CPU + /// allocations, and the scheduler ensures total allocated CPU doesn't exceed this limit. cpu: u32, + /// Available memory in GB. + /// + /// Total system memory available for jobs. Configure based on actual RAM minus OS overhead + /// and other running processes. Jobs exceeding this limit may be killed by the OS. memory: u32, + /// Available storage in GB. + /// + /// Disk space available for job artifacts (models, checkpoints, datasets). Should match + /// the available space in work_dir. Jobs with large datasets or frequent checkpoints + /// require more storage. storage: u32, - // TODO: How do we want to map multiple GPUs? + /// Available GPU memory in GB. + /// + /// Total GPU memory available across all GPUs. For multi-GPU systems, this represents + /// the combined memory. Set to 0 if no GPU is available. + /// + /// NOTE: Current implementation treats this as aggregate GPU memory. Future versions + /// may support per-GPU resource tracking for heterogeneous GPU clusters. gpu: u32, } #[derive(Clone, Deserialize, Serialize, Documented, DocumentedFieldsOpt)] -/// Configure an executor advertised and managed by the worker. +/// Executor configuration defining available job types and their runtime implementations. +/// +/// Workers advertise executors to schedulers. Each executor specifies a class (e.g., "trainer", +/// "parameter-server") and implementation details. Schedulers match jobs to workers based on +/// required executor classes. pub struct ExecutorConfig { /// Executor descriptor exposed to the scheduler (class + name). + /// + /// The class identifies the type of executor (e.g., "trainer", "inference-engine", + /// "parameter-server"). The name is a human-readable identifier for this specific + /// executor instance. #[serde(flatten)] descriptor: ExecutorDescriptor, + /// Runtime implementation used to fulfill this executor. + /// + /// Defines how jobs are executed: built-in implementations or external processes. #[serde(flatten)] runtime: ExecutorRuntime, } #[derive(Clone, Deserialize, Serialize, Documented, DocumentedFieldsOpt)] #[serde(tag = "runtime", rename_all = "kebab-case")] -/// Runtime implementation backing an executor selector. +/// Runtime implementation backing an executor. +/// +/// Determines how the worker executes jobs assigned to this executor. pub enum ExecutorRuntime { /// Built-in parameter server implementation. + /// + /// Uses the worker's native parameter server for distributed training coordination. + /// No external process required. ParameterServer, + /// External process launched by the worker. + /// + /// The worker launches and manages an external executable to handle job execution. + /// Useful for custom training frameworks or specialized inference engines. Process { /// Command to execute for process-based executors. + /// + /// Path to the executable or command name (must be in PATH). The worker spawns + /// this process for each job assigned to this executor. + /// + /// Examples: + /// * "python" - Python interpreter + /// * "/usr/local/bin/custom-trainer" - Custom training binary + /// * "docker" - Container-based execution cmd: String, + /// Arguments passed to the process-based executor. + /// + /// Command-line arguments appended when spawning the executor process. Job-specific + /// parameters are typically passed via environment variables or standard input. + /// + /// Examples: + /// * ["-m", "torch.distributed.run"] - PyTorch distributed launcher + /// * ["--config", "/etc/trainer/config.yaml"] - Config file path #[serde(default, skip_serializing_if = "Vec::is_empty")] args: Vec, }, @@ -80,53 +137,197 @@ impl ExecutorConfig { } #[derive(Deserialize, Serialize, Documented, DocumentedFieldsOpt)] -/// Configure network settings, security certificates, and runtime parameters. +/// Worker configuration for ML job execution and resource management. +/// +/// The worker executes ML training and inference jobs assigned by schedulers. It should be +/// deployed with adequate compute resources (CPU, GPU, memory, storage) for target workloads. pub struct Config { - /// Path to the certificate pem. + /// Path to the TLS certificate PEM file. + /// + /// Must be a valid X.509 certificate in PEM format that establishes this worker's + /// identity in the P2P network. The certificate must match the private key and be + /// trusted by all peers. + /// + /// SECURITY: Use certificates from a recognized CA or internal PKI for production deployments. cert_pem: PathBuf, - /// Path to the private key pem. + + /// Path to the private key PEM file. + /// + /// Must correspond to cert_pem. This is the worker's cryptographic identity. + /// + /// SECURITY: + /// * Restrict file permissions (chmod 600 recommended) + /// * Never commit to version control + /// * Store securely using secrets management systems in production + /// * Keep backups in secure, encrypted storage key_pem: PathBuf, - /// Path to the trust pem (bundle). + + /// Path to the trust chain PEM file (CA bundle). + /// + /// Contains root and intermediate certificates trusted by this worker. Peers presenting + /// certificates signed by these CAs will be accepted for network connections. trust_pem: PathBuf, - /// Path to the certificate revocation list pem. + + /// Path to certificate revocation list PEM (optional). + /// + /// Specifies certificates that should no longer be trusted, even if they're in the trust + /// chain. Used for compromised certificates or decommissioned peers. + /// + /// SECURITY: Keep this updated with your certificate authority's latest CRL to maintain + /// network security. Automate CRL updates in production environments. crls_pem: Option, - /// Addresses of the gateways. + + /// Gateway addresses to connect to (required for network entry). + /// + /// Specifies one or more gateways for network bootstrapping and relay functionality. + /// + /// Multiple gateways provide redundancy; the worker attempts to connect to all + /// and succeeds if any are reachable. + /// + /// Examples: + /// * "/ip4/203.0.113.10/tcp/8080/" + /// * "/dns4/gateway.hypha.example/tcp/443/" gateway_addresses: Vec, - /// Addresses to listen on. + + /// Network addresses to listen on for incoming connections. + /// + /// Supports TCP and QUIC protocols. Use port 0 to let the OS assign available ports. + /// + /// Examples: + /// * "/ip4/0.0.0.0/tcp/0" - TCP on all interfaces, OS-assigned port + /// * "/ip4/0.0.0.0/udp/0/quic-v1" - QUIC on all interfaces, OS-assigned port listen_addresses: Vec, - /// External addresses to advertise. Only list addresses that are guaranteed to be reachable from the internet. + + /// External addresses to advertise for peer discovery (optional). + /// + /// Only advertise addresses that schedulers can reliably reach. Most workers rely on + /// relay circuits and don't need external addresses. + /// + /// Examples: + /// * "/ip4/203.0.113.30/tcp/9091" + /// * "/dns4/worker.example.com/tcp/9091" external_addresses: Vec, - /// CIDR address filters applied before adding Identify-reported listen addresses to Kademlia. - /// Use standard CIDR notation (e.g., "10.0.0.0/8", "fc00::/7"). Defaults to reserved ranges. + + /// CIDR address filters for DHT routing table management. + /// + /// Peer addresses matching these CIDR ranges are excluded from the Kademlia DHT before + /// being added. This prevents routing to non-routable or private addresses. + /// + /// Defaults to reserved/private ranges (loopback, RFC1918, etc.). + /// + /// Add additional ranges to filter internal addresses specific to your network topology. + /// + /// NOTE: This only affects DHT address filtering, not direct peer connections. #[serde(default = "reserved_cidrs")] exclude_cidr: Vec, - /// Enable listening via relay P2pCircuit through the gateway. - /// Default is true to ensure inbound connectivity via relays. + + /// Enable listening via relay circuit through the gateway. + /// + /// When enabled (default), the worker establishes a listening address via the gateway's + /// relay circuit (/p2p-circuit). This allows schedulers to reach the worker even if it's + /// behind NAT or firewall. + /// + /// RECOMMENDATION: Keep enabled (true) unless the worker has public IP and external + /// addresses configured for direct connectivity. relay_circuit: bool, + /// Base directory for per-job working directories. + /// + /// Each job gets a unique subdirectory named hypha-{job_uuid} containing downloaded + /// models, checkpoints, datasets, and job outputs. + /// + /// REQUIREMENTS: + /// * Sufficient free space matching configured storage resource + /// * Fast I/O performance (SSD strongly recommended for training) + /// * Proper write permissions for worker process + /// * Adequate space for concurrent jobs if running multiple + /// + /// Jobs clean up their directories on successful completion, but failures may leave + /// artifacts for debugging. work_dir: PathBuf, - /// Available resources. + + /// Available compute resources advertised to schedulers. + /// + /// Accurately configure resources to enable proper job allocation. Resources are + /// reserved during job allocation and jobs exceeding available resources will be rejected. resources: ResourceConfig, - /// Available executors. + + /// Available executors for different job types. + /// + /// Executors define the runtime implementations available on this worker. Configure + /// executors for training, inference, parameter serving, or custom workloads. + /// + /// Workers with no executors configured cannot accept jobs. executors: Vec, + + /// OpenTelemetry Protocol (OTLP) endpoint for exporting telemetry data. + /// + /// Sends metrics, traces, and logs to an OpenTelemetry collector or compatible backend + /// (e.g., Jaeger, Prometheus, Grafana Cloud, ...). + /// + /// If unset, telemetry export is disabled (local logging only). #[serde(alias = "exporter_otlp_endpoint")] - /// OTLP Exporter endpoint for telemetry data. If unset, telemetry is disabled. telemetry_endpoint: Option, + + /// Resource attributes included in all telemetry data. + /// + /// Key-value pairs that identify this worker instance in your observability platform. + /// Useful for filtering and grouping metrics across multiple workers. + /// + /// Example Attributes: + /// * service.name: "hypha-worker" + /// * service.version: "0.1.0" + /// * deployment.environment: "production" + /// * host.name: "worker-gpu-01" + /// * hardware.gpu: "nvidia-a100" + /// * resource.tier: "high-memory" + /// + /// These attributes appear in all exported metrics, traces, and logs. #[serde(alias = "resource_attributes")] - /// Attributes to be included in telemetry. telemetry_attributes: Option, + + /// HTTP/gRPC headers for OTLP endpoint authentication. + /// + /// Used to authenticate with your telemetry backend. Common use cases: + /// * API keys: {"Authorization": "Bearer YOUR_API_KEY"} + /// * Custom headers: {"X-API-Key": "secret"} + /// + /// SECURITY: Protect these credentials. Use environment variables or secrets management + /// instead of hardcoding in config files. Never commit credentials to version control. #[serde(alias = "exporter_otlp_headers")] - /// Headers for OTLP telemetry endpoint request used for authentication. telemetry_headers: Option, + + /// Protocol for OTLP telemetry endpoint communication. + /// + /// Choose based on your collector's supported protocols. #[serde(alias = "exporter_otlp_protocol")] - /// Protocol for OTLP telemetry endpoint request. telemetry_protocol: Option, + + /// Trace sampling strategy to control volume and costs. + /// + /// Options: + /// * "always_on" - Sample every trace (high volume, expensive) + /// * "always_off" - Disable tracing (metrics and logs only) + /// * "traceidratio" - Sample traces by probability (cost-effective) + /// * "parentbased_traceidratio" - Honor parent trace decisions with fallback ratio + /// + /// RECOMMENDATION: Use "traceidratio" with sample_ratio for production to balance + /// observability with costs. Start with 0.1 (10%) and adjust based on job volume. #[serde(alias = "traces_sampler")] - /// Traces sampler: one of "always_on", "always_off", "traceidratio", or "parentbased_traceidratio". telemetry_sampler: Option, + + /// Sampling probability for ratio-based trace samplers. + /// + /// Valid range: 0.0 to 1.0 + /// * 1.0 = 100% sampling (sample every trace) + /// * 0.1 = 10% sampling (sample 1 in 10 traces) + /// * 0.01 = 1% sampling (sample 1 in 100 traces) + /// + /// Only applies to "traceidratio" and "parentbased_traceidratio" samplers. + /// + /// NOTE: Lower ratios reduce telemetry costs while maintaining statistical + /// significance. For high-throughput workers, 0.01-0.1 is probably sufficient. #[serde(alias = "traces_sampler_arg")] - /// For `traceidratio` and `parentbased_traceidratio` samplers: Sampling probability in [0..1], - /// e.g. "0.25". Default is 1.0. telemetry_sample_ratio: Option, }