Skip to content

Commit 1a39979

Browse files
authored
Sgl-router Prometheus metrics endpoint and usage track metrics (#6537)
1 parent 022012a commit 1a39979

File tree

9 files changed

+167
-2
lines changed

9 files changed

+167
-2
lines changed

sgl-router/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ tracing-appender = "0.2.3"
3030
kube = { version = "0.88.1", features = ["runtime", "derive"] }
3131
k8s-openapi = { version = "0.21.0", features = ["v1_29"] }
3232
futures = "0.3"
33+
# Added for metrics
34+
metrics = "0.24.2"
35+
metrics-exporter-prometheus = "0.17.0"
3336
[profile.release]
3437
lto = "thin"
3538
codegen-units = 1

sgl-router/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,18 @@ router = Router(
8181

8282
Use the `--verbose` flag with the CLI for more detailed logs.
8383

84+
### Metrics
85+
86+
SGL Router exposes a Prometheus HTTP scrape endpoint for monitoring, which by default listens at 127.0.0.1:29000.
87+
88+
To change the endpoint to listen on all network interfaces and set the port to 9000, configure the following options when launching the router:
89+
```
90+
python -m sglang_router.launch_router \
91+
--worker-urls http://localhost:8080 http://localhost:8081 \
92+
--prometheus-host 0.0.0.0 \
93+
--prometheus-port 9000
94+
```
95+
8496
### Kubernetes Service Discovery
8597

8698
SGL Router supports automatic service discovery for worker nodes in Kubernetes environments. When enabled, the router will automatically:

sgl-router/py_src/sglang_router/launch_router.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ class RouterArgs:
4848
selector: Dict[str, str] = dataclasses.field(default_factory=dict)
4949
service_discovery_port: int = 80
5050
service_discovery_namespace: Optional[str] = None
51+
# Prometheus configuration
52+
prometheus_port: Optional[int] = None
53+
prometheus_host: Optional[str] = None
5154

5255
@staticmethod
5356
def add_cli_args(
@@ -176,6 +179,19 @@ def add_cli_args(
176179
type=str,
177180
help="Kubernetes namespace to watch for pods. If not provided, watches all namespaces (requires cluster-wide permissions)",
178181
)
182+
# Prometheus configuration
183+
parser.add_argument(
184+
f"--{prefix}prometheus-port",
185+
type=int,
186+
default=29000,
187+
help="Port to expose Prometheus metrics. If not specified, Prometheus metrics are disabled",
188+
)
189+
parser.add_argument(
190+
f"--{prefix}prometheus-host",
191+
type=str,
192+
default="127.0.0.1",
193+
help="Host address to bind the Prometheus metrics server",
194+
)
179195

180196
@classmethod
181197
def from_cli_args(
@@ -215,6 +231,8 @@ def from_cli_args(
215231
service_discovery_namespace=getattr(
216232
args, f"{prefix}service_discovery_namespace", None
217233
),
234+
prometheus_port=getattr(args, f"{prefix}prometheus_port", None),
235+
prometheus_host=getattr(args, f"{prefix}prometheus_host", None),
218236
)
219237

220238
@staticmethod
@@ -278,6 +296,8 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
278296
selector=router_args.selector,
279297
service_discovery_port=router_args.service_discovery_port,
280298
service_discovery_namespace=router_args.service_discovery_namespace,
299+
prometheus_port=router_args.prometheus_port,
300+
prometheus_host=router_args.prometheus_host,
281301
)
282302

283303
router.start()

sgl-router/py_src/sglang_router/router.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ class Router:
4040
worker URLs using this port. Default: 80
4141
service_discovery_namespace: Kubernetes namespace to watch for pods. If not provided,
4242
watches pods across all namespaces (requires cluster-wide permissions). Default: None
43+
prometheus_port: Port to expose Prometheus metrics. Default: None
44+
prometheus_host: Host address to bind the Prometheus metrics server. Default: None
4345
"""
4446

4547
def __init__(
@@ -62,6 +64,8 @@ def __init__(
6264
selector: Dict[str, str] = None,
6365
service_discovery_port: int = 80,
6466
service_discovery_namespace: Optional[str] = None,
67+
prometheus_port: Optional[int] = None,
68+
prometheus_host: Optional[str] = None,
6569
):
6670
if selector is None:
6771
selector = {}
@@ -85,6 +89,8 @@ def __init__(
8589
selector=selector,
8690
service_discovery_port=service_discovery_port,
8791
service_discovery_namespace=service_discovery_namespace,
92+
prometheus_port=prometheus_port,
93+
prometheus_host=prometheus_host,
8894
)
8995

9096
def start(self) -> None:

sgl-router/py_test/test_launch_server.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ def popen_launch_router(
2828
selector: list = None,
2929
service_discovery_port: int = 80,
3030
service_discovery_namespace: str = None,
31+
prometheus_port: int = None,
32+
prometheus_host: str = None,
3133
):
3234
"""
3335
Launch the router server process.
@@ -45,6 +47,8 @@ def popen_launch_router(
4547
selector: List of label selectors in format ["key1=value1", "key2=value2"]
4648
service_discovery_port: Port to use for service discovery
4749
service_discovery_namespace: Kubernetes namespace to watch for pods. If None, watches all namespaces.
50+
prometheus_port: Port to expose Prometheus metrics. If None, Prometheus metrics are disabled.
51+
prometheus_host: Host address to bind the Prometheus metrics server.
4852
"""
4953
_, host, port = base_url.split(":")
5054
host = host[2:]
@@ -87,6 +91,12 @@ def popen_launch_router(
8791
["--router-service-discovery-namespace", service_discovery_namespace]
8892
)
8993

94+
if prometheus_port is not None:
95+
command.extend(["--router-prometheus-port", str(prometheus_port)])
96+
97+
if prometheus_host is not None:
98+
command.extend(["--router-prometheus-host", prometheus_host])
99+
90100
if log_dir is not None:
91101
command.extend(["--log-dir", log_dir])
92102

sgl-router/src/lib.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
use pyo3::prelude::*;
22
pub mod logging;
33
use std::collections::HashMap;
4+
pub mod prometheus;
45
pub mod router;
56
pub mod server;
67
pub mod service_discovery;
78
pub mod tree;
9+
use crate::prometheus::PrometheusConfig;
810

911
#[pyclass(eq)]
1012
#[derive(Clone, PartialEq, Debug)]
@@ -35,6 +37,8 @@ struct Router {
3537
selector: HashMap<String, String>,
3638
service_discovery_port: u16,
3739
service_discovery_namespace: Option<String>,
40+
prometheus_port: Option<u16>,
41+
prometheus_host: Option<String>,
3842
}
3943

4044
#[pymethods]
@@ -58,7 +62,9 @@ impl Router {
5862
service_discovery = false,
5963
selector = HashMap::new(),
6064
service_discovery_port = 80,
61-
service_discovery_namespace = None
65+
service_discovery_namespace = None,
66+
prometheus_port = None,
67+
prometheus_host = None
6268
))]
6369
fn new(
6470
worker_urls: Vec<String>,
@@ -79,6 +85,8 @@ impl Router {
7985
selector: HashMap<String, String>,
8086
service_discovery_port: u16,
8187
service_discovery_namespace: Option<String>,
88+
prometheus_port: Option<u16>,
89+
prometheus_host: Option<String>,
8290
) -> PyResult<Self> {
8391
Ok(Router {
8492
host,
@@ -99,6 +107,8 @@ impl Router {
99107
selector,
100108
service_discovery_port,
101109
service_discovery_namespace,
110+
prometheus_port,
111+
prometheus_host,
102112
})
103113
}
104114

@@ -136,6 +146,15 @@ impl Router {
136146
None
137147
};
138148

149+
// Create Prometheus config if enabled
150+
let prometheus_config = Some(PrometheusConfig {
151+
port: self.prometheus_port.unwrap_or(29000),
152+
host: self
153+
.prometheus_host
154+
.clone()
155+
.unwrap_or_else(|| "127.0.0.1".to_string()),
156+
});
157+
139158
actix_web::rt::System::new().block_on(async move {
140159
server::startup(server::ServerConfig {
141160
host: self.host.clone(),
@@ -146,6 +165,7 @@ impl Router {
146165
max_payload_size: self.max_payload_size,
147166
log_dir: self.log_dir.clone(),
148167
service_discovery_config,
168+
prometheus_config,
149169
})
150170
.await
151171
.map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))?;

sgl-router/src/prometheus.rs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
use metrics_exporter_prometheus::{Matcher, PrometheusBuilder};
2+
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
3+
use std::time::Duration;
4+
5+
#[derive(Debug, Clone)]
6+
pub struct PrometheusConfig {
7+
pub port: u16,
8+
pub host: String,
9+
}
10+
11+
impl Default for PrometheusConfig {
12+
fn default() -> Self {
13+
Self {
14+
port: 29000,
15+
host: "0.0.0.0".to_string(),
16+
}
17+
}
18+
}
19+
20+
pub fn start_prometheus(config: PrometheusConfig) {
21+
let duration_matcher = Matcher::Suffix(String::from("duration"));
22+
let duration_bucket = [
23+
0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 15.0, 30.0, 45.0,
24+
60.0, 90.0, 120.0, 180.0, 240.0,
25+
];
26+
27+
let ip_addr: IpAddr = config
28+
.host
29+
.parse()
30+
.unwrap_or(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)));
31+
let socket_addr = SocketAddr::new(ip_addr, config.port);
32+
33+
PrometheusBuilder::new()
34+
.with_http_listener(socket_addr)
35+
.upkeep_timeout(Duration::from_secs(5 * 60))
36+
.set_buckets_for_metric(duration_matcher, &duration_bucket)
37+
.expect("failed to set duration bucket")
38+
.install()
39+
.expect("failed to install Prometheus metrics exporter");
40+
}

0 commit comments

Comments
 (0)