diff --git a/.dockerignore b/.dockerignore index 209ff4ff4..d86d894d3 100644 --- a/.dockerignore +++ b/.dockerignore @@ -26,3 +26,4 @@ !/reqactor !/ballot !/redis-derive +!/metrics diff --git a/Cargo.lock b/Cargo.lock index 05eec9054..4f3c710fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6181,6 +6181,7 @@ dependencies = [ "bytemuck", "cap", "cfg-if", + "chrono", "clap 4.5.9", "dotenv", "env_logger", @@ -6195,6 +6196,7 @@ dependencies = [ "raiko-ballot", "raiko-core", "raiko-lib", + "raiko-metrics", "raiko-reqactor", "raiko-reqpool", "raiko-tasks", @@ -6273,6 +6275,15 @@ dependencies = [ "utoipa", ] +[[package]] +name = "raiko-metrics" +version = "0.1.0" +dependencies = [ + "lazy_static", + "prometheus", + "raiko-lib", +] + [[package]] name = "raiko-pipeline" version = "0.1.0" @@ -6308,6 +6319,7 @@ dependencies = [ "raiko-ballot", "raiko-core", "raiko-lib", + "raiko-metrics", "raiko-reqpool", "reth-primitives", "serde", @@ -6330,6 +6342,7 @@ dependencies = [ "quote", "raiko-core", "raiko-lib", + "raiko-metrics", "raiko-redis-derive", "redis", "serde", diff --git a/Cargo.toml b/Cargo.toml index 14665ba36..bec780a58 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ members = [ "reqpool", "reqactor", "ballot", + "metrics", ] # Always optimize; building and running the guest takes much longer without optimization. @@ -45,6 +46,7 @@ raiko-redis-derive = { path = "./redis-derive" } raiko-reqpool = { path = "./reqpool" } raiko-reqactor = { path = "./reqactor" } raiko-ballot = { path = "./ballot" } +raiko-metrics = { path = "./metrics" } # reth reth-primitives = { git = "https://github.com/taikoxyz/taiko-reth.git", branch = "v1.0.0-rc.2-taiko", default-features = false, features = [ diff --git a/docker/docker-compose.metrics.yml b/docker/docker-compose.metrics.yml index 9200cc89a..4ed5ccc53 100644 --- a/docker/docker-compose.metrics.yml +++ b/docker/docker-compose.metrics.yml @@ -18,6 +18,8 @@ services: - "3000:3000" volumes: - 'grafana_storage:/var/lib/grafana' + depends_on: + - prometheus volumes: grafana_storage: {} prometheus_storage: {} diff --git a/docker/monitoring/grafana/raiko.json b/docker/monitoring/grafana/raiko.json index ec2524d9f..43fb521bd 100644 --- a/docker/monitoring/grafana/raiko.json +++ b/docker/monitoring/grafana/raiko.json @@ -1,41 +1,4 @@ { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__elements": {}, - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "10.4.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "stat", - "name": "Stat", - "version": "" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], "annotations": { "list": [ { @@ -52,29 +15,97 @@ } ] }, - "description": "", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, + "id": 12, "links": [], "panels": [ { + "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, - "id": 11, - "title": "Traffic", + "id": 100, + "panels": [], + "title": "Overview", "type": "row" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "aefllvr997awwf" + }, + "description": "Number of HTTP requests received in the last 5 minutes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "increase(raiko_http_request_count[5m])", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "HTTP Requests (per 5m)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aefllvr997awwf" }, + "description": "Number of transitions to failed status in the last 5 minutes by request type", "fieldConfig": { "defaults": { "color": { @@ -83,29 +114,27 @@ "custom": { "axisBorderShow": false, "axisCenteredZero": false, - "axisColorMode": "series", + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "opacity", + "fillOpacity": 10, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, - "lineInterpolation": "smooth", - "lineStyle": { - "fill": "solid" - }, + "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -115,62 +144,166 @@ "mode": "off" } }, - "decimals": 0, - "fieldMinMax": true, "mappings": [], - "min": 0, + "noValue": "No failed requests :D", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null + }, + { + "color": "red", + "value": 1 } ] }, - "unit": "reqps" + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 8, - "w": 12, - "x": 0, + "h": 5, + "w": 6, + "x": 6, "y": 1 }, "id": 12, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "10.4.0", + "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum by(guest) (increase(guest_proof_request_count[$__rate_interval]))", - "fullMetaSearch": false, - "includeNullMetadata": true, + "editorMode": "code", + "expr": "sum by(request_type) (increase(raiko_pool_transition_duration_millis_count{proof_type=~\"$proof_type\", request_type=~\"$request_type\", status_to=\"failed\"}[5m]))", + "hide": false, "instant": false, - "legendFormat": "{{guest}}", + "legendFormat": "{{request_type}}", "range": true, - "refId": "A", - "useBackend": false + "refId": "A" } ], - "title": "Traffic", + "title": "Failed Transitions (per 5m)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aefllvr997awwf" + }, + "description": "Number of requests to the pool in the last 5 minutes by request type and proof type", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 23, + "x": 0, + "y": 6 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (request_type, proof_type) (increase(raiko_pool_request_count{proof_type=~\"$proof_type\", request_type=~\"$request_type\"}[5m]))", + "instant": false, + "legendFormat": "{{request_type}}.{{proof_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Pool Requests by Type (per 5m)", "type": "timeseries" }, { @@ -179,18 +312,19 @@ "h": 1, "w": 24, "x": 0, - "y": 9 + "y": 13 }, - "id": 10, + "id": 101, "panels": [], - "title": "Errors", + "title": "Pool", "type": "row" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "aefllvr997awwf" }, + "description": "Duration of requests transitioning from Register to WIP status (${percentile}th percentile)", "fieldConfig": { "defaults": { "color": { @@ -202,11 +336,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 16, - "axisSoftMin": 0, "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -220,7 +353,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -237,61 +370,61 @@ { "color": "green", "value": null - }, - { - "color": "red", - "value": 80 } ] - } + }, + "unit": "ms" }, "overrides": [] }, "gridPos": { "h": 8, - "w": 12, + "w": 8, "x": 0, - "y": 10 + "y": 14 }, "id": 3, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, + "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "rate(guest_proof_error_count[$__rate_interval])", - "fullMetaSearch": false, - "includeNullMetadata": true, + "editorMode": "code", + "expr": "histogram_quantile($percentile/100, sum by(le, request_type) (rate(raiko_pool_transition_duration_millis_bucket{status_from=\"registered\", status_to=\"wip\", request_type=~\"$request_type\"}[5m])))", "instant": false, - "legendFormat": "{{guest}}", + "legendFormat": "{{request_type}}", "range": true, - "refId": "A", - "useBackend": false + "refId": "A" } ], - "title": "Per prover error rate", + "title": "Pool Transition Duration | Register → WIP", "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "aefllvr997awwf" }, + "description": "Duration of requests transitioning from WIP to Success status (${percentile}th percentile)", "fieldConfig": { "defaults": { "color": { @@ -303,11 +436,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 16, - "axisSoftMin": 0, "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -321,7 +453,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -338,54 +470,53 @@ { "color": "green", "value": null - }, - { - "color": "red", - "value": 0.1 } ] - } + }, + "unit": "ms" }, "overrides": [] }, "gridPos": { "h": 8, - "w": 12, - "x": 12, - "y": 10 + "w": 8, + "x": 8, + "y": 14 }, - "id": 2, + "id": 11, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, + "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "rate(host_error_count[$__rate_interval])", - "fullMetaSearch": false, - "includeNullMetadata": true, + "editorMode": "code", + "expr": "histogram_quantile($percentile/100, sum by(le, request_type) (rate(raiko_pool_transition_duration_millis_bucket{proof_type=\"sgx\", status_from=\"wip\", status_to=\"success\", request_type=~\"$request_type\"}[5m])))", "instant": false, - "legendFormat": "Error rate", + "legendFormat": "{{request_type}}", "range": true, - "refId": "A", - "useBackend": false + "refId": "A" } ], - "title": "Non prover error rate", + "title": "Pool Transition Duration | WIP → Success", "type": "timeseries" }, { @@ -394,18 +525,19 @@ "h": 1, "w": 24, "x": 0, - "y": 18 + "y": 22 }, - "id": 9, + "id": 102, "panels": [], - "title": "Saturation", + "title": "Actor Channel", "type": "row" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "aefllvr997awwf" }, + "description": "Estimated number of requests currently in the actor channel queue", "fieldConfig": { "defaults": { "color": { @@ -414,12 +546,13 @@ "custom": { "axisBorderShow": false, "axisCenteredZero": false, - "axisColorMode": "series", + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -433,7 +566,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -451,13 +584,16 @@ "color": "green", "value": null }, + { + "color": "yellow", + "value": 50 + }, { "color": "red", - "value": 80 + "value": 100 } ] - }, - "unit": "decbytes" + } }, "overrides": [] }, @@ -465,137 +601,143 @@ "h": 8, "w": 12, "x": 0, - "y": 19 + "y": 23 }, - "id": 7, + "id": 5, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, - "timezone": [ - "" - ], "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, + "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "process_virtual_memory_bytes", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Virtual", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "process_resident_memory_bytes", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, + "editorMode": "code", + "expr": "(sum by (request_type, proof_type) (raiko_actor_channel_in_count{proof_type=~\"$proof_type\", request_type=~\"$request_type\"})) - (sum by (request_type, proof_type) (raiko_actor_channel_out_count{proof_type=~\"$proof_type\", request_type=~\"$request_type\"}))", "instant": false, - "legendFormat": "Resident", + "legendFormat": "Queue Depth {{request_type}}.{{proof_type}}", "range": true, - "refId": "B", - "useBackend": false + "refId": "A" } ], - "title": "Raiko memory utilization", + "title": "Actor Channel Estimated Channel Length", "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "aefllvr997awwf" }, + "description": "Time taken for requests to process through the actor channel (${percentile}th percentile)", "fieldConfig": { "defaults": { "color": { - "mode": "continuous-BlPu" + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, - "fieldMinMax": true, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null - }, - { - "color": "red", - "value": 12 } ] }, - "unit": "none" + "unit": "ms" }, "overrides": [] }, "gridPos": { "h": 8, - "w": 12, - "x": 12, - "y": 19 + "w": 24, + "x": 0, + "y": 31 }, - "id": 1, - "interval": "1", + "id": 6, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { + "legend": { "calcs": [ + "mean", + "max", "lastNotNull" ], - "fields": "", - "values": false + "displayMode": "table", + "placement": "bottom", + "showLegend": true }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } }, - "pluginVersion": "10.4.0", + "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "concurrent_requests", - "fullMetaSearch": false, - "includeNullMetadata": true, + "editorMode": "code", + "expr": "histogram_quantile($percentile/100, sum by (le) (rate(raiko_actor_channel_in_duration_millis_bucket{}[5m])))", "instant": false, - "legendFormat": "Concurrent requests", + "legendFormat": "duration (~0ms by expect)", "range": true, - "refId": "A", - "useBackend": false + "refId": "A" } ], - "title": "Concurrent proof requests", - "type": "stat" + "title": "Actor Channel send.await Duration (${percentile}th Percentile)", + "type": "timeseries" }, { "collapsed": false, @@ -603,18 +745,19 @@ "h": 1, "w": 24, "x": 0, - "y": 27 + "y": 39 }, - "id": 8, + "id": 103, "panels": [], - "title": "Latency", + "title": "Actor Proving Process", "type": "row" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "aefllvr997awwf" }, + "description": "Time taken for the actor to generate input (${percentile}th percentile)", "fieldConfig": { "defaults": { "color": { @@ -623,13 +766,13 @@ "custom": { "axisBorderShow": false, "axisCenteredZero": false, - "axisColorMode": "series", + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMin": 0, "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -637,14 +780,13 @@ "viz": false }, "insertNulls": false, - "lineInterpolation": "smooth", + "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { - "log": 2, - "type": "log" + "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -661,14 +803,10 @@ { "color": "green", "value": null - }, - { - "color": "red", - "value": 80 } ] }, - "unit": "s" + "unit": "ms" }, "overrides": [] }, @@ -676,49 +814,50 @@ "h": 8, "w": 12, "x": 0, - "y": 28 + "y": 40 }, - "id": 4, + "id": 7, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "10.4.0", + "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum by(success) (rate(total_time_histogram_bucket[$__interval]))", - "format": "heatmap", - "fullMetaSearch": false, - "includeNullMetadata": true, + "editorMode": "code", + "expr": "histogram_quantile($percentile/100, sum by (le, request_type) (rate(raiko_actor_generating_input_duration_millis_bucket{request_type=~\"$request_type\"}[5m])))", "instant": false, - "legendFormat": "Request success: {{success}}", + "legendFormat": "{{request_type}}", "range": true, - "refId": "A", - "useBackend": false + "refId": "A" } ], - "title": "Total Raiko latency", + "title": "Actor Generating Input Duration (${percentile}th Percentile)", "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "aefllvr997awwf" }, + "description": "Time taken for the actor to generate output (${percentile}th percentile)", "fieldConfig": { "defaults": { "color": { @@ -727,13 +866,13 @@ "custom": { "axisBorderShow": false, "axisCenteredZero": false, - "axisColorMode": "series", + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMin": 0, "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -741,14 +880,13 @@ "viz": false }, "insertNulls": false, - "lineInterpolation": "smooth", + "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { - "log": 2, - "type": "log" + "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -768,7 +906,7 @@ } ] }, - "unit": "s" + "unit": "ms" }, "overrides": [] }, @@ -776,48 +914,50 @@ "h": 8, "w": 12, "x": 12, - "y": 28 + "y": 40 }, - "id": 5, + "id": 8, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, + "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum by(success, guest) (rate(guest_proof_time_histogram_bucket[$__interval]))", - "format": "heatmap", - "fullMetaSearch": false, - "includeNullMetadata": true, + "editorMode": "code", + "expr": "histogram_quantile($percentile/100, sum by (le, request_type) (rate(raiko_actor_generating_output_duration_millis_bucket{request_type=~\"$request_type\"}[5m])))", "instant": false, - "legendFormat": "{{guest}} success: {{success}}", + "legendFormat": "{{request_type}}", "range": true, - "refId": "A", - "useBackend": false + "refId": "A" } ], - "title": "Prover latency", + "title": "Actor Generating Output Duration (${percentile}th Percentile)", "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "aefllvr997awwf" }, + "description": "Time taken for the actor to generate proofs (${percentile}th percentile)", "fieldConfig": { "defaults": { "color": { @@ -826,13 +966,13 @@ "custom": { "axisBorderShow": false, "axisCenteredZero": false, - "axisColorMode": "series", + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMin": 0, "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -840,14 +980,13 @@ "viz": false }, "insertNulls": false, - "lineInterpolation": "smooth", + "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { - "log": 2, - "type": "log" + "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -867,90 +1006,253 @@ } ] }, - "unit": "s" + "unit": "ms" }, "overrides": [] }, "gridPos": { "h": 8, - "w": 12, + "w": 24, "x": 0, - "y": 36 + "y": 48 }, - "id": 6, + "id": 9, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, + "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum by(success) (rate(prepare_input_time_histogram_bucket[$__interval]))", - "format": "heatmap", - "fullMetaSearch": false, - "includeNullMetadata": true, + "editorMode": "code", + "expr": "histogram_quantile($percentile/100, sum by (le, request_type, proof_type) (rate(raiko_actor_proving_duration_millis_bucket{proof_type=~\"$proof_type\", request_type=~\"$request_type\"}[5m])))", "instant": false, - "legendFormat": "Request success: {{success}}", + "legendFormat": "{{request_type}}.{{proof_type}}", "range": true, - "refId": "A", - "useBackend": false + "refId": "A" } ], - "title": "Input prepare latency", + "title": "Actor Proving Duration (${percentile}th Percentile)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 56 + }, + "id": 104, + "panels": [], + "title": "Batch Request", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "aefllvr997awwf" + }, + "description": "Distribution of block counts in batch requests in the last 5 minutes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 57 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "increase(raiko_batch_request_block_count_bucket[5m])", + "instant": false, + "legendFormat": "le {{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Batch Request Block Count (per 5m)", "type": "timeseries" } ], + "preload": false, "refresh": "5s", - "schemaVersion": 39, + "schemaVersion": 40, "tags": [ "raiko", - "proof" + "prometheus" ], "templating": { "list": [ { "current": { - "selected": false, - "text": "prometheus", - "value": "ediia2rrpaf40c" + "text": "All", + "value": "$__all" }, - "description": "The Prometheus instance which contains the Raiko metrics", - "hide": 0, - "includeAll": false, - "label": "Raiko Prometheus", - "multi": false, - "name": "raiko_prometheus", + "includeAll": true, + "label": "Proof Type", + "multi": true, + "name": "proof_type", "options": [], - "query": "prometheus", - "queryValue": "", + "query": { + "query": "label_values(raiko_pool_request_count, proof_type)", + "refId": "StandardVariableQuery" + }, "refresh": 1, "regex": "", - "skipUrlSync": false, - "type": "datasource" + "type": "query" + }, + { + "current": { + "text": "All", + "value": "$__all" + }, + "includeAll": true, + "label": "Request Type", + "multi": true, + "name": "request_type", + "options": [], + "query": { + "query": "label_values(raiko_pool_request_count, request_type)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": { + "text": "90", + "value": "90" + }, + "includeAll": false, + "label": "Percentile", + "name": "percentile", + "options": [ + { + "selected": false, + "text": "50", + "value": "50" + }, + { + "selected": true, + "text": "90", + "value": "90" + }, + { + "selected": false, + "text": "95", + "value": "95" + }, + { + "selected": false, + "text": "99", + "value": "99" + } + ], + "query": "50,90,95,99", + "type": "custom" } ] }, "time": { - "from": "now-1h", + "from": "now-30m", "to": "now" }, "timepicker": {}, "timezone": "browser", - "title": "Raiko Metrics", - "uid": "bdishcsl6e2v4e", + "title": "Raiko Dashboard", + "uid": "eegex5m3m7z7kf", "version": 3, "weekStart": "" } diff --git a/host/Cargo.toml b/host/Cargo.toml index de551b303..9cc2060d3 100644 --- a/host/Cargo.toml +++ b/host/Cargo.toml @@ -18,6 +18,7 @@ raiko-tasks = { workspace = true } raiko-reqpool = { workspace = true } raiko-reqactor = { workspace = true } raiko-ballot = { workspace = true } +raiko-metrics = { workspace = true } # alloy alloy-rlp = { workspace = true } @@ -72,6 +73,7 @@ url = { workspace = true } cfg-if = { workspace = true } cap = { workspace = true } dotenv = { workspace = true } +chrono = { workspace = true } # reth diff --git a/host/src/lib.rs b/host/src/lib.rs index b65313532..4c2435b9c 100644 --- a/host/src/lib.rs +++ b/host/src/lib.rs @@ -15,7 +15,6 @@ use crate::interfaces::HostResult; pub mod cache; pub mod interfaces; -pub mod metrics; pub mod server; #[derive(Default, Clone, Serialize, Deserialize, Debug, Parser)] diff --git a/host/src/metrics.rs b/host/src/metrics.rs deleted file mode 100644 index 26babdd63..000000000 --- a/host/src/metrics.rs +++ /dev/null @@ -1,169 +0,0 @@ -use std::time::Duration; - -use lazy_static::lazy_static; -use prometheus::{ - labels, register_histogram_vec, register_int_counter_vec, register_int_gauge, HistogramVec, - IntCounterVec, IntGauge, -}; -use raiko_lib::proof_type::ProofType; - -lazy_static! { - pub static ref HOST_REQ_COUNT: IntCounterVec = register_int_counter_vec!( - "host_request_count", - "the number of requests sent to the host", - &["block_id"] - ) - .unwrap(); - pub static ref HOST_ERROR_COUNT: IntCounterVec = register_int_counter_vec!( - "host_error_count", - "the number of failed requests produced by the host", - &["block_id"] - ) - .unwrap(); - pub static ref GUEST_PROOF_REQ_COUNT: IntCounterVec = register_int_counter_vec!( - "guest_proof_request_count", - "the number of requests sent to this guest", - &["guest", "block_id"] - ) - .unwrap(); - pub static ref GUEST_PROOF_SUCCESS_COUNT: IntCounterVec = register_int_counter_vec!( - "guest_proof_success_count", - "the number of successful proofs generated by this guest", - &["guest", "block_id"] - ) - .unwrap(); - pub static ref GUEST_PROOF_ERROR_COUNT: IntCounterVec = register_int_counter_vec!( - "guest_proof_error_count", - "the number of failed proofs generated by this guest", - &["guest", "block_id"] - ) - .unwrap(); - pub static ref GUEST_PROOF_TIME: HistogramVec = register_histogram_vec!( - "guest_proof_time_histogram", - "time taken for proof generation by this guest", - &["guest", "block_id", "success"] - ) - .unwrap(); - pub static ref PREPARE_INPUT_TIME: HistogramVec = register_histogram_vec!( - "prepare_input_time_histogram", - "time taken for prepare input", - &["block_id", "success"] - ) - .unwrap(); - pub static ref TOTAL_TIME: HistogramVec = register_histogram_vec!( - "total_time_histogram", - "time taken for the whole request", - &["block_id", "success"] - ) - .unwrap(); - pub static ref CONCURRENT_REQUESTS: IntGauge = register_int_gauge!( - "concurrent_requests", - "number of requests currently being processed" - ) - .unwrap(); -} - -/// Increase the count of requests currently being processed. -pub fn inc_current_req() { - CONCURRENT_REQUESTS.inc(); -} - -/// Decrease the count of requests currently being processed. -pub fn dec_current_req() { - CONCURRENT_REQUESTS.dec(); -} - -/// Increment the request count for the host. -pub fn inc_host_req_count(block_id: u64) { - let block_id = block_id.to_string(); - let labels = labels! { - "block_id" => block_id.as_str(), - }; - HOST_REQ_COUNT.with(&labels).inc(); -} - -/// Increment the error count for the host. -pub fn inc_host_error(block_id: u64) { - let block_id = block_id.to_string(); - let labels = labels! { - "block_id" => block_id.as_str(), - }; - HOST_ERROR_COUNT.with(&labels).inc(); -} - -/// Increment the request count for the given guest. -pub fn inc_guest_req_count(guest: &ProofType, block_id: u64) { - let guest = guest.to_string(); - let block_id = block_id.to_string(); - let labels = labels! { - "guest" => guest.as_str(), - "block_id" => &block_id, - }; - GUEST_PROOF_REQ_COUNT.with(&labels).inc(); -} - -/// Increment the success count for the given guest. -pub fn inc_guest_success(guest: &ProofType, block_id: u64) { - let guest = guest.to_string(); - let block_id = block_id.to_string(); - let labels = labels! { - "guest" => guest.as_str(), - "block_id" => &block_id, - }; - GUEST_PROOF_SUCCESS_COUNT.with(&labels).inc(); -} - -/// Increment the error count for the given guest. -pub fn inc_guest_error(guest: &ProofType, block_id: u64) { - let guest = guest.to_string(); - let block_id = block_id.to_string(); - let labels = labels! { - "guest" => guest.as_str(), - "block_id" => &block_id, - }; - GUEST_PROOF_ERROR_COUNT.with(&labels).inc(); -} - -/// Convert a duration to a float with 3 decimal places (seconds,milliseconds). -fn duration_to_f64(d: Duration) -> f64 { - (d.as_secs_f64() * 1_000.0).round() / 1_000.0 -} - -/// Observe the time taken for the given guest to generate a proof. -pub fn observe_guest_time(guest: &ProofType, block_id: u64, time: Duration, success: bool) { - let guest = guest.to_string(); - let block_id = block_id.to_string(); - let success = success.to_string(); - let labels = labels! { - "guest" => guest.as_str(), - "block_id" => &block_id, - "success" => &success, - }; - GUEST_PROOF_TIME - .with(&labels) - .observe(duration_to_f64(time)); -} - -/// Observe the time taken for prepare input. -pub fn observe_prepare_input_time(block_id: u64, time: Duration, success: bool) { - let block_id = block_id.to_string(); - let success = success.to_string(); - let labels = labels! { - "block_id" => block_id.as_str(), - "success" => &success, - }; - PREPARE_INPUT_TIME - .with(&labels) - .observe(duration_to_f64(time)); -} - -/// Observe the time taken for prepare input. -pub fn observe_total_time(block_id: u64, time: Duration, success: bool) { - let block_id = block_id.to_string(); - let success = success.to_string(); - let labels = labels! { - "block_id" => block_id.as_str(), - "success" => &success, - }; - TOTAL_TIME.with(&labels).observe(duration_to_f64(time)); -} diff --git a/host/src/server/api/v1/metrics.rs b/host/src/server/api/metrics.rs similarity index 100% rename from host/src/server/api/v1/metrics.rs rename to host/src/server/api/metrics.rs diff --git a/host/src/server/api/mod.rs b/host/src/server/api/mod.rs index d66700468..50e3c1bb4 100644 --- a/host/src/server/api/mod.rs +++ b/host/src/server/api/mod.rs @@ -13,6 +13,7 @@ use tower_http::{ }; pub mod admin; +pub mod metrics; pub mod v1; pub mod v2; pub mod v3; @@ -39,12 +40,14 @@ pub fn create_router(concurrency_limit: usize, jwt_secret: Option<&str>) -> Rout let v2_api = v2::create_router(); let v3_api = v3::create_router(); let admin_api = admin::create_router(); + let metrics_api = metrics::create_router(); let router = Router::new() .nest("/v1", v1_api) .nest("/v2", v2_api) .nest("/v3", v3_api.clone()) .merge(v3_api) .nest("/admin", admin_api) + .nest("/metrics", metrics_api) .layer(middleware) .layer(DefaultBodyLimit::max(MAX_BODY_SIZE)) .layer(trace) diff --git a/host/src/server/api/v1/mod.rs b/host/src/server/api/v1/mod.rs index b410e499a..162ba9df2 100644 --- a/host/src/server/api/v1/mod.rs +++ b/host/src/server/api/v1/mod.rs @@ -11,7 +11,6 @@ use crate::interfaces::HostError; use raiko_reqactor::Actor; pub mod health; -pub mod metrics; pub mod proof; #[derive(OpenApi)] @@ -43,7 +42,6 @@ pub mod proof; tags( (name = "Proving", description = "Routes that handle proving requests"), (name = "Health", description = "Routes that report the server health status"), - (name = "Metrics", description = "Routes that give detailed insight into the server") ) )] /// The root API struct which is generated from the `OpenApi` derive macro. @@ -105,31 +103,24 @@ pub struct GuestOutputDoc { #[must_use] pub fn create_docs() -> utoipa::openapi::OpenApi { - [ - health::create_docs(), - metrics::create_docs(), - proof::create_docs(), - ] - .into_iter() - .fold(Docs::openapi(), |mut doc, sub_doc| { - doc.merge(sub_doc); - doc - }) + [health::create_docs(), proof::create_docs()] + .into_iter() + .fold(Docs::openapi(), |mut doc, sub_doc| { + doc.merge(sub_doc); + doc + }) } pub fn create_router(concurrency_limit: usize) -> Router { let docs = create_docs(); Router::new() - // Only add the concurrency limit to the proof route. We want to still be able to call - // healthchecks and metrics to have insight into the system. .nest( "/proof", proof::create_router() .layer(ServiceBuilder::new().concurrency_limit(concurrency_limit)), ) .nest("/health", health::create_router()) - .nest("/metrics", metrics::create_router()) .merge(SwaggerUi::new("/swagger-ui").url("/api-docs/openapi.json", docs.clone())) .merge(Scalar::with_url("/scalar", docs)) } diff --git a/host/src/server/api/v2/mod.rs b/host/src/server/api/v2/mod.rs index 1b4208a51..d24bdc3cd 100644 --- a/host/src/server/api/v2/mod.rs +++ b/host/src/server/api/v2/mod.rs @@ -44,7 +44,6 @@ pub mod proof; tags( (name = "Proving", description = "Routes that handle proving requests"), (name = "Health", description = "Routes that report the server health status"), - (name = "Metrics", description = "Routes that give detailed insight into the server") ) )] /// The root API struct which is generated from the `OpenApi` derive macro. @@ -119,29 +118,22 @@ impl IntoResponse for PruneStatus { #[must_use] pub fn create_docs() -> utoipa::openapi::OpenApi { - [ - v1::health::create_docs(), - v1::metrics::create_docs(), - proof::create_docs(), - ] - .into_iter() - .fold(Docs::openapi(), |mut doc, sub_doc| { - doc.merge(sub_doc); - doc - }) + [v1::health::create_docs(), proof::create_docs()] + .into_iter() + .fold(Docs::openapi(), |mut doc, sub_doc| { + doc.merge(sub_doc); + doc + }) } pub fn create_router() -> Router { let docs = create_docs(); Router::new() - // Only add the concurrency limit to the proof route. We want to still be able to call - // healthchecks and metrics to have insight into the system. .nest("/proof", proof::create_router()) // TODO: Separate task or try to get it into /proof somehow? Probably separate .nest("/aggregate", proof::create_router()) .nest("/health", v1::health::create_router()) - .nest("/metrics", v1::metrics::create_router()) .merge(SwaggerUi::new("/swagger-ui").url("/api-docs/openapi.json", docs.clone())) .merge(Scalar::with_url("/scalar", docs)) } diff --git a/host/src/server/api/v2/proof/mod.rs b/host/src/server/api/v2/proof/mod.rs index 483025ad7..93a880493 100644 --- a/host/src/server/api/v2/proof/mod.rs +++ b/host/src/server/api/v2/proof/mod.rs @@ -9,7 +9,6 @@ use utoipa::OpenApi; use crate::server::utils::{draw_for_zk_any_request, fulfill_sp1_params, is_zk_any_request}; use crate::{ interfaces::HostResult, - metrics::{inc_current_req, inc_guest_req_count, inc_host_req_count}, server::{api::v2::Status, to_v2_status}, }; use raiko_reqactor::Actor; @@ -40,7 +39,7 @@ async fn proof_handler( State(actor): State, Json(mut req): Json, ) -> HostResult { - inc_current_req(); + raiko_metrics::inc_http_request_count(); if is_zk_any_request(&req) { fulfill_sp1_params(&mut req); @@ -68,8 +67,6 @@ async fn proof_handler( // Construct the actual proof request from the available configs. let proof_request = ProofRequest::try_from(config)?; - inc_host_req_count(proof_request.block_number); - inc_guest_req_count(&proof_request.proof_type, proof_request.block_number); let (chain_id, blockhash) = get_task_data( &proof_request.network, diff --git a/host/src/server/api/v3/mod.rs b/host/src/server/api/v3/mod.rs index 90b2e543f..09ebd7a10 100644 --- a/host/src/server/api/v3/mod.rs +++ b/host/src/server/api/v3/mod.rs @@ -49,7 +49,6 @@ mod proof; tags( (name = "Proving", description = "Routes that handle proving requests"), (name = "Health", description = "Routes that report the server health status"), - (name = "Metrics", description = "Routes that give detailed insight into the server") ) )] /// The root API struct which is generated from the `OpenApi` derive macro. @@ -57,27 +56,20 @@ pub struct Docs; #[must_use] pub fn create_docs() -> utoipa::openapi::OpenApi { - [ - v1::health::create_docs(), - v1::metrics::create_docs(), - proof::create_docs(), - ] - .into_iter() - .fold(Docs::openapi(), |mut doc, sub_doc| { - doc.merge(sub_doc); - doc - }) + [v1::health::create_docs(), proof::create_docs()] + .into_iter() + .fold(Docs::openapi(), |mut doc, sub_doc| { + doc.merge(sub_doc); + doc + }) } pub fn create_router() -> Router { let docs = create_docs(); Router::new() - // Only add the concurrency limit to the proof route. We want to still be able to call - // healthchecks and metrics to have insight into the system. .nest("/proof", proof::create_router()) .nest("/health", v1::health::create_router()) - .nest("/metrics", v1::metrics::create_router()) .merge(SwaggerUi::new("/swagger-ui").url("/api-docs/openapi.json", docs.clone())) .merge(Scalar::with_url("/scalar", docs)) } diff --git a/host/src/server/api/v3/proof/aggregate/mod.rs b/host/src/server/api/v3/proof/aggregate/mod.rs index 3f6745f9e..625f66859 100644 --- a/host/src/server/api/v3/proof/aggregate/mod.rs +++ b/host/src/server/api/v3/proof/aggregate/mod.rs @@ -8,7 +8,6 @@ use utoipa::OpenApi; use crate::{ interfaces::HostResult, - metrics::{inc_current_req, inc_guest_req_count, inc_host_req_count}, server::{api::v3::Status, to_v3_status, HostError}, }; use raiko_reqactor::Actor; @@ -36,7 +35,8 @@ async fn aggregation_handler( State(actor): State, Json(mut aggregation_request): Json, ) -> HostResult { - inc_current_req(); + raiko_metrics::inc_http_request_count(); + // Override the existing proof request config from the config file and command line // options with the request from the client. let default_request_config = actor.default_request_config(); @@ -49,8 +49,6 @@ async fn aggregation_handler( .unwrap_or_default(), ) .map_err(HostError::Conversion)?; - inc_host_req_count(0); - inc_guest_req_count(&proof_type, 0); if aggregation_request.proofs.is_empty() { return Err(anyhow::anyhow!("No proofs provided").into()); diff --git a/host/src/server/api/v3/proof/batch.rs b/host/src/server/api/v3/proof/batch.rs index 540963d04..5592d17b5 100644 --- a/host/src/server/api/v3/proof/batch.rs +++ b/host/src/server/api/v3/proof/batch.rs @@ -40,6 +40,8 @@ async fn batch_handler( State(actor): State, Json(batch_request_opt): Json, ) -> HostResult { + raiko_metrics::inc_http_request_count(); + if is_zk_any_request(&batch_request_opt) { return Ok(Status::Ok { proof_type: ProofType::Native, diff --git a/host/src/server/api/v3/proof/mod.rs b/host/src/server/api/v3/proof/mod.rs index f1d905a0d..e89905bd5 100644 --- a/host/src/server/api/v3/proof/mod.rs +++ b/host/src/server/api/v3/proof/mod.rs @@ -1,6 +1,5 @@ use crate::{ interfaces::HostResult, - metrics::{inc_current_req, inc_guest_req_count, inc_host_req_count}, server::{ api::{v2, v3::Status}, prove_aggregation, @@ -42,7 +41,7 @@ async fn proof_handler( State(actor): State, Json(mut aggregation_request): Json, ) -> HostResult { - inc_current_req(); + raiko_metrics::inc_http_request_count(); // Override the existing proof request config from the config file and command line // options with the request from the client. @@ -60,9 +59,6 @@ async fn proof_handler( for proof_request_opt in proof_request_opts { let proof_request = ProofRequest::try_from(proof_request_opt)?; - inc_host_req_count(proof_request.block_number); - inc_guest_req_count(&proof_request.proof_type, proof_request.block_number); - let (chain_id, blockhash) = get_task_data( &proof_request.network, proof_request.block_number, diff --git a/host/src/server/handler.rs b/host/src/server/handler.rs index afbf10492..6a5593594 100644 --- a/host/src/server/handler.rs +++ b/host/src/server/handler.rs @@ -16,6 +16,7 @@ pub async fn prove( let action = Action::Prove { request_key, request_entity, + start_time: chrono::Utc::now(), }; act(actor, action).await } diff --git a/metrics/Cargo.toml b/metrics/Cargo.toml new file mode 100644 index 000000000..a323e7fed --- /dev/null +++ b/metrics/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "raiko-metrics" +version = "0.1.0" +authors = ["Taiko Labs"] +edition = "2021" + +[dependencies] +raiko-lib = { workspace = true } +prometheus = { workspace = true } +lazy_static = { workspace = true } diff --git a/metrics/src/lib.rs b/metrics/src/lib.rs new file mode 100644 index 000000000..76d3bd024 --- /dev/null +++ b/metrics/src/lib.rs @@ -0,0 +1,160 @@ +use lazy_static::lazy_static; +use prometheus::{ + register_counter, register_counter_vec, register_histogram, register_histogram_vec, Counter, + CounterVec, Histogram, HistogramVec, +}; +use std::time::Duration; + +mod traits; + +// Re-export +pub use traits::ToLabel; + +lazy_static! { + // HTTP metrics + pub static ref HTTP_REQUEST_COUNT: Counter = register_counter!( + "raiko_http_request_count", + "the number of HTTP requests" + ) + .unwrap(); + + // Pool metrics + pub static ref POOL_REQUEST_COUNT: CounterVec = register_counter_vec!( + "raiko_pool_request_count", + "the number of requests to the pool", + &["request_type", "proof_type"] + ) + .unwrap(); + pub static ref POOL_TRANSITION_DURATION_MILLIS: HistogramVec = register_histogram_vec!( + "raiko_pool_transition_duration_millis", + "the duration of request transiting from one status to another", + &["request_type", "proof_type", "status_from", "status_to"] + ) + .unwrap(); + + // Actor metrics + pub static ref ACTOR_CHANNEL_IN_COUNT: CounterVec = register_counter_vec!( + "raiko_actor_channel_in_count", + "the number of requests sent to the actor", + &["request_type", "proof_type"] + ) + .unwrap(); + + pub static ref ACTOR_CHANNEL_OUT_COUNT: CounterVec = register_counter_vec!( + "raiko_actor_channel_out_count", + "the number of requests received from the actor", + &["request_type", "proof_type"] + ) + .unwrap(); + + pub static ref ACTOR_CHANNEL_IN_DURATION_MILLIS: HistogramVec = register_histogram_vec!( + "raiko_actor_channel_in_duration_millis", + "the duration of requests sent to the actor", + &["request_type", "proof_type"] + ) + .unwrap(); + + // Actor proving metrics + pub static ref ACTOR_GENERATING_INPUT_DURATION_MILLIS: HistogramVec = register_histogram_vec!( + "raiko_actor_generating_input_duration_millis", + "the duration of generating input by the actor", + &["request_type"] + ) + .unwrap(); + + pub static ref ACTOR_GENERATING_OUTPUT_DURATION_MILLIS: HistogramVec = register_histogram_vec!( + "raiko_actor_generating_output_duration_millis", + "the duration of generating output by the actor", + &["request_type"] + ) + .unwrap(); + + pub static ref ACTOR_PROVING_DURATION_MILLIS: HistogramVec = register_histogram_vec!( + "raiko_actor_proving_duration_millis", + "the duration of requests being proved by the actor", + &["request_type", "proof_type"] + ) + .unwrap(); + + // Batch request metrics + pub static ref BATCH_REQUEST_BLOCK_COUNT: Histogram = register_histogram!( + "raiko_batch_request_block_count", + "the number of blocks in a batch request", + ) + .unwrap(); +} + +pub fn inc_http_request_count() { + HTTP_REQUEST_COUNT.inc(); +} + +pub fn inc_pool_request_count(request_type: impl ToLabel, proof_type: impl ToLabel) { + POOL_REQUEST_COUNT + .with_label_values(&[request_type.to_label(), proof_type.to_label()]) + .inc(); +} + +pub fn observe_pool_transition_duration( + request_type: impl ToLabel, + proof_type: impl ToLabel, + status_from: impl ToLabel, + status_to: impl ToLabel, + duration: Duration, +) { + POOL_TRANSITION_DURATION_MILLIS + .with_label_values(&[ + request_type.to_label(), + proof_type.to_label(), + status_from.to_label(), + status_to.to_label(), + ]) + .observe(duration.as_millis() as f64); +} + +pub fn inc_actor_channel_in_count(request_type: impl ToLabel, proof_type: impl ToLabel) { + ACTOR_CHANNEL_IN_COUNT + .with_label_values(&[request_type.to_label(), proof_type.to_label()]) + .inc(); +} + +pub fn inc_actor_channel_out_count(request_type: impl ToLabel, proof_type: impl ToLabel) { + ACTOR_CHANNEL_OUT_COUNT + .with_label_values(&[request_type.to_label(), proof_type.to_label()]) + .inc(); +} + +pub fn observe_actor_channel_in_duration( + request_type: impl ToLabel, + proof_type: impl ToLabel, + duration: Duration, +) { + ACTOR_CHANNEL_IN_DURATION_MILLIS + .with_label_values(&[request_type.to_label(), proof_type.to_label()]) + .observe(duration.as_millis() as f64); +} + +pub fn observe_actor_generating_input_duration(request_type: impl ToLabel, duration: Duration) { + ACTOR_GENERATING_INPUT_DURATION_MILLIS + .with_label_values(&[request_type.to_label()]) + .observe(duration.as_millis() as f64); +} + +pub fn observe_actor_generating_output_duration(request_type: impl ToLabel, duration: Duration) { + ACTOR_GENERATING_OUTPUT_DURATION_MILLIS + .with_label_values(&[request_type.to_label()]) + .observe(duration.as_millis() as f64); +} + +pub fn observe_actor_proving_duration( + request_type: impl ToLabel, + proof_type: impl ToLabel, + duration: Duration, +) { + ACTOR_PROVING_DURATION_MILLIS + .with_label_values(&[request_type.to_label(), proof_type.to_label()]) + .observe(duration.as_millis() as f64); +} + +pub fn observe_batch_request_block_count(block_count: u64) { + BATCH_REQUEST_BLOCK_COUNT.observe(block_count as f64); +} diff --git a/metrics/src/traits.rs b/metrics/src/traits.rs new file mode 100644 index 000000000..6fcb2b555 --- /dev/null +++ b/metrics/src/traits.rs @@ -0,0 +1,22 @@ +use raiko_lib::proof_type::ProofType; + +pub trait ToLabel { + fn to_label(&self) -> &'static str; +} + +impl ToLabel for &'static str { + fn to_label(&self) -> &'static str { + self + } +} + +impl ToLabel for &ProofType { + fn to_label(&self) -> &'static str { + match self { + ProofType::Native => "native", + ProofType::Sp1 => "sp1", + ProofType::Sgx => "sgx", + ProofType::Risc0 => "risc0", + } + } +} diff --git a/reqactor/Cargo.toml b/reqactor/Cargo.toml index 820ae4aaa..29ba536b8 100644 --- a/reqactor/Cargo.toml +++ b/reqactor/Cargo.toml @@ -8,6 +8,7 @@ raiko-lib = { workspace = true } raiko-core = { workspace = true } raiko-reqpool = { workspace = true } raiko-ballot = { workspace = true } +raiko-metrics = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/reqactor/src/action.rs b/reqactor/src/action.rs index e9184e766..2c8b5167e 100644 --- a/reqactor/src/action.rs +++ b/reqactor/src/action.rs @@ -8,6 +8,7 @@ pub enum Action { Prove { request_key: RequestKey, request_entity: RequestEntity, + start_time: chrono::DateTime, }, Cancel { request_key: RequestKey, diff --git a/reqactor/src/actor.rs b/reqactor/src/actor.rs index e8e3fc606..1292a1c1d 100644 --- a/reqactor/src/actor.rs +++ b/reqactor/src/actor.rs @@ -95,11 +95,25 @@ impl Actor { let (resp_tx, resp_rx) = oneshot::channel(); // Send the action to the backend + let start_time = chrono::Utc::now(); + raiko_metrics::inc_actor_channel_in_count( + action.request_key(), + action.request_key().proof_type(), + ); + self.action_tx - .send((action, resp_tx)) + .send((action.clone(), resp_tx)) .await .map_err(|e| format!("failed to send action: {e}"))?; + raiko_metrics::observe_actor_channel_in_duration( + action.request_key(), + action.request_key().proof_type(), + (chrono::Utc::now() - start_time) + .to_std() + .unwrap_or_default(), + ); + // Wait for response of the action resp_rx .await @@ -212,6 +226,7 @@ mod tests { let test_action = Action::Prove { request_key: request_key.clone(), request_entity, + start_time: chrono::Utc::now(), }; // Spawn a task to handle the action and send back a response diff --git a/reqactor/src/backend.rs b/reqactor/src/backend.rs index 5dfd8e56b..ea3b24b50 100644 --- a/reqactor/src/backend.rs +++ b/reqactor/src/backend.rs @@ -75,6 +75,11 @@ impl Backend { tokio::select! { Some((action, resp_tx)) = action_rx.recv() => { let request_key = action.request_key().clone(); + raiko_metrics::inc_actor_channel_out_count( + action.request_key(), + action.request_key().proof_type(), + ); + let response = self.handle_external_action(action.clone()).await; // Signal the request key to the internal channel, to move on to the next step, whatever the result is @@ -113,10 +118,12 @@ impl Backend { Action::Prove { request_key, request_entity, + start_time, } => match self.pool.get_status(&request_key) { Ok(None) => { tracing::debug!("Actor Backend received prove-action {request_key}, and it is not in pool, registering"); - self.register(request_key.clone(), request_entity).await + self.register(request_key.clone(), request_entity, start_time) + .await } Ok(Some(status)) => match status.status() { Status::Registered | Status::WorkInProgress | Status::Success { .. } => { @@ -125,11 +132,11 @@ impl Backend { } Status::Cancelled { .. } => { tracing::warn!("Actor Backend received prove-action {request_key}, and it is cancelled, re-registering"); - self.register(request_key, request_entity).await + self.register(request_key, request_entity, start_time).await } Status::Failed { .. } => { tracing::warn!("Actor Backend received prove-action {request_key}, and it is failed, re-registering"); - self.register(request_key, request_entity).await + self.register(request_key, request_entity, start_time).await } }, Err(err) => { @@ -245,12 +252,13 @@ impl Backend { &mut self, request_key: RequestKey, request_entity: RequestEntity, + start_time: chrono::DateTime, ) -> Result { // 1. Register to the pool - let status = StatusWithContext::new_registered(); + let status = StatusWithContext::new(Status::Registered, start_time); if let Err(err) = self .pool - .add(request_key.clone(), request_entity, status.clone()) + .add_new(request_key.clone(), request_entity, status.clone()) { return Err(err); } @@ -517,22 +525,52 @@ pub async fn do_prove_single( .map_err(|err| format!("failed to create rpc block data provider: {err:?}"))?; // 1. Generate the proof input - let input = raiko - .generate_input(provider) - .await - .map_err(|e| format!("failed to generate input: {e:?}"))?; + let input = { + let start_time = chrono::Utc::now(); + let input = raiko + .generate_input(provider) + .await + .map_err(|e| format!("failed to generate input: {e:?}"))?; + raiko_metrics::observe_actor_generating_input_duration( + &request_key, + (chrono::Utc::now() - start_time) + .to_std() + .unwrap_or_default(), + ); + input + }; // 2. Generate the proof output - let output = raiko - .get_output(&input) - .map_err(|e| format!("failed to get output: {e:?}"))?; + let output = { + let start_time = chrono::Utc::now(); + let output = raiko + .get_output(&input) + .map_err(|e| format!("failed to get output: {e:?}"))?; + raiko_metrics::observe_actor_generating_output_duration( + &request_key, + (chrono::Utc::now() - start_time) + .to_std() + .unwrap_or_default(), + ); + output + }; // 3. Generate the proof - let proof = raiko - .prove(input, &output, Some(pool)) - .await - .map_err(|err| format!("failed to generate single proof: {err:?}"))?; - + let proof = { + let start_time = chrono::Utc::now(); + let proof = raiko + .prove(input, &output, Some(pool)) + .await + .map_err(|err| format!("failed to generate single proof: {err:?}"))?; + raiko_metrics::observe_actor_proving_duration( + &request_key, + request_key.proof_type(), + (chrono::Utc::now() - start_time) + .to_std() + .unwrap_or_default(), + ); + proof + }; Ok(proof) } @@ -601,18 +639,54 @@ async fn do_prove_batch( l2_block_numbers: all_prove_blocks.clone(), }; let raiko = Raiko::new(l1_chain_spec, taiko_chain_spec, proof_request); - let input = raiko - .generate_batch_input(provider) - .await - .map_err(|e| format!("failed to generateg guest batch input: {e:?}"))?; + + let input = { + let start_time = chrono::Utc::now(); + let input = raiko + .generate_batch_input(provider) + .await + .map_err(|e| format!("failed to generateg guest batch input: {e:?}"))?; + raiko_metrics::observe_actor_generating_input_duration( + &request_key, + (chrono::Utc::now() - start_time) + .to_std() + .unwrap_or_default(), + ); + raiko_metrics::observe_batch_request_block_count(input.inputs.len() as u64); + input + }; trace!("batch guest input: {input:?}"); - let output = raiko - .get_batch_output(&input) - .map_err(|e| format!("failed to get guest batch output: {e:?}"))?; + + let output = { + let start_time = chrono::Utc::now(); + let output = raiko + .get_batch_output(&input) + .map_err(|e| format!("failed to get guest batch output: {e:?}"))?; + raiko_metrics::observe_actor_generating_output_duration( + &request_key, + (chrono::Utc::now() - start_time) + .to_std() + .unwrap_or_default(), + ); + output + }; debug!("batch guest output: {output:?}"); - let proof = raiko - .batch_prove(input, &output, Some(pool)) - .await - .map_err(|e| format!("failed to generate batch proof: {e:?}"))?; + + let proof = { + let start_time = chrono::Utc::now(); + let proof = raiko + .batch_prove(input, &output, Some(pool)) + .await + .map_err(|e| format!("failed to generate batch proof: {e:?}"))?; + raiko_metrics::observe_actor_proving_duration( + &request_key, + request_key.proof_type(), + (chrono::Utc::now() - start_time) + .to_std() + .unwrap_or_default(), + ); + proof + }; + Ok(proof) } diff --git a/reqpool/Cargo.toml b/reqpool/Cargo.toml index 7d63bbfe1..065bd93dc 100644 --- a/reqpool/Cargo.toml +++ b/reqpool/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" [dependencies] raiko-lib = { workspace = true } raiko-core = { workspace = true } +raiko-metrics = { workspace = true } raiko-redis-derive = { workspace = true } chrono = { workspace = true, features = ["serde"] } serde = { workspace = true } diff --git a/reqpool/src/pool.rs b/reqpool/src/pool.rs index 52759c8d6..33fb3e37d 100644 --- a/reqpool/src/pool.rs +++ b/reqpool/src/pool.rs @@ -17,6 +17,16 @@ pub struct Pool { } impl Pool { + pub fn add_new( + &mut self, + request_key: RequestKey, + request_entity: RequestEntity, + status: StatusWithContext, + ) -> Result<(), String> { + raiko_metrics::inc_pool_request_count(&request_key, request_key.proof_type()); + self.add(request_key, request_entity, status) + } + pub fn add( &mut self, request_key: RequestKey, @@ -77,6 +87,16 @@ impl Pool { tracing::info!("RedisPool.update_status: {request_key}, {status}"); match self.get(&request_key)? { Some((entity, old_status)) => { + raiko_metrics::observe_pool_transition_duration( + &request_key, + request_key.proof_type(), + old_status.status(), + status.status(), + (chrono::Utc::now() - old_status.timestamp()) + .to_std() + .unwrap(), + ); + self.add(request_key, entity, status)?; Ok(old_status) } diff --git a/reqpool/src/request.rs b/reqpool/src/request.rs index 4acae2313..17ad9b117 100644 --- a/reqpool/src/request.rs +++ b/reqpool/src/request.rs @@ -355,7 +355,6 @@ pub enum RequestEntity { SingleProof(SingleProofRequestEntity), Aggregation(AggregationRequestEntity), BatchProof(BatchProofRequestEntity), - //todo: AggregationBatch(AggregationBatchRequestEntity), } impl From for RequestEntity { @@ -406,3 +405,25 @@ impl std::fmt::Display for StatusWithContext { write!(f, "{}", self.status()) } } + +impl raiko_metrics::ToLabel for &RequestKey { + fn to_label(&self) -> &'static str { + match self { + RequestKey::SingleProof(_) => "single", + RequestKey::Aggregation(_) => "aggr", + RequestKey::BatchProof(_) => "batch", + } + } +} + +impl raiko_metrics::ToLabel for &Status { + fn to_label(&self) -> &'static str { + match self { + Status::Registered => "registered", + Status::WorkInProgress => "wip", + Status::Success { .. } => "success", + Status::Cancelled => "cancelled", + Status::Failed { .. } => "failed", + } + } +}