open-telemetry · jerbly · May 24, 2025 · May 4, 2025 · May 4, 2025 · May 4, 2025
@@ -2,6 +2,10 @@
 
 All notable changes to this project will be documented in this file.
 
+# NEXT
+
+- Add support for metrics in Live Check. ([#728](https://github.com/open-telemetry/weaver/pull/728) by @jerbly)
+
 # [0.15.0] - 2025-05-01
 
 - Add support for attributes of type `any`. ([#707](https://github.com/open-telemetry/weaver/pull/707) by @lquerel)

@@ -96,6 +96,7 @@ tonic = { version = "0.12.3", default-features = false, features = [
     "tls-roots",
 ] }
 env_logger = "0.11.8"
+chrono = "0.4.41"
 
 # workspace dependencies
 serde.workspace = true
@@ -247,7 +248,7 @@ install-updater = false
 install-path = "CARGO_HOME"
 # Whether to enable GitHub Attestations
 github-attestations = true
-github-custom-runners = { global="ubuntu-latest", x86_64-unknown-linux-gnu="ubuntu-24.04" }
+github-custom-runners = { global = "ubuntu-latest", x86_64-unknown-linux-gnu = "ubuntu-24.04" }
 
 
 #exhaustive_enums = "warn"

@@ -23,6 +23,7 @@ serde_yaml.workspace = true
 walkdir.workspace = true
 globset.workspace = true
 miette.workspace = true
+schemars.workspace = true
 
 regorus = { version = "0.4.0", default-features = false, features = [
     "std",

@@ -2,6 +2,7 @@
 
 //! Definition of a policy violation.
 
+use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
 use std::fmt::{Display, Formatter};
@@ -72,7 +73,9 @@ impl Violation {
 }
 
 /// The level of an advice
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, PartialOrd, Ord, Eq, Hash)]
+#[derive(
+    Debug, Clone, PartialEq, Serialize, Deserialize, PartialOrd, Ord, Eq, Hash, JsonSchema,
+)]
 #[serde(rename_all = "snake_case")]
 pub enum AdviceLevel {
     /// Useful context without action needed
@@ -84,7 +87,7 @@ pub enum AdviceLevel {
 }
 
 /// Represents a live check advice
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)]
 pub struct Advice {
     /// The type of advice e.g. "is_deprecated"
     pub advice_type: String,

@@ -19,6 +19,7 @@ thiserror.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 miette.workspace = true
+schemars.workspace = true
 
 [dev-dependencies]
 tempfile = "3.20.0"

@@ -107,6 +107,9 @@ As mentioned, a list of `Advice` is returned in the report for each sample entit
 }
 ```
 
+> **Note**  
+> The `live_check_result` object augments the sample entity at the pertinent level in the structure. If the structure is `metric`->`[number_data_point]`->`[attribute]`, advice should be give at the `number_data_point` level for, say, required attributes that have not been supplied. Whereas, attribute advice, like `missing_attribute` in the JSON above, is given at the attribute level.
+
 ### Custom advisors
 
 Use the `--advice-policies` command line option to provide a path to a directory containing Rego policies with the `live_check_advice` package name. Here's a very simple example that rejects any attribute name containing the string "test":
@@ -118,8 +121,8 @@ import rego.v1
 
 # checks attribute name contains the word "test"
 deny contains make_advice(advice_type, advice_level, value, message) if {
-  input.attribute
-  value := input.attribute.name
+  input.sample.attribute
+  value := input.sample.attribute.name
   contains(value, "test")
   advice_type := "contains_test"
   advice_level := "violation"
@@ -135,7 +138,13 @@ make_advice(advice_type, advice_level, value, message) := {
 }
 ```
 
-`input` contains the sample entity for assessment wrapped in a type e.g. `input.attribute` or `input.span`. `data` contains a structure derived from the supplied `Registry`. A jq preprocessor takes the `Registry` (and maps for attributes and templates) to produce the `data` for the policy. If the jq is simply `.` this will passthrough as-is. Preprocessing is used to improve Rego performance and to simplify policy definitions. With this model `data` is processed once whereas the Rego policy runs for every sample entity as it arrives in the stream.
+`input.sample` contains the sample entity for assessment wrapped in a type e.g. `input.sample.attribute` or `input.sample.span`.
+
+`input.registry_attribute`, when present, contains the matching attribute definition from the supplied registry.
+
+`input.registry_group`, when present, contains the matching group definition from the supplied registry.
+
+`data` contains a structure derived from the supplied `Registry`. A jq preprocessor takes the `Registry` (and maps for attributes and templates) to produce the `data` for the policy. If the jq is simply `.` this will passthrough as-is. Preprocessing is used to improve Rego performance and to simplify policy definitions. With this model `data` is processed once whereas the Rego policy runs for every sample entity as it arrives in the stream.
 
 To override the default Otel jq preprocessor provide a path to the jq file through the `--advice-preprocessor` option.
 
@@ -202,7 +211,9 @@ These should be self-explanatory, but:
 - `no_advice_count` is the number of samples that received no advice
 - `seen_registry_attributes` is a record of how many times each attribute in the registry was seen in the samples
 - `seen_non_registry_attributes` is a record of how many times each non-registry attribute was seen in the samples
-- `registry_coverage` is the fraction of seen registry attributes over the total registry attributes
+- `seen_registry_metrics` is a record of how many times each metric in the registry was seen in the samples
+- `seen_non_registry_metrics` is a record of how many times each non-registry metric was seen in the samples
+- `registry_coverage` is the fraction of seen registry entities over the total registry entities
 
 This could be parsed for a more sophisticated way to determine pass/fail in CI for example.
 

@@ -0,0 +1,98 @@
+[
+    {
+        "metric": {
+            "data_points": [
+                {
+                    "attributes": [
+                        {
+                            "name": "state",
+                            "value": "used"
+                        }
+                    ],
+                    "value": 26963050496
+                },
+                {
+                    "attributes": [
+                        {
+                            "name": "state",
+                            "value": "free"
+                        }
+                    ],
+                    "value": 586153984
+                },
+                {
+                    "attributes": [
+                        {
+                            "name": "system.memory.state",
+                            "value": "inactive"
+                        }
+                    ],
+                    "value": 681053388.8
+                }
+            ],
+            "instrument": "updowncounter",
+            "name": "system.memory.usage",
+            "unit": "By"
+        }
+    },
+    {
+        "metric": {
+            "data_points": [
+                {
+                    "attributes": [],
+                    "bucket_counts": [
+                        0,
+                        0,
+                        1,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        0
+                    ],
+                    "count": 1,
+                    "max": 7.0015,
+                    "min": 7.0015,
+                    "sum": 7.0015
+                }
+            ],
+            "instrument": "histogram",
+            "name": "rpc.client.duration",
+            "unit": "ms"
+        }
+    },
+    {
+        "metric": {
+            "data_points": [
+                {
+                    "attributes": [],
+                    "value": 151552000
+                }
+            ],
+            "instrument": "gauge",
+            "name": "otelcol_process_memory_rss",
+            "unit": "By"
+        }
+    },
+    {
+        "metric": {
+            "data_points": [
+                {
+                    "attributes": [],
+                    "value": 39585616
+                }
+            ],
+            "instrument": "counter",
+            "name": "otelcol_process_runtime_total_alloc_bytes",
+            "unit": "By"
+        }
+    }
+]
@@ -0,0 +1,54 @@
+groups:
+  - id: registry.system.memory
+    type: attribute_group
+    display_name: System Memory Attributes
+    brief: "Describes System Memory attributes"
+    attributes:
+      - id: system.memory.state
+        type:
+          members:
+            - id: used
+              value: "used"
+              stability: development
+            - id: free
+              value: "free"
+              stability: development
+            - id: shared
+              value: "shared"
+              stability: development
+              deprecated: "Removed, report shared memory usage with `metric.system.memory.shared` metric"
+            - id: buffers
+              value: "buffers"
+              stability: development
+            - id: cached
+              value: "cached"
+              stability: development
+        stability: development
+        brief: "The memory state"
+        examples: ["free", "cached"]
+
+  # system.* metrics
+  - id: metric.system.uptime
+    type: metric
+    metric_name: system.uptime
+    stability: development
+    brief: "The time the system has been running"
+    note: |
+      Instrumentations SHOULD use a gauge with type `double` and measure uptime in seconds as a floating point number with the highest precision available.
+      The actual accuracy would depend on the instrumentation and operating system.
+    instrument: gauge
+    unit: "s"
+
+  # system.memory.* metrics
+  - id: metric.system.memory.usage
+    type: metric
+    metric_name: system.memory.usage
+    stability: development
+    brief: "Reports memory in use by state."
+    note: |
+      The sum over all `system.memory.state` values SHOULD equal the total memory
+      available on the system, that is `system.memory.limit`.
+    instrument: updowncounter
+    unit: "By"
+    attributes:
+      - ref: system.memory.state
@@ -4,8 +4,8 @@ import rego.v1
 
 # checks attribute name contains the word "test"
 deny contains make_advice(advice_type, advice_level, value, message) if {
-	input.attribute
-	value := input.attribute.name
+	input.sample.attribute
+	value := input.sample.attribute.name
 	contains(value, "test")
 	advice_type := "contains_test"
 	advice_level := "violation"
@@ -14,8 +14,8 @@ deny contains make_advice(advice_type, advice_level, value, message) if {
 
 # checks span name contains the word "test"
 deny contains make_advice(advice_type, advice_level, value, message) if {
-	input.span
-	value := input.span.name
+	input.sample.span
+	value := input.sample.span.name
 	contains(value, "test")
 	advice_type := "contains_test"
 	advice_level := "violation"
@@ -24,8 +24,8 @@ deny contains make_advice(advice_type, advice_level, value, message) if {
 
 # checks span status message contains the word "test"
 deny contains make_advice(advice_type, advice_level, value, message) if {
-	input.span
-	value := input.span.status.message
+	input.sample.span
+	value := input.sample.span.status.message
 	contains(value, "test")
 	advice_type := "contains_test_in_status"
 	advice_level := "violation"
@@ -34,14 +34,37 @@ deny contains make_advice(advice_type, advice_level, value, message) if {
 
 # checks span_event name contains the word "test"
 deny contains make_advice(advice_type, advice_level, value, message) if {
-	input.span_event
-	value := input.span_event.name
+	input.sample.span_event
+	value := input.sample.span_event.name
 	contains(value, "test")
 	advice_type := "contains_test"
 	advice_level := "violation"
 	message := "Name must not contain 'test'"
 }
 
+# This example shows how to use the registry_group provided in the input.
+# If the metric's unit is "By" the value in this data-point must be an integer.
+deny contains make_advice(advice_type, advice_level, value, message) if {
+	input.sample.number_data_point
+	value := input.sample.number_data_point.value
+	input.registry_group.unit == "By"
+	value != floor(value) # not a good type check, but serves as an example
+	advice_type := "invalid_data_point_value"
+	advice_level := "violation"
+	message := "Value must be an integer when unit is 'By'"
+}
+
+# As above, but for exemplars which are nested two levels deep.
+deny contains make_advice(advice_type, advice_level, value, message) if {
+	input.sample.exemplar
+	value := input.sample.exemplar.value
+	input.registry_group.unit == "s"
+	value < 1.0
+	advice_type := "low_value"
+	advice_level := "information"
+	message := "This is a low number of seconds"
+}
+
 make_advice(advice_type, advice_level, value, message) := {
 	"type": "advice",
 	"advice_type": advice_type,