splunk · greatestusername-splunk · Apr 10, 2025 · Apr 3, 2025 · greatestusername-splunk · Apr 3, 2025
diff --git a/dashboards-and-dashboard-groups/inferred-services-dg/Dashboard_Group_Inferred Services.json b/dashboards-and-dashboard-groups/inferred-services-dg/Dashboard_Group_Inferred Services.json
diff --git a/...-and-dashboard-groups/inferred-services-dg/Inferred-services-DashboardGroup.png b/...-and-dashboard-groups/inferred-services-dg/Inferred-services-DashboardGroup.png
diff --git a/dashboards-and-dashboard-groups/inferred-services-dg/README.md b/dashboards-and-dashboard-groups/inferred-services-dg/README.md
@@ -0,0 +1,22 @@
+# Inferred Services - assets to help observing
+
+1. [Dashboard Group - Inferred Services](./Dashboard_Group_Inferred%20Services.json)
+
+Feel free to also use
+
+2. [Sample Detectors: Latency Spike (>3s for 90% of 5min); Error Rate (>50%, sudden change)](../../detectors/inferred-services-detectors/README.md)
+
+Learn more about Inferred Services: 
+- [What are Inferred Services](https://docs.splunk.com/observability/en/apm/apm-spans-traces/inferred-services.html)
+- [Metrics available for Inferred Services](https://docs.splunk.com/observability/en/apm/span-tags/metricsets.html#available-default-mms-metrics-and-dimensions)
+
+## Inferred Services - Dashboard Group
+
+1. Import Dashboard Group
+*From UI:*
+Click on '+' on the top right and select Import->Dashboard Group.
+
+2. Find your dashboard group `Inferred Services` and use as a starting point to create charts.
+
+Screenshot:
+![Dashboard Group 'Inferred Services'](./Inferred-services-DashboardGroup.png)
diff --git a/detectors/inferred-services-detectors/POST_Detector_error_rate.sh b/detectors/inferred-services-detectors/POST_Detector_error_rate.sh
@@ -0,0 +1,90 @@
+curl --location 'https://api.us1.signalfx.com/v2/detector' \
+--header 'Content-Type:  application/json' \
+--header 'X-SF-TOKEN: REPLACEME' \
+--data '{
+    "authorizedWriters": {
+        "teams": [],
+        "users": []
+    },
+    "customProperties": null,
+    "description": "",
+    "detectorOrigin": "Standard",
+    "labelResolutions": {
+        "Error rate >50%": 2000,
+        "Sudden change in Error rate for last 5min": 2000
+    },
+    "maxDelay": 0,
+    "minDelay": 0,
+    "name": "[sample] Inferred Services - Error Rate per minute",
+    "overMTSLimit": false,
+    "programText": "from signalfx.detectors.against_recent import against_recent\nA = histogram('\''inferred.services'\'', filter=filter('\''sf_service'\'', '\''*'\'') and filter('\''sf_environment'\'', '\''*'\'')).count(by=['\''sf_environment'\'', '\''sf_service'\'']).sum(over='\''1m'\'').publish(label='\''A'\'', enable=False)\nB = histogram('\''inferred.services'\'', filter=filter('\''sf_service'\'', '\''*'\'') and filter('\''sf_environment'\'', '\''*'\'') and filter('\''sf_error'\'', '\''false'\'')).count(by=['\''sf_environment'\'', '\''sf_service'\'']).sum(over='\''1m'\'').publish(label='\''B'\'', enable=False)\nC = histogram('\''inferred.services'\'', filter=filter('\''sf_service'\'', '\''*'\'') and filter('\''sf_environment'\'', '\''*'\'') and filter('\''sf_error'\'', '\''true'\'')).count(by=['\''sf_environment'\'', '\''sf_service'\'']).sum(over='\''1m'\'').publish(label='\''C'\'', enable=False)\nD = combine(100*((C if C is not None else 0) / A)).publish(label='\''D'\'')\ndetect(when(D > threshold(50), lasting='\''5m'\'', at_least=0.9), auto_resolve_after='\''30m'\'').publish('\''Error rate >50%'\'')\nagainst_recent.detector_mean_std(stream=D, current_window='\''5m'\'', historical_window='\''15m'\'', fire_num_stddev=3.5, clear_num_stddev=3, orientation='\''above'\'', ignore_extremes=True, calculation_mode='\''vanilla'\'').publish('\''Sudden change in Error rate for last 5min'\'')",
+    "rules": [
+        {
+            "description": "The value of Error rate per min is above 50 for 90% of 5m.",
+            "detectLabel": "Error rate >50%",
+            "disabled": false,
+            "notifications": [],
+            "parameterizedBody": "{{#if anomalous}}\n\tRule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" triggered at {{dateTimeFormat timestamp format=\"full\"}}.\n{{else}}\n\tRule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" cleared at {{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n{{#if anomalous}}\nTriggering condition: {{{readableRule}}}\n{{/if}}\n\n{{#if anomalous}}Signal value for Error rate per min: {{inputs.D.value}}\n{{else}}Current signal value for Error rate per min: {{inputs.D.value}}\n{{/if}}\n\n{{#notEmpty dimensions}}\nSignal details:\n{{{dimensions}}}\n{{/notEmpty}}\n\n{{#if anomalous}}\n{{#if runbookUrl}}Runbook: {{{runbookUrl}}}{{/if}}\n{{#if tip}}Tip: {{{tip}}}{{/if}}\n{{/if}}",
+            "severity": "Major"
+        },
+        {
+            "description": "All the values of Error rate per min in the last 5m are more than 3.5 standard deviation(s) above the mean of its preceding 15m.",
+            "detectLabel": "Sudden change in Error rate for last 5min",
+            "disabled": false,
+            "notifications": [],
+            "parameterizedBody": "{{#if anomalous}}\n\tRule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" triggered at {{dateTimeFormat timestamp format=\"full\"}}.\n{{else}}\n\tRule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" cleared at {{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n{{#if anomalous}}\nTriggering condition: {{{readableRule}}}\n{{/if}}\n\nMinimum value of signal in the last {{event_annotations.current_window}}: {{inputs.recent_min.value}}\n{{#if anomalous}}Trigger threshold: {{inputs.f_top.value}}\n{{else}}Clear threshold: {{inputs.c_top.value}}\n{{/if}}\n\n{{#notEmpty dimensions}}\nSignal details:\n{{{dimensions}}}\n{{/notEmpty}}\n\n{{#if anomalous}}\n{{#if runbookUrl}}Runbook: {{{runbookUrl}}}{{/if}}\n{{#if tip}}Tip: {{{tip}}}{{/if}}\n{{/if}}",
+            "severity": "Warning"
+        }
+    ],
+    "sf_metricsInObjectProgramText": [
+        "inferred.services"
+    ],
+    "status": "ACTIVE",
+    "tags": [],
+    "teams": [],
+    "timezone": "",
+    "visualizationOptions": {
+        "disableSampling": false,
+        "publishLabelOptions": [
+            {
+                "displayName": "Total Requests / min",
+                "label": "A",
+                "paletteIndex": 14,
+                "valuePrefix": "",
+                "valueSuffix": "",
+                "valueUnit": null
+            },
+            {
+                "displayName": "Successful Requests / min",
+                "label": "B",
+                "paletteIndex": 6,
+                "valuePrefix": "",
+                "valueSuffix": "",
+                "valueUnit": null
+            },
+            {
+                "displayName": "Errors / min",
+                "label": "C",
+                "paletteIndex": 8,
+                "valuePrefix": "",
+                "valueSuffix": "",
+                "valueUnit": null
+            },
+            {
+                "displayName": "Error rate per min",
+                "label": "D",
+                "paletteIndex": null,
+                "valuePrefix": null,
+                "valueSuffix": "%",
+                "valueUnit": null
+            }
+        ],
+        "showDataMarkers": true,
+        "showEventLines": false,
+        "time": {
+            "range": 172800000,
+            "rangeEnd": 0,
+            "type": "relative"
+        }
+    }
+}'
diff --git a/detectors/inferred-services-detectors/POST_Detector_latency_spike.sh b/detectors/inferred-services-detectors/POST_Detector_latency_spike.sh
@@ -0,0 +1,57 @@
+curl --location 'https://api.us1.signalfx.com/v2/detector' \
+--header 'Content-Type:  application/json' \
+--header 'X-SF-TOKEN: REPLACEME' \
+--data '{
+    "authorizedWriters": {
+        "teams": [],
+        "users": []
+    },
+    "customProperties": null,
+    "description": "",
+    "detectorOrigin": "Standard",
+    "labelResolutions": {
+        "Latency >3s": 1000
+    },
+    "maxDelay": 0,
+    "minDelay": 0,
+    "name": "[sample] Inferred Services - Latency Spike",
+    "overMTSLimit": false,
+    "programText": "AB = alerts(detector_name='\''[sample] Inferred Services - Latency Spike'\'').publish(label='\''AB'\'')\nA = histogram('\''inferred.services'\'', filter=filter('\''sf_service'\'', '\''*'\'') and filter('\''sf_environment'\'', '\''*'\'')).max(by=['\''sf_operation'\'', '\''sf_environment'\'', '\''sf_service'\'', '\''sf_error'\'']).mean(over='\''1m'\'').publish(label='\''A'\'')\ndetect(when(A > threshold(3000000000), lasting='\''5m'\'', at_least=0.9), auto_resolve_after='\''30m'\'').publish('\''Latency >3s'\'')",
+    "rules": [
+        {
+            "description": "The value of Latency for Operation/Endpoint (1 min avg) is above 3000000000 for 90% of 5m.",
+            "detectLabel": "Latency >3s",
+            "disabled": false,
+            "notifications": [],
+            "parameterizedBody": "{{#if anomalous}}\n\tRule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" triggered at {{dateTimeFormat timestamp format=\"full\"}}.\n{{else}}\n\tRule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" cleared at {{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n{{#if anomalous}}\nTriggering condition: {{{readableRule}}}\n{{/if}}\n\n{{#if anomalous}}Signal value for Latency for Operation/Endpoint (1 min avg): {{inputs.A.value}}\n\n{{else}}Current signal value for Latency for Operation/Endpoint (1 min avg): {{inputs.A.value}}\n{{/if}}\nService: {{dimensions.sf_service}}\nEnvironment: {{dimensions.sf_environment}}\nOperation: {{dimensions.sf_operation}}\nError: {{dimensions.sf_error}}\n{{#notEmpty dimensions}}\nSignal details:\n{{{dimensions}}}\n{{/notEmpty}}\n\n{{#if anomalous}}\n{{#if runbookUrl}}Runbook: {{{runbookUrl}}}{{/if}}\n{{#if tip}}Tip: {{{tip}}}{{/if}}\n{{/if}}",
+            "severity": "Warning"
+        }
+    ],
+    "sf_metricsInObjectProgramText": [
+        "inferred.services"
+    ],
+    "status": "ACTIVE",
+    "tags": [],
+    "teams": [],
+    "timezone": "",
+    "visualizationOptions": {
+        "disableSampling": true,
+        "publishLabelOptions": [
+            {
+                "displayName": "Latency for Operation/Endpoint (1 min avg)",
+                "label": "A",
+                "paletteIndex": null,
+                "valuePrefix": null,
+                "valueSuffix": null,
+                "valueUnit": "Nanosecond"
+            }
+        ],
+        "showDataMarkers": true,
+        "showEventLines": false,
+        "time": {
+            "range": 900000,
+            "rangeEnd": 0,
+            "type": "relative"
+        }
+    }
+}'
diff --git a/detectors/inferred-services-detectors/README.md b/detectors/inferred-services-detectors/README.md
@@ -0,0 +1,24 @@
+# Inferred Services - assets to help observing
+
+1. [Detector: Latency Spike (>3s for 90% of 5min)](./POST_Detector_latency_spike.sh)
+
+2. [Detector: Error Rate (>50%, sudden change)](./POST_Detector_error_rate.sh)
+
+Feel free to also use
+
+3. [Dashboard Group - Inferred Services](../../dashboards-and-dashboard-groups/inferred-services-dg/README.md)
+
+Learn more about Inferred Services: 
+- [What are Inferred Services](https://docs.splunk.com/observability/en/apm/apm-spans-traces/inferred-services.html)
+- [Metrics available for Inferred Services](https://docs.splunk.com/observability/en/apm/span-tags/metricsets.html#available-default-mms-metrics-and-dimensions)
+
+## Inferred Services - Sample Detectors
+![Sample Detectors for Latency and Error rate of Inferred Services](../../detectors/inferred-services-detectors/detectors-1.png)
+
+Use curl command to post the detector (replace `Token` and `Realm` as required).
+
+These can be used as a starting point to customise signals, thresholds, messaging etc.
+
+Screeshots:
+![Error Rate Detector](../../detectors/inferred-services-detectors/detectors-errors.png)
+![Latency Spike Detector](../../detectors/inferred-services-detectors/detectors-latency.png)
diff --git a/detectors/inferred-services-detectors/detectors-1.png b/detectors/inferred-services-detectors/detectors-1.png
diff --git a/detectors/inferred-services-detectors/detectors-errors.png b/detectors/inferred-services-detectors/detectors-errors.png
diff --git a/detectors/inferred-services-detectors/detectors-latency.png b/detectors/inferred-services-detectors/detectors-latency.png