Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions alert-policies/amd-gpu/HighMemoryUtilization.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: High GPU Memory Utilization

description: |+
This alert is triggered when the AMD GPU memory utilization is above 90%.

type: STATIC
nrql:
query: "SELECT latest(vram_used_mb/vram_total_mb*100) FROM AMDGpuSample"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
262 changes: 262 additions & 0 deletions dashboards/amd-gpu/amd-gpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
{
"name": "AMD GPU Monitoring",
"description": null,
"pages": [
{
"name": "AMD GPU Monitoring",
"description": null,
"widgets": [
{
"title": null,
"layout": {
"column": 1,
"row": 1,
"width": 4,
"height": 2
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.markdown"
},
"rawConfiguration": {
"text": "[![AMD ROCm-SMI](https://logos-world.net/wp-content/uploads/2020/03/AMD-Logo.png)](https://github.com/RadeonOpenCompute/rocm_smi_lib)"
}
},
{
"title": "Current Clock Speeds",
"layout": {
"column": 5,
"row": 1,
"width": 2,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"dataFormatters": [],
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "FROM AMDGpuSample SELECT latest(gfx_0_clk_mhz) as 'GFX Clock MHz', latest(socclk_0_clk_mhz) as 'SOC Clock MHz', latest(fclk_0_clk_mhz) as 'Fabric Clock MHz', latest(temp_mem_c_0_clk) as 'Memory Clock MHz'"
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "Current Clock MHz",
"layout": {
"column": 7,
"row": 1,
"width": 6,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.line"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"legend": {
"enabled": true
},
"markers": {
"displayedTypes": {
"criticalViolations": false,
"deployments": true,
"relatedDeployments": true,
"warningViolations": false
}
},
"nrqlQueries": [
{
"accountIds": [],
"query": "FROM AMDGpuSample SELECT latest(gfx_0_clk_mhz) as 'GFX Clock MHz', latest(socclk_0_clk_mhz) as 'SOC Clock MHz', latest(fclk_0_clk_mhz) as 'Fabric Clock MHz', latest(temp_mem_c_0_clk) as 'Memory Clock MHz' TIMESERIES"
}
],
"platformOptions": {
"ignoreTimeRange": false
},
"thresholds": {
"isLabelVisible": true
},
"yAxisLeft": {
"zero": true
},
"yAxisRight": {
"zero": true
}
}
},
{
"title": "Select GPU",
"layout": {
"column": 1,
"row": 3,
"width": 4,
"height": 1
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.bar"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "FROM AMDGpuSample SELECT latest(device_id) FACET device_name, hostname"
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "Temps",
"layout": {
"column": 1,
"row": 4,
"width": 2,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"dataFormatters": [],
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "FROM AMDGpuSample SELECT latest(temp_hotspot_c) as 'GPU Temp °C', latest(temp_mem_c) as 'Memory Temp °C', latest(socket_power_w) as 'Socket Power W'"
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "Power Usage",
"layout": {
"column": 3,
"row": 4,
"width": 2,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"dataFormatters": [],
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "FROM AMDGpuSample SELECT latest(socket_power_w) as 'Socket Power Watts', latest(gfx_0_max_clk_mhz) as 'Max GFX Clock MHz', latest(size) as 'GPU Size MB'"
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "Memory Usage",
"layout": {
"column": 5,
"row": 4,
"width": 2,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"dataFormatters": [],
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "FROM AMDGpuSample SELECT latest(vram_free_mb) as 'VRAM Free MB', latest(vram_used_mb) as 'VRAM Used MB', latest(vram_total_mb) as 'VRAM Total MB'"
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "Utilization",
"layout": {
"column": 7,
"row": 4,
"width": 6,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.line"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"legend": {
"enabled": true
},
"markers": {
"displayedTypes": {
"criticalViolations": false,
"deployments": true,
"relatedDeployments": true,
"warningViolations": false
}
},
"nrqlQueries": [
{
"accountIds": [],
"query": "FROM AMDGpuSample SELECT latest(vram_used_mb/vram_total_mb) * 100 as 'VRAM Used %', latest(gfx_activity_percent) as 'GFX Activity %', latest(umc_activity_percent) as 'UMC Activity %', latest(socket_power_w) as 'Socket Power W' TIMESERIES"
}
],
"platformOptions": {
"ignoreTimeRange": false
},
"thresholds": {
"isLabelVisible": true
},
"yAxisLeft": {
"zero": true
},
"yAxisRight": {
"zero": true
}
}
}
]
}
],
"variables": []
}
Binary file added dashboards/amd-gpu/amd-gpu.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 7 additions & 1 deletion dashboards/ebpf/ebpf.json
Original file line number Diff line number Diff line change
Expand Up @@ -1135,7 +1135,13 @@
{
"name": "entity_name",
"items": null,
"defaultValues": [],
"defaultValues": [
{
"value": {
"string": "*"
}
}
],
"nrqlQuery": {
"accountIds": [],
"query": "FROM Metric SELECT uniques(entity.name) limit max"
Expand Down
23 changes: 23 additions & 0 deletions data-sources/amd-gpu/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
id: amd-gpu
displayName: AMD GPUs
description: |
Monitor AMD GPUs based on the AMD SMI utility.

install:
primary:
nerdlet:
nerdletId: marketplace.install-data-source
nerdletState:
dataSourceId: amd-gpu
frameworkConfigId: amd-gpu
requiresAccount: false

icon: logo.png

keywords:
- infrastructure
- amd
- gpu

categoryTerms:
- infrastructure
Binary file added data-sources/amd-gpu/logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
41 changes: 41 additions & 0 deletions quickstarts/amd-gpu/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
id: 10c9906d-935c-468a-b1b4-c280c5229a22
slug: amd-gpu

title: AMD GPU Monitoring

description: |
Our AMD GPU integration assists you in monitoring the status of GPUs.
This integration leverages our infrastructure agent and the Flex integration, which is seamlessly integrated with AMD SMI utility.
It provides you with a pre-built dashboard containing crucial GPU metrics, including GPU utilization, ECC error counts,
active compute processes, clock and performance states, temperature, fan speed, as well as dynamic and static information about each supported device.

summary: |
Monitor AMD GPUs performance and state.

level: Community

authors:
- New Relic

keywords:
- amd
- gpu
- infrastructure
- NR1_addData
- NR1_sys

documentation:
- name: AMD GPU integration
url: https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/amd-gpu-integration/
description: Monitor the status and performance of AMD GPUs.

icon: logo.png

dashboards:
- amd-gpu

alertPolicies:
- amd-gpu

dataSourceIds:
- amd-gpu
Binary file added quickstarts/amd-gpu/logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion quickstarts/ebpf/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ documentation:
dataSourceIds:
- ebpf
keywords:
- infrastructure
- language agent
- eAPM
- eBPF
dashboards:
Expand Down
4 changes: 2 additions & 2 deletions utils/schema/artifact.json
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@
"aerospike",
"akamai",
"alert-quality-management",
"amd-gpu",
"amazon-cloudwatch-metric-streams",
"amazon-eks-on-aws-fargate",
"ansible-automation-controller",
Expand Down Expand Up @@ -616,5 +617,4 @@
]
}
}
}

}
Loading