diff --git a/alert-policies/amd-gpu/HighMemoryUtilization.yml b/alert-policies/amd-gpu/HighMemoryUtilization.yml new file mode 100644 index 0000000000..1a31c2d133 --- /dev/null +++ b/alert-policies/amd-gpu/HighMemoryUtilization.yml @@ -0,0 +1,27 @@ +name: High GPU Memory Utilization + +description: |+ + This alert is triggered when the AMD GPU memory utilization is above 90%. + +type: STATIC +nrql: + query: "SELECT latest(vram_used_mb/vram_total_mb*100) FROM AMDGpuSample" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 diff --git a/dashboards/amd-gpu/amd-gpu.json b/dashboards/amd-gpu/amd-gpu.json new file mode 100644 index 0000000000..2f5e14685d --- /dev/null +++ b/dashboards/amd-gpu/amd-gpu.json @@ -0,0 +1,262 @@ +{ + "name": "AMD GPU Monitoring", + "description": null, + "pages": [ + { + "name": "AMD GPU Monitoring", + "description": null, + "widgets": [ + { + "title": null, + "layout": { + "column": 1, + "row": 1, + "width": 4, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "[![AMD ROCm-SMI](https://logos-world.net/wp-content/uploads/2020/03/AMD-Logo.png)](https://github.com/RadeonOpenCompute/rocm_smi_lib)" + } + }, + { + "title": "Current Clock Speeds", + "layout": { + "column": 5, + "row": 1, + "width": 2, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "dataFormatters": [], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM AMDGpuSample SELECT latest(gfx_0_clk_mhz) as 'GFX Clock MHz', latest(socclk_0_clk_mhz) as 'SOC Clock MHz', latest(fclk_0_clk_mhz) as 'Fabric Clock MHz', latest(temp_mem_c_0_clk) as 'Memory Clock MHz'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Current Clock MHz", + "layout": { + "column": 7, + "row": 1, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM AMDGpuSample SELECT latest(gfx_0_clk_mhz) as 'GFX Clock MHz', latest(socclk_0_clk_mhz) as 'SOC Clock MHz', latest(fclk_0_clk_mhz) as 'Fabric Clock MHz', latest(temp_mem_c_0_clk) as 'Memory Clock MHz' TIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Select GPU", + "layout": { + "column": 1, + "row": 3, + "width": 4, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM AMDGpuSample SELECT latest(device_id) FACET device_name, hostname" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Temps", + "layout": { + "column": 1, + "row": 4, + "width": 2, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "dataFormatters": [], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM AMDGpuSample SELECT latest(temp_hotspot_c) as 'GPU Temp °C', latest(temp_mem_c) as 'Memory Temp °C', latest(socket_power_w) as 'Socket Power W'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Power Usage", + "layout": { + "column": 3, + "row": 4, + "width": 2, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "dataFormatters": [], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM AMDGpuSample SELECT latest(socket_power_w) as 'Socket Power Watts', latest(gfx_0_max_clk_mhz) as 'Max GFX Clock MHz', latest(size) as 'GPU Size MB'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Memory Usage", + "layout": { + "column": 5, + "row": 4, + "width": 2, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "dataFormatters": [], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM AMDGpuSample SELECT latest(vram_free_mb) as 'VRAM Free MB', latest(vram_used_mb) as 'VRAM Used MB', latest(vram_total_mb) as 'VRAM Total MB'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Utilization", + "layout": { + "column": 7, + "row": 4, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM AMDGpuSample SELECT latest(vram_used_mb/vram_total_mb) * 100 as 'VRAM Used %', latest(gfx_activity_percent) as 'GFX Activity %', latest(umc_activity_percent) as 'UMC Activity %', latest(socket_power_w) as 'Socket Power W' TIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + } + ] + } + ], + "variables": [] +} \ No newline at end of file diff --git a/dashboards/amd-gpu/amd-gpu.png b/dashboards/amd-gpu/amd-gpu.png new file mode 100644 index 0000000000..7cf9e42439 Binary files /dev/null and b/dashboards/amd-gpu/amd-gpu.png differ diff --git a/data-sources/amd-gpu/config.yml b/data-sources/amd-gpu/config.yml new file mode 100644 index 0000000000..b6b186e065 --- /dev/null +++ b/data-sources/amd-gpu/config.yml @@ -0,0 +1,23 @@ +id: amd-gpu +displayName: AMD GPUs +description: | + Monitor AMD GPUs based on the AMD SMI utility. + +install: + primary: + nerdlet: + nerdletId: marketplace.install-data-source + nerdletState: + dataSourceId: amd-gpu + frameworkConfigId: amd-gpu + requiresAccount: false + +icon: logo.png + +keywords: + - infrastructure + - amd + - gpu + +categoryTerms: + - infrastructure diff --git a/data-sources/amd-gpu/logo.png b/data-sources/amd-gpu/logo.png new file mode 100644 index 0000000000..6666b270fa Binary files /dev/null and b/data-sources/amd-gpu/logo.png differ diff --git a/quickstarts/amd-gpu/config.yml b/quickstarts/amd-gpu/config.yml new file mode 100644 index 0000000000..02401c1a76 --- /dev/null +++ b/quickstarts/amd-gpu/config.yml @@ -0,0 +1,40 @@ +slug: amd-gpu + +title: AMD GPU Monitoring + +description: | + Our AMD GPU integration assists you in monitoring the status of GPUs. + This integration leverages our infrastructure agent and the Flex integration, which is seamlessly integrated with AMD SMI utility. + It provides you with a pre-built dashboard containing crucial GPU metrics, including GPU utilization, ECC error counts, + active compute processes, clock and performance states, temperature, fan speed, as well as dynamic and static information about each supported device. + +summary: | + Monitor AMD GPUs performance and state. + +level: Community + +authors: + - New Relic + +keywords: + - amd + - gpu + - infrastructure + - NR1_addData + - NR1_sys + +documentation: + - name: AMD GPU integration + url: https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/amd-gpu-integration/ + description: Monitor the status and performance of AMD GPUs. + +icon: logo.png + +dashboards: + - amd-gpu + +alertPolicies: + - amd-gpu + +dataSourceIds: + - amd-gpu diff --git a/quickstarts/amd-gpu/logo.png b/quickstarts/amd-gpu/logo.png new file mode 100644 index 0000000000..6666b270fa Binary files /dev/null and b/quickstarts/amd-gpu/logo.png differ diff --git a/utils/schema/artifact.json b/utils/schema/artifact.json index fb9d5a9674..ca8b95bf51 100644 --- a/utils/schema/artifact.json +++ b/utils/schema/artifact.json @@ -285,6 +285,7 @@ "aerospike", "akamai", "alert-quality-management", + "amd-gpu", "amazon-cloudwatch-metric-streams", "amazon-eks-on-aws-fargate", "ansible-automation-controller", @@ -616,5 +617,4 @@ ] } } -} - +} \ No newline at end of file