diff --git a/dashboards/databricks-cluster-health/databricks-cluster-health.json b/dashboards/databricks-cluster-health/databricks-cluster-health.json new file mode 100644 index 0000000000..fda0375c71 --- /dev/null +++ b/dashboards/databricks-cluster-health/databricks-cluster-health.json @@ -0,0 +1,1255 @@ +{ + "name": "Databricks Cluster Health", + "description": "View key health metrics for your Databricks clusters, including CPU, memory, disk, and network usage for both driver and worker nodes.", + "pages": [ + { + "name": "Driver Node Metrics", + "description": null, + "widgets": [ + { + "title": "Average CPU %", + "layout": { + "column": 1, + "row": 1, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SystemSample\nSELECT average(cpuPercent)\nWHERE databricksIsDriverNode = 'true'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{driverEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "max": 100, + "min": 0, + "zero": false + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "5 minute load average", + "layout": { + "column": 7, + "row": 1, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SystemSample\nSELECT latest(loadAverageFiveMinute)\nWHERE databricksIsDriverNode = 'true'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{driverEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average memory free", + "layout": { + "column": 1, + "row": 4, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SystemSample\nSELECT average(memoryFreeBytes)\nWHERE databricksIsDriverNode = 'true'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{driverEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "BYTES" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average used memory %", + "layout": { + "column": 7, + "row": 4, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SystemSample\nSELECT average(memoryUsedPercent)\nWHERE databricksIsDriverNode = 'true'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{driverEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "max": 100, + "min": 0, + "zero": false + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average disk free", + "layout": { + "column": 1, + "row": 7, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM StorageSample\nSELECT average(diskFreeBytes) AS 'Byte'\nWHERE databricksIsDriverNode = 'true'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{driverEntityName}})\nFACET databricksClusterName, entityName, mountPoint" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Average disk free %", + "layout": { + "column": 7, + "row": 7, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM StorageSample\nSELECT average(diskFreePercent)\nWHERE databricksIsDriverNode = 'true'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{driverEntityName}})\nFACET databricksClusterName, entityName, mountPoint\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "max": 100, + "min": 0, + "zero": false + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average bytes/s received", + "layout": { + "column": 1, + "row": 10, + "width": 3, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM NetworkSample\nSELECT average(receiveBytesPerSecond)\nWHERE databricksIsDriverNode = 'true'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{driverEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "BYTES_PER_SECOND" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average bytes/s transmitted", + "layout": { + "column": 4, + "row": 10, + "width": 3, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM NetworkSample\nSELECT average(transmitBytesPerSecond)\nWHERE databricksIsDriverNode = 'true'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{driverEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "BYTES_PER_SECOND" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average receive errors/s", + "layout": { + "column": 7, + "row": 10, + "width": 3, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM NetworkSample\nSELECT average(receiveErrorsPerSecond)\nWHERE databricksIsDriverNode = 'true'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{driverEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "BYTES_PER_SECOND" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average transmit errors/s", + "layout": { + "column": 10, + "row": 10, + "width": 3, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM NetworkSample\nSELECT average(transmitErrorsPerSecond)\nWHERE databricksIsDriverNode = 'true'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{driverEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "BYTES_PER_SECOND" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + } + ] + }, + { + "name": "Driver Logs", + "description": null, + "widgets": [ + { + "title": "Stdout", + "layout": { + "column": 1, + "row": 1, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "logger.log-table-widget" + }, + "rawConfiguration": { + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH position(databricksWorkspaceHost, '.') AS firstDot\nFROM Log\nSELECT substring(databricksWorkspaceHost, 0, firstDot) AS 'Workspace', databricksClusterName AS 'Cluster', hostname AS Hostname, message AS Message\nWHERE databricksClusterId IS NOT NULL\n AND databricksLogType = 'driver-stdout'\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND hostname IN ({{driverEntityName}})\nLIMIT 1000" + } + ] + } + }, + { + "title": "Stderr", + "layout": { + "column": 1, + "row": 5, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "logger.log-table-widget" + }, + "rawConfiguration": { + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH position(databricksWorkspaceHost, '.') AS firstDot\nFROM Log\nSELECT substring(databricksWorkspaceHost, 0, firstDot) AS 'Workspace', databricksClusterName AS 'Cluster', hostname AS Hostname, message AS Message\nWHERE databricksClusterId IS NOT NULL\n AND databricksLogType = 'driver-stderr'\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND hostname IN ({{driverEntityName}})\nLIMIT 1000" + } + ] + } + }, + { + "title": "Log4j", + "layout": { + "column": 1, + "row": 9, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "logger.log-table-widget" + }, + "rawConfiguration": { + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH position(databricksWorkspaceHost, '.') AS firstDot\nFROM Log\nSELECT substring(databricksWorkspaceHost, 0, firstDot) AS 'Workspace', databricksClusterName AS 'Cluster', hostname AS Hostname, level AS Level, message AS Message\nWHERE databricksClusterId IS NOT NULL\n AND databricksLogType = 'driver-log4j'\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND hostname IN ({{driverEntityName}})\nLIMIT 1000" + } + ] + } + }, + { + "title": "Event Log", + "layout": { + "column": 1, + "row": 13, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "logger.log-table-widget" + }, + "rawConfiguration": { + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH position(databricksWorkspaceHost, '.') AS firstDot,\n capture(filePath, r'/databricks/driver/eventlogs/(?P[^/]+)/.*') AS sparkContextId\nFROM Log\nSELECT substring(databricksWorkspaceHost, 0, firstDot) AS 'Workspace', databricksClusterName AS 'Cluster', sparkContextId AS 'Spark Context ID', Event, `App Name`, `Job ID`, `Stage Info.Stage ID` AS 'Stage ID', `Task Info.Task ID` AS 'Task ID'\nWHERE databricksClusterId IS NOT NULL\n AND databricksLogType = 'spark-eventlog'\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND hostname IN ({{driverEntityName}})\nLIMIT 1000" + } + ] + } + }, + { + "title": "Infrastructure Agent Logs", + "layout": { + "column": 1, + "row": 17, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "logger.log-table-widget" + }, + "rawConfiguration": { + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH position(databricksWorkspaceHost, '.') AS firstDot\nFROM Log\nSELECT substring(databricksWorkspaceHost, 0, firstDot) AS 'Workspace', databricksClusterName AS 'Cluster', hostname AS Hostname, MESSAGE.context.component AS Component, MESSAGE.msg AS Message\nWHERE SYSLOG_IDENTIFIER = 'newrelic-infra-service'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND hostname IN ({{driverEntityName}})\nLIMIT 1000" + } + ] + } + }, + { + "title": "Databricks Integration Logs (APM)", + "layout": { + "column": 1, + "row": 21, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "logger.log-table-widget" + }, + "rawConfiguration": { + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM Log\nSELECT hostname AS Hostname, level AS Level, message AS Message\nWHERE `newrelic.source` = 'logs.APM'\n AND `entity.name` = 'New Relic Databricks Integration'\n AND hostname IN ({{driverEntityName}})\nLIMIT 1000" + } + ] + } + } + ] + }, + { + "name": "Worker Node Metrics", + "description": null, + "widgets": [ + { + "title": "Average CPU %", + "layout": { + "column": 1, + "row": 1, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SystemSample\nSELECT average(cpuPercent)\nWHERE databricksIsDriverNode = 'false'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{workerEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "max": 100, + "min": 0, + "zero": false + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "5 minute load average", + "layout": { + "column": 7, + "row": 1, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SystemSample\nSELECT latest(loadAverageFiveMinute)\nWHERE databricksIsDriverNode = 'false'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{workerEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average memory free", + "layout": { + "column": 1, + "row": 4, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SystemSample\nSELECT average(memoryFreeBytes)\nWHERE databricksIsDriverNode = 'false'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{workerEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "BYTES" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average used memory %", + "layout": { + "column": 7, + "row": 4, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SystemSample\nSELECT average(memoryUsedPercent)\nWHERE databricksIsDriverNode = 'false'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{workerEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "max": 100, + "min": 0, + "zero": false + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average disk free", + "layout": { + "column": 1, + "row": 7, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM StorageSample\nSELECT average(diskFreeBytes) AS 'Byte'\nWHERE databricksIsDriverNode = 'false'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{workerEntityName}})\nFACET databricksClusterName, entityName, mountPoint" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Average disk free %", + "layout": { + "column": 7, + "row": 7, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM StorageSample\nSELECT average(diskFreePercent)\nWHERE databricksIsDriverNode = 'false'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{workerEntityName}})\nFACET databricksClusterName, entityName, mountPoint\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "max": 100, + "min": 0, + "zero": false + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average bytes/s received", + "layout": { + "column": 1, + "row": 10, + "width": 3, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM NetworkSample\nSELECT average(receiveBytesPerSecond)\nWHERE databricksIsDriverNode = 'false'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{workerEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "BYTES_PER_SECOND" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average bytes/s transmitted", + "layout": { + "column": 4, + "row": 10, + "width": 3, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM NetworkSample\nSELECT average(transmitBytesPerSecond)\nWHERE databricksIsDriverNode = 'false'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{workerEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "BYTES_PER_SECOND" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average receive errors/s", + "layout": { + "column": 7, + "row": 10, + "width": 3, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM NetworkSample\nSELECT average(receiveErrorsPerSecond)\nWHERE databricksIsDriverNode = 'false'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{workerEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "BYTES_PER_SECOND" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average transmit errors/s", + "layout": { + "column": 10, + "row": 10, + "width": 3, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM NetworkSample\nSELECT average(transmitErrorsPerSecond)\nWHERE databricksIsDriverNode = 'false'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND entityName IN ({{workerEntityName}})\nFACET databricksClusterName, entityName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "BYTES_PER_SECOND" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + } + ] + }, + { + "name": "Executor Logs", + "description": null, + "widgets": [ + { + "title": "Stdout", + "layout": { + "column": 1, + "row": 1, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "logger.log-table-widget" + }, + "rawConfiguration": { + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH position(databricksWorkspaceHost, '.') AS firstDot,\n capture(filePath, r'/databricks/spark/work/(?P[^/]+)/.*') AS appName\nFROM Log\nSELECT substring(databricksWorkspaceHost, 0, firstDot) AS 'Workspace', databricksClusterName AS 'Cluster', hostname AS Hostname, appName AS Application, message AS Message\nWHERE databricksClusterId IS NOT NULL\n AND databricksLogType = 'executor-stdout'\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND hostname IN ({{workerEntityName}})\nLIMIT 1000" + } + ] + } + }, + { + "title": "Stderr", + "layout": { + "column": 1, + "row": 5, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "logger.log-table-widget" + }, + "rawConfiguration": { + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH position(databricksWorkspaceHost, '.') AS firstDot,\n capture(filePath, r'/databricks/spark/work/(?P[^/]+)/.*') AS appName\nFROM Log\nSELECT substring(databricksWorkspaceHost, 0, firstDot) AS 'Workspace', databricksClusterName AS 'Cluster', hostname AS Hostname, appName as Application, level AS Level, message AS Message\nWHERE databricksClusterId IS NOT NULL\n AND databricksLogType = 'executor-stderr'\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND hostname IN ({{workerEntityName}})\nLIMIT 1000" + } + ] + } + }, + { + "title": "Infrastrucutre Agent Logs", + "layout": { + "column": 1, + "row": 9, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "logger.log-table-widget" + }, + "rawConfiguration": { + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH position(databricksWorkspaceHost, '.') AS firstDot\nFROM Log\nSELECT substring(databricksWorkspaceHost, 0, firstDot) AS 'Workspace', databricksClusterName AS 'Cluster', hostname AS Hostname, MESSAGE.context.component AS Component, MESSAGE.msg AS Message\nWHERE SYSLOG_IDENTIFIER = 'newrelic-infra-service'\n AND databricksClusterId IS NOT NULL\n AND databricksWorkspaceHost IN ({{databricksWorkspaceHost}})\n AND databricksClusterName IN ({{databricksClusterName}})\n AND hostname IN ({{workerEntityName}})\nLIMIT 1000" + } + ] + } + } + ] + } + ], + "variables": [ + { + "name": "databricksWorkspaceHost", + "items": null, + "defaultValues": [ + { + "value": { + "string": "*" + } + } + ], + "nrqlQuery": { + "accountIds": [], + "query": "SELECT uniques(databricksWorkspaceHost) FROM Log WHERE databricksWorkspaceHost IS NOT NULL" + }, + "options": { + "ignoreTimeRange": false, + "excluded": false + }, + "title": "Workspace", + "type": "NRQL", + "isMultiSelection": true, + "replacementStrategy": "STRING" + }, + { + "name": "databricksClusterName", + "items": null, + "defaultValues": [ + { + "value": { + "string": "*" + } + } + ], + "nrqlQuery": { + "accountIds": [], + "query": "SELECT uniques(databricksClusterName) FROM Log WHERE databricksClusterName IS NOT NULL" + }, + "options": { + "ignoreTimeRange": false, + "excluded": false + }, + "title": "Cluster", + "type": "NRQL", + "isMultiSelection": true, + "replacementStrategy": "STRING" + }, + { + "name": "driverEntityName", + "items": null, + "defaultValues": [ + { + "value": { + "string": "*" + } + } + ], + "nrqlQuery": { + "accountIds": [], + "query": "SELECT uniques(entityName) FROM SystemSample WHERE databricksClusterId IS NOT NULL AND databricksIsDriverNode = 'true'" + }, + "options": { + "ignoreTimeRange": false, + "excluded": false + }, + "title": "Driver Node", + "type": "NRQL", + "isMultiSelection": true, + "replacementStrategy": "STRING" + }, + { + "name": "workerEntityName", + "items": null, + "defaultValues": [ + { + "value": { + "string": "*" + } + } + ], + "nrqlQuery": { + "accountIds": [], + "query": "SELECT uniques(entityName) FROM SystemSample WHERE databricksClusterId IS NOT NULL AND databricksIsDriverNode = 'false'" + }, + "options": { + "ignoreTimeRange": false, + "excluded": false + }, + "title": "Worker Node", + "type": "NRQL", + "isMultiSelection": true, + "replacementStrategy": "STRING" + } + ] +} diff --git a/dashboards/databricks-cluster-health/databricks-cluster-health01.png b/dashboards/databricks-cluster-health/databricks-cluster-health01.png new file mode 100644 index 0000000000..b5945d3ac6 Binary files /dev/null and b/dashboards/databricks-cluster-health/databricks-cluster-health01.png differ diff --git a/dashboards/databricks-consumption-cost/databricks-consumption-cost.json b/dashboards/databricks-consumption-cost/databricks-consumption-cost.json new file mode 100644 index 0000000000..45a1dba3fd --- /dev/null +++ b/dashboards/databricks-consumption-cost/databricks-consumption-cost.json @@ -0,0 +1,840 @@ +{ + "name": "Databricks Consumption & Cost", + "description": "Monitor your Databricks consumption and associated costs. Track total and SKU-level usage and costs over time, analyze spending by clusters and warehouses, and gain insights into job-related expenses to optimize your Databricks environment.", + "pages": [ + { + "name": "Billable Usage", + "description": null, + "widgets": [ + { + "title": "Total consumption last 30 days compared to 60 days ago", + "layout": { + "column": 1, + "row": 1, + "width": 4, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksUsage SELECT sum(usage_quantity) AS 'DBUs' SINCE 31 days ago UNTIL TODAY COMPARE WITH 61 days ago" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Total cost last 30 days compared to 60 days ago", + "layout": { + "column": 5, + "row": 1, + "width": 4, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(usage_quantity * list_price) AS '$$' FROM DatabricksUsage JOIN (FROM lookup(DatabricksListPrices) SELECT list_price, sku_name AS price_sku_name) ON sku_name = price_sku_name SINCE 31 days ago UNTIL today COMPARE WITH 61 days ago" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Total cost by SKU last 30 days compared to 60 days ago", + "layout": { + "column": 9, + "row": 1, + "width": 4, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(usage_quantity * list_price) AS '$$' FROM DatabricksUsage JOIN (FROM lookup(DatabricksListPrices) SELECT list_price, sku_name AS price_sku_name) ON sku_name = price_sku_name FACET sku_name SINCE 31 days ago UNTIL today COMPARE WITH 61 days ago" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Daily consumption last 30 days", + "layout": { + "column": 1, + "row": 3, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksUsage SELECT sum(usage_quantity) AS 'DBUs' SINCE 31 days ago UNTIL today TIMESERIES 1 day" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Daily cost last 30 days", + "layout": { + "column": 7, + "row": 3, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(usage_quantity * list_price) AS '$$' FROM DatabricksUsage JOIN (FROM lookup(DatabricksListPrices) SELECT list_price, sku_name AS price_sku_name) ON sku_name = price_sku_name SINCE 31 days ago UNTIL today TIMESERIES 1 day" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Daily consumption by SKU last 30 days", + "layout": { + "column": 1, + "row": 6, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksUsage SELECT sum(usage_quantity) FACET sku_name SINCE 31 days ago UNTIL today TIMESERIES 1 day" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Daily cost by SKU last 30 days", + "layout": { + "column": 7, + "row": 6, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(usage_quantity * list_price) FROM DatabricksUsage JOIN (FROM lookup(DatabricksListPrices) SELECT list_price, sku_name AS price_sku_name) ON sku_name = price_sku_name SINCE 31 days ago UNTIL today FACET sku_name TIMESERIES 1 day" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Daily consumption by cluster last 30 days", + "layout": { + "column": 1, + "row": 9, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksUsage SELECT sum(usage_quantity) WHERE cluster_name IS NOT NULL AND cluster_name NOT LIKE 'job%' FACET cluster_name SINCE 31 days ago UNTIL today TIMESERIES 1 day" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Daily consumption by warehouse last 30 days", + "layout": { + "column": 5, + "row": 9, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksUsage SELECT sum(usage_quantity) WHERE warehouse_name IS NOT NULL FACET warehouse_name SINCE 31 days ago UNTIL today TIMESERIES 1 day" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Daily consumption by job cluster last 30 days", + "layout": { + "column": 9, + "row": 9, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksUsage SELECT sum(usage_quantity) WHERE cluster_name IS NOT NULL AND cluster_name LIKE 'job%' FACET cluster_name SINCE 31 days ago UNTIL today TIMESERIES 1 day" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Daily cost by cluster last 30 days", + "layout": { + "column": 1, + "row": 12, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(usage_quantity * list_price) AS '$$' FROM DatabricksUsage JOIN (FROM lookup(DatabricksListPrices) SELECT list_price, sku_name AS price_sku_name) ON sku_name = price_sku_name WHERE cluster_name IS NOT NULL AND cluster_name NOT LIKE 'job%' FACET cluster_name SINCE 31 days ago UNTIL today TIMESERIES 1 day" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Daily cost by warehouse last 30 days", + "layout": { + "column": 5, + "row": 12, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(usage_quantity * list_price) AS '$$' FROM DatabricksUsage JOIN (FROM lookup(DatabricksListPrices) SELECT list_price, sku_name AS price_sku_name) ON sku_name = price_sku_name WHERE warehouse_name IS NOT NULL FACET warehouse_name SINCE 31 days ago UNTIL today TIMESERIES 1 day" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Daily cost by job cluster last 30 days", + "layout": { + "column": 9, + "row": 12, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(usage_quantity * list_price) AS '$$' FROM DatabricksUsage JOIN (FROM lookup(DatabricksListPrices) SELECT list_price, sku_name AS price_sku_name) ON sku_name = price_sku_name WHERE cluster_name IS NOT NULL AND cluster_name LIKE 'job%' FACET cluster_name SINCE 31 days ago UNTIL today TIMESERIES 1 day" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + } + ] + }, + { + "name": "Job Cost", + "description": null, + "widgets": [ + { + "title": "Total job spend last 30 days compared to 60 days ago", + "layout": { + "column": 1, + "row": 1, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Total list_cost", + "precision": 2, + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(list_cost)\nWITH usage_quantity * list_price AS list_cost\nFROM DatabricksUsage\nJOIN (\n SELECT list_price, sku_name AS price_sku_name\n FROM lookup(DatabricksListPrices)\n) ON sku_name = price_sku_name\nWHERE sku_name LIKE '%JOBS%'\nSINCE 31 days ago UNTIL today\nCOMPARE with 61 days ago\nLIMIT MAX" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Weekly job spend by workspace last 3 months compared to 6 months ago", + "layout": { + "column": 5, + "row": 1, + "width": 8, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(list_cost)\nWITH usage_quantity * list_price AS list_cost\nFROM DatabricksUsage\nJOIN (\n SELECT list_price, sku_name AS price_sku_name\n FROM lookup(DatabricksListPrices)\n) ON sku_name = price_sku_name\nWHERE sku_name LIKE '%JOBS%'\nSINCE 3 months ago\nCOMPARE WITH 6 months ago\nFACET workspace_instance_name\nLIMIT 100\nTIMESERIES 1 week" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Most expensive jobs by workspace and user last 30 days", + "layout": { + "column": 1, + "row": 4, + "width": 6, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.pie" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": true + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(list_cost)\nWITH usage_quantity * list_price AS list_cost\nFROM DatabricksUsage\nJOIN (\n SELECT list_price, sku_name AS price_sku_name\n FROM lookup(DatabricksListPrices)\n) ON sku_name = price_sku_name\nWHERE sku_name LIKE '%JOBS%' AND job_id IS NOT NULL\nSINCE 31 days ago UNTIL today\nFACET job_name, workspace_instance_name, run_as\nLIMIT 20" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Most expensive jobs by workspace this week compared to 2 weeks ago", + "layout": { + "column": 7, + "row": 4, + "width": 6, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(list_cost)\nFROM DatabricksJobCost\nWHERE query_id = 'jobs_cost_list_cost_per_job'\nSINCE 8 days ago UNTIL today\nCOMPARE with 15 days ago\nFACET job_name, workspace_instance_name\nLIMIT 20\nTIMESERIES 1 day" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Highest change in weekly spend last 2 weeks", + "layout": { + "column": 1, + "row": 8, + "width": 12, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Last 7 Day Growth", + "precision": 2, + "type": "decimal" + }, + { + "name": "Last 14 day cost", + "precision": 2, + "type": "decimal" + }, + { + "name": "Last 7 Day Cost", + "precision": 2, + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(list_cost) as 'Last 7 Day Cost',\n latest(list_cost_14_day) as 'Last 14 day cost',\n sum(list_cost) - latest(list_cost_14_day) as 'Last 7 Day Growth'\nFROM DatabricksJobCost\nJOIN (\n SELECT sum(list_cost) as list_cost_14_day\n FROM DatabricksJobCost\n WHERE query_id = 'jobs_cost_list_cost_per_job'\n SINCE 15 days ago until 8 days ago\n FACET workspace_id, job_id, job_name, run_as\n LIMIT 100\n) ON job_id\nWHERE query_id = 'jobs_cost_list_cost_per_job'\nSINCE 8 days ago UNTIL today\nFACET workspace_id, job_id, job_name, run_as\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Most expensive jobs last 30 days", + "layout": { + "column": 1, + "row": 11, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "list_cost", + "precision": 2, + "type": "decimal" + }, + { + "name": "runs", + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(list_cost), latest(runs), latest(last_seen_date)\nFROM DatabricksJobCost\nWHERE query_id = 'jobs_cost_list_cost_per_job'\nSINCE 31 days ago UNTIL today\nFACET workspace_id, workspace_instance_name, job_id, job_name, run_as\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Most expensive job runs last 30 days", + "layout": { + "column": 1, + "row": 15, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "list_cost", + "precision": 2, + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(list_cost), latest(last_seen_date)\nFROM DatabricksJobCost\nWHERE query_id = 'jobs_cost_list_cost_per_job_run'\nSINCE 31 days ago UNTIL today\nFACET workspace_id, workspace_instance_name, job_id, job_name, run_id, run_as\nLIMIT 100\n" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Highest failure jobs last 30 days", + "layout": { + "column": 1, + "row": 19, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Failure Cost", + "precision": 2, + "type": "decimal" + }, + { + "name": "failures", + "type": "decimal" + }, + { + "name": "runs", + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT sum(runs), sum(failures), sum(list_cost) AS 'Failure Cost', max(last_seen_date) AS 'Last Seen Date'\nFROM DatabricksJobCost\nWHERE query_id = 'jobs_cost_frequent_failures'\nSINCE 31 days ago UNTIL today\nFACET workspace_id, workspace_instance_name, job_id, job_name, run_as\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Most retried runs last 30 days", + "layout": { + "column": 1, + "row": 23, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(run_as), sum(repairs), sum(repair_time_seconds), sum(list_cost) AS 'Repair Cost'\nFROM DatabricksJobCost\nWHERE query_id = 'jobs_cost_most_retries'\nSINCE 31 days ago UNTIL today\nFACET workspace_id, workspace_instance_name, job_id, job_name, run_id\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Job cost deviation across runs last 30 days", + "layout": { + "column": 1, + "row": 27, + "width": 12, + "height": 5 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "List Cost Deviation", + "precision": 2, + "type": "decimal" + }, + { + "name": "list_cost", + "precision": 2, + "type": "decimal" + }, + { + "name": "Max list_cost", + "precision": 2, + "type": "decimal" + }, + { + "name": "Avg list_cost", + "precision": 2, + "type": "decimal" + }, + { + "name": "Total list_cost", + "precision": 2, + "type": "decimal" + }, + { + "name": "Runs", + "precision": 0, + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT\n count(run_id) as 'Runs',\n sum(list_cost),\n average(list_cost),\n max(list_cost),\n percentile(list_cost, 90),\n max(list_cost) - percentile(list_cost, 90) AS 'List Cost Deviation'\nFROM DatabricksJobCost\nWHERE query_id = 'jobs_cost_list_cost_per_job_run'\nSINCE 31 days ago UNTIL today\nFACET workspace_id, workspace_instance_name, job_id, job_name\nLIMIT 100 " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + } + ], + "variables": [] +} diff --git a/dashboards/databricks-consumption-cost/databricks-consumption-cost01.png b/dashboards/databricks-consumption-cost/databricks-consumption-cost01.png new file mode 100644 index 0000000000..48ee3b48ce Binary files /dev/null and b/dashboards/databricks-consumption-cost/databricks-consumption-cost01.png differ diff --git a/dashboards/databricks-consumption-cost/databricks-consumption-cost02.png b/dashboards/databricks-consumption-cost/databricks-consumption-cost02.png new file mode 100644 index 0000000000..f598b04bff Binary files /dev/null and b/dashboards/databricks-consumption-cost/databricks-consumption-cost02.png differ diff --git a/dashboards/databricks-job-runs/databricks-job-runs.json b/dashboards/databricks-job-runs/databricks-job-runs.json new file mode 100644 index 0000000000..d1418825c8 --- /dev/null +++ b/dashboards/databricks-job-runs/databricks-job-runs.json @@ -0,0 +1,1049 @@ +{ + "name": "Databricks Job Runs", + "description": "Monitor the status and performance of your Databricks job runs and tasks, including success and failure rates and job and task durations.", + "pages": [ + { + "name": "Job Runs", + "description": null, + "widgets": [ + { + "title": "Jobs running", + "layout": { + "column": 1, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRunSummary\nSELECT latest(runningJobRunCount) AS Runs\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Job runs succeeded", + "layout": { + "column": 3, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRun\nSELECT count(*) AS Runs\nWHERE event = 'complete' AND terminationType = 'SUCCESS' AND databricksWorkspaceName = {{workspace}}\n" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Job runs failed", + "layout": { + "column": 5, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRun \nSELECT count(*) AS Runs\nWHERE event = 'complete' AND terminationType != 'SUCCESS' AND databricksWorkspaceName = {{workspace}}\n" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Tasks running", + "layout": { + "column": 7, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRunSummary\nSELECT latest(runningTaskRunCount) AS Runs\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Task runs succeeded", + "layout": { + "column": 9, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksTaskRun \nSELECT count(*) AS Runs\nWHERE event = 'complete' AND terminationType = 'SUCCESS' AND databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}}\n" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Task runs failed", + "layout": { + "column": 11, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksTaskRun \nSELECT count(*) AS Runs\nWHERE event = 'complete' AND terminationType != 'SUCCESS' AND databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}}\n" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Job run count", + "layout": { + "column": 1, + "row": 3, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRun\nSELECT filter(count(*), WHERE event = 'start') AS 'Runs started', filter(count(*), WHERE event = 'complete') AS 'Runs complete'\nLIMIT MAX\nWHERE databricksWorkspaceName = {{workspace}}\nFACET substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.'))\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average job run duration", + "layout": { + "column": 7, + "row": 3, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRun\nSELECT average(duration / 1000) AS 'Run duration'\nWHERE event = 'complete' AND databricksWorkspaceName = {{workspace}}\nFACET substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.'))\nTIMESERIES\nLIMIT MAX" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Running jobs", + "layout": { + "column": 1, + "row": 6, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace,\n concat(databricksWorkspaceUrl, '/jobs/', jobId, '/runs/', jobRunId) AS databricksJobRunURL\nSELECT count(*)\nFROM (\n FROM DatabricksJobRun\n SELECT\n uniqueCount(event) as 'total',\n latest(event) as 'state'\n FACET databricksWorkspaceName, databricksWorkspaceUrl, jobId, jobRunId, jobRunName, toDatetime(jobRunStartTime, 'MMMM dd, YYYY HH:mm:ss') AS startTime\n ORDER BY max(timestamp)\n LIMIT 100)\nWHERE total = 1 AND state = 'start'\nFACET workspace AS Workspace, jobRunId AS 'Job Run ID', jobRunName AS 'Job Run Name', startTime AS 'Job Run Start Time', databricksJobRunURL AS 'Databricks Link'\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Running tasks", + "layout": { + "column": 7, + "row": 6, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace,\n concat(databricksWorkspaceUrl, '/jobs/', jobId, '/runs/', taskRunId) AS databricksJobRunURL\nSELECT count(*)\nFROM (\n FROM DatabricksTaskRun\n SELECT\n uniqueCount(event) as 'total',\n latest(event) as 'state'\n FACET databricksWorkspaceName, databricksWorkspaceUrl, jobId, taskRunId, taskName, toDatetime(taskRunStartTime, 'MMMM dd, YYYY HH:mm:ss') AS startTime\n ORDER BY max(timestamp)\n LIMIT 100)\nWHERE total = 1 AND state = 'start'\nFACET workspace AS Workspace, taskRunId AS 'Task Run ID', taskName AS 'Task Name', startTime AS 'Task Run Start Time', databricksJobRunURL AS 'Databricks Link'\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Blocked job runs", + "layout": { + "column": 1, + "row": 9, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRunSummary\nSELECT latest(blockedJobRunCount) AS Runs\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Waiting job runs", + "layout": { + "column": 3, + "row": 9, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRunSummary\nSELECT latest(waitingJobRunCount) AS Runs\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Pending job runs", + "layout": { + "column": 5, + "row": 9, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRunSummary\nSELECT latest(pendingJobRunCount) AS Runs\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Queued job runs", + "layout": { + "column": 7, + "row": 9, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRunSummary\nSELECT latest(queuedJobRunCount) AS Runs\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Terminating job runs", + "layout": { + "column": 9, + "row": 9, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRunSummary\nSELECT latest(terminatingJobRunCount) AS Runs\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Completed job runs by duration", + "layout": { + "column": 1, + "row": 11, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "duration", + "precision": null, + "type": "decimal" + }, + { + "name": "attempt", + "type": "decimal" + }, + { + "name": "jobRunId", + "precision": null, + "type": "decimal" + }, + { + "name": "jobId", + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace,\n concat(databricksWorkspaceUrl, '/jobs/', jobId, '/runs/', jobRunId) AS databricksJobRunURL\nFROM DatabricksJobRun\nSELECT workspace AS Workspace,\n jobId AS 'Job ID',\n jobRunId AS 'Job Run ID',\n jobRunName, attempt,\n jobRunStartTime,\n jobRunEndTime,\n jobRunType,\n jobRunTrigger,\n terminationCode,\n terminationType,\n duration,\n databricksJobRunURL AS 'Databricks Link'\nWHERE event = 'complete' AND databricksWorkspaceName = {{workspace}}\nORDER BY duration DESC\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Blocked task runs", + "layout": { + "column": 1, + "row": 15, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRunSummary\nSELECT latest(blockedTaskRunCount) AS Runs\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Waiting task runs", + "layout": { + "column": 3, + "row": 15, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRunSummary\nSELECT latest(waitingTaskRunCount) AS Runs\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Pending task runs", + "layout": { + "column": 5, + "row": 15, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRunSummary\nSELECT latest(pendingTaskRunCount) AS Runs\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Queued task runs", + "layout": { + "column": 7, + "row": 15, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRunSummary\nSELECT latest(queuedTaskRunCount) AS Runs\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Terminating task runs", + "layout": { + "column": 9, + "row": 15, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRunSummary\nSELECT latest(terminatingTaskRunCount) AS Runs\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Completed task runs by duration", + "layout": { + "column": 1, + "row": 17, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "taskRunId", + "type": "decimal" + }, + { + "name": "duration", + "precision": null, + "type": "decimal" + }, + { + "name": "attempt", + "type": "decimal" + }, + { + "name": "jobRunId", + "type": "decimal" + }, + { + "name": "jobId", + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace,\n concat(databricksWorkspaceUrl, '/jobs/', jobId, '/runs/', taskRunId) AS databricksJobRunURL\nFROM DatabricksTaskRun\nSELECT workspace AS Workspace,\n databricksClusterName AS Cluster,\n jobId AS 'Job ID',\n jobRunId AS 'Job Run ID',\n jobRunName,\n taskRunId AS 'Task Run ID',\n taskName,\n attempt,\n taskRunStartTime,\n taskRunEndTime,\n terminationCode,\n terminationType,\n duration,\n databricksJobRunURL AS 'Databricks Link'\nWHERE event = 'complete' AND databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}}\nORDER BY duration DESC\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Average duration by job", + "layout": { + "column": 1, + "row": 21, + "width": 6, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRun\nSELECT average(duration / 1000)\nWHERE event = 'complete' AND state = 'TERMINATED' AND databricksWorkspaceName = {{workspace}}\nFACET jobRunName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average duration by task", + "layout": { + "column": 7, + "row": 21, + "width": 6, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksTaskRun\nSELECT average(duration / 1000)\nWHERE event = 'complete' AND state = 'TERMINATED' AND databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}}\nFACET taskName\nTIMESERIES SINCE 3 hours ago UNTIL now" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average queue duration by job", + "layout": { + "column": 1, + "row": 25, + "width": 6, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksJobRun\nSELECT average(queueDuration / 1000)\nWHERE event = 'complete' AND state = 'TERMINATED' AND databricksWorkspaceName = {{workspace}}\nFACET jobRunName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average queue duration by task", + "layout": { + "column": 7, + "row": 25, + "width": 6, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksTaskRun\nSELECT average(queueDuration / 1000)\nWHERE event = 'complete' AND state = 'TERMINATED' AND databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}}\nFACET taskName\nTIMESERIES SINCE 3 hours ago UNTIL now" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + } + ] + }, + { + "name": "Spark Metrics", + "description": null, + "widgets": [ + { + "title": "Spark jobs by job run", + "layout": { + "column": 1, + "row": 1, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Spark Job ID", + "precision": 0, + "type": "decimal" + }, + { + "name": "Duration", + "precision": 2, + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n concat(databricksWorkspaceUrl, '/jobs/', databricksJobId, '/runs/', databricksJobRunTaskRunId) AS databricksJobRunURL\nFROM SparkJob\nSELECT\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS Workspace,\n databricksClusterName AS Cluster,\n sparkAppName AS 'Spark App Name',\n description AS 'Job Description',\n jobId AS 'Spark Job ID',\n duration / 1000 AS Duration,\n databricksJobId AS 'Databricks Job ID',\n databricksJobRunTaskRunId as 'Databricks Task Run ID',\n databricksJobRunURL AS 'Databricks Link'\nWHERE event = 'complete' AND databricksJobId IS NOT NULL AND databricksJobRunTaskRunId IS NOT NULL AND databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}}\nLIMIT MAX " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Spark stages by job run", + "layout": { + "column": 1, + "row": 5, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Spark Stage ID", + "type": "decimal" + }, + { + "name": "Duration", + "precision": 2, + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n concat(databricksWorkspaceUrl, '/jobs/', databricksJobId, '/runs/', databricksJobRunTaskRunId) AS databricksJobRunURL\nFROM SparkStage\nSELECT\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS Workspace,\n databricksClusterName AS Cluster,\n sparkAppName AS 'Spark App Name',\n jobDescription,\n stageId AS 'Spark Stage ID',\n duration / 1000 AS Duration,\n databricksJobId AS 'Job ID',\n databricksJobRunTaskRunId as 'Task Run ID',\n databricksJobRunURL AS 'Databricks Link'\nWHERE event = 'complete' AND databricksJobId IS NOT NULL AND databricksJobRunTaskRunId IS NOT NULL AND databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}}\nLIMIT MAX " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Spark tasks by job run", + "layout": { + "column": 1, + "row": 9, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Spark Task ID", + "type": "decimal" + }, + { + "name": "Duration", + "precision": 2, + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n concat(databricksWorkspaceUrl, '/jobs/', databricksJobId, '/runs/', databricksJobRunTaskRunId) AS databricksJobRunURL\nFROM SparkTask\nSELECT\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS Workspace,\n databricksClusterName AS Cluster,\n sparkAppName AS 'Spark App Name',\n jobDescription,\n taskId AS 'Spark Task ID',\n duration / 1000 AS Duration,\n databricksJobId AS 'Job ID',\n databricksJobRunTaskRunId as 'Task Run ID',\n databricksJobRunURL AS 'Databricks Link'\nWHERE event = 'complete' AND databricksJobId IS NOT NULL AND databricksJobRunTaskRunId IS NOT NULL AND databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}}\nLIMIT MAX " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + } + ], + "variables": [ + { + "name": "workspace", + "items": null, + "defaultValues": [], + "nrqlQuery": { + "accountIds": [], + "query": "FROM DatabricksJobRunSummary\nSELECT uniques(databricksWorkspaceName)" + }, + "options": { + "ignoreTimeRange": true, + "excluded": true + }, + "title": "Workspace", + "type": "NRQL", + "isMultiSelection": false, + "replacementStrategy": "STRING" + }, + { + "name": "cluster", + "items": null, + "defaultValues": [], + "nrqlQuery": { + "accountIds": [], + "query": "FROM DatabricksTaskRun\nSELECT uniques(databricksClusterName)" + }, + "options": { + "ignoreTimeRange": true, + "excluded": true + }, + "title": "Cluster", + "type": "NRQL", + "isMultiSelection": false, + "replacementStrategy": "STRING" + } + ] +} diff --git a/dashboards/databricks-job-runs/databricks-job-runs01.png b/dashboards/databricks-job-runs/databricks-job-runs01.png new file mode 100644 index 0000000000..2fe4188b51 Binary files /dev/null and b/dashboards/databricks-job-runs/databricks-job-runs01.png differ diff --git a/dashboards/databricks-job-runs/databricks-job-runs02.png b/dashboards/databricks-job-runs/databricks-job-runs02.png new file mode 100644 index 0000000000..672396bd40 Binary files /dev/null and b/dashboards/databricks-job-runs/databricks-job-runs02.png differ diff --git a/dashboards/databricks-pipeline-updates/databricks-pipeline-updates.json b/dashboards/databricks-pipeline-updates/databricks-pipeline-updates.json new file mode 100644 index 0000000000..1d0c5f5506 --- /dev/null +++ b/dashboards/databricks-pipeline-updates/databricks-pipeline-updates.json @@ -0,0 +1,1102 @@ +{ + "name": "Databricks Pipeline Updates", + "description": "Monitor the status and performance of your Databricks pipeline updates, including update and flow durations and flow data quality metrics.", + "pages": [ + { + "name": "Pipeline Updates", + "description": null, + "widgets": [ + { + "title": "Pipelines running", + "layout": { + "column": 1, + "row": 1, + "width": 4, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksPipelineSummary \nSELECT latest(runningPipelineCount) AS 'Pipelines'\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Pipelines failed", + "layout": { + "column": 5, + "row": 1, + "width": 4, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksPipelineSummary \nSELECT latest(failedPipelineCount) AS 'Pipelines'\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Pipelines idle", + "layout": { + "column": 9, + "row": 1, + "width": 4, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksPipelineSummary \nSELECT latest(idlePipelineCount) AS 'Pipelines'\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Updates running", + "layout": { + "column": 1, + "row": 3, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksPipelineSummary \nSELECT latest(runningUpdateCount) AS 'Updates'\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Updates waiting for resources", + "layout": { + "column": 3, + "row": 3, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksPipelineSummary \nSELECT latest(waitingForResourcesUpdateCount) AS 'Updates'\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Updates setting up tables", + "layout": { + "column": 5, + "row": 3, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksPipelineSummary \nSELECT latest(settingUpTablesUpdateCount) AS 'Updates'\nWHERE databricksWorkspaceName = {{workspace}}" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Updates completed", + "layout": { + "column": 7, + "row": 3, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksPipelineUpdate\nSELECT count(*) AS 'Updates'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete' AND status = 'COMPLETED'\nLIMIT MAX" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Updates failed", + "layout": { + "column": 9, + "row": 3, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksPipelineUpdate\nSELECT count(*) AS 'Updates'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete' AND status = 'FAILED'\nLIMIT MAX" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Updates canceled", + "layout": { + "column": 11, + "row": 3, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksPipelineUpdate\nSELECT count(*) AS 'Updates'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete' AND status = 'CANCELED'\nLIMIT MAX" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Update count by workspace", + "layout": { + "column": 1, + "row": 5, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace\nFROM DatabricksPipelineUpdate\nSELECT filter(count(*), WHERE event = 'start') AS 'Updates started', filter(count(*), WHERE event = 'complete') AS 'Updates complete'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}}\nFACET workspace \nTIMESERIES\nLIMIT MAX" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average update duration by workspace", + "layout": { + "column": 7, + "row": 5, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace\nFROM DatabricksPipelineUpdate\nSELECT average(duration) / 1000 AS 'Update duration'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete' \nFACET workspace\nTIMESERIES\nLIMIT MAX" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Running updates", + "layout": { + "column": 1, + "row": 8, + "width": 12, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace,\n concat(databricksWorkspaceUrl, '/pipelines/', databricksPipelineId, '/updates/', databricksPipelineUpdateId) AS databricksPipelineUpdateUrl\nSELECT count(*)\nFROM (\n FROM DatabricksPipelineUpdate\n SELECT\n uniqueCount(event) as 'total',\n latest(event) as 'state'\n FACET databricksWorkspaceName, databricksWorkspaceUrl, databricksPipelineId, pipelineName, databricksPipelineUpdateId, toDatetime(creationTime, 'MMMM dd, YYYY HH:mm:ss') AS creationTime\n ORDER BY max(timestamp)\n LIMIT 100)\nWHERE total = 1 AND state = 'start'\nFACET workspace AS Workspace, pipelineName AS Pipeline, substring(databricksPipelineUpdateId, 0, 6) as Update, creationTime AS 'Update Creation Time', databricksPipelineUpdateUrl AS 'Databricks Link'\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Recent updates", + "layout": { + "column": 1, + "row": 11, + "width": 12, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Duration", + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace,\n substring(databricksPipelineUpdateId, 0, 6) AS update,\n concat(databricksWorkspaceUrl, '/pipelines/', databricksPipelineId, '/updates/', databricksPipelineUpdateId) AS databricksPipelineUpdateUrl\nFROM DatabricksPipelineUpdate\nSELECT workspace,\n databricksClusterName AS Cluster,\n pipelineName AS Pipeline,\n update,\n creationTime,\n completionTime,\n status,\n duration / 1000 AS Duration,\n databricksPipelineUpdateUrl AS 'Databricks Link'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete'\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Average update duration by workspace and pipeline", + "layout": { + "column": 1, + "row": 14, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace\nFROM DatabricksPipelineUpdate \nSELECT average(duration) / 1000 AS 'seconds'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete'\nFACET workspace, pipelineName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average update wait duration by workspace and pipeline", + "layout": { + "column": 5, + "row": 14, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace\nFROM DatabricksPipelineUpdate\nSELECT average(waitDuration) / 1000 AS 'seconds'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete'\nFACET workspace, pipelineName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average update run duration by workspace and pipeline", + "layout": { + "column": 9, + "row": 14, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace\nFROM DatabricksPipelineUpdate \nSELECT average(runDuration) / 1000 AS 'seconds'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete'\nFACET workspace, pipelineName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Running flows", + "layout": { + "column": 1, + "row": 17, + "width": 12, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace,\n concat(databricksWorkspaceUrl, '/pipelines/', databricksPipelineId, '/updates/', databricksPipelineUpdateId) AS databricksPipelineUpdateUrl\nSELECT count(*)\nFROM (\n FROM DatabricksPipelineFlow\n SELECT\n uniqueCount(event) as 'total',\n latest(event) as 'state'\n FACET databricksWorkspaceName, databricksWorkspaceUrl, databricksPipelineId, pipelineName, databricksPipelineUpdateId, databricksPipelineFlowName, toDatetime(queueStartTime, 'MMMM dd, YYYY HH:mm:ss') AS queueStartTime\n ORDER BY max(timestamp)\n LIMIT 100)\nWHERE total = 1 AND state = 'start'\nFACET workspace AS Workspace, pipelineName AS Pipeline, substring(databricksPipelineUpdateId, 0, 6) as Update, databricksPipelineFlowName AS Flow, queueStartTime AS 'Queue Start Time', databricksPipelineUpdateUrl AS 'Databricks Link'\nLIMIT 100 " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Recent flows", + "layout": { + "column": 1, + "row": 20, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Duration", + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace,\n substring(databricksPipelineUpdateId, 0, 6) AS update,\n concat(databricksWorkspaceUrl, '/pipelines/', databricksPipelineId, '/updates/', databricksPipelineUpdateId) AS databricksPipelineUpdateUrl\nFROM DatabricksPipelineFlow\nSELECT workspace,\n databricksClusterName AS Cluster,\n pipelineName AS Pipeline,\n update,\n databricksPipelineFlowName AS Flow,\n queueStartTime,\n completionTime,\n status,\n duration / 1000 AS Duration,\n databricksPipelineUpdateUrl AS 'Databricks Link'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete'\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Average flow duration by workspace, pipeline, and flow", + "layout": { + "column": 1, + "row": 24, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace\nFROM DatabricksPipelineFlow \nSELECT average(duration) / 1000 AS 'seconds'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete'\nFACET workspace, pipelineName, databricksPipelineFlowName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average flow queue duration by workspace, pipeline, and flow", + "layout": { + "column": 5, + "row": 24, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace\nFROM DatabricksPipelineFlow \nSELECT average(queueDuration) / 1000 AS 'seconds'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete'\nFACET workspace, pipelineName, databricksPipelineFlowName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average flow plan duration by workspace, pipeline, and flow", + "layout": { + "column": 9, + "row": 24, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace\nFROM DatabricksPipelineFlow\nSELECT average(planDuration) / 1000 AS 'seconds'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete'\nFACET workspace, pipelineName, databricksPipelineFlowName\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average rows written by workspace, pipeline, and flow", + "layout": { + "column": 1, + "row": 27, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace\nFROM DatabricksPipelineFlow\nSELECT average(outputRowCount)\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete'\nFACET workspace, pipelineName, databricksPipelineFlowName" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Average records dropped by workspace, pipeline, and flow", + "layout": { + "column": 7, + "row": 27, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace\nFROM DatabricksPipelineFlow\nSELECT average(droppedRecordCount)\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete'\nFACET workspace, pipelineName, databricksPipelineFlowName" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Average backlog bytes by workspace, pipeline, and flow", + "layout": { + "column": 1, + "row": 30, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace\nFROM DatabricksPipelineFlow\nSELECT average(backlogBytes)\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete'\nFACET workspace, pipelineName, databricksPipelineFlowName" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Average backlog files by workspace, pipeline, and flow", + "layout": { + "column": 7, + "row": 30, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace\nFROM DatabricksPipelineFlow\nSELECT average(backlogFileCount)\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete'\nFACET workspace, pipelineName, databricksPipelineFlowName" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Expectation records passed", + "layout": { + "column": 1, + "row": 33, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Records", + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace,\n substring(databricksPipelineUpdateId, 0, 6) AS update\nFROM DatabricksPipelineFlowExpectation\nSELECT workspace, databricksClusterName AS cluster, pipelineName AS Pipeline, update, databricksPipelineFlowName AS Flow, name AS Expectation, dataset, passedRecordCount AS Records\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND failedRecordCount = 0\nLIMIT MAX\n " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Expectation records failed", + "layout": { + "column": 1, + "row": 37, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Records", + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace,\n substring(databricksPipelineUpdateId, 0, 6) AS update\nFROM DatabricksPipelineFlowExpectation\nSELECT workspace, databricksClusterName AS Cluster, pipelineName AS Pipeline, update, databricksPipelineFlowName AS Flow, name AS Expectation, dataset, failedRecordCount AS Records\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND failedRecordCount > 0\nLIMIT MAX\n " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + }, + { + "name": "Spark Metrics", + "description": null, + "widgets": [ + { + "title": "Spark jobs by pipeline update", + "layout": { + "column": 1, + "row": 1, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Spark Job ID", + "precision": 0, + "type": "humanized" + }, + { + "name": "Duration", + "precision": 2, + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace,\n substring(databricksPipelineUpdateId, 0, 6) as update,\n concat(databricksWorkspaceUrl, '/pipelines/', databricksPipelineId, '/updates/', databricksPipelineUpdateId) AS databricksPipelineUpdateUrl\nFROM SparkJob\nSELECT\n workspace,\n databricksClusterName AS Cluster,\n sparkAppName AS 'Spark App Name',\n description AS 'Job Description',\n jobId AS 'Spark Job ID',\n duration / 1000 AS Duration,\n databricksPipelineId AS 'Pipeline ID',\n update as 'Update',\n databricksPipelineUpdateUrl AS 'Databricks Link'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete' AND databricksPipelineId IS NOT NULL AND databricksPipelineUpdateId IS NOT NULL\nLIMIT MAX " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Spark stages by pipeline update", + "layout": { + "column": 1, + "row": 5, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Duration", + "precision": 2, + "type": "humanized" + }, + { + "name": "Spark Job", + "precision": 0, + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace,\n substring(databricksPipelineUpdateId, 0, 6) as update,\n concat(databricksWorkspaceUrl, '/pipelines/', databricksPipelineId, '/updates/', databricksPipelineUpdateId) AS databricksPipelineUpdateUrl\nFROM SparkStage\nSELECT\n workspace,\n databricksClusterName AS Cluster,\n sparkAppName AS 'Spark App Name',\n jobDescription,\n stageId AS 'Spark Stage ID',\n duration / 1000 AS Duration,\n databricksPipelineId AS 'Pipeline ID',\n update as 'Update',\n databricksPipelineUpdateUrl AS 'Databricks Link'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete' AND databricksPipelineId IS NOT NULL AND databricksPipelineUpdateId IS NOT NULL\nLIMIT MAX " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Spark tasks by pipeline update", + "layout": { + "column": 1, + "row": 9, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Duration", + "precision": 2, + "type": "humanized" + }, + { + "name": "Spark Job", + "precision": 0, + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n substring(databricksWorkspaceName, 0, position(databricksWorkspaceName, '.')) AS workspace,\n substring(databricksPipelineUpdateId, 0, 6) as update,\n concat(databricksWorkspaceUrl, '/pipelines/', databricksPipelineId, '/updates/', databricksPipelineUpdateId) AS databricksPipelineUpdateUrl\nFROM SparkTask\nSELECT\n workspace,\n databricksClusterName AS Cluster,\n sparkAppName AS 'Spark App Name',\n jobDescription,\n taskId AS 'Spark Task ID',\n duration / 1000 AS Duration,\n databricksPipelineId AS 'Pipeline ID',\n update as 'Update',\n databricksPipelineUpdateUrl AS 'Databricks Link'\nWHERE databricksWorkspaceName = {{workspace}} AND databricksClusterName = {{cluster}} AND event = 'complete' AND databricksPipelineId IS NOT NULL AND databricksPipelineUpdateId IS NOT NULL\nLIMIT MAX " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + } + ], + "variables": [ + { + "name": "workspace", + "items": null, + "defaultValues": [], + "nrqlQuery": { + "accountIds": [], + "query": "FROM DatabricksPipelineSummary\nSELECT uniques(databricksWorkspaceName)" + }, + "options": { + "ignoreTimeRange": true, + "excluded": true + }, + "title": "Workspace", + "type": "NRQL", + "isMultiSelection": false, + "replacementStrategy": "STRING" + }, + { + "name": "cluster", + "items": null, + "defaultValues": [], + "nrqlQuery": { + "accountIds": [], + "query": "FROM DatabricksPipelineUpdate SELECT uniques(databricksClusterName)" + }, + "options": { + "ignoreTimeRange": true, + "excluded": true + }, + "title": "Cluster", + "type": "NRQL", + "isMultiSelection": false, + "replacementStrategy": "STRING" + } + ] +} diff --git a/dashboards/databricks-pipeline-updates/databricks-pipeline-updates01.png b/dashboards/databricks-pipeline-updates/databricks-pipeline-updates01.png new file mode 100644 index 0000000000..8c4ed8fa67 Binary files /dev/null and b/dashboards/databricks-pipeline-updates/databricks-pipeline-updates01.png differ diff --git a/dashboards/databricks-query-metrics/databricks-query-metrics.json b/dashboards/databricks-query-metrics/databricks-query-metrics.json new file mode 100644 index 0000000000..5e98ef8ca8 --- /dev/null +++ b/dashboards/databricks-query-metrics/databricks-query-metrics.json @@ -0,0 +1,840 @@ +{ + "name": "Databricks Queries", + "description": "Monitor the performance of your Databricks SQL queries, including query completion rates, latencies, error rates, and I/O metrics.", + "pages": [ + { + "name": "Databricks Queries", + "description": null, + "widgets": [ + { + "title": "Completed queries", + "layout": { + "column": 1, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Queries", + "precision": 3, + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT count(*) AS Queries\nWHERE status = 'FINISHED'\n" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Failed queries", + "layout": { + "column": 3, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Queries", + "precision": 3, + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT count(*) AS Queries\nWHERE status = 'FAILED'\n" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Queries with data spilling", + "layout": { + "column": 5, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Queries", + "precision": 3, + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT count(*) AS Queries\nWHERE diskBytesSpilled > 0\n" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Queries with shuffling", + "layout": { + "column": 7, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Queries", + "precision": 3, + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT count(*) AS Queries\nWHERE networkBytesSent > 0\n" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Queries with pruning", + "layout": { + "column": 9, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Queries", + "precision": 3, + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT count(*) AS Queries\nWHERE bytesPruned > 0 OR filesPruned > 0\n" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Queries queued due to overloading", + "layout": { + "column": 11, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Queries", + "precision": 3, + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT count(*) AS Queries\nWHERE overloadingQueueStartTime > 0\n" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Queries per second", + "layout": { + "column": 1, + "row": 3, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT rate(count(*), 1 second) AS Queries\nWHERE status = 'FINISHED'\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average latency", + "layout": { + "column": 5, + "row": 3, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT average(duration) / 1000 AS Duration\nTIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Error rate", + "layout": { + "column": 9, + "row": 3, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT percentage(count(*), WHERE status = 'FAILED')\nTIMESERIES 1 minute" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Average latency by query", + "layout": { + "column": 1, + "row": 6, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT average(duration) / 1000 AS Duration\nTIMESERIES\nFACET query" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Error rate by query", + "layout": { + "column": 7, + "row": 6, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT percentage(count(*), WHERE status != 'FINISHED')\nTIMESERIES 1 minute\nFACET query" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Query executions by warehouse", + "layout": { + "column": 1, + "row": 9, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT count(*) AS Executions\nTIMESERIES\nFACET warehouseName" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "COUNT" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Total duration by warehouse", + "layout": { + "column": 7, + "row": 9, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT sum(duration) / 1000 AS Duration\nTIMESERIES\nFACET warehouseName" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Query executions by user", + "layout": { + "column": 1, + "row": 12, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT count(*) AS Executions\nTIMESERIES\nFACET userName" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "COUNT" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Total duration by user", + "layout": { + "column": 7, + "row": 12, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM DatabricksQuery\nSELECT sum(duration) / 1000 AS Duration\nTIMESERIES\nFACET userName" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "units": { + "unit": "SECONDS" + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Query history", + "layout": { + "column": 1, + "row": 15, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Duration", + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n concat(workspaceUrl, '/sql/warehouses/', warehouseId, '/monitoring?queryId=', id) AS queryLink\nFROM DatabricksQuery\nSELECT\n status as Status,\n substring(query, 0, 100) as Query,\n startTime as Started,\n duration as Duration,\n userName as User,\n queryLink" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Failed queries", + "layout": { + "column": 1, + "row": 19, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Duration", + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n concat(workspaceUrl, '/sql/warehouses/', warehouseId, '/monitoring?queryId=', id) AS queryLink\nFROM DatabricksQuery\nSELECT\n substring(query, 0, 100) as Query,\n errorMessage as 'Error Message',\n queryLink\nWHERE error = true" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Rows returned versus read", + "layout": { + "column": 1, + "row": 23, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Ratio %", + "precision": 2, + "type": "humanized" + }, + { + "name": "Rows Read", + "type": "decimal" + }, + { + "name": "Rows Returned", + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n concat(workspaceUrl, '/sql/warehouses/', warehouseId, '/monitoring?queryId=', id) AS queryLink,\n rowsReturned / rowsRead * 100 as ratio\nFROM DatabricksQuery\nSELECT\n rowsReturned AS 'Rows Returned',\n rowsRead AS 'Rows Read',\n ratio AS 'Ratio %',\n substring(query, 0, 100) as Query,\n queryLink AS 'Query Link'\nORDER BY ratio ASC\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Queries with data spilling", + "layout": { + "column": 1, + "row": 27, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "rowsReturned", + "type": "decimal" + }, + { + "name": "rowsRead", + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n concat(workspaceUrl, '/sql/warehouses/', warehouseId, '/monitoring?queryId=', id) AS queryLink\nFROM DatabricksQuery\nSELECT\n diskBytesSpilled,\n query,\n queryLink\nWHERE diskBytesSpilled > 0\nORDER BY diskBytesSpilled DESC\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Queries with data shuffling", + "layout": { + "column": 1, + "row": 31, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "rowsReturned", + "type": "decimal" + }, + { + "name": "rowsRead", + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH\n concat(workspaceUrl, '/sql/warehouses/', warehouseId, '/monitoring?queryId=', id) AS queryLink\nFROM DatabricksQuery\nSELECT\n networkBytesSent,\n query,\n queryLink\nWHERE networkBytesSent > 0\nORDER BY networkBytesSent DESC\nLIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + } + ], + "variables": [] +} diff --git a/dashboards/databricks-query-metrics/databricks-query-metrics01.png b/dashboards/databricks-query-metrics/databricks-query-metrics01.png new file mode 100644 index 0000000000..d3e4cecee2 Binary files /dev/null and b/dashboards/databricks-query-metrics/databricks-query-metrics01.png differ diff --git a/dashboards/databricks-spark/databricks-spark.json b/dashboards/databricks-spark/databricks-spark.json index f75fb807b9..cfa498a885 100644 --- a/dashboards/databricks-spark/databricks-spark.json +++ b/dashboards/databricks-spark/databricks-spark.json @@ -1,18 +1,18 @@ { - "name": "Databricks Spark", - "description": null, + "name": "Apache Spark", + "description": "Monitor the performance of your Apache Spark applications running on Databricks. Gain insights into job execution, stage performance, task efficiency, and executor utilization to optimize your Spark workloads.", "pages": [ { "name": "Jobs", "description": null, "widgets": [ { - "title": "Jobs Running", + "title": "Jobs succeeded", "layout": { "column": 1, "row": 1, - "width": 3, - "height": 3 + "width": 2, + "height": 2 }, "linkedEntityGuids": null, "visualization": { @@ -25,7 +25,7 @@ "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.jobs) AS 'Jobs' WHERE sparkAppJobStatus = 'running'" + "query": "FROM SparkJob\nSELECT count(*) AS 'Jobs'\nWHERE status = 'succeeded'" } ], "platformOptions": { @@ -34,12 +34,12 @@ } }, { - "title": "Stages Running", + "title": "Jobs failed", "layout": { - "column": 4, + "column": 3, "row": 1, - "width": 3, - "height": 3 + "width": 2, + "height": 2 }, "linkedEntityGuids": null, "visualization": { @@ -52,7 +52,7 @@ "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.stages) AS 'Stages' WHERE sparkAppStageStatus = 'active'" + "query": "FROM SparkJob\nSELECT count(*) AS 'Jobs'\nWHERE status = 'failed'" } ], "platformOptions": { @@ -61,12 +61,39 @@ } }, { - "title": "Tasks Running", + "title": "Stages completed", + "layout": { + "column": 5, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SparkStage\nSELECT count(*) AS 'Stages'\nWHERE status = 'complete'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Stages failed", "layout": { "column": 7, "row": 1, - "width": 3, - "height": 3 + "width": 2, + "height": 2 }, "linkedEntityGuids": null, "visualization": { @@ -79,7 +106,7 @@ "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.job.tasks) AS 'Tasks' WHERE sparkAppTaskStatus = 'active'" + "query": "FROM SparkStage\nSELECT count(*) AS 'Stages'\nWHERE status = 'failed'" } ], "platformOptions": { @@ -88,12 +115,12 @@ } }, { - "title": "Executors", + "title": "Tasks completed", "layout": { - "column": 10, + "column": 9, "row": 1, - "width": 3, - "height": 3 + "width": 2, + "height": 2 }, "linkedEntityGuids": null, "visualization": { @@ -106,7 +133,7 @@ "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT uniqueCount(sparkAppExecutorId) AS 'Executors' WHERE metricName = 'spark.app.executor.maxMemory'" + "query": "FROM SparkTask\nSELECT count(*) AS 'Tasks'\nWHERE status = 'success'" } ], "platformOptions": { @@ -115,11 +142,38 @@ } }, { - "title": "Jobs By Status", + "title": "Tasks failed", + "layout": { + "column": 11, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SparkTask\nSELECT count(*) AS 'Tasks'\nWHERE status = 'failed'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Job count", "layout": { "column": 1, - "row": 4, - "width": 4, + "row": 3, + "width": 6, "height": 3 }, "linkedEntityGuids": null, @@ -133,15 +187,20 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.jobs) AS 'Jobs' FACET sparkAppJobStatus TIMESERIES" + "query": "FROM SparkJob\nSELECT filter(count(*), WHERE event = 'start') AS 'Jobs started', filter(count(*), WHERE event = 'complete') AS 'Jobs complete'\nLIMIT MAX\nTIMESERIES" } ], - "nullValues": { - "nullValue": "zero" - }, "platformOptions": { "ignoreTimeRange": false }, @@ -157,11 +216,11 @@ } }, { - "title": "Job Tasks By Status", + "title": "Average job duration", "layout": { - "column": 5, - "row": 4, - "width": 4, + "column": 7, + "row": 3, + "width": 6, "height": 3 }, "linkedEntityGuids": null, @@ -175,21 +234,29 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.job.tasks) AS 'Tasks' FACET sparkAppTaskStatus TIMESERIES" + "query": "FROM SparkJob\nSELECT average(duration / 1000) AS 'Job duration'\nWHERE event = 'complete'\nLIMIT MAX\nTIMESERIES" } ], - "nullValues": { - "nullValue": "zero" - }, "platformOptions": { "ignoreTimeRange": false }, "thresholds": { "isLabelVisible": true }, + "units": { + "unit": "SECONDS" + }, "yAxisLeft": { "zero": true }, @@ -199,60 +266,315 @@ } }, { - "title": "Job Stages By Status", + "title": "Running jobs", "layout": { - "column": 9, - "row": 4, - "width": 4, + "column": 1, + "row": 6, + "width": 6, "height": 3 }, "linkedEntityGuids": null, "visualization": { - "id": "viz.line" + "id": "viz.table" }, "rawConfiguration": { + "dataFormatters": [ + { + "name": "jobId", + "type": "decimal" + }, + { + "name": "taskCount", + "type": "decimal" + }, + { + "name": "Skipped Stages", + "type": "decimal" + }, + { + "name": "Failed Stages", + "type": "decimal" + }, + { + "name": "Completed Stages", + "type": "decimal" + }, + { + "name": "jobDuration", + "precision": null, + "type": "humanized" + } + ], "facet": { "showOtherSeries": false }, - "legend": { - "enabled": true + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT count(*)\nFROM (\n FROM SparkJob\n SELECT\n uniqueCount(event) as 'total',\n latest(event) as 'state'\n FACET jobId, description, submissionTime\n ORDER BY max(timestamp)\n LIMIT MAX)\nWHERE total = 1 AND state = 'start'\nFACET jobId AS 'Job ID', description\nLIMIT MAX" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Running stages", + "layout": { + "column": 7, + "row": 6, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "jobId", + "type": "decimal" + }, + { + "name": "taskCount", + "type": "decimal" + }, + { + "name": "Skipped Stages", + "type": "decimal" + }, + { + "name": "Failed Stages", + "type": "decimal" + }, + { + "name": "Completed Stages", + "type": "decimal" + }, + { + "name": "jobDuration", + "precision": null, + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.job.stages) AS 'Stages' FACET sparkAppStageStatus TIMESERIES" + "query": "SELECT count(*)\nFROM (\n FROM SparkStage\n SELECT\n uniqueCount(event) as 'total',\n latest(event) as 'state'\n FACET stageId, jobDescription, firstTaskLaunchedTime\n ORDER BY max(timestamp)\n LIMIT MAX)\nWHERE total = 1 AND state = 'start'\nFACET stageId AS 'Stage ID', jobDescription\nLIMIT MAX" } ], - "nullValues": { - "nullValue": "zero" + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Completed jobs by duration", + "layout": { + "column": 1, + "row": 9, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "jobId", + "type": "decimal" + }, + { + "name": "taskCount", + "type": "decimal" + }, + { + "name": "Skipped Stages", + "type": "decimal" + }, + { + "name": "Failed Stages", + "type": "decimal" + }, + { + "name": "Completed Stages", + "type": "decimal" + }, + { + "name": "jobDuration", + "precision": null, + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH duration / 1000 AS jobDuration FROM SparkJob\nSELECT sparkAppName AS Application, jobId AS 'Job ID', description, status, submissionTime, completionTime, completedStageCount AS 'Completed Stages', failedStageCount AS 'Failed Stages', skippedStageCount AS 'Skipped Stages', taskCount\nWHERE event = 'complete'\nLIMIT MAX\nORDER BY jobDuration DESC" + } + ], "platformOptions": { "ignoreTimeRange": false + } + } + }, + { + "title": "Completed stages by duration", + "layout": { + "column": 1, + "row": 13, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "attemptId", + "type": "decimal" + }, + { + "name": "stageId", + "type": "decimal" + }, + { + "name": "Killed Tasks", + "type": "decimal" + }, + { + "name": "Failed Tasks", + "type": "decimal" + }, + { + "name": "Completed Tasks", + "type": "decimal" + }, + { + "name": "stageDuration", + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false }, - "thresholds": { - "isLabelVisible": true + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH duration / 1000 AS stageDuration FROM SparkStage\nSELECT sparkAppName AS Application, stageId AS 'Stage ID', jobDescription, attemptId AS 'Attempt ID', status, firstTaskLaunchedTime, completionTime, completedTaskCount AS 'Completed Tasks', failedTaskCount AS 'Failed Tasks', killedTaskCount AS 'Killed Tasks'\nWHERE event = 'complete'\nLIMIT MAX\nORDER BY stageDuration DESC" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Failed stage details", + "layout": { + "column": 1, + "row": 17, + "width": 12, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false }, - "yAxisLeft": { - "zero": true + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SparkStage\nSELECT sparkAppName as Application, stageId AS 'Stage ID', jobDescription, attemptId AS 'Attempt ID', firstTaskLaunchedTime, completionTime, stageName, details\nWHERE event = 'complete' AND status = 'failed'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Completed tasks by duration", + "layout": { + "column": 1, + "row": 20, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "taskDuration", + "type": "humanized" + }, + { + "name": "gettingResultDuration", + "type": "decimal" + }, + { + "name": "schedulerDelay", + "type": "decimal" + }, + { + "name": "executorId", + "type": "decimal" + }, + { + "name": "taskId", + "type": "decimal" + }, + { + "name": "stageAttemptId", + "type": "decimal" + }, + { + "name": "attemptId", + "type": "decimal" + }, + { + "name": "stageId", + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false }, - "yAxisRight": { - "zero": true + "nrqlQueries": [ + { + "accountIds": [], + "query": "WITH duration / 1000 AS taskDuration FROM SparkTask\nSELECT sparkAppName AS Application, stageId AS 'Stage ID', stageAttemptId AS 'Stage Attempt ID', taskId AS 'Task ID', attemptId AS 'Attempt ID', executorId AS 'Executor ID', status, launchTime, completionTime, schedulerDelay, gettingResultDuration\nWHERE event = 'complete'\nLIMIT MAX\nORDER BY taskDuration DESC" + } + ], + "platformOptions": { + "ignoreTimeRange": false } } - } - ] - }, - { - "name": "Stages", - "description": null, - "widgets": [ + }, { - "title": "Average Executor Total Task Run Time", + "title": "Stage I/O rate per second", "layout": { "column": 1, - "row": 1, - "width": 4, - "height": 3 + "row": 24, + "width": 6, + "height": 4 }, "linkedEntityGuids": null, "visualization": { @@ -265,10 +587,18 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.stage.executor.runTime) WHERE spark.app.stage.executor.runTime IS NOT NULL TIMESERIES" + "query": "FROM SparkStage\nSELECT rate(sum(inputBytes), 1 second) AS 'Bytes in', rate(sum(inputRecords), 1 second) AS 'Records in', rate(sum(outputBytes), 1 second) AS 'Bytes out', rate(sum(outputRecords), 1 second) AS 'Records out'\nWHERE event = 'complete'\nTIMESERIES " } ], "platformOptions": { @@ -278,7 +608,24 @@ "isLabelVisible": true }, "units": { - "unit": "MS" + "seriesOverrides": [ + { + "seriesName": "Bytes In", + "unit": "BYTES_PER_SECOND" + }, + { + "seriesName": "Records In", + "unit": "REQUESTS_PER_SECOND" + }, + { + "seriesName": "Bytes Out", + "unit": "BYTES_PER_SECOND" + }, + { + "seriesName": "Records Out", + "unit": "REQUESTS_PER_SECOND" + } + ] }, "yAxisLeft": { "zero": true @@ -289,12 +636,12 @@ } }, { - "title": "Average Executor Total Task CPU Time", + "title": "Average stage shuffle I/O rate per second", "layout": { - "column": 5, - "row": 1, - "width": 4, - "height": 3 + "column": 7, + "row": 24, + "width": 6, + "height": 4 }, "linkedEntityGuids": null, "visualization": { @@ -307,10 +654,18 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.stage.executor.cpuTime) / 1000000 WHERE spark.app.stage.executor.cpuTime IS NOT NULL TIMESERIES" + "query": "FROM SparkStage\nSELECT rate(sum(shuffleReadBytes), 1 second) AS 'Bytes read', rate(sum(shuffleReadRecords), 1 second) AS 'Records read', rate(sum(shuffleWriteBytes), 1 second) AS 'Bytes written', rate(sum(shuffleWriteRecords), 1 second) AS 'Records written'\nWHERE event = 'complete'\nTIMESERIES " } ], "platformOptions": { @@ -320,7 +675,24 @@ "isLabelVisible": true }, "units": { - "unit": "MS" + "seriesOverrides": [ + { + "seriesName": "Bytes Read", + "unit": "BYTES_PER_SECOND" + }, + { + "seriesName": "Records Read", + "unit": "REQUESTS_PER_SECOND" + }, + { + "seriesName": "Bytes Written", + "unit": "BYTES_PER_SECOND" + }, + { + "seriesName": "Records Written", + "unit": "REQUESTS_PER_SECOND" + } + ] }, "yAxisLeft": { "zero": true @@ -331,12 +703,12 @@ } }, { - "title": "Average JVM GC Time", + "title": "Average stage memory bytes spilled to disk", "layout": { - "column": 9, - "row": 1, - "width": 4, - "height": 3 + "column": 1, + "row": 28, + "width": 6, + "height": 4 }, "linkedEntityGuids": null, "visualization": { @@ -349,10 +721,18 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.stage.jvmGcTime) WHERE spark.app.stage.jvmGcTime IS NOT NULL TIMESERIES" + "query": "FROM SparkStage\nSELECT average(memorySpilledBytes) AS 'Memory bytes spilled'\nWHERE event = 'complete'\nTIMESERIES " } ], "platformOptions": { @@ -362,7 +742,24 @@ "isLabelVisible": true }, "units": { - "unit": "MS" + "seriesOverrides": [ + { + "seriesName": "Bytes In", + "unit": "BYTES_PER_SECOND" + }, + { + "seriesName": "Records In", + "unit": "REQUESTS_PER_SECOND" + }, + { + "seriesName": "Bytes Out", + "unit": "BYTES_PER_SECOND" + }, + { + "seriesName": "Records Out", + "unit": "REQUESTS_PER_SECOND" + } + ] }, "yAxisLeft": { "zero": true @@ -373,12 +770,12 @@ } }, { - "title": "Average Task Deserialization Time", + "title": "Average stage disk bytes spilled during shuffles", "layout": { - "column": 1, - "row": 4, - "width": 4, - "height": 3 + "column": 7, + "row": 28, + "width": 6, + "height": 4 }, "linkedEntityGuids": null, "visualization": { @@ -391,10 +788,18 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.stage.executor.deserializeTime) WHERE spark.app.stage.executor.deserializeTime IS NOT NULL TIMESERIES" + "query": "FROM SparkStage\nSELECT average(diskSpilledBytes) AS 'Disk bytes spilled'\nWHERE event = 'complete'\nTIMESERIES " } ], "platformOptions": { @@ -404,7 +809,24 @@ "isLabelVisible": true }, "units": { - "unit": "MS" + "seriesOverrides": [ + { + "seriesName": "Bytes In", + "unit": "BYTES_PER_SECOND" + }, + { + "seriesName": "Records In", + "unit": "REQUESTS_PER_SECOND" + }, + { + "seriesName": "Bytes Out", + "unit": "BYTES_PER_SECOND" + }, + { + "seriesName": "Records Out", + "unit": "REQUESTS_PER_SECOND" + } + ] }, "yAxisLeft": { "zero": true @@ -415,180 +837,284 @@ } }, { - "title": "Average Task Deserialization CPU Time", + "title": "Executor statistics by stage", "layout": { - "column": 5, - "row": 4, - "width": 4, - "height": 3 + "column": 1, + "row": 32, + "width": 12, + "height": 4 }, "linkedEntityGuids": null, "visualization": { - "id": "viz.line" + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Result Serialization Duration", + "precision": 2, + "type": "humanized" + }, + { + "name": "GC Duration", + "precision": 2, + "type": "humanized" + }, + { + "name": "attemptId", + "type": "decimal" + }, + { + "name": "stageId", + "type": "decimal" + }, + { + "name": "CPU Duration", + "precision": 2, + "type": "humanized" + }, + { + "name": "Run Duration", + "precision": 2, + "type": "humanized" + }, + { + "name": "Deserialize CPU Duration", + "precision": 2, + "type": "humanized" + }, + { + "name": "Deserialize Duration", + "precision": 2, + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SparkStage\nSELECT sparkAppName as Application, stageId AS 'Stage ID', jobDescription, attemptId AS 'Attempt ID', submissionTime, executorDeserializeDuration / 1000 AS 'Deserialize Duration', executorDeserializeCpuDuration / 1000000000 AS 'Deserialize CPU Duration', executorRunDuration / 1000 AS 'Run Duration', executorCpuDuration / 1000000000 AS 'CPU Duration', gcDuration / 1000 AS 'GC Duration', resultSerializationDuration / 1000 AS 'Result Serialization Duration'\nWHERE event = 'complete'\nLIMIT MAX" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Executor statistics by task", + "layout": { + "column": 1, + "row": 36, + "width": 12, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" }, "rawConfiguration": { + "dataFormatters": [ + { + "name": "executorId", + "type": "decimal" + }, + { + "name": "Result Serialization Duration", + "precision": 2, + "type": "humanized" + }, + { + "name": "GC Duration", + "precision": 2, + "type": "humanized" + }, + { + "name": "attemptId", + "type": "decimal" + }, + { + "name": "taskId", + "type": "decimal" + }, + { + "name": "stageAttemptId", + "type": "decimal" + }, + { + "name": "stageId", + "type": "decimal" + }, + { + "name": "CPU Duration", + "precision": 2, + "type": "humanized" + }, + { + "name": "Run Duration", + "precision": 2, + "type": "humanized" + }, + { + "name": "Deserialize CPU Duration", + "precision": 2, + "type": "humanized" + }, + { + "name": "Deserialize Duration", + "precision": 2, + "type": "humanized" + } + ], "facet": { "showOtherSeries": false }, - "legend": { - "enabled": true - }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.stage.executor.deserializeCpuTime) / 1000000 WHERE spark.app.stage.executor.deserializeCpuTime IS NOT NULL TIMESERIES" + "query": "FROM SparkTask\nSELECT sparkAppName as Application, stageId AS 'Stage ID', stageAttemptId AS 'Stage Attempt ID', taskId AS 'Task ID', attemptId AS 'Attempt ID', launchTime, executorId AS 'Executor ID', executorDeserializeDuration / 1000 AS 'Deserialize Duration', executorDeserializeCpuDuration / 1000000000 AS 'Deserialize CPU Duration', executorRunDuration / 1000 AS 'Run Duration', executorCpuDuration / 1000000000 AS 'CPU Duration', gcDuration / 1000 AS 'GC Duration', resultSerializationDuration / 1000 AS 'Result Serialization Duration'\nWHERE event = 'complete'\nLIMIT MAX" } ], "platformOptions": { "ignoreTimeRange": false - }, - "thresholds": { - "isLabelVisible": true - }, - "units": { - "unit": "MS" - }, - "yAxisLeft": { - "zero": true - }, - "yAxisRight": { - "zero": true } } - }, + } + ] + }, + { + "name": "Executors", + "description": null, + "widgets": [ { - "title": "Average Result Serialization Time", + "title": "Executors", "layout": { - "column": 9, - "row": 4, - "width": 4, - "height": 3 + "column": 1, + "row": 1, + "width": 2, + "height": 2 }, "linkedEntityGuids": null, "visualization": { - "id": "viz.line" + "id": "viz.billboard" }, "rawConfiguration": { "facet": { "showOtherSeries": false }, - "legend": { - "enabled": true - }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.stage.resultSerializationTime) WHERE spark.app.stage.resultSerializationTime IS NOT NULL TIMESERIES" + "query": "FROM SparkExecutorSample\nSELECT uniqueCount(executorId) AS Executors" } ], "platformOptions": { "ignoreTimeRange": false - }, - "thresholds": { - "isLabelVisible": true - }, - "units": { - "unit": "MS" - }, - "yAxisLeft": { - "zero": true - }, - "yAxisRight": { - "zero": true } } }, { - "title": "I/O Bytes", + "title": "Current executors by application", "layout": { - "column": 1, - "row": 7, - "width": 6, - "height": 3 + "column": 3, + "row": 1, + "width": 10, + "height": 2 }, "linkedEntityGuids": null, "visualization": { - "id": "viz.line" + "id": "viz.table" }, "rawConfiguration": { + "dataFormatters": [ + { + "name": "Total Memory (GiB)", + "precision": 2, + "type": "humanized" + }, + { + "name": "maxTasks", + "type": "decimal" + }, + { + "name": "coreCount", + "type": "decimal" + } + ], "facet": { "showOtherSeries": false }, - "legend": { - "enabled": true - }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.stage.inputBytes) AS 'Bytes In' WHERE spark.app.stage.inputBytes IS NOT NULL TIMESERIES" - }, - { - "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.stage.outputBytes) AS 'Bytes Out' WHERE spark.app.stage.outputBytes IS NOT NULL TIMESERIES" + "query": "FROM SparkExecutorSample\nSELECT latest(isActive), latest(addTime), latest(coreCount), latest(maxTasks), latest(memoryTotalBytes) / 1024 / 1024 / 1024 AS 'Total Memory (GiB)'\nFACET sparkAppName AS Application, executorId AS 'Executor ID'" } ], "platformOptions": { - "ignoreTimeRange": false - }, - "thresholds": { - "isLabelVisible": true - }, - "yAxisLeft": { - "zero": true - }, - "yAxisRight": { - "zero": true + "ignoreTimeRange": true } } }, { - "title": "I/O Records", + "title": "Current executor task counters by application", "layout": { - "column": 7, - "row": 7, - "width": 6, + "column": 1, + "row": 3, + "width": 12, "height": 3 }, "linkedEntityGuids": null, "visualization": { - "id": "viz.line" + "id": "viz.table" }, "rawConfiguration": { + "dataFormatters": [ + { + "name": "GC Duration (s)", + "type": "humanized" + }, + { + "name": "Task Duration (s)", + "type": "humanized" + }, + { + "name": "Failed Tasks", + "type": "decimal" + }, + { + "name": "Completed Tasks", + "type": "decimal" + }, + { + "name": "Active Tasks", + "type": "decimal" + }, + { + "name": "Total Tasks", + "type": "decimal" + } + ], "facet": { "showOtherSeries": false }, - "legend": { - "enabled": true - }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.stage.inputRecords) AS 'Records In' WHERE spark.app.stage.inputRecords IS NOT NULL TIMESERIES" - }, - { - "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.stage.outputRecords) AS 'Records Out' WHERE spark.app.stage.outputRecords IS NOT NULL TIMESERIES" + "query": "FROM SparkExecutorSample\nSELECT latest(taskCount) AS 'Total Tasks', latest(completedTaskCount) AS 'Completed Tasks', latest(failedTaskCount) AS 'Failed Tasks', latest(taskDuration) / 1000 AS 'Task Duration (s)', latest(gcDuration) / 1000 AS 'GC Duration (s)'\nFACET sparkAppName AS Application, executorId AS 'Executor ID'" } ], "platformOptions": { - "ignoreTimeRange": false - }, - "thresholds": { - "isLabelVisible": true - }, - "yAxisLeft": { - "zero": true - }, - "yAxisRight": { - "zero": true + "ignoreTimeRange": true } } }, { - "title": "Shuffle I/O Bytes", + "title": "Storage memory usage by application and executor", "layout": { "column": 1, - "row": 10, + "row": 6, "width": 6, "height": 3 }, @@ -603,14 +1129,18 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.stage.shuffle.readBytes) AS 'Bytes Written' WHERE spark.app.stage.shuffle.readBytes IS NOT NULL TIMESERIES" - }, - { - "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.stage.shuffle.writeBytes) AS 'Bytes Out' WHERE spark.app.stage.shuffle.writeBytes IS NOT NULL TIMESERIES" + "query": "FROM SparkExecutorSample\nSELECT average(memoryUsedBytes)\nFACET sparkAppName, executorId\nTIMESERIES" } ], "platformOptions": { @@ -619,6 +1149,9 @@ "thresholds": { "isLabelVisible": true }, + "units": { + "unit": "BYTES" + }, "yAxisLeft": { "zero": true }, @@ -628,10 +1161,10 @@ } }, { - "title": "Shuffle I/O Records", + "title": "Disk usage by application and executor", "layout": { "column": 7, - "row": 10, + "row": 6, "width": 6, "height": 3 }, @@ -646,14 +1179,18 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.stage.shuffle.readRecords) AS 'Records Read' WHERE spark.app.stage.shuffle.readRecords IS NOT NULL TIMESERIES" - }, - { - "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.stage.shuffle.writeRecords) AS 'Records Written' WHERE spark.app.stage.shuffle.writeRecords IS NOT NULL TIMESERIES" + "query": "FROM SparkExecutorSample\nSELECT average(diskUsedBytes)\nFACET sparkAppName, executorId\nTIMESERIES" } ], "platformOptions": { @@ -662,6 +1199,9 @@ "thresholds": { "isLabelVisible": true }, + "units": { + "unit": "BYTES" + }, "yAxisLeft": { "zero": true }, @@ -669,19 +1209,13 @@ "zero": true } } - } - ] - }, - { - "name": "Executors", - "description": null, - "widgets": [ + }, { - "title": "Average Driver Memory Used", + "title": "Average on-heap storage memory usage by application and executor", "layout": { "column": 1, - "row": 1, - "width": 6, + "row": 9, + "width": 3, "height": 3 }, "linkedEntityGuids": null, @@ -695,15 +1229,20 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.executor.memoryUsed) AS 'Bytes' WHERE sparkAppExecutorId = 'driver' AND spark.app.executor.memoryUsed IS NOT NULL TIMESERIES" + "query": "FROM SparkExecutorSample\nSELECT average(onHeapMemoryUsedBytes)\nFACET sparkAppName, executorId\nTIMESERIES" } ], - "nullValues": { - "nullValue": "default" - }, "platformOptions": { "ignoreTimeRange": false }, @@ -722,11 +1261,11 @@ } }, { - "title": "Average Driver Disk Used", + "title": "Average on-heap storage memory total by application and executor", "layout": { - "column": 7, - "row": 1, - "width": 6, + "column": 4, + "row": 9, + "width": 3, "height": 3 }, "linkedEntityGuids": null, @@ -740,15 +1279,20 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.executor.diskUsed) AS 'Bytes' WHERE sparkAppExecutorId = 'driver' AND spark.app.executor.diskUsed IS NOT NULL TIMESERIES " + "query": "FROM SparkExecutorSample\nSELECT average(onHeapMemoryTotalBytes)\nFACET sparkAppName, executorId\nTIMESERIES" } ], - "nullValues": { - "nullValue": "default" - }, "platformOptions": { "ignoreTimeRange": false }, @@ -767,11 +1311,11 @@ } }, { - "title": "Average Executor Memory Used", + "title": "Average off-heap storage memory usage by application and executor", "layout": { - "column": 1, - "row": 4, - "width": 6, + "column": 7, + "row": 9, + "width": 3, "height": 3 }, "linkedEntityGuids": null, @@ -785,15 +1329,20 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.executor.memoryUsed) as 'Bytes' WHERE sparkAppExecutorId != 'driver' AND spark.app.executor.memoryUsed IS NOT NULL TIMESERIES FACET sparkAppExecutorId" + "query": "FROM SparkExecutorSample\nSELECT average(offHeapMemoryUsedBytes)\nFACET sparkAppName, executorId\nTIMESERIES" } ], - "nullValues": { - "nullValue": "default" - }, "platformOptions": { "ignoreTimeRange": false }, @@ -812,11 +1361,11 @@ } }, { - "title": "Average Executor Disk Used", + "title": "Average off-heap storage memory total by application and executor", "layout": { - "column": 7, - "row": 4, - "width": 6, + "column": 10, + "row": 9, + "width": 3, "height": 3 }, "linkedEntityGuids": null, @@ -830,15 +1379,20 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.executor.diskUsed) AS 'Bytes' WHERE sparkAppExecutorId != 'driver' AND spark.app.executor.diskUsed IS NOT NULL TIMESERIES FACET sparkAppExecutorId" + "query": "FROM SparkExecutorSample\nSELECT average(offHeapMemoryTotalBytes)\nFACET sparkAppName, executorId\nTIMESERIES" } ], - "nullValues": { - "nullValue": "default" - }, "platformOptions": { "ignoreTimeRange": false }, @@ -857,229 +1411,268 @@ } }, { - "title": "Active Tasks", + "title": "Current peak on-heap JVM memory usage (MiB) by application and executor", "layout": { "column": 1, - "row": 7, - "width": 4, - "height": 3 + "row": 12, + "width": 6, + "height": 2 }, "linkedEntityGuids": null, "visualization": { - "id": "viz.line" + "id": "viz.billboard" }, "rawConfiguration": { + "billboardSettings": { + "gridOptions": { + "value": 35 + }, + "visual": { + "alignment": "stacked", + "display": "auto" + } + }, + "dataFormatters": [ + { + "name": "Used (MiB)", + "precision": 4, + "type": "humanized" + } + ], "facet": { "showOtherSeries": false }, - "legend": { - "enabled": true - }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.executor.activeTasks) AS 'Tasks' TIMESERIES WHERE sparkAppExecutorId != 'driver' FACET sparkAppExecutorId" + "query": "FROM SparkExecutorSample\nSELECT latest(peakJvmHeapMemoryUsedBytes) / 1024 / 1024\nFACET sparkAppName, executorId\n" } ], "platformOptions": { - "ignoreTimeRange": false - }, - "thresholds": { - "isLabelVisible": true - }, - "yAxisLeft": { - "zero": true - }, - "yAxisRight": { - "zero": true + "ignoreTimeRange": true } } }, { - "title": "Complete Tasks", + "title": "Current peak off-heap JVM memory usage (MiB) by application and executor", "layout": { - "column": 5, - "row": 7, - "width": 4, - "height": 3 + "column": 7, + "row": 12, + "width": 6, + "height": 2 }, "linkedEntityGuids": null, "visualization": { "id": "viz.billboard" }, "rawConfiguration": { + "billboardSettings": { + "gridOptions": { + "value": 35 + }, + "visual": { + "alignment": "stacked", + "display": "auto" + } + }, + "dataFormatters": [ + { + "name": "Used (MiB)", + "precision": 4, + "type": "humanized" + } + ], "facet": { "showOtherSeries": false }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.executor.completedTasks) AS 'Tasks' WHERE sparkAppExecutorId != 'driver' FACET sparkAppExecutorId" + "query": "FROM SparkExecutorSample\nSELECT latest(peakJvmOffHeapMemoryUsedBytes) / 1024 / 1024\nFACET sparkAppName, executorId" } ], "platformOptions": { - "ignoreTimeRange": false + "ignoreTimeRange": true } } }, { - "title": "Failed Tasks", + "title": "Current peak on-heap tracked execution memory usage (MiB) by application and executor", "layout": { - "column": 9, - "row": 7, - "width": 4, - "height": 3 + "column": 1, + "row": 14, + "width": 6, + "height": 2 }, "linkedEntityGuids": null, "visualization": { "id": "viz.billboard" }, "rawConfiguration": { + "billboardSettings": { + "gridOptions": { + "value": 35 + }, + "visual": { + "alignment": "stacked", + "display": "auto" + } + }, + "dataFormatters": [ + { + "name": "Used (MiB)", + "precision": 4, + "type": "humanized" + } + ], "facet": { "showOtherSeries": false }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.executor.failedTasks) AS 'Tasks' WHERE sparkAppExecutorId != 'driver' FACET sparkAppExecutorId" + "query": "FROM SparkExecutorSample\nSELECT latest(peakOnHeapExecutionMemoryUsedBytes) / 1024 / 1024\nFACET sparkAppName, executorId\n" } ], "platformOptions": { - "ignoreTimeRange": false + "ignoreTimeRange": true } } }, { - "title": "Average Executor Used On Heap Memory", + "title": "Current peak off-heap tracked execution memory usage (MiB) by application and executor", "layout": { - "column": 1, - "row": 10, + "column": 7, + "row": 14, "width": 6, - "height": 3 + "height": 2 }, "linkedEntityGuids": null, "visualization": { - "id": "viz.line" + "id": "viz.billboard" }, "rawConfiguration": { + "billboardSettings": { + "gridOptions": { + "value": 35 + }, + "visual": { + "alignment": "stacked", + "display": "auto" + } + }, + "dataFormatters": [ + { + "name": "Used (MiB)", + "precision": 4, + "type": "humanized" + } + ], "facet": { "showOtherSeries": false }, - "legend": { - "enabled": true - }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.executor.memory.usedOnHeapStorageMemory) WHERE sparkAppExecutorId != 'driver' AND spark.app.executor.memory.usedOnHeapStorageMemory IS NOT NULL TIMESERIES FACET sparkAppExecutorId" + "query": "FROM SparkExecutorSample\nSELECT latest(peakOffHeapExecutionMemoryUsedBytes) / 1024 / 1024\nFACET sparkAppName, executorId" } ], "platformOptions": { - "ignoreTimeRange": false - }, - "thresholds": { - "isLabelVisible": true - }, - "units": { - "unit": "BYTES" - }, - "yAxisLeft": { - "zero": true - }, - "yAxisRight": { - "zero": true + "ignoreTimeRange": true } } }, { - "title": "Average Executor Used Off Heap Memory", + "title": "Current peak on-heap storage memory usage (MiB) by application and executor", "layout": { - "column": 7, - "row": 10, + "column": 1, + "row": 16, "width": 6, - "height": 3 + "height": 2 }, "linkedEntityGuids": null, "visualization": { - "id": "viz.line" + "id": "viz.billboard" }, "rawConfiguration": { + "billboardSettings": { + "gridOptions": { + "value": 35 + }, + "visual": { + "alignment": "stacked", + "display": "auto" + } + }, + "dataFormatters": [ + { + "name": "Used (MiB)", + "precision": 4, + "type": "humanized" + } + ], "facet": { "showOtherSeries": false }, - "legend": { - "enabled": true - }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.executor.memory.usedOffHeapStorageMemory) WHERE sparkAppExecutorId != 'driver' AND spark.app.executor.memory.usedOffHeapStorageMemory IS NOT NULL TIMESERIES FACET sparkAppExecutorId" + "query": "FROM SparkExecutorSample\nSELECT latest(peakOnHeapStorageMemoryUsedBytes) / 1024 / 1024\nFACET sparkAppName, executorId" } ], "platformOptions": { - "ignoreTimeRange": false - }, - "thresholds": { - "isLabelVisible": true - }, - "units": { - "unit": "BYTES" - }, - "yAxisLeft": { - "zero": true - }, - "yAxisRight": { - "zero": true + "ignoreTimeRange": true } } }, { - "title": "Average Executor Peak JVM Heap Memory", + "title": "Current peak off-heap storage memory usage (MiB) by application and executor", "layout": { - "column": 1, - "row": 13, + "column": 7, + "row": 16, "width": 6, - "height": 3 + "height": 2 }, "linkedEntityGuids": null, "visualization": { - "id": "viz.line" + "id": "viz.billboard" }, "rawConfiguration": { + "billboardSettings": { + "gridOptions": { + "value": 35 + }, + "visual": { + "alignment": "stacked", + "display": "auto" + } + }, + "dataFormatters": [ + { + "name": "Used (MiB)", + "precision": 4, + "type": "humanized" + } + ], "facet": { "showOtherSeries": false }, - "legend": { - "enabled": true - }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.executor.memory.peak.jvmHeap) WHERE sparkAppExecutorId != 'driver' AND spark.app.executor.memory.peak.jvmHeap IS NOT NULL TIMESERIES FACET sparkAppExecutorId" + "query": "FROM SparkExecutorSample\nSELECT latest(peakOffHeapStorageMemoryUsedBytes) / 1024 / 1024\nFACET sparkAppName, executorId" } ], "platformOptions": { - "ignoreTimeRange": false - }, - "thresholds": { - "isLabelVisible": true - }, - "units": { - "unit": "BYTES" - }, - "yAxisLeft": { - "zero": true - }, - "yAxisRight": { - "zero": true + "ignoreTimeRange": true } } }, { - "title": "Average Executor Peak JVM Off Heap Memory", + "title": "Average RDD blocks by application and executor", "layout": { - "column": 7, - "row": 13, + "column": 1, + "row": 18, "width": 6, "height": 3 }, @@ -1094,10 +1687,18 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.executor.memory.peak.jvmOffHeap) WHERE sparkAppExecutorId != 'driver' AND spark.app.executor.memory.peak.jvmOffHeap IS NOT NULL TIMESERIES FACET sparkAppExecutorId" + "query": "FROM SparkExecutorSample\nSELECT average(rddBlockCount)\nFACET sparkAppName, executorId\nTIMESERIES" } ], "platformOptions": { @@ -1106,9 +1707,6 @@ "thresholds": { "isLabelVisible": true }, - "units": { - "unit": "BYTES" - }, "yAxisLeft": { "zero": true }, @@ -1118,11 +1716,11 @@ } }, { - "title": "Total Executor JVM Task Duration (seconds)", + "title": "Current input bytes (MiB) by application and executor", "layout": { - "column": 1, - "row": 16, - "width": 4, + "column": 7, + "row": 18, + "width": 6, "height": 3 }, "linkedEntityGuids": null, @@ -1136,20 +1734,20 @@ "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.executor.totalDuration) / 1000 AS 'Seconds' WHERE sparkAppExecutorId != 'driver' FACET sparkAppExecutorId" + "query": "FROM SparkExecutorSample\nSELECT latest(inputBytes) / 1024 / 1024\nFACET sparkAppName, executorId" } ], "platformOptions": { - "ignoreTimeRange": false + "ignoreTimeRange": true } } }, { - "title": "Total Executor JVM GC Time (seconds)", + "title": "Current shuffle read bytes (MiB) by application and executor", "layout": { - "column": 5, - "row": 16, - "width": 4, + "column": 1, + "row": 21, + "width": 6, "height": 3 }, "linkedEntityGuids": null, @@ -1163,79 +1761,67 @@ "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.executor.totalGCTime) / 1000 AS 'Seconds' WHERE sparkAppExecutorId != 'driver' FACET sparkAppExecutorId" + "query": "FROM SparkExecutorSample\nSELECT latest(shuffleReadBytes) / 1024 / 1024\nFACET sparkAppName, executorId" } ], "platformOptions": { - "ignoreTimeRange": false + "ignoreTimeRange": true } } }, { - "title": "RDD Blocks", + "title": "Current shuffle write bytes (MiB) by application and executor", "layout": { - "column": 9, - "row": 16, - "width": 4, + "column": 7, + "row": 21, + "width": 6, "height": 3 }, "linkedEntityGuids": null, "visualization": { - "id": "viz.line" + "id": "viz.bar" }, "rawConfiguration": { "facet": { "showOtherSeries": false }, - "legend": { - "enabled": true - }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.executor.rddBlocks) AS 'RDD Blocks' WHERE sparkAppExecutorId != 'driver' TIMESERIES FACET sparkAppExecutorId" + "query": "FROM SparkExecutorSample\nSELECT latest(shuffleWriteBytes) / 1024 / 1024\nFACET sparkAppName, executorId" } ], "platformOptions": { - "ignoreTimeRange": false - }, - "thresholds": { - "isLabelVisible": true - }, - "yAxisLeft": { - "zero": true - }, - "yAxisRight": { - "zero": true + "ignoreTimeRange": true } } - }, + } + ] + }, + { + "name": "Storage", + "description": null, + "widgets": [ { - "title": "Total Input Bytes Summed", + "title": "RDDs", "layout": { "column": 1, - "row": 19, + "row": 1, "width": 4, - "height": 3 + "height": 2 }, "linkedEntityGuids": null, "visualization": { "id": "viz.billboard" }, "rawConfiguration": { - "dataFormatters": [ - { - "name": "Bytes", - "type": "humanized" - } - ], "facet": { "showOtherSeries": false }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.executor.totalInputBytes) AS 'Bytes' WHERE sparkAppExecutorId != 'driver' FACET sparkAppExecutorId" + "query": "FROM SparkRDDSample\nSELECT uniqueCount(concat(rddId, rddName)) AS 'RDDs'" } ], "platformOptions": { @@ -1244,31 +1830,25 @@ } }, { - "title": "Total Shuffle Read Bytes Summed", + "title": "Distributions", "layout": { "column": 5, - "row": 19, + "row": 1, "width": 4, - "height": 3 + "height": 2 }, "linkedEntityGuids": null, "visualization": { "id": "viz.billboard" }, "rawConfiguration": { - "dataFormatters": [ - { - "name": "Bytes", - "type": "humanized" - } - ], "facet": { "showOtherSeries": false }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.executor.totalShuffleRead) AS 'Bytes' WHERE sparkAppExecutorId != 'driver' FACET sparkAppExecutorId" + "query": "FROM SparkRDDDistributionSample\nSELECT uniqueCount(concat(rddId, rddName, distributionIndex)) AS 'Distributions'" } ], "platformOptions": { @@ -1277,50 +1857,38 @@ } }, { - "title": "Total Shuffle Write Bytes Summed", + "title": "Partitions", "layout": { "column": 9, - "row": 19, + "row": 1, "width": 4, - "height": 3 + "height": 2 }, "linkedEntityGuids": null, "visualization": { "id": "viz.billboard" }, "rawConfiguration": { - "dataFormatters": [ - { - "name": "Bytes", - "type": "humanized" - } - ], "facet": { "showOtherSeries": false }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.executor.totalShuffleWrite) AS 'Bytes' WHERE sparkAppExecutorId != 'driver' FACET sparkAppExecutorId" + "query": "FROM SparkRDDPartitionSample\nSELECT uniqueCount(concat(rddId, rddName, blockName)) AS 'Partitions'" } ], "platformOptions": { "ignoreTimeRange": false } } - } - ] - }, - { - "name": "RDDs", - "description": null, - "widgets": [ + }, { - "title": "Total Partitions By App & RDD", + "title": "RDDs by application", "layout": { "column": 1, - "row": 1, - "width": 6, + "row": 3, + "width": 12, "height": 3 }, "linkedEntityGuids": null, @@ -1328,40 +1896,31 @@ "id": "viz.table" }, "rawConfiguration": { - "facet": { - "showOtherSeries": false - }, - "nrqlQueries": [ + "dataFormatters": [ { - "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.storage.rdd.partitions) AS 'Partitions' WHERE spark.app.storage.rdd.partitions IS NOT NULL FACET sparkAppName, sparkAppRDDName " + "name": "Average disk used (MiB)", + "type": "decimal" + }, + { + "name": "Average memory used (MiB)", + "type": "decimal" + }, + { + "name": "Cached Partitions", + "type": "decimal" + }, + { + "name": "Partitions", + "type": "decimal" } ], - "platformOptions": { - "ignoreTimeRange": false - } - } - }, - { - "title": "Cached Partitions By App & RDD", - "layout": { - "column": 7, - "row": 1, - "width": 6, - "height": 3 - }, - "linkedEntityGuids": null, - "visualization": { - "id": "viz.table" - }, - "rawConfiguration": { "facet": { "showOtherSeries": false }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT latest(spark.app.storage.rdd.cachedPartitions) AS 'Partitions' WHERE spark.app.storage.rdd.cachedPartitions IS NOT NULL FACET sparkAppName, sparkAppRDDName " + "query": "FROM SparkRDDSample\nSELECT latest(partitionCount) AS 'Partitions', latest(cachedPartitionCount) AS 'Cached Partitions', average(memoryUsedBytes) / 1024 / 1024 AS 'Average Memory Used (MiB)', average(diskUsedBytes) / 1024 / 1024 AS 'Average Disk Used (MiB)'\nFACET sparkAppName AS Application, rddId AS 'RDD ID', rddName AS 'RDD Name'" } ], "platformOptions": { @@ -1370,10 +1929,10 @@ } }, { - "title": "Average Memory Used By App & RDD", + "title": "Average memory used by application and RDD", "layout": { "column": 1, - "row": 4, + "row": 6, "width": 6, "height": 3 }, @@ -1388,10 +1947,18 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.storage.rdd.memory.used) WHERE spark.app.storage.rdd.memory.used IS NOT NULL TIMESERIES FACET sparkAppName, sparkAppRDDName" + "query": "FROM SparkRDDSample\nSELECT average(memoryUsedBytes)\nFACET sparkAppName, rddId, rddName \nTIMESERIES" } ], "platformOptions": { @@ -1412,10 +1979,10 @@ } }, { - "title": "Average Disk Used By App & RDD", + "title": "Average disk used by application and RDD", "layout": { "column": 7, - "row": 4, + "row": 6, "width": 6, "height": 3 }, @@ -1430,10 +1997,18 @@ "legend": { "enabled": true }, + "markers": { + "displayedTypes": { + "criticalViolations": false, + "deployments": true, + "relatedDeployments": true, + "warningViolations": false + } + }, "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.storage.rdd.disk.used) WHERE spark.app.storage.rdd.disk.used IS NOT NULL TIMESERIES FACET sparkAppName, sparkAppRDDName" + "query": "FROM SparkRDDSample\nSELECT average(diskUsedBytes)\nFACET sparkAppName, rddId, rddName\nTIMESERIES" } ], "platformOptions": { @@ -1454,20 +2029,20 @@ } }, { - "title": "Average Memory Used by RDD Partition Block Name", + "title": "Average partition size (MiB) by application and RDD", "layout": { "column": 1, - "row": 7, + "row": 9, "width": 6, "height": 3 }, "linkedEntityGuids": null, "visualization": { - "id": "viz.line" + "id": "viz.pie" }, "rawConfiguration": { "facet": { - "showOtherSeries": false + "showOtherSeries": true }, "legend": { "enabled": true @@ -1475,41 +2050,29 @@ "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.storage.rdd.partition.memory.used) WHERE spark.app.storage.rdd.partition.memory.used IS NOT NULL TIMESERIES FACET sparkAppRDDName, sparkAppRddPartitionBlockName" + "query": "FROM SparkRDDPartitionSample\nSELECT average(memoryUsedBytes + diskUsedBytes) / 1024 / 1024 AS 'MiB'\nFACET sparkAppName, rddId, rddName, blockName" } ], "platformOptions": { "ignoreTimeRange": false - }, - "thresholds": { - "isLabelVisible": true - }, - "units": { - "unit": "BYTES" - }, - "yAxisLeft": { - "zero": true - }, - "yAxisRight": { - "zero": true } } }, { - "title": "Average Disk Used by RDD Partition Block Name", + "title": "Average distribution size (MiB) by application and RDD", "layout": { "column": 7, - "row": 7, + "row": 9, "width": 6, "height": 3 }, "linkedEntityGuids": null, "visualization": { - "id": "viz.line" + "id": "viz.pie" }, "rawConfiguration": { "facet": { - "showOtherSeries": false + "showOtherSeries": true }, "legend": { "enabled": true @@ -1517,27 +2080,98 @@ "nrqlQueries": [ { "accountIds": [], - "query": "FROM Metric SELECT average(spark.app.storage.rdd.partition.disk.used) WHERE spark.app.storage.rdd.partition.disk.used IS NOT NULL TIMESERIES FACET sparkAppRDDName, sparkAppRddPartitionBlockName" + "query": "FROM SparkRDDDistributionSample\nSELECT average(memoryUsedBytes + diskUsedBytes) / 1024 / 1024 AS 'MiB'\nFACET sparkAppName, rddId, rddName, distributionIndex" } ], "platformOptions": { "ignoreTimeRange": false + } + } + }, + { + "title": "Partitions by application and RDD", + "layout": { + "column": 1, + "row": 12, + "width": 12, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Average disk used (MiB)", + "type": "humanized" + }, + { + "name": "Average memory used (MiB)", + "type": "humanized" + } + ], + "facet": { + "showOtherSeries": false }, - "thresholds": { - "isLabelVisible": true - }, - "units": { - "unit": "BYTES" - }, - "yAxisLeft": { - "zero": true + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SparkRDDPartitionSample\nSELECT latest(storageLevel), latest(executorIds) AS 'Executor IDs', average(memoryUsedBytes) / 1024 / 1024 AS 'Average Memory Used (MiB)', average(diskUsedBytes) / 1024 / 1024 AS 'Average Disk Used (MiB)'\nFACET sparkAppName AS Application, rddId AS 'RDD ID', rddName AS 'RDD Name', blockName AS 'Block Name'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Distributions by application and RDD", + "layout": { + "column": 1, + "row": 15, + "width": 12, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "dataFormatters": [ + { + "name": "Average disk used (MiB)", + "type": "humanized" + }, + { + "name": "Average memory used (MiB)", + "type": "humanized" + }, + { + "name": "Cached Partitions", + "type": "decimal" + }, + { + "name": "Partitions", + "type": "decimal" + } + ], + "facet": { + "showOtherSeries": false }, - "yAxisRight": { - "zero": true + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM SparkRDDDistributionSample\nSELECT average(memoryUsedBytes) / 1024 / 1024 AS 'Average Memory Used (MiB)', average(diskUsedBytes) / 1024 / 1024 AS 'Average Disk Used (MiB)', average(onHeapMemoryUsedBytes) / 1024 / 1024 AS 'Average On-heap Memory Used (MiB)', average(offHeapMemoryUsedBytes) / 1024 / 1024 AS 'Average Off-heap Memory Used (MiB)'\nFACET sparkAppName AS Application, rddId AS 'RDD ID', rddName AS 'RDD Name', distributionIndex AS 'Distribution Index' " + } + ], + "platformOptions": { + "ignoreTimeRange": false } } } ] } - ] -} \ No newline at end of file + ], + "variables": [] +} diff --git a/dashboards/databricks-spark/databricks-spark01.png b/dashboards/databricks-spark/databricks-spark01.png index 95fe120a05..b4e29ec1bf 100644 Binary files a/dashboards/databricks-spark/databricks-spark01.png and b/dashboards/databricks-spark/databricks-spark01.png differ diff --git a/dashboards/databricks-spark/databricks-spark02.png b/dashboards/databricks-spark/databricks-spark02.png index 66d4c858e3..3249772d73 100644 Binary files a/dashboards/databricks-spark/databricks-spark02.png and b/dashboards/databricks-spark/databricks-spark02.png differ diff --git a/dashboards/databricks-spark/databricks-spark03.png b/dashboards/databricks-spark/databricks-spark03.png index 0bbff226fb..f77fec768b 100644 Binary files a/dashboards/databricks-spark/databricks-spark03.png and b/dashboards/databricks-spark/databricks-spark03.png differ diff --git a/dashboards/databricks-spark/databricks-spark04.png b/dashboards/databricks-spark/databricks-spark04.png deleted file mode 100644 index deee19c229..0000000000 Binary files a/dashboards/databricks-spark/databricks-spark04.png and /dev/null differ diff --git a/data-sources/databricks/config.yml b/data-sources/databricks/config.yml index ab70d2bbc8..1578fe282d 100644 --- a/data-sources/databricks/config.yml +++ b/data-sources/databricks/config.yml @@ -1,12 +1,15 @@ id: databricks displayName: Databricks Integration description: | - This integration collects Spark telemetry, Workflow telemetry, and Cost and Billing information from Databricks. + The Databricks Integration collects Apache Spark application metrics, + Databricks Lakeflow job run metrics, Databricks Lakeflow Spark Declarative + Pipeline update metrics, Databricks query metrics, Databricks cluster health + metrics and logs, and Databricks consumption and cost data. icon: logo.png install: primary: link: - url: https://github.com/newrelic-experimental/newrelic-databricks-integration + url: https://github.com/newrelic/newrelic-databricks-integration?tab=readme-ov-file#getting-started keywords: - nrlabs - nrlabs-data diff --git a/quickstarts/databricks/config.yml b/quickstarts/databricks/config.yml index 19243e0869..db18869b81 100644 --- a/quickstarts/databricks/config.yml +++ b/quickstarts/databricks/config.yml @@ -1,16 +1,38 @@ id: 533cdd19-8232-42cb-b134-e7d17bfff581 slug: databricks -title: Databricks Spark Integration +title: Databricks Integration description: | - Databricks is an orchestration platform for Apache Spark. Instantly monitor Databricks Spark clusters with our New Relic Spark integration. - This integration collects Spark telemetry, Workflow telemetry, and Cost and Billing information from Databricks. + ## Why monitor Databricks - The New Relic Databricks integration can collect telemetry from Spark running on Databricks. By default, the integration will automatically connect to and collect telemetry from the Spark deployments in all clusters created via the UI or API in the specified workspace. + In the world of big data, Databricks is a mission-critical platform. But + making sure your workloads are running efficiently, cost-effectively, and + reliably can be challenging. + + The Databricks Integration from New Relic delivers total visibility for your + entire Databricks estate, allowing you to troubleshoot, optimize, and connect + performance directly to cost - all from a single, unified observability + platform. + + ### Databricks quickstart highlights + + The Databricks integration collects a comprehensive suite of telemetry data, + including the following: + + - Apache Spark application metrics + - Databricks Lakeflow job run metrics + - Databricks Lakeflow Spark Declarative Pipeline update metrics + - Databricks query metrics + - Databricks cluster health metrics and logs + - Databricks consumption and cost data + + With the pre-built dashboards in this quickstart, you can quickly visualize + and analyze your Databricks workloads to ensure optimal performance and cost + efficiency. summary: | - Monitor Databricks Spark clusters with the New Relic Databricks integration + Gain full visibility into your entire Databricks estate with the comprehensive + suite of telemetry data collected by the Databricks Integration icon: logo.png level: Community - keywords: - nrlabs - nrlabs-data @@ -23,11 +45,22 @@ keywords: authors: - New Relic Labs documentation: - - name: Databricks integration docs + - name: Getting Started + description: | + Follow our Getting Started documentation to quickly instrument your + Databricks environment and start visualizing your data in New Relic. + url: https://github.com/newrelic/newrelic-databricks-integration?tab=readme-ov-file#getting-started + - name: Learn More description: | - Collect Spark telemetry data with the New Relic Databricks integration - url: https://github.com/newrelic-experimental/newrelic-databricks-integration + Learn more about the Databricks Integration in our Usage Guide, including + configuration options, telemetry collected, and dashboards included. + url: https://github.com/newrelic/newrelic-databricks-integration?tab=readme-ov-file#usage-guide dataSourceIds: - databricks dashboards: - databricks-spark + - databricks-job-runs + - databricks-pipeline-updates + - databricks-query-metrics + - databricks-cluster-health + - databricks-consumption-cost