Skip to content

Commit 1be77f1

Browse files
authored
feat: add GPU Collector based on DCGM (alibaba#2360)
* feat: add GPU Collector based on DCGM * fix: correct spelling error and add Linux platform protection * fix: update copyright year to 2025 and modify comments to English
1 parent ebfd563 commit 1be77f1

16 files changed

Lines changed: 1114 additions & 2 deletions

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
[submodule "core/_thirdparty/coolbpf"]
22
path = core/_thirdparty/coolbpf
33
url = https://gitee.com/anolis/coolbpf.git
4+
[submodule "core/_thirdparty/DCGM"]
5+
path = core/_thirdparty/DCGM
6+
url = https://github.com/NVIDIA/DCGM.git

core/_thirdparty/DCGM

Submodule DCGM added at 6e947dc

core/host_monitor/Constants.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,8 @@ const int EXECUTE_FAIL = -1;
4545
const std::filesystem::path PROCESS_NET_IF_INET6 = "net/if_inet6";
4646
const int64_t SYSTEM_HERTZ = sysconf(_SC_CLK_TCK);
4747
const long PAGE_SIZE = sysconf(_SC_PAGESIZE);
48+
const std::string NVSMI = "nvidia-smi";
49+
const std::string LIB_DCGM = "dcgm";
50+
const std::filesystem::path NVIDIACTL = "/dev/nvidiactl";
4851

4952
} // namespace logtail

core/host_monitor/Constants.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#pragma once
1818

1919
#include <filesystem>
20+
#include <string>
2021

2122
#include "common/StringView.h"
2223

@@ -56,5 +57,8 @@ inline constexpr StringView DEFAULT_USER_ID_LABEL = "user_id";
5657
#else
5758
inline constexpr StringView DEFAULT_HOST_IP_LABEL = "host_ip";
5859
#endif
60+
const extern std::string NVSMI;
61+
const extern std::string LIB_DCGM;
62+
const extern std::filesystem::path NVIDIACTL;
5963

6064
} // namespace logtail

core/host_monitor/HostMonitorInputRunner.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include "host_monitor/HostMonitorTimerEvent.h"
3737
#include "host_monitor/collector/CPUCollector.h"
3838
#include "host_monitor/collector/DiskCollector.h"
39+
#include "host_monitor/collector/GPUCollector.h"
3940
#include "host_monitor/collector/MemCollector.h"
4041
#include "host_monitor/collector/NetCollector.h"
4142
#include "host_monitor/collector/ProcessCollector.h"
@@ -63,6 +64,7 @@ HostMonitorInputRunner::HostMonitorInputRunner() {
6364
RegisterCollector<DiskCollector>();
6465
RegisterCollector<ProcessCollector>();
6566
RegisterCollector<NetCollector>();
67+
RegisterCollector<GPUCollector>();
6668

6769
size_t threadPoolSize = 1;
6870
// threadPoolSize should be greater than 0

core/host_monitor/LinuxSystemInterface.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ using namespace std::chrono;
3636
#include "common/FileSystemUtil.h"
3737
#include "common/StringTools.h"
3838
#include "host_monitor/Constants.h"
39+
#include "host_monitor/SystemInformationTools.h"
3940
#include "host_monitor/common/FastFieldParser.h"
4041
#include "logger/Logger.h"
4142

@@ -1148,4 +1149,48 @@ bool LinuxSystemInterface::GetProcessOpenFilesOnce(pid_t pid, ProcessFd& process
11481149

11491150
return true;
11501151
}
1152+
1153+
bool LinuxSystemInterface::InitGPUCollectorOnce(const FieldMap& fieldMap) {
1154+
if (!CheckGPUDevice()) {
1155+
return false;
1156+
}
1157+
1158+
if (!mDcgmCollector.IsLibraryLoaded()) {
1159+
LOG_ERROR(sLogger, ("GPU collector initialization failed", "DCGM library not loaded"));
1160+
return false;
1161+
}
1162+
1163+
if (!mDcgmCollector.CanInitialize()) {
1164+
if (mDcgmCollector.IsFullyInitialized()) {
1165+
LOG_INFO(sLogger, ("GPU collector already initialized", "skipping initialization"));
1166+
return true;
1167+
} else {
1168+
LOG_ERROR(sLogger, ("GPU collector initialization failed", "DCGM collector in invalid state"));
1169+
return false;
1170+
}
1171+
}
1172+
1173+
if (!mDcgmCollector.Initialize(fieldMap)) {
1174+
LOG_ERROR(sLogger, ("GPU collector initialization failed", "DCGM initialization error"));
1175+
return false;
1176+
}
1177+
1178+
return true;
1179+
}
1180+
1181+
bool LinuxSystemInterface::GetGPUInformationOnce(GPUInformation& gpuInfo) {
1182+
if (!mDcgmCollector.IsFullyInitialized()) {
1183+
LOG_ERROR(sLogger, ("GPU data retrieval failed", "DCGM collector not ready"));
1184+
return false;
1185+
}
1186+
1187+
bool success = mDcgmCollector.Collect(gpuInfo);
1188+
if (!success) {
1189+
LOG_ERROR(sLogger, ("GPU data retrieval failed", "collection operation failed"));
1190+
} else {
1191+
LOG_DEBUG(sLogger, ("GPU data retrieval successful", "metrics collected")("gpu_count", gpuInfo.stats.size()));
1192+
}
1193+
1194+
return success;
1195+
}
11511196
} // namespace logtail

core/host_monitor/LinuxSystemInterface.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
#pragma once
1818

1919
#include "common/ProcParser.h"
20+
#include "host_monitor/Constants.h"
21+
#include "host_monitor/SystemInformationTools.h"
2022
#include "host_monitor/SystemInterface.h"
2123

2224
namespace logtail {
@@ -32,7 +34,7 @@ class LinuxSystemInterface : public SystemInterface {
3234
}
3335

3436
private:
35-
explicit LinuxSystemInterface() : mProcParser("") {}
37+
explicit LinuxSystemInterface() : mProcParser(""), mDcgmCollector(LIB_DCGM) {}
3638
~LinuxSystemInterface() = default;
3739

3840
bool GetSystemInformationOnce(SystemInformation& systemInfo) override;
@@ -52,6 +54,9 @@ class LinuxSystemInterface : public SystemInterface {
5254
bool GetDiskStateInformationOnce(DiskStateInformation& diskStateInfo) override;
5355
bool GetFileSystemInformationOnce(std::string dirName, FileSystemInformation& fileSystemInfo) override;
5456

57+
bool InitGPUCollectorOnce(const FieldMap& fieldMap) override;
58+
bool GetGPUInformationOnce(GPUInformation& gpuInfo) override;
59+
5560
uint64_t GetMemoryValue(char unit, uint64_t value);
5661
bool GetProcessCmdlineStringOnce(pid_t pid, ProcessCmdlineString& cmdline) override;
5762
bool GetProcessStatmOnce(pid_t pid, ProcessMemoryInformation& processMemory) override;
@@ -67,5 +72,6 @@ class LinuxSystemInterface : public SystemInterface {
6772
bool GetInterfaceConfig(InterfaceConfig& interfaceConfig, const std::string& name);
6873

6974
ProcParser mProcParser;
75+
DCGMCollector mDcgmCollector;
7076
};
7177
} // namespace logtail

0 commit comments

Comments
 (0)