Skip to content

Commit 8e5ac51

Browse files
feat(autoware_system_monitor): change diagnostic logic for mem_monitor (#11971)
* feat(autoware_system_monitor): change diagnostic logic for mem_monitor (#11971) --------- Signed-off-by: Takayuki AKAMINE <takayuki.akamine@tier4.jp> Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
1 parent 6a6a2ec commit 8e5ac51

File tree

7 files changed

+164
-114
lines changed

7 files changed

+164
-114
lines changed

system/autoware_system_monitor/CMakeLists.txt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ rclcpp_components_register_node(voltage_monitor_lib
194194
)
195195

196196
if(BUILD_TESTING)
197+
# Process Monitor Test
197198
ament_add_ros_isolated_gtest(test_process_monitor
198199
test/src/process_monitor/test_process_monitor.cpp
199200
)
@@ -211,6 +212,7 @@ if(BUILD_TESTING)
211212
COMMENT "Copying test data files to the build directory after build"
212213
)
213214

215+
# CPU Monitor Test
214216
ament_add_ros_isolated_gtest(test_cpu_monitor
215217
test/src/cpu_monitor/test_${CMAKE_CPU_PLATFORM}_cpu_monitor.cpp
216218
${CPU_MONITOR_SOURCE}
@@ -222,6 +224,28 @@ if(BUILD_TESTING)
222224

223225
target_link_libraries(test_cpu_monitor cpu_monitor_lib ${Boost_LIBRARIES} ${LIBRARIES})
224226

227+
# Mem Monitor Test
228+
ament_add_ros_isolated_gtest(test_mem_monitor
229+
test/src/mem_monitor/test_mem_monitor.cpp
230+
src/mem_monitor/mem_monitor.cpp
231+
)
232+
233+
target_include_directories(test_mem_monitor
234+
PRIVATE "include"
235+
)
236+
237+
target_link_libraries(test_mem_monitor mem_monitor_lib ${Boost_LIBRARIES} ${LIBRARIES})
238+
239+
# helper free command to failed free command.
240+
add_executable(free1 test/src/mem_monitor/free1.cpp)
241+
242+
add_custom_command(
243+
TARGET test_mem_monitor
244+
POST_BUILD
245+
COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:free1> $<TARGET_FILE_DIR:test_mem_monitor>
246+
COMMENT "Copying free1 dummy executable to the test directory"
247+
)
248+
225249
endif()
226250

227251
# TODO(yunus.caliskan): Port the tests to ROS 2, robustify the tests.
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
/**:
22
ros__parameters:
3-
available_size: 1024 # MB
3+
available_size: 1024 # MiB
4+
warning_margin: 0 # MiB
5+
swap_error_threshold: 1024 # MiB

system/autoware_system_monitor/docs/ros_parameters.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,11 @@ hdd_monitor:
5252

5353
mem_monitor:
5454

55-
| Name | Type | Unit | Default | Notes |
56-
| :---------- | :---: | :-----: | :-----: | :-------------------------------------------------------------------------------- |
57-
| usage_warn | float | %(1e-2) | 0.95 | Generates warning when physical memory usage reaches a specified value or higher. |
58-
| usage_error | float | %(1e-2) | 0.99 | Generates error when physical memory usage reaches a specified value or higher. |
55+
| Name | Type | Unit | Default | Notes |
56+
| :------------------- | :--: | :--: | :-----: | :----------------------------------------------------------------------------------------------------------------------------------: |
57+
| available_size | int | MiB | 1024 | Generates error when physical memory usage reaches a specified value defined by `available_size`. |
58+
| warning_margin | int | MiB | 0 | Generates warning when physical memory usage reaches a specified value defined by addition of `available_size` and `warning_margin`. |
59+
| swap_error_threshold | int | MiB | 1024 | Generates error when swap usage reaches a specified value or higher. |
5960

6061
## <u>Net Monitor</u>
6162

system/autoware_system_monitor/include/system_monitor/mem_monitor/mem_monitor.hpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,14 @@ class MemMonitor : public rclcpp::Node
6868
*/
6969
std::string toHumanReadable(const std::string & str);
7070

71+
/**
72+
* @brief determine diagnostic level based on available memory and swap usage
73+
* @param [in] mem_available available memory size in bytes
74+
* @param [in] swap_used swap usage size in bytes
75+
* @return diagnostic level (OK, WARN, or ERROR)
76+
*/
77+
uint8_t determineDiagnosticLevel(size_t mem_available, size_t swap_used);
78+
7179
/**
7280
* @brief publish memory status
7381
* @param [in] usage memory usage
@@ -78,15 +86,17 @@ class MemMonitor : public rclcpp::Node
7886

7987
char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name
8088

81-
size_t available_size_; //!< @brief Memory available size to generate error
89+
size_t error_available_size_; //!< @brief Memory available size to generate error
90+
size_t warning_available_size_; //!< @brief Memory available size to generate warning
91+
size_t swap_error_threshold_; //!< @brief Swap usage size to generate error
8292

8393
rclcpp::Publisher<tier4_external_api_msgs::msg::MemoryStatus>::SharedPtr
8494
pub_memory_status_; //!< @brief publisher
8595

8696
/**
8797
* @brief Memory usage status messages
8898
*/
89-
const std::map<int, const char *> usage_dict_ = {
99+
const std::map<uint8_t, const char *> usage_dict_ = {
90100
{DiagStatus::OK, "OK"}, {DiagStatus::WARN, "high load"}, {DiagStatus::ERROR, "very high load"}};
91101
};
92102

system/autoware_system_monitor/src/mem_monitor/mem_monitor.cpp

Lines changed: 59 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,35 @@ namespace bp = boost::process;
3434
MemMonitor::MemMonitor(const rclcpp::NodeOptions & options)
3535
: Node("mem_monitor", options),
3636
updater_(this),
37-
available_size_(declare_parameter<int>("available_size", 1024) * 1024 * 1024)
37+
error_available_size_(
38+
declare_parameter<int>(
39+
"available_size", 1024,
40+
rcl_interfaces::msg::ParameterDescriptor().set__read_only(true).set__description(
41+
"Memory available size to generate error [MiB]. Cannot be changed after "
42+
"initialization.")) *
43+
1024 * 1024),
44+
warning_available_size_(0),
45+
swap_error_threshold_(
46+
declare_parameter<int>(
47+
"swap_error_threshold", 1024,
48+
rcl_interfaces::msg::ParameterDescriptor().set__read_only(true).set__description(
49+
"Swap usage size to generate error [MiB]. Cannot be changed after "
50+
"initialization.")) *
51+
1024 * 1024)
3852
{
53+
// Define warning_available_size_
54+
int warning_margin =
55+
declare_parameter<int>(
56+
"warning_margin", 0,
57+
rcl_interfaces::msg::ParameterDescriptor().set__read_only(true).set__description(
58+
"Warning margin with error threshold [MiB] to generate Warn [MiB]. Cannot be changed after "
59+
"initialization.")) *
60+
1024 * 1024;
61+
if (warning_margin < 0) {
62+
warning_margin = 0;
63+
}
64+
warning_available_size_ = error_available_size_ + warning_margin;
65+
3966
gethostname(hostname_, sizeof(hostname_));
4067
updater_.setHardwareID(hostname_);
4168
updater_.add("Memory Usage", this, &MemMonitor::checkUsage);
@@ -99,9 +126,8 @@ void MemMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat)
99126
std::vector<std::string> list;
100127
float usage = 0.0f;
101128
size_t mem_total = 0;
102-
size_t mem_shared = 0;
103129
size_t mem_available = 0;
104-
size_t used_plus = 0;
130+
size_t swap_used = 0;
105131

106132
/*
107133
Output example of `free -tb`
@@ -124,15 +150,18 @@ void MemMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat)
124150
// Physical memory
125151
if (index == 1) {
126152
mem_total = std::atoll(list[1].c_str());
127-
mem_shared = std::atoll(list[4].c_str());
128153
mem_available = std::atoll(list[6].c_str());
129154

130155
// available divided by total is available memory including calculation for buff/cache,
131156
// so the subtraction of this from 1 gives real usage.
132157
usage = 1.0f - static_cast<double>(mem_available) / mem_total;
133158
stat.addf(fmt::format("{} usage", list[0]), "%.2f%%", usage * 1e+2);
159+
} else if (index == 2) {
160+
// Swap memory
161+
swap_used = std::atoll(list[2].c_str());
134162
}
135163

164+
// Add additional information for each memory type and total
136165
stat.add(fmt::format("{} total", list[0]), toHumanReadable(list[1]));
137166
stat.add(fmt::format("{} used", list[0]), toHumanReadable(list[2]));
138167
stat.add(fmt::format("{} free", list[0]), toHumanReadable(list[3]));
@@ -142,26 +171,14 @@ void MemMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat)
142171
stat.add(fmt::format("{} shared", list[0]), toHumanReadable(list[4]));
143172
stat.add(fmt::format("{} buff/cache", list[0]), toHumanReadable(list[5]));
144173
stat.add(fmt::format("{} available", list[0]), toHumanReadable(list[6]));
145-
} else if (index == 3) {
146-
// Total:used + Mem:shared
147-
used_plus = std::atoll(list[2].c_str()) + mem_shared;
148-
double giga = static_cast<double>(used_plus) / (1024 * 1024 * 1024);
149-
stat.add(fmt::format("{} used+", list[0]), fmt::format("{:.1f}{}", giga, "G"));
150174
} else {
151-
/* nothing */
175+
// Swap (index 2) and Total (index 3) do not have their specific entry.
176+
// So do nothing here.
152177
}
153178
++index;
154179
}
155180

156-
int level;
157-
if (mem_total > used_plus) {
158-
level = DiagStatus::OK;
159-
} else if (mem_available >= available_size_) {
160-
level = DiagStatus::WARN;
161-
} else {
162-
level = DiagStatus::ERROR;
163-
}
164-
181+
const uint8_t level = determineDiagnosticLevel(mem_available, swap_used);
165182
stat.summary(level, usage_dict_.at(level));
166183

167184
publishMemoryStatus(usage);
@@ -235,6 +252,29 @@ std::string MemMonitor::toHumanReadable(const std::string & str)
235252
return fmt::format(format, size, units[count]);
236253
}
237254

255+
uint8_t MemMonitor::determineDiagnosticLevel(size_t mem_available, size_t swap_used)
256+
{
257+
uint8_t level = DiagStatus::OK;
258+
if (mem_available < error_available_size_) {
259+
level = DiagStatus::ERROR;
260+
} else if (mem_available < warning_available_size_) {
261+
level = DiagStatus::WARN;
262+
}
263+
264+
// Swap usage checking
265+
if (swap_used > 0) {
266+
if (level == DiagStatus::OK) {
267+
level = DiagStatus::WARN;
268+
}
269+
270+
if (swap_used > swap_error_threshold_) {
271+
level = DiagStatus::ERROR;
272+
}
273+
}
274+
275+
return level;
276+
}
277+
238278
void MemMonitor::publishMemoryStatus(float usage)
239279
{
240280
tier4_external_api_msgs::msg::MemoryStatus memory_status;

system/autoware_system_monitor/test/src/mem_monitor/free1.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,7 @@
1919

2020
int main(int argc, char ** argv)
2121
{
22+
(void)argc;
23+
(void)argv;
2224
return -1;
2325
}

0 commit comments

Comments
 (0)