Skip to content

Commit eff3a26

Browse files
committed
Address review comments
1 parent 9e66dd9 commit eff3a26

File tree

2 files changed

+9
-24
lines changed

2 files changed

+9
-24
lines changed

src/callbacks/gpu_memory_usage.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "lbann/models/model.hpp"
3131
#include "lbann/utils/gpu/helpers.hpp"
3232
#include "lbann/utils/serialize.hpp"
33+
#include <h2/gpu/memory_utils.hpp>
3334
#include <iomanip>
3435
#include <sstream>
3536

@@ -79,13 +80,7 @@ void gpu_memory_usage::write_specific_proto(lbann_data::Callback& proto) const
7980
void gpu_memory_usage::on_epoch_begin(model* m)
8081
{
8182
#ifdef LBANN_HAS_GPU
82-
size_t available;
83-
size_t total;
84-
#ifdef LBANN_HAS_CUDA
85-
FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total));
86-
#elif defined(LBANN_HAS_ROCM)
87-
FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total));
88-
#endif
83+
auto const [available, total] = h2::gpu::mem_info();
8984
size_t used = total - available;
9085
auto comm = m->get_comm();
9186
if (comm->am_trainer_master()) {

src/callbacks/memory_profiler.cpp

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include "lbann/proto/callbacks.pb.h"
3838

3939
#include <algorithm>
40+
#include <h2/gpu/memory_utils.hpp>
4041
#include <string>
4142

4243
namespace lbann {
@@ -45,17 +46,12 @@ namespace callback {
4546
/**
4647
* @brief Returns the currently used memory, or 0 if LBANN was not compiled with
4748
* GPU support.
49+
* TODO(later): Gather across all ranks?
4850
*/
4951
static inline size_t get_used_gpu_memory()
5052
{
5153
#ifdef LBANN_HAS_GPU
52-
size_t available;
53-
size_t total;
54-
#ifdef LBANN_HAS_CUDA
55-
FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total));
56-
#elif defined(LBANN_HAS_ROCM)
57-
FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total));
58-
#endif
54+
auto const [available, total] = h2::gpu::mem_info();
5955
// TODO(later): Might be nicer to return a struct with gathered information
6056
// (min, max, median across ranks)
6157
return total - available;
@@ -71,14 +67,7 @@ static inline size_t get_used_gpu_memory()
7167
static inline size_t get_total_gpu_memory()
7268
{
7369
#ifdef LBANN_HAS_GPU
74-
size_t available;
75-
size_t total;
76-
#ifdef LBANN_HAS_CUDA
77-
FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total));
78-
#elif defined(LBANN_HAS_ROCM)
79-
FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total));
80-
#endif
81-
return total;
70+
return h2::gpu::mem_info().total;
8271
#else
8372
return 0;
8473
#endif
@@ -103,8 +92,9 @@ memory_profiler::memory_profiler(bool detailed_first_step)
10392
: callback_base(), m_detailed_first_step(detailed_first_step)
10493
{
10594
#ifndef LBANN_HAS_GPU
106-
LBANN_WARNING("Memory profiler callback only provides unaccounted memory "
107-
"information with GPU support.");
95+
LBANN_WARNING(
96+
"Without GPU support, the memory profiler callback does not ",
97+
"provide raw memory usage information, only expected allocation size.");
10898
#endif
10999
}
110100

0 commit comments

Comments
 (0)