3737#include " lbann/proto/callbacks.pb.h"
3838
3939#include < algorithm>
40+ #include < h2/gpu/memory_utils.hpp>
4041#include < string>
4142
4243namespace lbann {
@@ -45,17 +46,12 @@ namespace callback {
4546/* *
4647 * @brief Returns the currently used memory, or 0 if LBANN was not compiled with
4748 * GPU support.
49+ * TODO(later): Gather across all ranks?
4850 */
4951static inline size_t get_used_gpu_memory ()
5052{
5153#ifdef LBANN_HAS_GPU
52- size_t available;
53- size_t total;
54- #ifdef LBANN_HAS_CUDA
55- FORCE_CHECK_CUDA (cudaMemGetInfo (&available, &total));
56- #elif defined(LBANN_HAS_ROCM)
57- FORCE_CHECK_ROCM (hipMemGetInfo (&available, &total));
58- #endif
54+ auto const [available, total] = h2::gpu::mem_info ();
5955 // TODO(later): Might be nicer to return a struct with gathered information
6056 // (min, max, median across ranks)
6157 return total - available;
@@ -71,14 +67,7 @@ static inline size_t get_used_gpu_memory()
7167static inline size_t get_total_gpu_memory ()
7268{
7369#ifdef LBANN_HAS_GPU
74- size_t available;
75- size_t total;
76- #ifdef LBANN_HAS_CUDA
77- FORCE_CHECK_CUDA (cudaMemGetInfo (&available, &total));
78- #elif defined(LBANN_HAS_ROCM)
79- FORCE_CHECK_ROCM (hipMemGetInfo (&available, &total));
80- #endif
81- return total;
70+ return h2::gpu::mem_info ().total ;
8271#else
8372 return 0 ;
8473#endif
@@ -103,8 +92,9 @@ memory_profiler::memory_profiler(bool detailed_first_step)
10392 : callback_base(), m_detailed_first_step(detailed_first_step)
10493{
10594#ifndef LBANN_HAS_GPU
106- LBANN_WARNING (" Memory profiler callback only provides unaccounted memory "
107- " information with GPU support." );
95+ LBANN_WARNING (
96+ " Without GPU support, the memory profiler callback does not " ,
97+ " provide raw memory usage information, only expected allocation size." );
10898#endif
10999}
110100
0 commit comments