Description
I'm not sure if my issue is related to the issue 446 but here is what I experienced. The first time I load an ONNXRuntime-genai model into GPU memory (CUDA) it is not freed even after I call the function OGADestroyModel(). After that, I have to deal with an occupied memory if I have to load another model during the same execution. The problem doesn't seem to exist after the first loading and the memory is freed correctly, but I don't understand why the first loading occupies the GPU memory permanently.
Here is a code in C that would reproduce the issue, it's inspired from following example
#include <cuda_runtime.h>
#include <iostream>
#include "ort_genai.h"
#include <sstream>
static void CheckResult(OgaResult* result) {
if (result) {
std::string string = OgaResultGetError(result);
OgaDestroyResult(result);
throw std::runtime_error(string);
}
}
static void C_API_stream(const char* model_path) {
cudaError_t cudaStatus;
OgaModel* model;
std::cout << "Creating model..." << std::endl;
CheckResult(OgaCreateModel(model_path, &model));
size_t freeMem, totalMem;
cudaStatus = cudaMemGetInfo(&freeMem, &totalMem);
if (cudaStatus != cudaSuccess) {
std::ostringstream error_stream;
error_stream << "Erreur cudaMemGetInfo: " << cudaGetErrorString(cudaStatus) << std::endl;
throw std::runtime_error(error_stream.str());
}
std::cout << "GPU memory used by the model after loading : " << (totalMem - freeMem) / (1024 * 1024) << " Mo" << std::endl;
std::cout << std::endl;
OgaTokenizer* tokenizer;
std::cout << "Creating tokenizer..." << std::endl;
CheckResult(OgaCreateTokenizer(model, &tokenizer));
const char* prompt = "A simple recipe of Carbonara is ";
std::cout << "Prompt: " << std::endl
<< prompt << std::endl;
OgaSequences* sequences;
CheckResult(OgaCreateSequences(&sequences));
CheckResult(OgaTokenizerEncode(tokenizer, prompt, sequences));
OgaGeneratorParams* params;
CheckResult(OgaCreateGeneratorParams(model, ¶ms));
CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 256));
CheckResult(OgaGeneratorParamsSetSearchNumber(params, "repetition_penalty", 1.1));
CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", 0.8));
CheckResult(OgaGeneratorParamsSetInputSequences(params, sequences));
OgaGenerator* generator;
CheckResult(OgaCreateGenerator(model, params, &generator));
std::cout << "Output: " << std::endl;
size_t offset = 0;
while (!OgaGenerator_IsDone(generator)) {
CheckResult(OgaGenerator_ComputeLogits(generator));
CheckResult(OgaGenerator_GenerateNextToken(generator));
size_t num_tokens = OgaGenerator_GetSequenceCount(generator, 0);
const int32_t* sequence_data = OgaGenerator_GetSequenceData(generator, 0);
const char* out_string;
CheckResult(OgaTokenizerDecode(tokenizer, sequence_data + offset, num_tokens - offset, &out_string));
std::cout << out_string;
OgaDestroyString(out_string);
offset = num_tokens;
}
std::cout << std::endl;
OgaDestroyGenerator(generator);
OgaDestroyGeneratorParams(params);
OgaDestroySequences(sequences);
OgaDestroyTokenizer(tokenizer);
OgaDestroyModel(model);
cudaStatus = cudaMemGetInfo(&freeMem, &totalMem);
if (cudaStatus != cudaSuccess) {
std::ostringstream error_stream;
error_stream << "Erreur cudaMemGetInfo: " << cudaGetErrorString(cudaStatus) << std::endl;
throw std::runtime_error(error_stream.str());
}
std::cout << "GPU memory used by the model after freeing : " << (totalMem - freeMem) / (1024 * 1024) << " Mo" << std::endl;
std::cout << std::endl;
}
static void print_usage(int /*argc*/, char** argv) {
std::cerr << "usage: " << argv[0] << " model_path" << std::endl;
}
int main(int argc, char** argv) {
if (argc != 2) {
print_usage(argc, argv);
return -1;
}
// Responsible for cleaning up the library during shutdown
OgaHandle handle;
cudaError_t cudaStatus;
int deviceCount;
cudaStatus = cudaGetDeviceCount(&deviceCount);
if (cudaStatus != cudaSuccess) {
std::cerr << "Erreur cudaGetDeviceCount: " << cudaGetErrorString(cudaStatus) << std::endl;
return 1;
}
if (deviceCount == 0) {
std::cerr << "Aucun périphérique CUDA trouvé." << std::endl;
return 1;
}
int device = 0;
cudaStatus = cudaSetDevice(device);
if (cudaStatus != cudaSuccess) {
std::cerr << "Erreur cudaSetDevice: " << cudaGetErrorString(cudaStatus) << std::endl;
return 1;
}
for (std::size_t i = 0; i < 2; i++) {
std::cout << "C API" << std::endl;
try {
C_API_stream(argv[1]);
}
catch (const std::exception& e) {
std::cout << e.what() << std::endl;
return -1;
}
}
return 0;
}
After the compilation, you run the executable with the config path of the genai model as an argument. It's mandatory to link onnxruntime-genai and cuda libraries.
Language : C/C++
ONNXRuntime version : 1.18.0
ONNXRuntime-genai version : 0.4.0-dev
Execution provider : CUDA (v12.3)