ONNXRuntime-genai doesn't release GPU memory after first inference

I'm not sure if my issue is related to the issue [446](https://github.com/microsoft/onnxruntime-genai/issues/446) but here is what I experienced. The first time I load an ONNXRuntime-genai model into GPU memory (CUDA) it is not freed even after I call the function OGADestroyModel(). After that, I have to deal with an occupied memory if I have to load another model during the same execution. The problem doesn't seem to exist after the first loading and the memory is freed correctly, but I don't understand why the first loading occupies the GPU memory permanently.

Here is a code in C that would reproduce the issue, it's inspired from following [example](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/c/src/main.cpp)
```  
#include <cuda_runtime.h>
#include <iostream>
#include "ort_genai.h"
#include <sstream>

static void CheckResult(OgaResult* result) {
    if (result) {
        std::string string = OgaResultGetError(result);
        OgaDestroyResult(result);
        throw std::runtime_error(string);
    }
}

static void C_API_stream(const char* model_path) {
    cudaError_t cudaStatus;
    OgaModel* model;
    std::cout << "Creating model..." << std::endl;
    CheckResult(OgaCreateModel(model_path, &model));
    size_t freeMem, totalMem;
    cudaStatus = cudaMemGetInfo(&freeMem, &totalMem);
    if (cudaStatus != cudaSuccess) {
        std::ostringstream error_stream;
        error_stream << "Erreur cudaMemGetInfo: " << cudaGetErrorString(cudaStatus) << std::endl;
        throw std::runtime_error(error_stream.str());
    }
    std::cout << "GPU memory used by the model after loading : " << (totalMem - freeMem) / (1024 * 1024) << " Mo" << std::endl;
    std::cout << std::endl;

    OgaTokenizer* tokenizer;
    std::cout << "Creating tokenizer..." << std::endl;
    CheckResult(OgaCreateTokenizer(model, &tokenizer));

    const char* prompt = "A simple recipe of Carbonara is ";
    std::cout << "Prompt: " << std::endl
        << prompt << std::endl;

    OgaSequences* sequences;
    CheckResult(OgaCreateSequences(&sequences));
    CheckResult(OgaTokenizerEncode(tokenizer, prompt, sequences));

    OgaGeneratorParams* params;
    CheckResult(OgaCreateGeneratorParams(model, &params));
    CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 256));
    CheckResult(OgaGeneratorParamsSetSearchNumber(params, "repetition_penalty", 1.1));
    CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", 0.8));
    CheckResult(OgaGeneratorParamsSetInputSequences(params, sequences));

    OgaGenerator* generator;
    CheckResult(OgaCreateGenerator(model, params, &generator));
    std::cout << "Output: " << std::endl;

    size_t offset = 0;
    while (!OgaGenerator_IsDone(generator)) {
        CheckResult(OgaGenerator_ComputeLogits(generator));
        CheckResult(OgaGenerator_GenerateNextToken(generator));
        size_t num_tokens = OgaGenerator_GetSequenceCount(generator, 0);
        const int32_t* sequence_data = OgaGenerator_GetSequenceData(generator, 0);
        const char* out_string;
        CheckResult(OgaTokenizerDecode(tokenizer, sequence_data + offset, num_tokens - offset, &out_string));
        std::cout << out_string;
        OgaDestroyString(out_string);
        offset = num_tokens;
    }

    std::cout << std::endl;

    OgaDestroyGenerator(generator);
    OgaDestroyGeneratorParams(params);
    OgaDestroySequences(sequences);
    OgaDestroyTokenizer(tokenizer);
    OgaDestroyModel(model);

    cudaStatus = cudaMemGetInfo(&freeMem, &totalMem);
    if (cudaStatus != cudaSuccess) {
        std::ostringstream error_stream;
        error_stream << "Erreur cudaMemGetInfo: " << cudaGetErrorString(cudaStatus) << std::endl;
        throw std::runtime_error(error_stream.str());
    }
    std::cout << "GPU memory used by the model after freeing : " << (totalMem - freeMem) / (1024 * 1024) << " Mo" << std::endl;
    std::cout << std::endl;
}

static void print_usage(int /*argc*/, char** argv) {
    std::cerr << "usage: " << argv[0] << " model_path" << std::endl;
}

int main(int argc, char** argv) {
    if (argc != 2) {
        print_usage(argc, argv);
        return -1;
    }

    // Responsible for cleaning up the library during shutdown
    OgaHandle handle;

    cudaError_t cudaStatus;

    int deviceCount;
    cudaStatus = cudaGetDeviceCount(&deviceCount);
    if (cudaStatus != cudaSuccess) {
        std::cerr << "Erreur cudaGetDeviceCount: " << cudaGetErrorString(cudaStatus) << std::endl;
        return 1;
    }

    if (deviceCount == 0) {
        std::cerr << "Aucun périphérique CUDA trouvé." << std::endl;
        return 1;
    }

    int device = 0;
    cudaStatus = cudaSetDevice(device);
    if (cudaStatus != cudaSuccess) {
        std::cerr << "Erreur cudaSetDevice: " << cudaGetErrorString(cudaStatus) << std::endl;
        return 1;
    }

    for (std::size_t i = 0; i < 2; i++) {
        std::cout << "C API" << std::endl;
        try {
            C_API_stream(argv[1]);
        }
        catch (const std::exception& e) {
            std::cout << e.what() << std::endl;
            return -1;
        }
    }
    return 0;
}
```
After the compilation, you run the executable with the config path of the genai model as an argument. It's mandatory to link onnxruntime-genai and cuda libraries.

Language : C/C++
ONNXRuntime version : 1.18.0
ONNXRuntime-genai version : 0.4.0-dev
Execution provider : CUDA (v12.3)




Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

ONNXRuntime-genai doesn't release GPU memory after first inference #526

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

ONNXRuntime-genai doesn't release GPU memory after first inference #526

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions