Explore options to run multi GPU cugraph leiden for a large graph using some host memory to reduce GPU count

### What is your question?

Hey, I am trying to run multi-GPU leiden clustering on a large graph with 3.7 billion edges. I have 4 NVIDIA A100 GPUs with 80G VRAM each. I am facing memory issues when I try to run it and I was wondering if you have any suggestions on how to handle such a large graph. Here's my code:
```python
import argparse
from cuml.dask.common import utils as dask_utils
from cugraph.dask.common.read_utils import get_n_workers
from cugraph.dask.common.mg_utils import get_visible_devices, run_gc_on_dask_cluster
from cugraph.dask.common.part_utils import load_balance_func, persist_dask_df_equal_parts_per_worker
import dask
from dask_cuda import LocalCUDACluster
from distributed import Client
import dask_cudf
import numpy as np
from cugraph import Graph
from cugraph.dask import leiden as culeiden
import os
from cugraph.testing.mg_utils import (
    start_dask_client,
    stop_dask_client,
)
import math
from dask.distributed import wait
from cudf.utils.performance_tracking import print_memory_report
from rmm.statistics import enable_statistics
import cudf
from cugraph.dask.comms import comms as Comms

def parse_args():
    parser = argparse.ArgumentParser(description='Compute leiden using multiple GPUs')
    parser.add_argument('--file', required=True, help='Input numpy file path')
    parser.add_argument('--output-dir', required=True, help='Output directory')
    parser.add_argument('--resolution', type=float, default=0.5, help='Resolution for leiden')
    parser.add_argument('--memory-fraction', type=float, default=0.7, 
                       help='Fraction of GPU memory to use for RMM pool')
    return parser.parse_args()

def save_leiden_output(dask_df, resolution, output_dir):
    file_path = os.path.join(output_dir, f"leiden_resolution_{resolution}.parquet")
    print(f"Saving leiden output to {output_path}")
    file_path.to_parquet(file_path)
    
def load_and_persist_data(client, file_path):
    df = dask_cudf.read_parquet(
         path=file_path, columns=['source', 'destination', 'weights'], index=False, blocksize='1GiB',
    ).astype({'source': np.int64, 'destination': np.int64, 'weights': np.float32})
    
    (persisted_df,) = dask_utils.persist_across_workers(
        client, [df], workers=list(client.has_what().keys())
    )
    return persisted_df

def create_graph(persisted_df):
    g = Graph()
    g.from_dask_cudf_edgelist(
        persisted_df, source="source", destination="destination", weight="weights"
    )
    return g

def main():
    args = parse_args()

    dask.config.set({
        "dask.dataframe.backend": "cudf",
        "distributed.scheduler.worker-ttl": None
    })

    print(f"Devices available are: {get_visible_devices()}")

    client = Client(LocalCUDACluster(
        CUDA_VISIBLE_DEVICES="4,5,6,7", #should use all devices by default
        rmm_pool_size=args.memory_fraction,
        memory_limit=0.85,
        device_memory_limit=0.85,
        enable_cudf_spill=True,
        local_directory="/tmp",
        cudf_spill_stats=1,
        protocol="ucx",
        enable_tcp_over_ucx=True,
        enable_nvlink=True,
        enable_infiniband=False
    ))

    client.wait_for_workers(n_workers=4)
    Comms.initialize(p2p=True)
    
    print(f"Number of workers started are {len(client.has_what().keys())}")
    
    persisted_df = load_and_persist_data(client=client, file_path=args.file)
    wait(persisted_df)
    
    ## explicitly test if all workers have equal parts
    dist = np.array([len(v) for v in client.has_what().values()])
    print(f"Worker distribution is {dist}")
    #  assert np.all(dist == dist[0])

    print(f"Loaded data with shape: {persisted_df.shape} and number of partitions are {persisted_df.npartitions}")

    print("Creating a dask undirected weighted graph to run leiden.......")
    graph = create_graph(persisted_df)

    print("Running multi-gpu leiden algorithm for 100 iterations (default).........")
    leiden_parts, modularity = culeiden(
        graph,
        resolution=args.resolution,
        random_state=0,
        max_iter=100,
    )

    print(f"Leiden completed with modularity score of {modularity}")
    save_leiden_output(dask_df = leiden_parts, resolution = args.resolution, output_dir = args.output_dir)
    
    print("Processing completed successfully")

    client.shutdown()

if __name__ == "__main__":
    main()
```

Here's the error message I get: 
```text
Worker distribution is [8 8 8 8]
Loaded data with shape: (<dask_expr.expr.Scalar: expr=FromGraph(bca9c83).size() // 3, dtype=int64>, 3) and number of partitions are 32
Creating a dask undirected weighted graph to run leiden.......
/home/shaha4/miniforge3/envs/rapids-25.02/lib/python3.11/site-packages/cudf/core/reshape.py:384: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.
  warnings.warn(
[2025-01-22 12:58:34.783] [RMM] [error] [A][Stream 0x2][Upstream 7451880192B][FAILURE maximum pool size exceeded]
[2025-01-22 12:58:37.504] [RMM] [error] [A][Stream 0x2][Upstream 7451880192B][FAILURE maximum pool size exceeded]
[2025-01-22 12:58:38.862] [RMM] [error] [A][Stream 0x2][Upstream 7451880192B][FAILURE maximum pool size exceeded]
[2025-01-22 12:58:42.402] [RMM] [error] [A][Stream 0x2][Upstream 1862663936B][FAILURE maximum pool size exceeded]
2025-01-22 12:58:43,473 - distributed.worker - ERROR - Compute Failed
Key:       _make_plc_graph-266693e6-ddc3-4888-8e4c-5ac4fc9bef3a
State:     executing
Task:  <Task '_make_plc_graph-266693e6-ddc3-4888-8e4c-5ac4fc9bef3a' _make_plc_graph(...)>
Exception: "RuntimeError('non-success value returned from cugraph_mg_graph_create(): CUGRAPH_UNKNOWN_ERROR NCCL error encountered at: file=/home/shaha4/miniforge3/envs/rapids-25.02/include/raft/comms/detail/std_comms.hpp line=632: ')"
Traceback: '  File "/home/shaha4/miniforge3/envs/rapids-25.02/lib/python3.11/site-packages/cugraph/structure/graph_implementation/simpleDistributedGraph.py", line 140, in _make_plc_graph\n    plc_graph = MGGraph(\n                ^^^^^^^^\n  File "graphs.pyx", line 485, in pylibcugraph.graphs.MGGraph.__cinit__\n  File "utils.pyx", line 53, in pylibcugraph.utils.assert_success\n'

2025-01-22 12:58:43,473 - distributed.worker - ERROR - Compute Failed
Key:       _make_plc_graph-e77be12f-3695-45b5-a3d7-a8f9db4c9e0f
State:     executing
Task:  <Task '_make_plc_graph-e77be12f-3695-45b5-a3d7-a8f9db4c9e0f' _make_plc_graph(...)>
Exception: "RuntimeError('non-success value returned from cugraph_mg_graph_create(): CUGRAPH_UNKNOWN_ERROR NCCL error encountered at: file=/home/shaha4/miniforge3/envs/rapids-25.02/include/raft/comms/detail/std_comms.hpp line=632: ')"
Traceback: '  File "/home/shaha4/miniforge3/envs/rapids-25.02/lib/python3.11/site-packages/cugraph/structure/graph_implementation/simpleDistributedGraph.py", line 140, in _make_plc_graph\n    plc_graph = MGGraph(\n                ^^^^^^^^\n  File "graphs.pyx", line 485, in pylibcugraph.graphs.MGGraph.__cinit__\n  File "utils.pyx", line 53, in pylibcugraph.utils.assert_success\n'
```

### Code of Conduct

- [x] I agree to follow cuGraph's Code of Conduct
- [x] I have searched the [open issues](https://github.com/rapidsai/cugraph/issues) and have found no duplicates for this question

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Explore options to run multi GPU cugraph leiden for a large graph using some host memory to reduce GPU count #4884

What is your question?

Code of Conduct

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Explore options to run multi GPU cugraph leiden for a large graph using some host memory to reduce GPU count #4884

Description

What is your question?

Code of Conduct

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions