Skip to content

[BUG]can't support nvidia b200 #1340

@cmubioinformatics

Description

@cmubioinformatics

it is running well on A100, but when I run it on B200, it throw a exception below.
I search about this, not sure, looks like Cellpose's GPU acceleration module has not been updated to support the Blackwell architecture.

is it possible supported in the future?

AcceleratorError                          Traceback (most recent call last)
Cell In[6], line 1
----> 1 img,final_mask = run_segmentation_pipeline(
      2     image_path='[/data2/core-med1/public/Spatial_project/Xenium/tif/fullimage/AlphaSMA_Vimentin.tif](http://127.0.0.1:8891/lab/tree/project/xenium/code/public/Spatial_project/Xenium/tif/fullimage/AlphaSMA_Vimentin.tif)',
      3     model_path='[/data2/core-med1/public/Spatial_project/Xenium/cellpose/Models/X4.2](http://127.0.0.1:8891/lab/tree/project/xenium/code/public/Spatial_project/Xenium/cellpose/Models/X4.2)',       
      4     patch_seg = True,
      5     patch_size=20000,              
      6     patch_overlap=300,
      7     flow_threshold=0.5,             
      8     cellprob_threshold=0,
      9     tile_norm_blocksize=0,
     10     selected_channels=[0], 
     11     patch_masks_save_path='./result/patch_masks_mem_v3.npz',
     12     final_mask_save_path='./result/final_mask_mem_v3.tif'
     13 )
     15 # 2h 8m 48s

Cell In[3], line 274, in run_segmentation_pipeline(image_path, model_path, patch_seg, patch_size, patch_overlap, selected_channels, flow_threshold, cellprob_threshold, tile_norm_blocksize, patch_masks_save_path, final_mask_save_path)
    272 pbar = tqdm(patches, desc="Infer patches", total=len(patches))
    273 for patch in pbar:
--> 274     m, _, _ = model.eval(
    275         patch,
    276         batch_size=8,
    277         flow_threshold=FLOW_THRESHOLD,
    278         cellprob_threshold=CELLLPROB_THRESHOLD,
    279         normalize={'tile_norm_blocksize': TILE_NORM_BLOCKSIZE}
    280     )
    281     masks.append(m.astype(np.uint32, copy=False))
    282     pbar.update(1)

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/models.py:338](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/models.py#line=337), in CellposeModel.eval(self, x, batch_size, resample, channels, channel_axis, z_axis, normalize, invert, rescale, diameter, flow_threshold, cellprob_threshold, do_3D, anisotropy, flow3D_smooth, stitch_threshold, min_size, max_size_fraction, niter, augment, tile_overlap, bsize, compute_masks, progress)
    336     niter_scale = 1 if image_scaling is None else image_scaling
    337     niter = int(200[/](http://127.0.0.1:8891/niter_scale)[niter_scale](http://127.0.0.1:8891/niter_scale)) if niter is None or niter == 0 else niter
--> 338     masks = self._compute_masks(x.shape, dP, cellprob, flow_threshold=flow_threshold,
    339                     cellprob_threshold=cellprob_threshold, min_size=min_size,
    340                 max_size_fraction=max_size_fraction, niter=niter,
    341                 stitch_threshold=stitch_threshold, do_3D=do_3D)
    342 else:
    343     masks = np.zeros(0) #pass back zeros if not compute_masks

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/models.py:524](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/models.py#line=523), in CellposeModel._compute_masks(self, shape, dP, cellprob, flow_threshold, cellprob_threshold, min_size, max_size_fraction, niter, do_3D, stitch_threshold)
    521 for i in iterator:
    522     # turn off min_size for 3D stitching
    523     min_size0 = min_size if stitch_threshold == 0 or nimg == 1 else -1
--> 524     outputs = dynamics.resize_and_compute_masks(
    525         dP[:, i], cellprob[i],
    526         niter=niter, cellprob_threshold=cellprob_threshold,
    527         flow_threshold=flow_threshold, resize=resize,
    528         min_size=min_size0, max_size_fraction=max_size_fraction,
    529         device=self.device)
    530     if i==0 and nimg > 1:
    531         masks = np.zeros((nimg, shape[1], shape[2]), outputs.dtype)

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py:610](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py#line=609), in resize_and_compute_masks(dP, cellprob, niter, cellprob_threshold, flow_threshold, do_3D, min_size, max_size_fraction, resize, device)
    587 def resize_and_compute_masks(dP, cellprob, niter=200, cellprob_threshold=0.0,
    588                              flow_threshold=0.4, do_3D=False, min_size=15,
    589                              max_size_fraction=0.4, resize=None, device=torch.device("cpu")):
    590     """Compute masks using dynamics from dP and cellprob, and resizes masks if resize is not None.
    591 
    592     Args:
   (...)
    608         tuple: A tuple containing the computed masks and the final pixel locations.
    609     """
--> 610     mask = compute_masks(dP, cellprob, niter=niter,
    611                             cellprob_threshold=cellprob_threshold,
    612                             flow_threshold=flow_threshold, do_3D=do_3D,
    613                             max_size_fraction=max_size_fraction, 
    614                             device=device)
    616     if resize is not None:
    617         dynamics_logger.warning("Resizing is depricated in v4.0.1+")

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py:672](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py#line=671), in compute_masks(dP, cellprob, p, niter, cellprob_threshold, flow_threshold, do_3D, min_size, max_size_fraction, device)
    669 if not do_3D:
    670     if mask.max() > 0 and flow_threshold is not None and flow_threshold > 0:
    671         # make sure labels are unique at output of get_masks
--> 672         mask = remove_bad_flow_masks(mask, dP, threshold=flow_threshold,
    673                                      device=device)
    675 if mask.max() < 2**16 and mask.dtype != "uint16":
    676     mask = mask.astype("uint16")

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py:443](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py#line=442), in remove_bad_flow_masks(masks, flows, threshold, device)
    440         dynamics_logger.info("turn off QC step with flow_threshold=0 if too slow")
    441         device0 = torch.device("cpu")
--> 443 merrors, _ = flow_error(masks, flows, device0)
    444 badi = 1 + (merrors > threshold).nonzero()[0]
    445 masks[np.isin(masks, badi)] = 0

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py:300](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py#line=299), in flow_error(maski, dP_net, device)
    297     return
    299 # flows predicted from estimated masks
--> 300 dP_masks = masks_to_flows_gpu(maski, device=device)
    301 # difference between predicted flows vs mask flows
    302 flow_errors = np.zeros(maski.max())

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py:139](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py#line=138), in masks_to_flows_gpu(masks, device, niter)
    137 ### run diffusion
    138 n_iter = 2 * ext.max() if niter is None else niter
--> 139 mu = _extend_centers_gpu(neighbors, meds_p, isneighbor, shape, n_iter=n_iter,
    140                         device=device)
    141 mu = mu.astype("float64")
    143 # new normalization

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py:50](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py#line=49), in _extend_centers_gpu(***failed resolving arguments***)
     47 del meds, isneighbor, Tneigh
     49 if T.ndim == 2:
---> 50     grads = T[neighbors[0, [2, 1, 4, 3]], neighbors[1, [2, 1, 4, 3]]]
     51     del neighbors
     52     dy = grads[0] - grads[1]

AcceleratorError: CUDA error: invalid configuration argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Here is my torch version

import torch
print(torch.__version__)
print(torch.cuda.get_device_name(0))
print(torch.version.cuda)

2.8.0+cu128
NVIDIA B200
12.8

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions