[BUG]can't support nvidia b200

it is running well on A100, but when I run it on B200, it throw a exception below.
I search about this, not sure, looks like Cellpose's GPU acceleration module has not been updated to support the Blackwell architecture.

is it possible supported in the future?

```
AcceleratorError                          Traceback (most recent call last)
Cell In[6], line 1
----> 1 img,final_mask = run_segmentation_pipeline(
      2     image_path='[/data2/core-med1/public/Spatial_project/Xenium/tif/fullimage/AlphaSMA_Vimentin.tif](http://127.0.0.1:8891/lab/tree/project/xenium/code/public/Spatial_project/Xenium/tif/fullimage/AlphaSMA_Vimentin.tif)',
      3     model_path='[/data2/core-med1/public/Spatial_project/Xenium/cellpose/Models/X4.2](http://127.0.0.1:8891/lab/tree/project/xenium/code/public/Spatial_project/Xenium/cellpose/Models/X4.2)',       
      4     patch_seg = True,
      5     patch_size=20000,              
      6     patch_overlap=300,
      7     flow_threshold=0.5,             
      8     cellprob_threshold=0,
      9     tile_norm_blocksize=0,
     10     selected_channels=[0], 
     11     patch_masks_save_path='./result/patch_masks_mem_v3.npz',
     12     final_mask_save_path='./result/final_mask_mem_v3.tif'
     13 )
     15 # 2h 8m 48s

Cell In[3], line 274, in run_segmentation_pipeline(image_path, model_path, patch_seg, patch_size, patch_overlap, selected_channels, flow_threshold, cellprob_threshold, tile_norm_blocksize, patch_masks_save_path, final_mask_save_path)
    272 pbar = tqdm(patches, desc="Infer patches", total=len(patches))
    273 for patch in pbar:
--> 274     m, _, _ = model.eval(
    275         patch,
    276         batch_size=8,
    277         flow_threshold=FLOW_THRESHOLD,
    278         cellprob_threshold=CELLLPROB_THRESHOLD,
    279         normalize={'tile_norm_blocksize': TILE_NORM_BLOCKSIZE}
    280     )
    281     masks.append(m.astype(np.uint32, copy=False))
    282     pbar.update(1)

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/models.py:338](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/models.py#line=337), in CellposeModel.eval(self, x, batch_size, resample, channels, channel_axis, z_axis, normalize, invert, rescale, diameter, flow_threshold, cellprob_threshold, do_3D, anisotropy, flow3D_smooth, stitch_threshold, min_size, max_size_fraction, niter, augment, tile_overlap, bsize, compute_masks, progress)
    336     niter_scale = 1 if image_scaling is None else image_scaling
    337     niter = int(200[/](http://127.0.0.1:8891/niter_scale)[niter_scale](http://127.0.0.1:8891/niter_scale)) if niter is None or niter == 0 else niter
--> 338     masks = self._compute_masks(x.shape, dP, cellprob, flow_threshold=flow_threshold,
    339                     cellprob_threshold=cellprob_threshold, min_size=min_size,
    340                 max_size_fraction=max_size_fraction, niter=niter,
    341                 stitch_threshold=stitch_threshold, do_3D=do_3D)
    342 else:
    343     masks = np.zeros(0) #pass back zeros if not compute_masks

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/models.py:524](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/models.py#line=523), in CellposeModel._compute_masks(self, shape, dP, cellprob, flow_threshold, cellprob_threshold, min_size, max_size_fraction, niter, do_3D, stitch_threshold)
    521 for i in iterator:
    522     # turn off min_size for 3D stitching
    523     min_size0 = min_size if stitch_threshold == 0 or nimg == 1 else -1
--> 524     outputs = dynamics.resize_and_compute_masks(
    525         dP[:, i], cellprob[i],
    526         niter=niter, cellprob_threshold=cellprob_threshold,
    527         flow_threshold=flow_threshold, resize=resize,
    528         min_size=min_size0, max_size_fraction=max_size_fraction,
    529         device=self.device)
    530     if i==0 and nimg > 1:
    531         masks = np.zeros((nimg, shape[1], shape[2]), outputs.dtype)

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py:610](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py#line=609), in resize_and_compute_masks(dP, cellprob, niter, cellprob_threshold, flow_threshold, do_3D, min_size, max_size_fraction, resize, device)
    587 def resize_and_compute_masks(dP, cellprob, niter=200, cellprob_threshold=0.0,
    588                              flow_threshold=0.4, do_3D=False, min_size=15,
    589                              max_size_fraction=0.4, resize=None, device=torch.device("cpu")):
    590     """Compute masks using dynamics from dP and cellprob, and resizes masks if resize is not None.
    591 
    592     Args:
   (...)
    608         tuple: A tuple containing the computed masks and the final pixel locations.
    609     """
--> 610     mask = compute_masks(dP, cellprob, niter=niter,
    611                             cellprob_threshold=cellprob_threshold,
    612                             flow_threshold=flow_threshold, do_3D=do_3D,
    613                             max_size_fraction=max_size_fraction, 
    614                             device=device)
    616     if resize is not None:
    617         dynamics_logger.warning("Resizing is depricated in v4.0.1+")

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py:672](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py#line=671), in compute_masks(dP, cellprob, p, niter, cellprob_threshold, flow_threshold, do_3D, min_size, max_size_fraction, device)
    669 if not do_3D:
    670     if mask.max() > 0 and flow_threshold is not None and flow_threshold > 0:
    671         # make sure labels are unique at output of get_masks
--> 672         mask = remove_bad_flow_masks(mask, dP, threshold=flow_threshold,
    673                                      device=device)
    675 if mask.max() < 2**16 and mask.dtype != "uint16":
    676     mask = mask.astype("uint16")

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py:443](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py#line=442), in remove_bad_flow_masks(masks, flows, threshold, device)
    440         dynamics_logger.info("turn off QC step with flow_threshold=0 if too slow")
    441         device0 = torch.device("cpu")
--> 443 merrors, _ = flow_error(masks, flows, device0)
    444 badi = 1 + (merrors > threshold).nonzero()[0]
    445 masks[np.isin(masks, badi)] = 0

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py:300](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py#line=299), in flow_error(maski, dP_net, device)
    297     return
    299 # flows predicted from estimated masks
--> 300 dP_masks = masks_to_flows_gpu(maski, device=device)
    301 # difference between predicted flows vs mask flows
    302 flow_errors = np.zeros(maski.max())

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py:139](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py#line=138), in masks_to_flows_gpu(masks, device, niter)
    137 ### run diffusion
    138 n_iter = 2 * ext.max() if niter is None else niter
--> 139 mu = _extend_centers_gpu(neighbors, meds_p, isneighbor, shape, n_iter=n_iter,
    140                         device=device)
    141 mu = mu.astype("float64")
    143 # new normalization

File [/data2/core-med1/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py:50](http://127.0.0.1:8891/lab/tree/project/xenium/code/miniconda/envs/cellpose_b200/lib/python3.10/site-packages/cellpose/dynamics.py#line=49), in _extend_centers_gpu(***failed resolving arguments***)
     47 del meds, isneighbor, Tneigh
     49 if T.ndim == 2:
---> 50     grads = T[neighbors[0, [2, 1, 4, 3]], neighbors[1, [2, 1, 4, 3]]]
     51     del neighbors
     52     dy = grads[0] - grads[1]

AcceleratorError: CUDA error: invalid configuration argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
```


Here is my torch version
```
import torch
print(torch.__version__)
print(torch.cuda.get_device_name(0))
print(torch.version.cuda)

2.8.0+cu128
NVIDIA B200
12.8
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[BUG]can't support nvidia b200 #1340

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[BUG]can't support nvidia b200 #1340

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions