Skip to content

RuntimeError: CUDA error: device-side assert triggered #8

@Manusinh-Thakor

Description

@Manusinh-Thakor

I'm facing follwing issue at the generating stage

code:

pip install accelerate

from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import requests
import torch

model_id = "google/medgemma-4b-it"

model = AutoModelForImageTextToText.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

Image attribution: Stillwaterising, CC0, via Wikimedia Commons

image_url = "https://upload.wikimedia.org/wikipedia/commons/c/c8/Chest_Xray_PA_3-8-2010.png"
image = Image.open(requests.get(image_url, headers={"User-Agent": "example"}, stream=True).raw)

messages = [
{
"role": "system",
"content": [{"type": "text", "text": "You are an expert radiologist."}]
},
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this X-ray"},
{"type": "image", "image": image}
]
}
]

inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True,
return_dict=True, return_tensors="pt"
).to(model.device, dtype=torch.bfloat16)

input_len = inputs["input_ids"].shape[-1]

with torch.inference_mode():
generation = model.generate(**inputs, max_new_tokens=200, do_sample=False)
generation = generation[0][input_len:]

decoded = processor.decode(generation, skip_special_tokens=True)
print(decoded)

error:
at this stage

with torch.inference_mode():
generation = model.generate(**inputs, max_new_tokens=200, do_sample=False)
generation = generation[0][input_len:]

as below:


RuntimeError Traceback (most recent call last)
Cell In[2], line 82
80 if name == "main":
81 torch.cuda.empty_cache()
---> 82 main()

Cell In[2], line 76, in main()
73 image = process_image(image_url)
75 # Generate and display report
---> 76 report = generate_radiology_report(image, model, processor)
77 print("\n=== RADIOLOGY REPORT ===")
78 print(report)

Cell In[2], line 52, in generate_radiology_report(image, model, processor)
43 inputs = processor.apply_chat_template(
44 messages,
45 add_generation_prompt=True,
(...)
48 return_tensors="pt"
49 ).to(DEVICE, dtype=DTYPE)
51 with torch.inference_mode():
---> 52 outputs = model.generate(
53 **inputs,
54 max_new_tokens=256, # Optimal for detailed reports
55 do_sample=True, # Better quality than greedy
56 temperature=0.7,
57 top_p=0.9,
58 pad_token_id=processor.tokenizer.pad_token_id
59 )
61 return processor.decode(
62 outputs[0][inputs["input_ids"].shape[-1]:],
63 skip_special_tokens=True
64 )

File /opt/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator..decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)

File /opt/venv/lib/python3.10/site-packages/transformers/generation/utils.py:2597, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, use_model_defaults, custom_generate, **kwargs)
2589 input_ids, model_kwargs = self._expand_inputs_for_generation(
2590 input_ids=input_ids,
2591 expand_size=generation_config.num_return_sequences,
2592 is_encoder_decoder=self.config.is_encoder_decoder,
2593 **model_kwargs,
2594 )
2596 # 12. run sample (it degenerates to greedy search when generation_config.do_sample=False)
-> 2597 result = self._sample(
2598 input_ids,
2599 logits_processor=prepared_logits_processor,
2600 stopping_criteria=prepared_stopping_criteria,
2601 generation_config=generation_config,
2602 synced_gpus=synced_gpus,
2603 streamer=streamer,
2604 **model_kwargs,
2605 )
2607 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
2608 # 11. interleave input_ids with num_beams additional sequences per batch
2609 input_ids, model_kwargs = self._expand_inputs_for_generation(
2610 input_ids=input_ids,
2611 expand_size=generation_config.num_beams,
2612 is_encoder_decoder=self.config.is_encoder_decoder,
2613 **model_kwargs,
2614 )

File /opt/venv/lib/python3.10/site-packages/transformers/generation/utils.py:3602, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
3600 probs = nn.functional.softmax(next_token_scores, dim=-1)
3601 # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
-> 3602 next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
3603 else:
3604 next_tokens = torch.argmax(next_token_scores, dim=-1)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

Metadata

Metadata

Assignees

No one assigned

    Labels

    type: bugSomething isn't working

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions