@@ -185,6 +185,106 @@ To do multi-LoRA batch inference, you need to set LoRA related parameters in `en
185185 batch_size=64,
186186 )
187187
188+ .. _vision_language_model :
189+
190+ Batch inference with vision-language-model (VLM)
191+ --------------------------------------------------------
192+
193+ Ray Data LLM also supports running batch inference with vision language
194+ models. This example shows how to prepare a dataset with images and run
195+ batch inference with a vision language model.
196+
197+ This example applies 2 adjustments on top of the previous example:
198+
199+ - set `has_image=True ` in `vLLMEngineProcessorConfig `
200+ - prepare image input inside preprocessor
201+
202+ .. testcode ::
203+
204+ # Load "LMMs-Eval-Lite" dataset from Hugging Face.
205+ vision_dataset_llms_lite = datasets.load_dataset("lmms-lab/LMMs-Eval-Lite", "coco2017_cap_val")
206+ vision_dataset = ray.data.from_huggingface(vision_dataset_llms_lite["lite"])
207+
208+ vision_processor_config = vLLMEngineProcessorConfig(
209+ model_source="Qwen/Qwen2.5-VL-3B-Instruct",
210+ engine_kwargs=dict(
211+ tensor_parallel_size=1,
212+ pipeline_parallel_size=1,
213+ max_model_len=4096,
214+ enable_chunked_prefill=True,
215+ max_num_batched_tokens=2048,
216+ ),
217+ # Override Ray's runtime env to include the Hugging Face token. Ray Data uses Ray under the hood to orchestrate the inference pipeline.
218+ runtime_env=dict(
219+ env_vars=dict(
220+ HF_TOKEN=HF_TOKEN,
221+ VLLM_USE_V1="1",
222+ ),
223+ ),
224+ batch_size=16,
225+ accelerator_type="L4",
226+ concurrency=1,
227+ has_image=True,
228+ )
229+
230+ def vision_preprocess(row: dict) -> dict:
231+ choice_indices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
232+ return dict(
233+ messages=[
234+ {
235+ "role": "system",
236+ "content": """Analyze the image and question carefully, using step-by-step reasoning.
237+ First, describe any image provided in detail. Then, present your reasoning. And finally your final answer in this format:
238+ Final Answer: <answer>
239+ where <answer> is:
240+ - The single correct letter choice A, B, C, D, E, F, etc. when options are provided. Only include the letter.
241+ - Your direct answer if no options are given, as a single phrase or number.
242+ - If your answer is a number, only include the number without any unit.
243+ - If your answer is a word or phrase, do not paraphrase or reformat the text you see in the image.
244+ - You cannot answer that the question is unanswerable. You must either pick an option or provide a direct answer.
245+ IMPORTANT: Remember, to end your answer with Final Answer: <answer>.""",
246+ },
247+ {
248+ "role": "user",
249+ "content": [
250+ {
251+ "type": "text",
252+ "text": row["question"] + "\n\n "
253+ },
254+ {
255+ "type": "image",
256+ # Ray Data accepts PIL Image or image URL.
257+ "image": Image.open(BytesIO(row["image"]["bytes"]))
258+ },
259+ {
260+ "type": "text",
261+ "text": "\n\n Choices:\n " + "\n ".join([f"{choice_indices[i]}. {choice}" for i, choice in enumerate(row["answer"])])
262+ }
263+ ]
264+ },
265+ ],
266+ sampling_params=dict(
267+ temperature=0.3,
268+ max_tokens=150,
269+ detokenize=False,
270+ ),
271+ )
272+
273+ def vision_postprocess(row: dict) -> dict:
274+ return {
275+ "resp": row["generated_text"],
276+ }
277+
278+ vision_processor = build_llm_processor(
279+ vision_processor_config,
280+ preprocess=vision_preprocess,
281+ postprocess=vision_postprocess,
282+ )
283+
284+ vision_processed_ds = vision_processor(vision_dataset).materialize()
285+ vision_processed_ds.show(3)
286+
287+
188288.. _openai_compatible_api_endpoint :
189289
190290Batch inference with an OpenAI-compatible endpoint
0 commit comments