llava-next Fp8 (#209)

yuanwu2017 · Thanaji Rao Thakkalapelli · regisss · web-flow · commit 2985503900f1 · 2024-08-26T16:53:08.000+02:00
Signed-off-by: yuanwu &lt;yuan.wu@intel.com&gt;
Co-authored-by: Thanaji Rao Thakkalapelli &lt;tthakkalapelli@habana.ai&gt;
Co-authored-by: regisss &lt;15324346+regisss@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -28,7 +28,9 @@ limitations under the License.
     - [LLama 7b FP8 on 1 Gaudi2 card](#llama-7b-fp8-on-1-gaudi2-card)
     - [LLama 70b BF16 on 8 Gaudi2 card](#llama-70b-bf16-on-8-gaudi2-card)
     - [LLama 70b FP8 on 8 Gaudi2 card](#llama-70b-fp8-on-8-gaudi2-card)
-    - [LLava-next 7B BF16 on 1 Gaudi2 card](#llava-next-7b-bf16-on-1-gaudi2-card)
+    - [Llava-next](#llava-next)
+      - [llava-v1.6-mistral-7b-hf BF16 on 1 Gaudi2 card](#llava-v16-mistral-7b-hf-bf16-on-1-gaudi2-card)
+      - [llava-v1.6-mistral-7b-hf FP8 on 1 Gaudi2 card](#llava-v16-mistral-7b-hf-fp8-on-1-gaudi2-card)
   - [Environment variables](#environment-variables)
   - [Profiler](#profiler)
 
@@ -264,8 +266,9 @@ docker run -p 8080:80 \
    --sharded true \
    --num-shard 8
 ```
+### Llava-next
 
-### LLava-next 7B BF16 on 1 Gaudi2 card
+#### llava-v1.6-mistral-7b-hf BF16 on 1 Gaudi2 card
 
 An image usually accounts for 2000 input tokens. For example, an image of size 512x512 is represented by 2800 tokens. Thus, `max-input-tokens` must be larger than the number of tokens associated to the image. Otherwise the image may be truncated. We set `BASE_IMAGE_TOKENS=2048` as the default image token number. This is the minimum value of `max-input-tokens`. You can override the environment variable `BASE_IMAGE_TOKENS` to change this value. The warmup will generate graphs with input length from `BASE_IMAGE_TOKENS` to `max-input-tokens`. For LLava-next 7B, the value of `max-batch-prefill-tokens` is 16384, which is calcualted as follows: `prefill_batch_size` = `max-batch-prefill-tokens` / `max-input-tokens`.
 
@@ -281,7 +284,44 @@ docker run -p 8080:80 \
    -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
    -e HF_HUB_ENABLE_HF_TRANSFER=1 \
    -e HUGGING_FACE_HUB_TOKEN=$hf_token \
-   -e PREFILL_BATCH_BUCKET_SIZE=1 \
+   --cap-add=sys_nice \
+   --ipc=host \
+   ghcr.io/huggingface/tgi-gaudi:2.0.1 \
+   --model-id $model \
+   --max-input-tokens 4096 \
+   --max-batch-prefill-tokens 16384 \
+   --max-total-tokens 8192
+```
+
+Send the simple request.
+```bash
+curl -N 127.0.0.1:8080/generate_stream \
+    -X POST \
+    -d '{"inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n","parameters":{"max_new_tokens":16, "seed": 42}}' \
+    -H 'Content-Type: application/json'
+```
+
+Multi-card Llava-next inference is currently not supported.
+
+#### llava-v1.6-mistral-7b-hf FP8 on 1 Gaudi2 card
+
+```bash
+model=llava-hf/llava-v1.6-mistral-7b-hf
+hf_token=YOUR_ACCESS_TOKEN   # HF access token
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e HABANA_VISIBLE_DEVICES=all \
+   -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+   -e HF_HUB_ENABLE_HF_TRANSFER=1 \
+   -e HUGGING_FACE_HUB_TOKEN=$hf_token \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e USE_FLASH_ATTENTION=true \
+   -e FLASH_ATTENTION_RECOMPUTE=true \
    --cap-add=sys_nice \
    --ipc=host \
    ghcr.io/huggingface/tgi-gaudi:2.0.1 \
diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py
@@ -86,6 +86,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         token_idx: Optional[torch.Tensor] = None,
+        use_flash_attention: Optional[bool] = False,
     ):
 
         if token_idx is not None:
@@ -107,6 +108,8 @@ def forward(
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
                 token_idx=token_idx,
+                use_flash_attention=use_flash_attention,
+                flash_attention_recompute=use_flash_attention,
             )
 
             logits = outputs[0]
@@ -145,7 +148,7 @@ def prepare_inputs_for_generation(
                     **kwargs,
                 )
             else:
-                
+                use_flash_attention = kwargs.get("use_flash_attention", False)
                 position_ids = kwargs.get("position_ids", None)
                 labels = kwargs.get("labels", None)
                 if past_key_values is None and pixel_values is not None and input_ids.shape[1] != 1:
@@ -166,7 +169,7 @@ def prepare_inputs_for_generation(
                     batch_size, num_patches, num_channels, height, width = pixel_values.shape
                     reshaped_pixel_values = pixel_values.view(batch_size * num_patches, num_channels, height, width)
                     image_features = self.vision_tower(
-                        reshaped_pixel_values, output_hidden_states=True
+                        reshaped_pixel_values, output_hidden_states=True, use_flash_attention=use_flash_attention
                     )
 
                     selected_image_feature = image_features.hidden_states[vision_feature_layer]
@@ -279,6 +282,7 @@ def prepare_inputs_for_generation(
                         "attention_mask": attention_mask,
                         "token_idx": token_idx,
                         "labels": labels,
+                        "use_flash_attention": use_flash_attention,
                     }
                 )
 
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
@@ -591,7 +591,7 @@ def __init__(
             "return_dict": True,
         }
 
-        if model.config.model_type in ["llama", "mistral"]:
+        if model.config.model_type in ["llama", "mistral", "llava_next"]:
             kwargs["attn_softmax_bf16"] = True
             kwargs["trim_logits"] = True
 

Original file line number	Diff line number	Diff line change
`@@ -591,7 +591,7 @@ def __init__(`
`591`	`591`	`"return_dict": True,`
`592`	`592`	`}`
`593`	`593`
`594`		`- if model.config.model_type in ["llama", "mistral"]:`
	`594`	`+ if model.config.model_type in ["llama", "mistral", "llava_next"]:`
`595`	`595`	`kwargs["attn_softmax_bf16"] = True`
`596`	`596`	`kwargs["trim_logits"] = True`
`597`	`597`