Add bedrock llama4 pricing + handle llama4 templating on bedrock invoke route (#10582)

krrishdholakia · aswny · web-flow · commit 3a7330900670 · 2025-05-06T16:06:24.000-07:00
* build(model_prices_and_context_window.json): add bedrock llama4 models to model cost map * fix template conversion for Llama 4 models in Bedrock (#10557) * test: add testing to repro #10557 * test: add unit testing * test(test_main.py): refactor where test is kept --------- Co-authored-by: aswny <87371411+aswny@users.noreply.github.com>
diff --git a/litellm/litellm_core_utils/prompt_templates/factory.py b/litellm/litellm_core_utils/prompt_templates/factory.py
@@ -3633,7 +3633,7 @@ def prompt_factory(
             return mistral_instruct_pt(messages=messages)
         elif "llama2" in model and "chat" in model:
             return llama_2_chat_pt(messages=messages)
-        elif "llama3" in model and "instruct" in model:
+        elif ("llama3" in model or "llama4" in model) and "instruct" in model:
             return hf_chat_template(
                 model="meta-llama/Meta-Llama-3-8B-Instruct",
                 messages=messages,
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
@@ -10105,6 +10105,66 @@
         "supports_function_calling": true, 
         "supports_tool_choice": false
     },
+    "meta.llama4-maverick-17b-instruct-v1:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00024e-3,
+        "input_cost_per_token_batches": 0.00012e-3,
+        "output_cost_per_token": 0.00097e-3,
+        "output_cost_per_token_batches": 0.000485e-3,
+        "litellm_provider": "bedrock_converse",
+        "mode": "chat",
+        "supports_function_calling": true, 
+        "supports_tool_choice": false,
+        "supported_modalities": ["text", "image"],
+        "supported_output_modalities": ["text", "code"]
+    },
+    "us.meta.llama4-maverick-17b-instruct-v1:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00024e-3,
+        "input_cost_per_token_batches": 0.00012e-3,
+        "output_cost_per_token": 0.00097e-3,
+        "output_cost_per_token_batches": 0.000485e-3,
+        "litellm_provider": "bedrock_converse",
+        "mode": "chat",
+        "supports_function_calling": true, 
+        "supports_tool_choice": false,
+        "supported_modalities": ["text", "image"],
+        "supported_output_modalities": ["text", "code"]
+    },
+    "meta.llama4-scout-17b-instruct-v1:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00017e-3,
+        "input_cost_per_token_batches": 0.000085e-3,
+        "output_cost_per_token": 0.00066e-3,
+        "output_cost_per_token_batches": 0.00033e-3,
+        "litellm_provider": "bedrock_converse",
+        "mode": "chat",
+        "supports_function_calling": true, 
+        "supports_tool_choice": false,
+        "supported_modalities": ["text", "image"],
+        "supported_output_modalities": ["text", "code"]
+    },
+    "us.meta.llama4-scout-17b-instruct-v1:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00017e-3,
+        "input_cost_per_token_batches": 0.000085e-3,
+        "output_cost_per_token": 0.00066e-3,
+        "output_cost_per_token_batches": 0.00033e-3,
+        "litellm_provider": "bedrock_converse",
+        "mode": "chat",
+        "supports_function_calling": true, 
+        "supports_tool_choice": false,
+        "supported_modalities": ["text", "image"],
+        "supported_output_modalities": ["text", "code"]
+    },
     "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
         "max_tokens": 77, 
         "max_input_tokens": 77, 
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
@@ -10105,6 +10105,66 @@
         "supports_function_calling": true, 
         "supports_tool_choice": false
     },
+    "meta.llama4-maverick-17b-instruct-v1:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00024e-3,
+        "input_cost_per_token_batches": 0.00012e-3,
+        "output_cost_per_token": 0.00097e-3,
+        "output_cost_per_token_batches": 0.000485e-3,
+        "litellm_provider": "bedrock_converse",
+        "mode": "chat",
+        "supports_function_calling": true, 
+        "supports_tool_choice": false,
+        "supported_modalities": ["text", "image"],
+        "supported_output_modalities": ["text", "code"]
+    },
+    "us.meta.llama4-maverick-17b-instruct-v1:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00024e-3,
+        "input_cost_per_token_batches": 0.00012e-3,
+        "output_cost_per_token": 0.00097e-3,
+        "output_cost_per_token_batches": 0.000485e-3,
+        "litellm_provider": "bedrock_converse",
+        "mode": "chat",
+        "supports_function_calling": true, 
+        "supports_tool_choice": false,
+        "supported_modalities": ["text", "image"],
+        "supported_output_modalities": ["text", "code"]
+    },
+    "meta.llama4-scout-17b-instruct-v1:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00017e-3,
+        "input_cost_per_token_batches": 0.000085e-3,
+        "output_cost_per_token": 0.00066e-3,
+        "output_cost_per_token_batches": 0.00033e-3,
+        "litellm_provider": "bedrock_converse",
+        "mode": "chat",
+        "supports_function_calling": true, 
+        "supports_tool_choice": false,
+        "supported_modalities": ["text", "image"],
+        "supported_output_modalities": ["text", "code"]
+    },
+    "us.meta.llama4-scout-17b-instruct-v1:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00017e-3,
+        "input_cost_per_token_batches": 0.000085e-3,
+        "output_cost_per_token": 0.00066e-3,
+        "output_cost_per_token_batches": 0.00033e-3,
+        "litellm_provider": "bedrock_converse",
+        "mode": "chat",
+        "supports_function_calling": true, 
+        "supports_tool_choice": false,
+        "supported_modalities": ["text", "image"],
+        "supported_output_modalities": ["text", "code"]
+    },
     "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
         "max_tokens": 77, 
         "max_input_tokens": 77, 
diff --git a/tests/litellm/litellm_core_utils/prompt_templates/test_litellm_core_utils_prompt_templates_factory.py b/tests/litellm/litellm_core_utils/prompt_templates/test_litellm_core_utils_prompt_templates_factory.py
@@ -1,8 +1,14 @@
-import pytest
 import json
 from unittest.mock import patch
+
+import pytest
+
 import litellm
-from litellm.litellm_core_utils.prompt_templates.factory import ollama_pt, BAD_MESSAGE_ERROR_STR
+from litellm.litellm_core_utils.prompt_templates.factory import (
+    BAD_MESSAGE_ERROR_STR,
+    ollama_pt,
+)
+
 
 def test_ollama_pt_simple_messages():
     """Test basic functionality with simple text messages"""
@@ -11,14 +17,15 @@ def test_ollama_pt_simple_messages():
         {"role": "assistant", "content": "How can I help you?"},
         {"role": "user", "content": "Hello"},
     ]
-    
+
     result = ollama_pt(model="llama2", messages=messages)
-    
+
     expected_prompt = "### System:\nYou are a helpful assistant\n\n### Assistant:\nHow can I help you?\n\n### User:\nHello\n\n"
     assert isinstance(result, dict)
     assert result["prompt"] == expected_prompt
     assert result["images"] == []
 
+
 def test_ollama_pt_consecutive_user_messages():
     """Test handling consecutive user messages"""
     messages = [
@@ -28,14 +35,15 @@ def test_ollama_pt_consecutive_user_messages():
         {"role": "assistant", "content": "I'm good, thanks!"},
         {"role": "user", "content": "I am well too."},
     ]
-    
+
     result = ollama_pt(model="llama2", messages=messages)
-    
+
     # Consecutive user messages should be merged
     expected_prompt = "### User:\nHello\n\n### Assistant:\nHow can I help you?\n\n### User:\nHow are you?\n\n### Assistant:\nI'm good, thanks!\n\n### User:\nI am well too.\n\n"
     assert isinstance(result, dict)
     assert result["prompt"] == expected_prompt
 
+
 # def test_ollama_pt_consecutive_system_messages():
 #     """Test handling consecutive system messages"""
 #     messages = [
@@ -44,9 +52,9 @@ def test_ollama_pt_consecutive_user_messages():
 #         {"role": "system", "content": "Be concise and polite"},
 #         {"role": "assistant", "content": "How can I help you?"}
 #     ]
-    
+
 #     result = ollama_pt(model="llama2", messages=messages)
-    
+
 #     # Consecutive system messages should be merged
 #     expected_prompt = "### User:\nHello\n\n### System:\nYou are a helpful assistantBe concise and polite\n\n### Assistant:\nHow can I help you?\n\n"
 #     assert result == expected_prompt
@@ -59,9 +67,9 @@ def test_ollama_pt_consecutive_user_messages():
 #         {"role": "assistant", "content": "How can I help you?"},
 #         {"role": "user", "content": "Tell me a joke"}
 #     ]
-    
+
 #     result = ollama_pt(model="llama2", messages=messages)
-    
+
 #     # Consecutive assistant messages should be merged
 #     expected_prompt = "### User:\nHello\n\n### Assistant:\nHi there!How can I help you?\n\n### User:\nTell me a joke\n\n"
 #     assert result["prompt"] == expected_prompt
@@ -75,9 +83,9 @@ def test_ollama_pt_consecutive_user_messages():
 #         ]},
 #         {"role": "assistant", "content": "That's a cat."}
 #     ]
-    
+
 #     result = ollama_pt(model="llama2", messages=messages)
-    
+
 #     expected_prompt = "### User:\nWhat's in this image?\n\n### Assistant:\nThat's a cat.\n\n"
 #     assert result["prompt"] == expected_prompt
 #     assert result["images"] == ["http://example.com/image.jpg"]
@@ -91,9 +99,9 @@ def test_ollama_pt_consecutive_user_messages():
 #         ]},
 #         {"role": "assistant", "content": "That's a cat."}
 #     ]
-    
+
 #     result = ollama_pt(model="llama2", messages=messages)
-    
+
 #     expected_prompt = "### User:\nWhat's in this image?\n\n### Assistant:\nThat's a cat.\n\n"
 #     assert result["prompt"] == expected_prompt
 #     assert result["images"] == ["http://example.com/image.jpg"]
@@ -116,9 +124,9 @@ def test_ollama_pt_consecutive_user_messages():
 #         },
 #         {"role": "tool", "content": "Sunny, 72°F"}
 #     ]
-    
+
 #     result = ollama_pt(model="llama2", messages=messages)
-    
+
 #     # Check if tool call is included in the prompt
 #     assert "### User:\nWhat's the weather in San Francisco?" in result["prompt"]
 #     assert "### Assistant:\nI'll check the weather for you.Tool Calls:" in result["prompt"]
@@ -131,18 +139,18 @@ def test_ollama_pt_consecutive_user_messages():
 #     messages = [
 #         {"role": "invalid_role", "content": "This is an invalid role"}
 #     ]
-    
+
 #     with pytest.raises(litellm.BadRequestError) as excinfo:
 #         ollama_pt(model="llama2", messages=messages)
-    
+
 #     assert BAD_MESSAGE_ERROR_STR in str(excinfo.value)
 
 # def test_ollama_pt_empty_messages():
 #     """Test with empty messages list"""
 #     messages = []
-    
+
 #     result = ollama_pt(model="llama2", messages=messages)
-    
+
 #     assert result["prompt"] == ""
 #     assert result["images"] == []
 
@@ -155,9 +163,9 @@ def test_ollama_pt_consecutive_user_messages():
 #         {"role": "assistant", "content": "To get to the other side!"},
 #         {"role": "tool", "content": "Joke rating: 5/10"}
 #     ]
-    
+
 #     result = ollama_pt(model="llama2", messages=messages)
-    
+
 #     assert "### User:\nTell me a joke" in result["prompt"]
 #     assert "### Assistant:\nWhy did the chicken cross the road?" in result["prompt"]
 #     assert "### User:\nWhy?" in result["prompt"]
@@ -171,9 +179,9 @@ def test_ollama_pt_consecutive_user_messages():
 #         {"role": "function", "content": "The result is 4"},
 #         {"role": "assistant", "content": "The answer is 4."}
 #     ]
-    
+
 #     result = ollama_pt(model="llama2", messages=messages)
-    
+
 #     assert "### User:\nWhat's 2+2?The result is 4\n\n" in result["prompt"]
 #     assert "### Assistant:\nThe answer is 4.\n\n" in result["prompt"]
 
@@ -187,9 +195,9 @@ def test_ollama_pt_consecutive_user_messages():
 #         ]},
 #         {"role": "assistant", "content": "Both images show cats, but different breeds."}
 #     ]
-    
+
 #     result = ollama_pt(model="llama2", messages=messages)
-    
+
 #     expected_prompt = "### User:\nCompare these images:\n\n### Assistant:\nBoth images show cats, but different breeds.\n\n"
 #     assert result["prompt"] == expected_prompt
 #     assert result["images"] == ["http://example.com/image1.jpg", "http://example.com/image2.jpg"]
@@ -206,12 +214,12 @@ def test_ollama_pt_consecutive_user_messages():
 #         {"role": "system", "content": "Be helpful"},
 #         {"role": "assistant", "content": "I see a cat in the image."}
 #     ]
-    
+
 #     result = ollama_pt(model="llama2", messages=messages)
-    
+
 #     assert "### User:\nHello\n\n" in result["prompt"]
 #     assert "### Assistant:\nHi there!\n\n" in result["prompt"]
 #     assert "### User:\nLook at this:\n\n" in result["prompt"]
 #     assert "### System:\nBe helpful\n\n" in result["prompt"]
 #     assert "### Assistant:\nI see a cat in the image.\n\n" in result["prompt"]
-#     assert result["images"] == ["http://example.com/image.jpg"]
+#     assert result["images"] == ["http://example.com/image.jpg"]
diff --git a/tests/litellm/test_main.py b/tests/litellm/test_main.py
diff --git a/tests/llm_translation/test_bedrock_llama.py b/tests/llm_translation/test_bedrock_llama.py