BerriAI · krrishdholakia · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/docs/my-website/blog/litellm_observatory/index.md b/docs/my-website/blog/litellm_observatory/index.md
@@ -93,7 +93,7 @@ Our focus moving forward is on being the first to detect issues, even when they
 The `TestOAIAzureRelease` test is designed to catch a class of bugs that only surface after sustained runtime:
 
 - **Duration**: Runs continuously for 3 hours
-- **Behavior**: Cycles through specified models (such as `gpt-4` and `gpt-3.5-turbo`), issuing requests continuously
+- **Behavior**: Cycles through specified models (such as `gpt-4` and `gpt-4o`), issuing requests continuously
 - **Why 3 Hours**: This helps catch issues where HTTP clients degrade or fail after extended use (for example, a bug observed in LiteLLM v1.81.3)
 - **Pass / Fail Criteria**: The test passes if fewer than 1% of requests fail. If the failure rate exceeds 1%, the test fails and we are notified in Slack
 - **Key Detail**: The same HTTP client is reused for the entire run, allowing us to detect lifecycle-related bugs that only appear under prolonged reuse

diff --git a/docs/my-website/docs/adding_provider/adding_guardrail_support.md b/docs/my-website/docs/adding_provider/adding_guardrail_support.md
@@ -321,7 +321,7 @@ curl -X POST 'http://localhost:4000/{my_endpoint}' \
 -H 'Content-Type: application/json' \
 -H 'Authorization: Bearer your-api-key' \
 -d '{
-    "model": "gpt-3.5-turbo",
+    "model": "gpt-4o",
     "messages": [{"role": "user", "content": "Hello"}],
     "guardrails": ["test"]
 }'

diff --git a/docs/my-website/docs/adding_provider/generic_prompt_management_api.md b/docs/my-website/docs/adding_provider/generic_prompt_management_api.md
@@ -131,9 +131,9 @@ Add to `config.yaml`:
 
 ```yaml
 model_list:
-  - model_name: gpt-3.5-turbo
+  - model_name: gpt-4o
     litellm_params:
-      model: openai/gpt-3.5-turbo
+      model: openai/gpt-4o
       api_key: os.environ/OPENAI_API_KEY
 
 prompts:

diff --git a/docs/my-website/docs/budget_manager.md b/docs/my-website/docs/budget_manager.md
@@ -51,7 +51,7 @@ if not budget_manager.is_valid_user(user):
 
 # check if a given call can be made
 if budget_manager.get_current_cost(user=user) <= budget_manager.get_total_budget(user):
-    response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}])
+    response = completion(model="gpt-4o", messages=[{"role": "user", "content": "Hey, how's it going?"}])
     budget_manager.update_cost(completion_obj=response, user=user)
 else:
     response = "Sorry - no budget!"
@@ -72,7 +72,7 @@ budget_manager.create_budget(total_budget=10, user=user, duration="daily")
 
 input_text = "hello world"
 output_text = "it's a sunny day in san francisco"
-model = "gpt-3.5-turbo"
+model = "gpt-4o"
 
 budget_manager.update_cost(user=user, model=model, input_text=input_text, output_text=output_text) # 👈
 print(budget_manager.get_current_cost(user))
@@ -108,7 +108,7 @@ if not budget_manager.is_valid_user(user):
 
 # check if a given call can be made
 if budget_manager.get_current_cost(user=user) <= budget_manager.get_total_budget(user):
-    response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}])
+    response = completion(model="gpt-4o", messages=[{"role": "user", "content": "Hey, how's it going?"}])
     budget_manager.update_cost(completion_obj=response, user=user)
 else:
     response = "Sorry - no budget!"
@@ -138,7 +138,7 @@ if not budget_manager.is_valid_user(user):
 
 # check if a given call can be made
 if budget_manager.get_current_cost(user=user) <= budget_manager.get_total_budget(user):
-    response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey, how's it going?"}])
+    response = completion(model="gpt-4o", messages=[{"role": "user", "content": "Hey, how's it going?"}])
     budget_manager.update_cost(completion_obj=response, user=user)
 else:
     response = "Sorry - no budget!"

diff --git a/docs/my-website/docs/caching/all_caches.md b/docs/my-website/docs/caching/all_caches.md
@@ -39,11 +39,11 @@ litellm.cache = Cache(type="redis", host=<host>, port=<port>, password=<password
 
 # Make completion calls
 response1 = completion(
-    model="gpt-3.5-turbo",
+    model="gpt-4o",
     messages=[{"role": "user", "content": "Tell me a joke."}]
 )
 response2 = completion(
-    model="gpt-3.5-turbo",
+    model="gpt-4o",
     messages=[{"role": "user", "content": "Tell me a joke."}]
 )
 
@@ -77,11 +77,11 @@ litellm.cache = RedisClusterCache(
 
 # Make completion calls
 response1 = completion(
-    model="gpt-3.5-turbo",
+    model="gpt-4o",
     messages=[{"role": "user", "content": "Tell me a joke."}]
 )
 response2 = completion(
-    model="gpt-3.5-turbo",
+    model="gpt-4o",
     messages=[{"role": "user", "content": "Tell me a joke."}]
 )
 
@@ -132,11 +132,11 @@ from litellm.caching.caching import Cache
 litellm.cache = Cache(type="gcs", gcs_bucket_name="my-cache-bucket", gcs_path_service_account="/path/to/service_account.json")
 
 response1 = completion(
-    model="gpt-3.5-turbo",
+    model="gpt-4o",
     messages=[{"role": "user", "content": "Tell me a joke."}]
 )
 response2 = completion(
-    model="gpt-3.5-turbo",
+    model="gpt-4o",
     messages=[{"role": "user", "content": "Tell me a joke."}]
 )
 
@@ -170,11 +170,11 @@ litellm.cache = Cache(type="s3", s3_bucket_name="cache-bucket-litellm", s3_regio
 
 # Make completion calls
 response1 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}]
 )
 response2 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}]
 )
 
@@ -201,11 +201,11 @@ litellm.cache = Cache(type="azure-blob", azure_account_url="https://example.blob
 
 # Make completion calls
 response1 = completion(
-    model="gpt-3.5-turbo",
+    model="gpt-4o",
     messages=[{"role": "user", "content": "Tell me a joke."}]
 )
 response2 = completion(
-    model="gpt-3.5-turbo",
+    model="gpt-4o",
     messages=[{"role": "user", "content": "Tell me a joke."}]
 )
 
@@ -244,7 +244,7 @@ litellm.cache = Cache(
     redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
 )
 response1 = completion(
-    model="gpt-3.5-turbo",
+    model="gpt-4o",
     messages=[
         {
             "role": "user",
@@ -258,7 +258,7 @@ print(f"response1: {response1}")
 random_number = random.randint(1, 100000)
 
 response2 = completion(
-    model="gpt-3.5-turbo",
+    model="gpt-4o",
     messages=[
         {
             "role": "user",
@@ -301,7 +301,7 @@ litellm.cache = Cache(
 )
 
 response1 = completion(
-    model="gpt-3.5-turbo",
+    model="gpt-4o",
     messages=[
         {
             "role": "user",
@@ -315,7 +315,7 @@ print(f"response1: {response1}")
 random_number = random.randint(1, 100000)
 
 response2 = completion(
-    model="gpt-3.5-turbo",
+    model="gpt-4o",
     messages=[
         {
             "role": "user",
@@ -343,12 +343,12 @@ litellm.cache = Cache()
 
 # Make completion calls
 response1 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}],
     caching=True
 )
 response2 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}],
     caching=True
 )
@@ -379,12 +379,12 @@ litellm.cache = Cache(type="disk")
 
 # Make completion calls
 response1 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}],
     caching=True
 )
 response2 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}],
     caching=True
 )
@@ -416,7 +416,7 @@ Example usage `no-cache` - When `True`, Will not return a cached response
 
 ```python
 response = litellm.completion(
-        model="gpt-3.5-turbo",
+        model="gpt-4o",
         messages=[
             {
                 "role": "user",
@@ -435,7 +435,7 @@ Example usage `no-store` - When `True`, Will not cache the response.
 
 ```python
 response = litellm.completion(
-        model="gpt-3.5-turbo",
+        model="gpt-4o",
         messages=[
             {
                 "role": "user",
@@ -453,7 +453,7 @@ Example usage `ttl` - cache the response for 10 seconds
 
 ```python
 response = litellm.completion(
-        model="gpt-3.5-turbo",
+        model="gpt-4o",
         messages=[
             {
                 "role": "user",
@@ -471,7 +471,7 @@ Example usage `s-maxage` - Will only accept cached responses for 60 seconds
 
 ```python
 response = litellm.completion(
-        model="gpt-3.5-turbo",
+        model="gpt-4o",
         messages=[
             {
                 "role": "user",

diff --git a/docs/my-website/docs/caching/caching_api.md b/docs/my-website/docs/caching/caching_api.md
@@ -11,13 +11,13 @@ litellm.cache = Cache(type="hosted") # init cache to use api.litellm.ai
 
 # Make completion calls
 response1 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}]
     caching=True
 )
 
 response2 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}],
     caching=True
 )
@@ -59,7 +59,7 @@ litellm.cache = Cache(type="hosted")
 
 # Make completion calls
 response1 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}], 
     stream=True,
     caching=True)
@@ -69,7 +69,7 @@ for chunk in response1:
 time.sleep(1) # cache is updated asynchronously
 
 response2 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}], 
     stream=True,
     caching=True)

diff --git a/docs/my-website/docs/caching/local_caching.md b/docs/my-website/docs/caching/local_caching.md
@@ -18,12 +18,12 @@ litellm.cache = Cache()
 
 # Make completion calls
 response1 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}]
     caching=True
 )
 response2 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}],
     caching=True
 )
@@ -55,14 +55,14 @@ litellm.cache = Cache()
 
 # Make completion calls
 response1 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}], 
     stream=True,
     caching=True)
 for chunk in response1:
     print(chunk)
 response2 = completion(
-    model="gpt-3.5-turbo", 
+    model="gpt-4o", 
     messages=[{"role": "user", "content": "Tell me a joke."}], 
     stream=True,
     caching=True)

diff --git a/docs/my-website/docs/completion/audio.md b/docs/my-website/docs/completion/audio.md
@@ -216,8 +216,8 @@ Use `litellm.supports_audio_input(model="")` -> returns `True` if model can acce
 assert litellm.supports_audio_output(model="gpt-4o-audio-preview") == True
 assert litellm.supports_audio_input(model="gpt-4o-audio-preview") == True
 
-assert litellm.supports_audio_output(model="gpt-3.5-turbo") == False
-assert litellm.supports_audio_input(model="gpt-3.5-turbo") == False
+assert litellm.supports_audio_output(model="gpt-4o") == False
+assert litellm.supports_audio_input(model="gpt-4o") == False
 ```
 </TabItem>
 

diff --git a/docs/my-website/docs/completion/batching.md b/docs/my-website/docs/completion/batching.md
@@ -68,7 +68,7 @@ os.environ['OPENAI_API_KEY'] = ""
 os.environ['COHERE_API_KEY'] = ""
 
 response = batch_completion_models(
-    models=["gpt-3.5-turbo", "claude-instant-1.2", "command-nightly"], 
+    models=["gpt-4o", "claude-instant-1.2", "command-nightly"], 
     messages=[{"role": "user", "content": "Hey, how's it going"}]
 )
 print(result)
@@ -203,7 +203,7 @@ os.environ['OPENAI_API_KEY'] = ""
 os.environ['COHERE_API_KEY'] = ""
 
 responses = batch_completion_models_all_responses(
-    models=["gpt-3.5-turbo", "claude-instant-1.2", "command-nightly"], 
+    models=["gpt-4o", "claude-instant-1.2", "command-nightly"], 
     messages=[{"role": "user", "content": "Hey, how's it going"}]
 )
 print(responses)
@@ -259,7 +259,7 @@ print(responses)
   "id": "chatcmpl-80szFnKHzCxObW0RqCMw1hWW1Icrq",
   "object": "chat.completion",
   "created": 1695222061,
-  "model": "gpt-3.5-turbo-0613",
+  "model": "gpt-4o-0613",
   "choices": [
     {
       "index": 0,

diff --git a/docs/my-website/docs/completion/drop_params.md b/docs/my-website/docs/completion/drop_params.md
@@ -210,7 +210,7 @@ client = openai.OpenAI(
 )
 
 response = client.chat.completions.create(
-    model="gpt-3.5-turbo",
+    model="gpt-4o",
     messages = [
         {
             "role": "user",