From 95df3a00e1cd58375ee4e5559f3483064a64f49f Mon Sep 17 00:00:00 2001
From: Jim Burtoft <39492751+jimburtoft@users.noreply.github.com>
Date: Wed, 5 Feb 2025 22:11:11 -0500
Subject: [PATCH] display available cached versions in error message

If a model is cached with a different configuration, I want to display alternative options to the user.

If someone copies from the deploy code on Hugging Face and change something (e.g. sequence length), it is not obvious why it isn't working.

Based on a true story!

added some carriage returns to make it more readable
get_hub_cached_entries does generate an error if it is fed a model that doesn't have a model.  For example:  (randomly selected) model_id = "hexgrad/Kokoro-82M"

Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/optimum/neuron/utils/hub_cache_utils.py", line 431, in get_hub_cached_entries
    model_type = target_entry.config["model_type"]
KeyError: 'model_type'

However, we call that function inside of is_cached, so I don't know if we are filtering for certain types of models or not.  If not, it should generate the error before it ever gets here
---
 .../server/text_generation_server/model.py    | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/text-generation-inference/server/text_generation_server/model.py b/text-generation-inference/server/text_generation_server/model.py
index e8cb34ee1..c6313cde3 100644
--- a/text-generation-inference/server/text_generation_server/model.py
+++ b/text-generation-inference/server/text_generation_server/model.py
@@ -99,10 +99,28 @@ def fetch_model(
     if not is_cached(model_id, neuron_config):
         hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache"
         neuron_export_url = "https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-neuronx-tgi"
+            
+        entries = get_hub_cached_entries(model_id, "inference")
+        available_configs = ""
+        if entries:
+            config_list = []
+            for entry in entries:
+                config = (
+                    f"batch_size={entry['batch_size']}, "
+                    f"sequence_length={entry['sequence_length']}, "
+                    f"num_cores={entry['num_cores']}, "
+                    f"auto_cast_type={entry['auto_cast_type']}"
+                )
+                config_list.append(config)
+            available_configs = "\nAvailable cached configurations for this model:\n- " + "\n- ".join(config_list)
+        else:
+            available_configs = "\nNo cached versions are currently available for that model with any configuration."
+            
         error_msg = (
             f"No cached version found for {model_id} with {neuron_config}."
-            f"You can start a discussion to request it on {hub_cache_url}"
-            f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}"
+            f"{available_configs}"
+            f"\nYou can start a discussion to request it on {hub_cache_url}"
+            f"\nAlternatively, you can export your own neuron model as explained in {neuron_export_url}"
         )
         raise ValueError(error_msg)
     logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.")