switch granite-7b for 3.1-8b-instruct (#214)

guimou · web-flow · commit c3994a228bac · 2025-06-05T11:55:26.000-04:00
* switch granite-7b for 3.1-8b-instruct

adjust max-length

* add RedHatAI for HF org
diff --git a/.github/.wordlist.txt b/.github/.wordlist.txt
@@ -209,6 +209,7 @@ README
 readonly
 recog
 redhat
+RedHatAI
 repo
 repoURL
 RespectIgnoreDifferences
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ Useful link: [https://redhat-scholars.github.io/build-course/rhs-build-course/de
 
 - Python 3.11
 - Nodejs > 18
-- An existing instance of an LLM served through an OpenAI compatible API at `INFERENCE_SERVER_URL`. This application is based on Granite-7b-Instruct Prompt format. You will need to modify this format if you are using a different model.
+- An existing instance of an LLM served through an OpenAI compatible API at `INFERENCE_SERVER_URL`. This application is based on Granite-3.1-8B-Instruct Prompt format. You will need to modify this format if you are using a different model.
 
 ### Installation
 
diff --git a/bootstrap/granite-modelcar-image/Containerfile b/bootstrap/granite-modelcar-image/Containerfile
diff --git a/bootstrap/granite-modelcar-image/README.md b/bootstrap/granite-modelcar-image/README.md
diff --git a/bootstrap/ic-shared-app/deployment-app.yaml b/bootstrap/ic-shared-app/deployment-app.yaml
@@ -29,9 +29,9 @@ spec:
                 fieldRef:
                   fieldPath: metadata.namespace
             - name: INFERENCE_SERVER_URL
-              value: http://granite-7b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080/v1
+              value: http://granite-3-1-8b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080/v1
             - name: MODEL_NAME
-              value: 'granite-7b-instruct'
+              value: 'granite-3-1-8b-instruct'
             - name: MAX_TOKENS
               value: '512'
             - name: TOP_P
diff --git a/bootstrap/ic-shared-llm/inference-service-granite-modelcar.yaml b/bootstrap/ic-shared-llm/inference-service-granite-modelcar.yaml
@@ -2,15 +2,15 @@ apiVersion: serving.kserve.io/v1beta1
 kind: InferenceService
 metadata:
   annotations:
-    openshift.io/display-name: granite-7b-instruct
+    openshift.io/display-name: granite-3-1-8b-instruct
     serving.knative.openshift.io/enablePassthrough: 'true'
     sidecar.istio.io/inject: 'true'
     sidecar.istio.io/rewriteAppHTTPProbers: 'true'
     argocd.argoproj.io/sync-wave: "2"
     serving.kserve.io/deploymentMode: RawDeployment
     argocd.argoproj.io/compare-options: IgnoreExtraneous
     argocd.argoproj.io/sync-options: Prune=false
-  name: granite-7b-instruct
+  name: granite-3-1-8b-instruct
   namespace: ic-shared-llm
   labels:
     opendatahub.io/dashboard: 'true'
@@ -19,6 +19,15 @@ spec:
     maxReplicas: 1
     minReplicas: 1
     model:
+      args:
+        - '--port=8080'
+        - '--model=/mnt/models'
+        - '--served-model-name=granite-3-1-8b-instruct'
+        - '--max-model-len=15000'
+        - '--dtype=half'
+        - '--enable-auto-tool-choice'
+        - '--tool-call-parser'
+        - granite
       modelFormat:
         name: vLLM
       name: ''
@@ -32,7 +41,7 @@ spec:
           memory: 8Gi
           nvidia.com/gpu: '1'
       runtime: vllm
-      storageUri: oci://quay.io/rh-aiservices-bu/granite-7b-instruct-modelcar:0.2
+      storageUri: oci://registry.redhat.io/rhelai1/modelcar-granite-3-1-8b-instruct:1.5
     tolerations:
       - effect: NoSchedule
         key: nvidia.com/gpu
diff --git a/bootstrap/ic-shared-llm/serving-runtime-vllm-granite-modelcar.yaml b/bootstrap/ic-shared-llm/serving-runtime-vllm-granite-modelcar.yaml
@@ -19,19 +19,14 @@ spec:
     prometheus.io/path: /metrics
     prometheus.io/port: '8080'
   containers:
-    - args:
-        - '--port=8080'
-        - '--model=/mnt/models'
-        - '--served-model-name={{.Name}}'
-        - '--distributed-executor-backend=mp'
-      command:
+    - command:
         - python
         - '-m'
         - vllm.entrypoints.openai.api_server
       env:
         - name: HF_HOME
           value: /tmp/hf_home
-      image: 'quay.io/modh/vllm@sha256:b51fde66f162f1a78e8c027320dddf214732d5345953b1599a84fe0f0168c619'
+      image: 'quay.io/modh/vllm:rhoai-2.19-cuda'
       name: kserve-container
       ports:
         - containerPort: 8080
diff --git a/content/modules/ROOT/pages/03-04-comparing-model-servers.adoc b/content/modules/ROOT/pages/03-04-comparing-model-servers.adoc
@@ -1,7 +1,7 @@
 = Comparing two LLMs
 include::_attributes.adoc[]
 
-So far, for this {ic-lab}, we have used the model https://huggingface.co/ibm-granite/granite-7b-instruct[Granite 7B Instruct,window=_blank]. Although lighter than other models, it is still quite heavy and we need a large GPU to run it. Would we get as good results with a smaller model running on a CPU only? Let's try!
+So far, for this {ic-lab}, we have used the model https://huggingface.co/RedHatAI/granite-3.1-8b-instruct[Granite 3.1 8B Instruct,window=_blank]. Although lighter than other models, it is still quite heavy and we need a large GPU to run it. Would we get as good results with a smaller model running on a CPU only? Let's try!
 
 In this exercise, we'll pitch our previous model against a much smaller LLM called https://huggingface.co/google/flan-t5-large[flan-t5-large,window=_blank]. We'll compare the results and see if the smaller model is good enough for our use case.
 
diff --git a/content/modules/ROOT/pages/06-01-potential-imp-ref.adoc b/content/modules/ROOT/pages/06-01-potential-imp-ref.adoc
@@ -39,7 +39,7 @@ If you want to read what **we** thought could be improved, read below! (response
 ** Mismatch in license plate, if visible in the picture.
 * We've only scratched the surface with gitops and Data Science pipelines here
 ** There was no performance testing done. If too many users connect at the same time, it might overwhelm either the app, the database, the LLM, etc...
-* Currently, most simple changes would probably end up breaking the application. And the person who, for example decides to change Granite-7B for Flan-T5-Large would not necessarily realize that.
+* Currently, most simple changes would probably end up breaking the application. And the person who, for example decides to change Granite-3.1-8B-Instruct for Flan-T5-Large would not necessarily realize that.
 ** It would be critical to have multiple instances (Dev/Test/UAT/Prod) of the application.
 ** It would also be required to have integration pipelines run in these environments to confirm that changes made do not break the overall application.
 * We could ask the LLM to start writing a response to the customer.
diff --git a/lab-materials/02/02-05-validating.ipynb b/lab-materials/02/02-05-validating.ipynb
@@ -69,7 +69,7 @@
     "    services_to_check = [\n",
     "        (\"minio.ic-shared-minio.svc.cluster.local\", 9000, \"Minio\"),\n",
     "        (\"claimdb.ic-shared-db.svc.cluster.local\", 5432, \"Postgres Database\"),\n",
-    "        (\"granite-7b-instruct-predictor.ic-shared-llm.svc.cluster.local\", 8080, \"LLM Service\"),\n",
+    "        (\"granite-3-1-8b-instruct-predictor.ic-shared-llm.svc.cluster.local\", 8080, \"LLM Service\"),\n",
     "        (\"llm-flant5.ic-shared-llm.svc.cluster.local\", 3000, \"LLM Service-FlanT5\"),\n",
     "        (\"modelmesh-serving.ic-shared-img-det.svc.cluster.local\", 8033, \"ModelMesh\"),\n",
     "        (\"vectordb-milvus.ic-shared-milvus.svc.cluster.local\", 19530, \"Milvus Vector DB\"),\n",
diff --git a/lab-materials/03/03-01-nb-llm-example.ipynb b/lab-materials/03/03-01-nb-llm-example.ipynb
@@ -9,7 +9,7 @@
     "\n",
     "You have certainly interacted before with a Large Language Model (LLM) like ChatGPT. This is usually done through a UI or an application.\n",
     "\n",
-    "In this Notebook, we are going to use Python to connect and query an LLM directly through its API. For this Lab we have selected the model **Granite-7B-Instruct**.(https://huggingface.co/ibm-granite/granite-7b-instruct). This is a fully Open Source model (Apache 2.0 license) developed by IBM Research.\n",
+    "In this Notebook, we are going to use Python to connect and query an LLM directly through its API. For this Lab we have selected the model **Granite-3.1-8B-Instruct**.(https://huggingface.co/RedHatAI/granite-3.1-8b-instruct). This is a fully Open Source model (Apache 2.0 license) developed by IBM Research.\n",
     "\n",
     "This model has already been deployed on the Lab cluster because even if it's a smaller model, it still needs a GPU with 24GB of RAM to run..."
    ]
@@ -63,13 +63,13 @@
    "outputs": [],
    "source": [
     "# LLM Inference Server URL\n",
-    "inference_server_url = \"http://granite-7b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080\"\n",
+    "inference_server_url = \"http://granite-3-1-8b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080\"\n",
     "\n",
     "# LLM definition\n",
     "llm = VLLMOpenAI(           # We are using the vLLM OpenAI-compatible API client. But the Model is running on OpenShift AI, not OpenAI.\n",
     "    openai_api_key=\"EMPTY\",   # And that is why we don't need an OpenAI key for this.\n",
     "    openai_api_base= f\"{inference_server_url}/v1\",\n",
-    "    model_name=\"granite-7b-instruct\",\n",
+    "    model_name=\"granite-3-1-8b-instruct\",\n",
     "    top_p=0.92,\n",
     "    temperature=0.01,\n",
     "    max_tokens=512,\n",
diff --git a/lab-materials/03/03-02-summarization.ipynb b/lab-materials/03/03-02-summarization.ipynb
@@ -64,13 +64,13 @@
    "outputs": [],
    "source": [
     "# LLM Inference Server URL\n",
-    "inference_server_url = \"http://granite-7b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080\"\n",
+    "inference_server_url = \"http://granite-3-1-8b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080\"\n",
     "\n",
     "# LLM definition\n",
     "llm = VLLMOpenAI(           # We are using the vLLM OpenAI-compatible API client. But the Model is running on OpenShift AI, not OpenAI.\n",
     "    openai_api_key=\"EMPTY\",   # And that is why we don't need an OpenAI key for this.\n",
     "    openai_api_base= f\"{inference_server_url}/v1\",\n",
-    "    model_name=\"granite-7b-instruct\",\n",
+    "    model_name=\"granite-3-1-8b-instruct\",\n",
     "    top_p=0.92,\n",
     "    temperature=0.01,\n",
     "    max_tokens=512,\n",
diff --git a/lab-materials/03/03-03-information-extraction.ipynb b/lab-materials/03/03-03-information-extraction.ipynb
@@ -65,13 +65,13 @@
    "outputs": [],
    "source": [
     "# LLM Inference Server URL\n",
-    "inference_server_url = \"http://granite-7b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080\"\n",
+    "inference_server_url = \"http://granite-3-1-8b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080\"\n",
     "\n",
     "# LLM definition\n",
     "llm = VLLMOpenAI(           # We are using the vLLM OpenAI-compatible API client. But the Model is running on OpenShift AI, not OpenAI.\n",
     "    openai_api_key=\"EMPTY\",   # And that is why we don't need an OpenAI key for this.\n",
     "    openai_api_base= f\"{inference_server_url}/v1\",\n",
-    "    model_name=\"granite-7b-instruct\",\n",
+    "    model_name=\"granite-3-1-8b-instruct\",\n",
     "    top_p=0.92,\n",
     "    temperature=0.01,\n",
     "    max_tokens=512,\n",
diff --git a/lab-materials/03/03-04-comparing-model-servers.ipynb b/lab-materials/03/03-04-comparing-model-servers.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "## Comparing models for our different tasks\n",
     "\n",
-    "In this Notebook, we are going to use another model, Flan-T5-large in parallel to Granite-7B-Instruct and see how it behaves.\n",
+    "In this Notebook, we are going to use another model, Flan-T5-large in parallel to Granite-3.1-8B-Instruct and see how it behaves.\n",
     "\n",
     "Flan-T5-Large is indeed smaller, will run without GPU and use only 4 GB of RAM, but is it up to the task?"
    ]
@@ -62,13 +62,13 @@
    "outputs": [],
    "source": [
     "# LLM Inference Server URL\n",
-    "inference_server_url = \"http://granite-7b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080\"\n",
+    "inference_server_url = \"http://granite-3-1-8b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080\"\n",
     "\n",
     "# LLM definition\n",
     "llm = VLLMOpenAI(           # We are using the vLLM OpenAI-compatible API client. But the Model is running on OpenShift AI, not OpenAI.\n",
     "    openai_api_key=\"EMPTY\",   # And that is why we don't need an OpenAI key for this.\n",
     "    openai_api_base= f\"{inference_server_url}/v1\",\n",
-    "    model_name=\"granite-7b-instruct\",\n",
+    "    model_name=\"granite-3-1-8b-instruct\",\n",
     "    top_p=0.92,\n",
     "    temperature=0.01,\n",
     "    max_tokens=512,\n",
@@ -203,7 +203,7 @@
     "print(\"Original content:\")\n",
     "print(\"-----------------\")\n",
     "print(f\"Subject: {claims[filename]['subject']}\\nContent:\\n{claims[filename]['content']}\\n\\n\")\n",
-    "print('Analysis with Granite-7B-Instruct:')\n",
+    "print('Analysis with Granite-3.1-8B-Instruct:')\n",
     "print(\"--------\")\n",
     "start_granite = time.time()\n",
     "print(f\"- Sentiment: \")\n",
@@ -235,7 +235,7 @@
    "id": "6e28a5b0-6c93-42ba-84dd-42e17746d11d",
    "metadata": {},
    "source": [
-    "As you can see, Flan-T5-Large may be faster to produce some of the results as it's a 770 Million parameters model only. However those results are less accurate or detailed. So it works to some extent, but the results are nowhere near the ones from Granite-7B-Instruct, which is a 7 Billion parameter.\n",
+    "As you can see, Flan-T5-Large may be faster to produce some of the results as it's a 770 Million parameters model only. However those results are less accurate or detailed. So it works to some extent, but the results are nowhere near the ones from Granite-3.1-8B-Instruct, which is a 7 Billion parameter.\n",
     "\n",
     "The art of working with LLM is to find the right balance between the performance and accuracy you require, and the resources it takes along with the involved costs.\n",
     "\n",
diff --git a/lab-materials/03/03-05-retrieval-augmented-generation.ipynb b/lab-materials/03/03-05-retrieval-augmented-generation.ipynb
@@ -81,13 +81,13 @@
    "outputs": [],
    "source": [
     "# LLM Inference Server URL\n",
-    "inference_server_url = \"http://granite-7b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080\"\n",
+    "inference_server_url = \"http://granite-3-1-8b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080\"\n",
     "\n",
     "# LLM definition\n",
     "llm = VLLMOpenAI(           # We are using the vLLM OpenAI-compatible API client. But the Model is running on OpenShift AI, not OpenAI.\n",
     "    openai_api_key=\"EMPTY\",   # And that is why we don't need an OpenAI key for this.\n",
     "    openai_api_base= f\"{inference_server_url}/v1\",\n",
-    "    model_name=\"granite-7b-instruct\",\n",
+    "    model_name=\"granite-3-1-8b-instruct\",\n",
     "    top_p=0.92,\n",
     "    temperature=0.01,\n",
     "    max_tokens=512,\n",
diff --git a/lab-materials/03/06/llm_usage.py b/lab-materials/03/06/llm_usage.py
@@ -3,7 +3,7 @@
 from langchain.prompts import PromptTemplate
 from langchain_community.llms import VLLMOpenAI
 
-INFERENCE_SERVER_URL = "http://granite-7b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080"
+INFERENCE_SERVER_URL = "http://granite-3-1-8b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080"
 MAX_NEW_TOKENS = 512
 TOP_P = 0.95
 TEMPERATURE = 0.01
@@ -13,7 +13,7 @@ def infer_with_template(input_text, template, max_tokens = MAX_NEW_TOKENS):
     llm = VLLMOpenAI(
         openai_api_key="EMPTY",
         openai_api_base= f"{INFERENCE_SERVER_URL}/v1",
-        model_name="granite-7b-instruct",
+        model_name="granite-3-1-8b-instruct",
         max_tokens=max_tokens,
         top_p=TOP_P,
         temperature=TEMPERATURE,
diff --git a/lab-materials/03/06/test_security.py b/lab-materials/03/06/test_security.py
@@ -21,6 +21,6 @@ def test_security(endpoint, expected_id,):
         }, f)
 
 if __name__ == '__main__':
-    info_endpoint = "http://granite-7b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080" + "/v1/models"
+    info_endpoint = "http://granite-3-1-8b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080" + "/v1/models"
     expected_id = "modelperm" #This is just for the demo, in a real scenario you would input the model id here
     test_security(info_endpoint, expected_id)
diff --git a/lab-materials/05/05-04/llm_usage.py b/lab-materials/05/05-04/llm_usage.py
@@ -4,7 +4,7 @@
 from langchain_community.llms import VLLMOpenAI
 
 
-INFERENCE_SERVER_URL = "http://granite-7b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080"
+INFERENCE_SERVER_URL = "http://granite-3-1-8b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080"
 MAX_NEW_TOKENS = 512
 TOP_P = 0.95
 TEMPERATURE = 0.01
@@ -14,7 +14,7 @@ def infer_with_template(input_text, template):
     llm = VLLMOpenAI(
         openai_api_key="EMPTY",
         openai_api_base= f"{INFERENCE_SERVER_URL}/v1",
-        model_name="granite-7b-instruct",
+        model_name="granite-3-1-8b-instruct",
         max_tokens=MAX_NEW_TOKENS,
         top_p=TOP_P,
         temperature=TEMPERATURE,
diff --git a/lab-materials/05/app/deployment-app.yaml b/lab-materials/05/app/deployment-app.yaml
@@ -29,9 +29,9 @@ spec:
                 fieldRef:
                   fieldPath: metadata.namespace
             - name: INFERENCE_SERVER_URL
-              value: http://granite-7b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080/v1
+              value: http://granite-3-1-8b-instruct-predictor.ic-shared-llm.svc.cluster.local:8080/v1
             - name: MODEL_NAME
-              value: 'granite-7b-instruct'
+              value: 'granite-3-1-8b-instruct'
             - name: MAX_TOKENS
               value: '512'
             - name: TOP_P