Update meta/llama models to latest supported and add AI functions tests

sudosalim · sudosalim · commit 27b8792444d6 · 2025-11-12T10:55:13.000Z
+ Update all llama3 8B models spec to use meta/llama3-8b-instruct + Add ai functions tests Change-Id: I06a427bf5bd90aff6cc8e0c7ab8c810b6dc07dd1 Reviewed-on: https://review.couchbase.org/c/perfrunner/+/234284 Reviewed-by: Daniel Nagy <daniel.nagy@couchbase.com> Tested-by: Build Bot <build@couchbase.com>
diff --git a/cloud/infrastructure/ai_services/capella_aws_1c_1llm_g6_xlarge_llama_3_8b_instruct.spec b/cloud/infrastructure/ai_services/capella_aws_1c_1llm_g6_xlarge_llama_3_8b_instruct.spec
@@ -24,7 +24,7 @@ volume_size = 100
 iops = 3000
 
 [text-generation]
-model_name = meta/llama-3.2-3b-instruct
+model_name = meta/llama3-8b-instruct
 instance_type = g6.xlarge
 instance_capacity = 1
 
diff --git a/cloud/infrastructure/ai_services/capella_aws_1c_1llm_g6e_12xlarge_llama_3_8b_instruct.spec b/cloud/infrastructure/ai_services/capella_aws_1c_1llm_g6e_12xlarge_llama_3_8b_instruct.spec
@@ -24,7 +24,7 @@ volume_size = 100
 iops = 3000
 
 [text-generation]
-model_name = meta/llama-3.1-8b-instruct
+model_name = meta/llama3-8b-instruct
 instance_type = g6e.12xlarge
 instance_capacity = 1
 
diff --git a/cloud/infrastructure/ai_services/capella_aws_1c_1llm_g6e_xlarge_llama_3_1_8b_instruct.spec b/cloud/infrastructure/ai_services/capella_aws_1c_1llm_g6e_xlarge_llama_3_1_8b_instruct.spec
diff --git a/cloud/infrastructure/ai_services/capella_aws_1c_1llm_g6e_xlarge_llama_3_8b_instruct.spec b/cloud/infrastructure/ai_services/capella_aws_1c_1llm_g6e_xlarge_llama_3_8b_instruct.spec
@@ -24,7 +24,7 @@ volume_size = 100
 iops = 3000
 
 [text-generation]
-model_name = meta/llama-3.2-3b-instruct
+model_name = meta/llama3-8b-instruct
 instance_type = g6e.xlarge
 instance_capacity = 1
 
diff --git a/cloud/infrastructure/ai_services/capella_aws_1c_1llm_p4de_24xlarge_llama_3_3_70b_instruct.spec b/cloud/infrastructure/ai_services/capella_aws_1c_1llm_p4de_24xlarge_llama_3_3_70b_instruct.spec
@@ -24,7 +24,7 @@ volume_size = 100
 iops = 3000
 
 [text-generation]
-model_name = meta-llama/Llama-3.1-8B-Instruct
+model_name = meta/llama3.3-70b-instruct
 instance_type = p4de.24xlarge
 instance_capacity = 1
 
diff --git a/cloud/infrastructure/ai_services/capella_aws_1c_1llm_p4de_24xlarge_llama_3_8b_instruct.spec b/cloud/infrastructure/ai_services/capella_aws_1c_1llm_p4de_24xlarge_llama_3_8b_instruct.spec
@@ -24,7 +24,7 @@ volume_size = 100
 iops = 3000
 
 [text-generation]
-model_name = meta/llama3-70b-instruct
+model_name = meta/llama3-8b-instruct
 instance_type = p4de.24xlarge
 instance_capacity = 1
 
diff --git a/cloud/infrastructure/ai_services/capella_aws_1c_1llm_p5_48xlarge_llama_3_3_70b_instruct.spec b/cloud/infrastructure/ai_services/capella_aws_1c_1llm_p5_48xlarge_llama_3_3_70b_instruct.spec
@@ -24,7 +24,7 @@ volume_size = 100
 iops = 3000
 
 [text-generation]
-model_name = meta/llama3-70b-instruct
+model_name = meta/llama3.3-70b-instruct
 instance_type = p5.48xlarge
 instance_capacity = 1
 
diff --git a/cloud/infrastructure/ai_services/capella_aws_7s_1c_c5_2xlarge_1llm_g6e_xlarge_llama_3_8b_instruct.spec b/cloud/infrastructure/ai_services/capella_aws_7s_1c_c5_2xlarge_1llm_g6e_xlarge_llama_3_8b_instruct.spec
@@ -51,7 +51,7 @@ data = var/cb/data
 source = default_capella
 
 [text-generation]
-model_name = meta/llama-3.1-8b-instruct
+model_name = meta/llama3-8b-instruct
 instance_type = g6e.xlarge
 instance_capacity = 1
 
diff --git a/perfrunner/helpers/monitor.py b/perfrunner/helpers/monitor.py
@@ -1870,3 +1870,28 @@ def wait_for_ai_gateway_models_health(self, model_status_func: Callable[[], dict
             time.sleep(self.POLLING_INTERVAL)
 
         logger.info(f"AI Gateway models are healthy: {misc.pretty_dict(models_status)}")
+
+    def wait_for_ai_functions_healthy(self, host: str, deployed_functions: list[str]):
+        logger.info(f"Waiting for AI functions to be healthy: {deployed_functions}")
+        retries = 0
+        while deployed_functions:
+            try:
+                functions = self.rest.list_ai_functions(host)
+                healthy_functions = [
+                    f.get("data", {}).get("name")
+                    for f in functions
+                    if f.get("data", {}).get("functionStatus") == "healthy"
+                ]
+                if len(healthy_functions) == len(deployed_functions):
+                    logger.info(f"All AI functions are healthy: {healthy_functions}")
+                    return
+
+                pending_functions = list(set(deployed_functions) - set(healthy_functions))
+                if retries % 60 == 0:
+                    logger.info(f"AI functions not healthy yet: {pending_functions}")
+            except Exception as e:
+                logger.error(f"Error while checking AI functions health status: {e}")
+            retries += 1
+            if retries >= self.MAX_RETRY:
+                raise Exception(f"AI functions are not healthy after {retries} retries.")
+            time.sleep(self.MONITORING_DELAY)
diff --git a/perfrunner/tests/ai_services.py b/perfrunner/tests/ai_services.py
@@ -563,16 +563,23 @@ def create_openai_integration(self, cluster_uuid: str):
         )
         logger.info(f"Created openAI integration: {self.openai_integration_id}")
 
+    @timeit
     def deploy_ai_functions(self):
         uuid = self.cluster_spec.infrastructure_settings.get("uuid", uuid4().hex[:6])
         self.create_openai_integration(uuid)
         payload = self._create_ai_functions_payload()
         logger.info(f"Deploying AI functions with payload: {pretty_dict(payload)}")
         self.rest.create_ai_functions(self.master_node, payload)
-        # Cant deteministically monitor deployment due to AV-108636, so wait for 30 seconds
-        sleep(30)
+        self.monitor.wait_for_ai_functions_healthy(
+            self.master_node, self.ai_services_settings.functions_names
+        )
 
     def run(self):
+        functions_deployment_time = self.deploy_ai_functions()
+        logger.info(f"AI Functions deployment time: {functions_deployment_time} seconds")
+        # Workaround for AV-110058
+        self.rest.refresh_cluster_allowlist(self.master_node)
+
         self.load()
         self.wait_for_persistence()
         self.check_num_items()
@@ -581,10 +588,6 @@ def run(self):
         self.wait_for_indexing()
         self.store_plans()
 
-        self.deploy_ai_functions()
-        # Workaround for AV-110058
-        self.rest.refresh_cluster_allowlist(self.master_node)
-
         self.access_bg()
         self.access()
 
diff --git a/tests/cloud/ai_services/ai_functions_classification_read_only_1M.test b/tests/cloud/ai_services/ai_functions_classification_read_only_1M.test
@@ -0,0 +1,53 @@
+[test_case]
+test = perfrunner.tests.ai_services.QueryThroughputWithAIFunctionsTest
+
+[showfast]
+title = Classification, 1M x 1KB, read only, 7 nodes, s=1 c=1, AI functions
+component = aiservices
+category = functions
+sub_category = {provider}
+
+[cluster]
+initial_nodes = 7
+num_buckets = 1
+enable_cpu_cores = false
+enable_n2n_encryption = all
+ui_http = disabled
+
+[collection]
+config = collections/1bucket_1scope_1collection_basic.json
+
+[bucket]
+replica_number = 1
+eviction_policy = fullEviction
+backend_storage = magma
+
+[secondary]
+indexer.settings.storage_mode = plasma
+
+[index]
+fields = email
+indexes_per_collection = 1
+
+[load]
+items = 1000000
+size = 1024
+workers = 80
+doc_gen = reverse_lookup
+ssl_mode = capella
+
+[access]
+items = 1000000
+workers = 0
+n1ql_queries = classification
+n1ql_workers = 120
+time = 1200
+ssl_mode = capella
+
+[n1ql-classification]
+statement = SELECT email, default:ai_classification({"text": email, "labels": ["valid", "invalid"]}) AS email_result FROM `bucket-1` USE KEYS[$1];
+args = ["{key}"]
+
+[ai_services]
+provider = capella
+functions_names = ai_classification
diff --git a/tests/cloud/ai_services/ai_functions_classification_read_only_1M_openai_gpt_4_1.test b/tests/cloud/ai_services/ai_functions_classification_read_only_1M_openai_gpt_4_1.test
@@ -0,0 +1,54 @@
+[test_case]
+test = perfrunner.tests.ai_services.QueryThroughputWithAIFunctionsTest
+
+[showfast]
+title = Classification, 1M x 1KB, read only, 7 nodes, s=1 c=1, AI functions (OpenAI GPT-4.1)
+component = aiservices
+category = functions
+sub_category = {provider}
+
+[cluster]
+initial_nodes = 7
+num_buckets = 1
+enable_cpu_cores = false
+enable_n2n_encryption = all
+ui_http = disabled
+
+[collection]
+config = collections/1bucket_1scope_1collection_basic.json
+
+[bucket]
+replica_number = 1
+eviction_policy = fullEviction
+backend_storage = magma
+
+[secondary]
+indexer.settings.storage_mode = plasma
+
+[index]
+fields = email
+indexes_per_collection = 1
+
+[load]
+items = 1000000
+size = 1024
+workers = 80
+doc_gen = reverse_lookup
+ssl_mode = capella
+
+[access]
+items = 1000000
+workers = 0
+n1ql_queries = classification
+n1ql_workers = 120
+time = 1200
+ssl_mode = capella
+
+[n1ql-classification]
+statement = SELECT email, default:ai_classification({"text": email, "labels": ["valid", "invalid"]}) AS email_result FROM `bucket-1` USE KEYS[$1];
+args = ["{key}"]
+
+[ai_services]
+provider = openai
+functions_names = ai_classification
+model_name = gpt-4.1
diff --git a/tests/cloud/ai_services/ai_functions_classification_update_1M.test b/tests/cloud/ai_services/ai_functions_classification_update_1M.test
@@ -0,0 +1,54 @@
+[test_case]
+test = perfrunner.tests.ai_services.QueryThroughputWithAIFunctionsTest
+
+[showfast]
+title = Classification, 1M x 1KB, Update, 7 nodes, s=1 c=1, AI functions
+component = aiservices
+category = functions
+sub_category = {provider}
+
+[cluster]
+initial_nodes = 7
+num_buckets = 1
+enable_cpu_cores = false
+enable_n2n_encryption = all
+ui_http = disabled
+
+[collection]
+config = collections/1bucket_1scope_1collection_basic.json
+
+[bucket]
+replica_number = 1
+eviction_policy = fullEviction
+backend_storage = magma
+
+[secondary]
+indexer.settings.storage_mode = plasma
+
+[index]
+fields = email
+indexes_per_collection = 1
+
+[load]
+items = 1000000
+size = 1024
+workers = 80
+doc_gen = reverse_lookup
+ssl_mode = capella
+
+[access]
+items = 1000000
+workers = 0
+n1ql_queries = classification
+n1ql_workers = 120
+time = 1200
+ssl_mode = capella
+n1ql_op = update
+
+[n1ql-classification]
+statement = UPDATE `bucket-1` USE KEYS $1 SET email_status = default:ai_classification({"text": email, "labels": ["valid", "invalid"]})[0].response;
+args = ["{key}"]
+
+[ai_services]
+provider = capella
+functions_names = ai_classification
diff --git a/tests/cloud/ai_services/ai_functions_classification_update_1M_openai_gpt_4_1.test b/tests/cloud/ai_services/ai_functions_classification_update_1M_openai_gpt_4_1.test
@@ -0,0 +1,55 @@
+[test_case]
+test = perfrunner.tests.ai_services.QueryThroughputWithAIFunctionsTest
+
+[showfast]
+title = Classification, 1M x 1KB, Update, 7 nodes, s=1 c=1, AI functions (OpenAI GPT-4.1)
+component = aiservices
+category = functions
+sub_category = {provider}
+
+[cluster]
+initial_nodes = 7
+num_buckets = 1
+enable_cpu_cores = false
+enable_n2n_encryption = all
+ui_http = disabled
+
+[collection]
+config = collections/1bucket_1scope_1collection_basic.json
+
+[bucket]
+replica_number = 1
+eviction_policy = fullEviction
+backend_storage = magma
+
+[secondary]
+indexer.settings.storage_mode = plasma
+
+[index]
+fields = email
+indexes_per_collection = 1
+
+[load]
+items = 1000000
+size = 1024
+workers = 80
+doc_gen = reverse_lookup
+ssl_mode = capella
+
+[access]
+items = 1000000
+workers = 0
+n1ql_queries = classification
+n1ql_workers = 120
+time = 1200
+ssl_mode = capella
+n1ql_op = update
+
+[n1ql-classification]
+statement = UPDATE `bucket-1` USE KEYS $1 SET email_status = default:ai_classification({"text": email, "labels": ["valid", "invalid"]})[0].response;
+args = ["{key}"]
+
+[ai_services]
+provider = openai
+functions_names = ai_classification
+model_name = gpt-4.1
diff --git a/tests/cloud/ai_services/ai_gateway_latency_llm_text_generation_2000_200_type_task.test b/tests/cloud/ai_services/ai_gateway_latency_llm_text_generation_2000_200_type_task.test
@@ -0,0 +1,15 @@
+[test_case]
+test = perfrunner.tests.ai_services.AIGatewayTest
+
+[showfast]
+title = AI Gateway LLM model serving, random text, 2000/200 type tasks
+component = aiservices
+category = models
+sub_category = {provider}
+
+[ai_bench]
+model_kind = text-generation
+time = 0
+ops = 1000
+dataset = datasets/templates/long_form_input
+max_tokens = 200