Skip to content

Commit 67ce640

Browse files
Merge pull request #239 from carbonin/fix-eval
Make eval tests more reliable
2 parents 9dc8780 + 1980a64 commit 67ce640

14 files changed

Lines changed: 378 additions & 103 deletions

Containerfile.add_llama_to_lightspeed

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@ RUN cd /app-root/llama-stack && python3.12 -m pip install --editable .
1111

1212
RUN cd /app-root/ && python3.12 -m pip install .
1313

14+
RUN microdnf install -y patch
15+
16+
# Patch created based on https://github.com/llamastack/llama-stack/pull/3957
17+
COPY llama-stack-make-tool-call-args-fully-recursive.patch /tmp/
18+
RUN cat /tmp/llama-stack-make-tool-call-args-fully-recursive.patch \
19+
| patch -p1 -d "$(dirname "$(dirname "$(python3.12 -c "import llama_stack; print(llama_stack.__file__)")")")"
20+
1421
COPY migrate.py /app/migrate.py
1522
ENTRYPOINT ["/bin/sh", "-c", "python3.12 /app/migrate.py && python3.12 src/lightspeed_stack.py"]
1623

Containerfile.assisted-chat

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,15 @@
22
# This is the digest of quay.io/lightspeed-core/lightspeed-stack:0.3.0
33
FROM quay.io/lightspeed-core/lightspeed-stack@sha256:d1805df92f4de55d662e6274328830e2c1300f308c258abfcfad825a241cb50d
44

5+
USER root
6+
7+
RUN microdnf install -y patch
8+
9+
# Patch created based on https://github.com/llamastack/llama-stack/pull/3957
10+
COPY llama-stack-make-tool-call-args-fully-recursive.patch /tmp/
11+
RUN cat /tmp/llama-stack-make-tool-call-args-fully-recursive.patch \
12+
| patch -p1 -d "$(dirname "$(dirname "$(python3.12 -c "import llama_stack; print(llama_stack.__file__)")")")"
13+
514
COPY migrate.py /app/migrate.py
615
ENTRYPOINT ["/bin/sh", "-c", "python3.12 /app/migrate.py && python3.12 src/lightspeed_stack.py"]
716

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
diff --git a/llama_stack/models/llama/datatypes.py b/llama_stack/models/llama/datatypes.py
2+
index 7f1ebed5..f5e26ec3 100644
3+
--- a/llama_stack/models/llama/datatypes.py
4+
+++ b/llama_stack/models/llama/datatypes.py
5+
@@ -31,7 +31,7 @@ class BuiltinTool(Enum):
6+
7+
8+
Primitive = str | int | float | bool | None
9+
-RecursiveType = Primitive | list[Primitive] | dict[str, Primitive]
10+
+type RecursiveType = Primitive | list[RecursiveType] | dict[str, RecursiveType]
11+
12+
13+
class ToolCall(BaseModel):

test/evals/eval_data.yaml

Lines changed: 18 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,8 @@
22
conversation:
33
- eval_id: basic_introduction
44
eval_query: Hi!
5-
eval_types: [response_eval:accuracy]
6-
expected_response: Hi! I'm the Red Hat OpenShift Lightspeed Intelligent Assistant, and I'm here to guide you through installing OpenShift using the Assisted Installer.
7-
description: Basic greeting test using keyword matching for reliability (avoids LLM judge flapping)
5+
eval_types: [response_eval:intent]
6+
expected_intent: A basic greeting that indicates willingess to help with installing OpenShift
87

98
- conversation_group: basic_cluster_request_conv
109
conversation:
@@ -56,18 +55,8 @@
5655
conversation:
5756
- eval_id: create_eval_test_sno
5857
eval_query: create a new single node cluster named eval-test-singlenode-uniq-cluster-name, running on version 4.19.7 with the x86_64 CPU architecture, configured under the base domain example.com, using the provided SSH key "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAAAgQCmeaBFhSJ/MLECmqUaKweRgo10ABpwdvJ7v76qLYfP0pzfzYsF3hGP/fH5OQfHi9pTbWynjaEcPHVfaTaFWHvyMtv8PEMUIDgQPWlBSYzb+3AgQ5AsChhzTJCYnRdmCdzENlV+azgtb3mVfXiyCfjxhyy3QAV4hRrMaVtJGuUQfQ== example@example.com".
59-
eval_types: [tool_eval, response_eval:sub-string, response_eval:accuracy, action_eval]
60-
expected_tool_calls:
61-
- - tool_name: create_cluster
62-
arguments:
63-
name: "eval-test-singlenode-uniq-cluster-name"
64-
version: "4\\.19\\.7"
65-
base_domain: "example\\.com"
66-
single_node: "(?i:true)"
67-
cpu_architecture: "x86_64"
68-
ssh_public_key: 'ssh-rsa\s+[A-Za-z0-9+/]+[=]{0,3}(\s+.+)?\s*'
69-
platform: "none"
70-
eval_verify_script: ../scripts/cluster_created_check.sh
58+
eval_types: [response_eval:sub-string, response_eval:accuracy, action_eval]
59+
eval_verify_script: ../scripts/verify_create_eval_test_sno.sh
7160
expected_keywords: ["eval-test-singlenode-uniq-cluster-name", "ID", "Discovery ISO", "download", "cluster"]
7261
expected_response: I have created a cluster with name eval-test-singlenode-uniq-cluster-name. Next, you'll need to download the Discovery ISO, then boot your hosts with it. Would you like me to get the Discovery ISO download URL?
7362
- eval_id: get_iso_eval_test_sno
@@ -84,19 +73,9 @@
8473
cleanup_script: ../scripts/delete_cluster.sh
8574
conversation:
8675
- eval_id: create_eval_test_multinode
87-
eval_types: [tool_eval, response_eval:accuracy, response_eval:sub-string, action_eval]
76+
eval_types: [response_eval:accuracy, response_eval:sub-string, action_eval]
8877
eval_query: Create a multi-node cluster named 'eval-test-multinode-uniq-cluster-name' with OpenShift 4.18.22 and domain test.local and with the x86_64 CPU architecture.
89-
expected_tool_calls:
90-
- - tool_name: create_cluster
91-
arguments:
92-
name: "eval-test-multinode-uniq-cluster-name"
93-
version: "4\\.18\\.22"
94-
base_domain: "test\\.local"
95-
single_node: "(?i:false)"
96-
cpu_architecture: "x86_64"
97-
ssh_public_key: ""
98-
platform: "baremetal"
99-
eval_verify_script: ../scripts/cluster_created_check.sh
78+
eval_verify_script: ../scripts/verify_create_eval_test_multinode.sh
10079
expected_keywords: ["eval-test-multinode-uniq-cluster-name", "ID", "Discovery ISO", "cluster"]
10180
expected_response: I have created a cluster with name eval-test-multinode-uniq-cluster-name. Before downloading the Discovery ISO, would you like to configure static network configuration for the hosts? If not, the cluster will use Dynamic Host Configuration Protocol (DHCP) for host networking configuration by default.
10281
- eval_id: set_ssh_key_eval_test_ssh
@@ -135,8 +114,8 @@
135114
- eval_id: cluster_info_tool_call
136115
eval_query: Give me details about cluster named 'abc123abc'
137116
description: Test handling for non-existent cluster name
138-
eval_types: [tool_eval, response_eval:accuracy]
139-
expected_response: Retrieval failed for cluster 'abc123abc' because the resource was not found.
117+
eval_types: [tool_eval, response_eval:intent]
118+
expected_intent: An explanation indicating that a cluster with the name abc123abc does not exist
140119
expected_tool_calls:
141120
- - tool_name: list_clusters
142121
arguments: {}
@@ -161,16 +140,16 @@
161140
conversation:
162141
- eval_id: refuse_azure_request
163142
eval_query: Can you create a cluster on Azure?
164-
eval_types: [response_eval:accuracy]
165-
expected_response: Sorry, I can't create clusters directly on public clouds like Azure. I am designed to assist with on-premise OpenShift installations using the Assisted Installer. For installing OpenShift on Azure, you can explore Red Hat OpenShift Service on Azure or use the standard OpenShift Installer.
143+
eval_types: [response_eval:intent]
144+
expected_intent: An explanation that indicates clusters cannot be created on public clouds and a reminder that the scope of this installer is on-premise installations
166145
description: Refuse out-of-scope request to create clusters on Azure; clarify on-prem scope and Assisted Installer
167146

168147
- conversation_group: capabilities_scope_conv
169148
conversation:
170149
- eval_id: supported_platforms_summary
171150
eval_query: Which platforms can you help me install on?
172-
eval_types: [response_eval:accuracy]
173-
expected_response: I can help you install OpenShift on-premise using the Assisted Installer, either on bare metal servers or virtual machines (VMs) like vSphere, KVM or libvirt. I do not support public cloud platforms like Amazon Web Services (AWS), Azure, or Google Cloud Platform (GCP).
151+
eval_types: [response_eval:intent]
152+
expected_intent: An explanation that indicates on-premise platforms are supported, but public cloud platforms are not.
174153
description: Ensure the assistant concisely states supported on-prem baremetal/VM scope and references Assisted Installer
175154

176155
- conversation_group: cluster_id_from_name
@@ -195,33 +174,14 @@
195174
conversation:
196175
- eval_id: create_eval_test_sno
197176
eval_query: create a new single node cluster named eval-test-static-net-cluster-uniq-cluster-name, running on version 4.19.7 with the x86_64 CPU architecture, configured under the base domain example.com, without an ssh key.
198-
eval_types: [tool_eval, action_eval]
199-
eval_verify_script: ../scripts/cluster_created_check.sh
200-
expected_tool_calls:
201-
- - tool_name: create_cluster
202-
arguments:
203-
name: "eval-test-static-net-cluster-uniq-cluster-name"
204-
version: "4\\.19\\.7"
205-
base_domain: "example\\.com"
206-
single_node: "(?i:true)"
207-
cpu_architecture: "x86_64"
208-
ssh_public_key: ""
209-
platform: "none"
177+
eval_types: [action_eval]
178+
eval_verify_script: ../scripts/verify_create_eval_test_sno_static_net.sh
210179
- eval_id: configure_hosts
211180
eval_query: |
212181
I want to configure static networking. Create configs with a single vlan interface backed by an ethernet interface. It should have an ethernet interface with mac address c5:d6:bc:f0:05:20, and the vlan interface has ip address 10.0.0.5/24. The vlan id is 400. Use the name eth0 for the ethernet interface and vlan0 as the name of the vlan interface. Also I want DNS config with a DNS server 8.8.8.8.
213-
eval_types: [tool_eval]
214-
expected_tool_calls:
215-
- - tool_name: generate_nmstate_yaml
216-
arguments:
217-
params: |-
218-
(?s)^(?=.*"ethernet_ifaces":\s*\[\s*\{(?=.*"mac_address":\s*"c5:d6:bc:f0:05:20")(?=.*"name":\s*"eth0").*?\}\s*\])(?=.*"vlan_ifaces":\s*\[\s*\{(?=.*?"vlan_id":\s*400\b)(?=.*?"name":\s*"vlan0")(?=.*?"base_interface_name":\s*"eth0")(?=.*?"ipv4_address":\s*\{(?=.*?"address":\s*"10\.0\.0\.5")(?=.*?"cidr_length":\s*24\b).*?\}).*?\}\s*\])(?=.*"dns":\s*\{\s*"dns_servers":\s*\[\s*"8\.8\.8\.8"\s*\]\s*\}).*$
182+
eval_types: [response_eval:intent]
183+
expected_intent: Acknowledgement of the desired configuration and a request for the user to validate the configuration before applying it.
219184
- eval_id: apply_to_cluster
220185
eval_query: Yes, apply it to the cluster you just created.
221-
eval_types: [tool_eval]
222-
expected_tool_calls:
223-
- - tool_name: alter_static_network_config_nmstate_for_host
224-
arguments:
225-
cluster_id: "[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}"
226-
index: null
227-
new_nmstate_yaml: "10.0.0.5"
186+
eval_types: [action_eval]
187+
eval_verify_script: ../scripts/verify_static_net_apply_to_cluster.sh

test/prow/Dockerfile.plus-llama

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,12 @@ RUN python3.12 -m ensurepip
1111
RUN cd /app-root/llama-stack && python3.12 -m pip install --editable .
1212
RUN cd /app-root/ && python3.12 -m pip install .
1313

14-
# Apply upstream patch required by current revision (make it optional if it fails)
14+
RUN microdnf install -y patch
1515

16-
# Patch taken from https://github.com/meta-llama/llama-stack/commit/fd466b0459bfa7cc696ac80dba90b6e02d5869bd.patch
17-
COPY meta-llama.llama-stack.commit.fd466b0459bfa7cc696ac80dba90b6e02d5869bd.patch /tmp/
18-
19-
RUN microdnf install -y patch && \
20-
LLAMA_STACK_PATH=$(python3.12 -c "import llama_stack; import os; print(os.path.dirname(os.path.dirname(llama_stack.__file__)))") && \
21-
echo "Applying patch to: $LLAMA_STACK_PATH" && \
22-
cat /tmp/meta-llama.llama-stack.commit.fd466b0459bfa7cc696ac80dba90b6e02d5869bd.patch \
23-
| sed '/^diff --git a\/tests\/unit\/server\/test_replace_env_vars.py b\/tests\/unit\/server\/test_replace_env_vars.py/,/^diff --git /{ /^diff --git /!d }' \
24-
| patch -p1 -d "$LLAMA_STACK_PATH" || echo "Patch application failed, continuing anyway"
16+
# Patch created based on https://github.com/llamastack/llama-stack/pull/3957
17+
COPY llama-stack-make-tool-call-args-fully-recursive.patch /tmp/
18+
RUN cat /tmp/llama-stack-make-tool-call-args-fully-recursive.patch \
19+
| patch -p1 -d "$(dirname "$(dirname "$(python3.12 -c "import llama_stack; print(llama_stack.__file__)")")")"
2520

2621
USER 1001
2722
EXPOSE 8080

test/scripts/cluster_create_test_setup.sh

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
#!/bin/bash
22

3-
set -o nounset
4-
set -o errexit
5-
set -o pipefail
3+
# Source the common helper functions
4+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
5+
source "${SCRIPT_DIR}/common.sh"
6+
7+
setup_shell_options
8+
validate_environment
69

7-
: "${OCM_TOKEN:?OCM_TOKEN is required}"
8-
: "${UNIQUE_ID:?UNIQUE_ID is required}"
910
OCM_BASE_URL=${OCM_BASE_URL:-https://api.stage.openshift.com}
10-
ASSISTED_SERVICE_URL="${OCM_BASE_URL}/api/assisted-install/v2"
11+
ASSISTED_SERVICE_URL=$(get_assisted_service_url)
1112

1213
PULL_SECRET_RAW="$(
1314
curl -sSf -X POST \
@@ -55,7 +56,7 @@ curl -sSf -X POST \
5556
COUNTER=0
5657
while ! curl -sSf -H "Authorization: Bearer ${OCM_TOKEN}" "${ASSISTED_SERVICE_URL}/clusters/${CLUSTER_ID}"; do
5758
if [[ $COUNTER -gt 3 ]]; then
58-
echo "Cluster creation timed out"
59+
echo_err "Cluster creation timed out"
5960
exit 1
6061
fi
6162
((COUNTER++))

test/scripts/cluster_created_check.sh

Lines changed: 0 additions & 21 deletions
This file was deleted.

test/scripts/common.sh

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
#!/bin/bash
2+
3+
# Helper function to print to stderr
4+
echo_err() {
5+
echo "$@" >&2
6+
}
7+
8+
# Helper function to print to stdout (for clarity, though echo does this by default)
9+
echo_out() {
10+
echo "$@"
11+
}
12+
13+
# Setup common bash settings
14+
setup_shell_options() {
15+
set -o nounset
16+
set -o errexit
17+
set -o pipefail
18+
}
19+
20+
# Validate required environment variables
21+
validate_environment() {
22+
: "${OCM_TOKEN:?OCM_TOKEN is required}"
23+
: "${UNIQUE_ID:?UNIQUE_ID is required}"
24+
}
25+
26+
# Get the assisted service URL
27+
get_assisted_service_url() {
28+
echo "${OCM_BASE_URL:-https://api.stage.openshift.com}/api/assisted-install/v2"
29+
}
30+
31+
# Fetch cluster data by name
32+
# Usage: fetch_cluster_data <cluster_name>
33+
fetch_cluster_data() {
34+
local cluster_name="$1"
35+
local service_url=$(get_assisted_service_url)
36+
curl -sSf -H "Authorization: Bearer ${OCM_TOKEN}" "${service_url}/clusters" | \
37+
jq -r ".[] | select(.name == \"${cluster_name}\")"
38+
}
39+
40+
# Fetch infra env data by name
41+
# Usage: fetch_infra_env_data <infra_env_name>
42+
fetch_infra_env_data() {
43+
local infra_env_name="$1"
44+
local service_url=$(get_assisted_service_url)
45+
curl -sSf -H "Authorization: Bearer ${OCM_TOKEN}" "${service_url}/infra-envs" | \
46+
jq -r ".[] | select(.name == \"${infra_env_name}\")"
47+
}
48+
49+
# Extract cluster properties from cluster data
50+
# Usage: extract_cluster_properties <cluster_data>
51+
# Outputs: ACTUAL_VERSION, ACTUAL_DOMAIN, ACTUAL_SINGLE_NODE, ACTUAL_CPU_ARCH, ACTUAL_SSH_KEY
52+
extract_cluster_properties() {
53+
local cluster_data="$1"
54+
ACTUAL_VERSION=$(echo "$cluster_data" | jq -r '.openshift_version')
55+
ACTUAL_DOMAIN=$(echo "$cluster_data" | jq -r '.base_dns_domain')
56+
ACTUAL_SINGLE_NODE=$(echo "$cluster_data" | jq -r '.high_availability_mode == "None"')
57+
ACTUAL_CPU_ARCH=$(echo "$cluster_data" | jq -r '.cpu_architecture')
58+
ACTUAL_SSH_KEY=$(echo "$cluster_data" | jq -r '.ssh_public_key')
59+
}
60+
61+
# Validate cluster properties (version, domain, single_node, cpu_arch, ssh_key)
62+
# Usage: validate_cluster_properties <expected_version> <expected_domain> <expected_single_node> <expected_cpu_arch> [expected_ssh_key]
63+
validate_cluster_properties() {
64+
local expected_version="$1"
65+
local expected_domain="$2"
66+
local expected_single_node="$3"
67+
local expected_cpu_arch="$4"
68+
local expected_ssh_key="${5:-}"
69+
70+
if [[ "$ACTUAL_VERSION" == "$expected_version" && \
71+
"$ACTUAL_DOMAIN" == "$expected_domain" && \
72+
"$ACTUAL_SINGLE_NODE" == "$expected_single_node" && \
73+
"$ACTUAL_CPU_ARCH" == "$expected_cpu_arch" ]]; then
74+
# If SSH key is provided, validate it too
75+
if [[ -n "$expected_ssh_key" ]]; then
76+
if [[ "$ACTUAL_SSH_KEY" == "$expected_ssh_key" ]]; then
77+
return 0
78+
else
79+
return 1
80+
fi
81+
else
82+
return 0
83+
fi
84+
else
85+
return 1
86+
fi
87+
}
88+
89+
90+
# Wait for cluster to exist and validate
91+
# Usage: wait_and_validate_cluster <cluster_name_prefix> <expected_version> <expected_domain> <expected_single_node> <expected_cpu_arch> <cluster_type> [expected_ssh_key]
92+
wait_and_validate_cluster() {
93+
local cluster_name_prefix="$1"
94+
local expected_version="$2"
95+
local expected_domain="$3"
96+
local expected_single_node="$4"
97+
local expected_cpu_arch="$5"
98+
local cluster_type="$6"
99+
local expected_ssh_key="${7:-}"
100+
101+
local cluster_name="${cluster_name_prefix}-${UNIQUE_ID}"
102+
103+
local counter=0
104+
while true; do
105+
local cluster_data=$(fetch_cluster_data "$cluster_name")
106+
107+
if [[ -n "$cluster_data" && "$cluster_data" != "null" ]]; then
108+
extract_cluster_properties "$cluster_data"
109+
if validate_cluster_properties "$expected_version" "$expected_domain" "$expected_single_node" "$expected_cpu_arch" "$expected_ssh_key"; then
110+
echo_out "The ${cluster_type} cluster was successfully created with correct configuration:"
111+
echo_out " Name: ${cluster_name}"
112+
echo_out " Version: ${ACTUAL_VERSION}"
113+
echo_out " Domain: ${ACTUAL_DOMAIN}"
114+
echo_out " Single Node: ${ACTUAL_SINGLE_NODE}"
115+
echo_out " CPU Architecture: ${ACTUAL_CPU_ARCH}"
116+
if [[ -n "$expected_ssh_key" ]]; then
117+
echo_out " SSH Key: ${ACTUAL_SSH_KEY}"
118+
fi
119+
exit 0
120+
else
121+
echo_err "Cluster found but configuration mismatch:"
122+
echo_err " Expected version: ${expected_version}, got: ${ACTUAL_VERSION}"
123+
echo_err " Expected domain: ${expected_domain}, got: ${ACTUAL_DOMAIN}"
124+
echo_err " Expected single node: ${expected_single_node}, got: ${ACTUAL_SINGLE_NODE}"
125+
echo_err " Expected CPU architecture: ${expected_cpu_arch}, got: ${ACTUAL_CPU_ARCH}"
126+
if [[ -n "$expected_ssh_key" ]]; then
127+
echo_err " Expected SSH key: ${expected_ssh_key}, got: ${ACTUAL_SSH_KEY}"
128+
fi
129+
exit 1
130+
fi
131+
fi
132+
133+
if [[ $counter -ge 3 ]]; then
134+
echo_err "Cluster creation timed out"
135+
exit 1
136+
fi
137+
((counter++))
138+
sleep 10
139+
done
140+
}
141+

0 commit comments

Comments
 (0)