Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions testdata/localstack-init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,28 @@ awslocal cloudwatch put-metric-data \
--value 1000000 \
--unit Bytes

# CloudWatch Logs test data
echo "Creating CloudWatch Logs test data..."

awslocal logs create-log-group \
--log-group-name "test-application-logs" \
--region us-east-1

awslocal logs create-log-stream \
--log-group-name "test-application-logs" \
--log-stream-name "test-stream-1" \
--region us-east-1

TIMESTAMP=$(date +%s000)
awslocal logs put-log-events \
--log-group-name "test-application-logs" \
--log-stream-name "test-stream-1" \
--region us-east-1 \
--log-events \
"[{\"timestamp\":${TIMESTAMP},\"message\":\"ERROR: Connection timeout in service handler\"}, \
{\"timestamp\":$((TIMESTAMP+1000)),\"message\":\"INFO: Request processed successfully\"}, \
{\"timestamp\":$((TIMESTAMP+2000)),\"message\":\"WARN: High memory usage detected\"}, \
{\"timestamp\":$((TIMESTAMP+3000)),\"message\":\"ERROR: Database query failed\"}, \
{\"timestamp\":$((TIMESTAMP+4000)),\"message\":\"INFO: Health check passed\"}]"

echo "CloudWatch test data seeded successfully"
61 changes: 61 additions & 0 deletions tests/cloudwatch_logs_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pytest
from mcp import ClientSession

from conftest import models
from utils import assert_mcp_eval, run_llm_tool_loop

pytestmark = pytest.mark.anyio


@pytest.mark.parametrize("model", models)
@pytest.mark.flaky(reruns=2)
async def test_cloudwatch_list_log_groups(
model: str,
mcp_client: ClientSession,
mcp_transport: str,
):
"""Test that the LLM can list CloudWatch log groups."""
prompt = "List all CloudWatch log groups available on the CloudWatch datasource in Grafana. Use the us-east-1 region."
final_content, tools_called, mcp_server = await run_llm_tool_loop(
model, mcp_client, mcp_transport, prompt
)

assert_mcp_eval(
prompt,
final_content,
tools_called,
mcp_server,
"Does the response contain CloudWatch log group names? "
"It should mention specific log groups like 'test-application-logs' "
"or similar log group patterns. ",
expected_tools="list_cloudwatch_log_groups",
)


@pytest.mark.parametrize("model", models)
@pytest.mark.flaky(reruns=2)
async def test_cloudwatch_query_logs(
model: str,
mcp_client: ClientSession,
mcp_transport: str,
):
"""Test that the LLM can query CloudWatch Logs Insights."""
prompt = (
"Query CloudWatch Logs Insights for ERROR messages in the 'test-application-logs' log group "
"over the last hour. Use the us-east-1 region."
)
final_content, tools_called, mcp_server = await run_llm_tool_loop(
model, mcp_client, mcp_transport, prompt
)

assert_mcp_eval(
prompt,
final_content,
tools_called,
mcp_server,
"Does the response provide information about CloudWatch log data? "
"It should either show log entries or messages, mention that logs were retrieved, "
"or explain that no log data was found in the specified time range. "
"Generic error messages don't count.",
expected_tools="query_cloudwatch_logs",
)
Loading