From 3395253057e544f49c1bf30a51c1cb32f2039d06 Mon Sep 17 00:00:00 2001 From: JohnGilhuly Date: Thu, 1 May 2025 13:22:05 -0400 Subject: [PATCH 1/4] Pydantic evals --- tutorials/evals/pydantic-evals.ipynb | 409 +++++++++++++++++++++++++++ 1 file changed, 409 insertions(+) create mode 100644 tutorials/evals/pydantic-evals.ipynb diff --git a/tutorials/evals/pydantic-evals.ipynb b/tutorials/evals/pydantic-evals.ipynb new file mode 100644 index 0000000000..7cd8823468 --- /dev/null +++ b/tutorials/evals/pydantic-evals.ipynb @@ -0,0 +1,409 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "

\n", + " \"phoenix\n", + "
\n", + " Docs\n", + " |\n", + " GitHub\n", + " |\n", + " Community\n", + "

\n", + "
\n", + "\n", + "#
Evaluation using Pydantic Evals
\n", + "\n", + "1. Use Pydantic Evals to evaluate your LLM app for a simple question-answering task.\n", + "2. Log your results to Arize Phoenix to track your experiments and traces." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "857d8bf104ed" + }, + "source": [ + "## Step 1: Install dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Ignoring invalid distribution ~oogle-cloud-aiplatform (/Users/jgilhuly/.local/lib/python3.11/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution ~oogle-cloud-aiplatform (/Users/jgilhuly/.local/lib/python3.11/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution ~oogle-cloud-aiplatform (/Users/jgilhuly/.local/lib/python3.11/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "crewai 0.114.0 requires chromadb>=0.5.23, which is not installed.\n", + "crewai-tools 0.40.1 requires chromadb>=0.4.22, which is not installed.\n", + "langchain 0.3.23 requires langsmith<0.4,>=0.1.17, which is not installed.\n", + "embedchain 0.1.128 requires chromadb<0.6.0,>=0.5.10, which is not installed.\n", + "embedchain 0.1.128 requires langsmith<0.4.0,>=0.3.18, which is not installed.\n", + "langchain-community 0.3.21 requires langsmith<0.4,>=0.1.125, which is not installed.\u001b[0m\u001b[31m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution ~oogle-cloud-aiplatform (/Users/jgilhuly/.local/lib/python3.11/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution ~oogle-cloud-aiplatform (/Users/jgilhuly/.local/lib/python3.11/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q pydantic-evals arize-phoenix openai openinference-instrumentation-openai \"httpx<0.28.0,>=0.23.0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install arize-phoenix" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Setup API keys and imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from getpass import getpass\n", + "\n", + "from openai import OpenAI\n", + "from pydantic_evals import Case, Dataset\n", + "\n", + "import phoenix as px\n", + "\n", + "if os.getenv(\"OPENAI_API_KEY\") is None:\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter your OpenAI API key: \")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 3: Enable Phoenix Tracing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sign up for a free instance of [Phoenix Cloud](https://app.phoenix.arize.com) to get your API key. If you'd prefer, you can instead [self-host Phoenix](https://docs.arize.com/phoenix/deployment)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "if os.getenv(\"PHOENIX_API_KEY\") is None:\n", + " os.environ[\"PHOENIX_API_KEY\"] = getpass(\"Enter your Phoenix API key: \")\n", + "\n", + "os.environ[\"PHOENIX_COLLECTOR_ENDPOINT\"] = \"https://app.phoenix.arize.com/\"\n", + "os.environ[\"PHOENIX_CLIENT_HEADERS\"] = f\"api_key={os.getenv('PHOENIX_API_KEY')}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔭 OpenTelemetry Tracing Details 🔭\n", + "| Phoenix Project: pydantic-evals-tutorial\n", + "| Span Processor: SimpleSpanProcessor\n", + "| Collector Endpoint: http://localhost:6006/v1/traces\n", + "| Transport: HTTP + protobuf\n", + "| Transport Headers: {'authorization': '****'}\n", + "| \n", + "| Using a default SpanProcessor. `add_span_processor` will overwrite this default.\n", + "| \n", + "| ⚠️ WARNING: It is strongly advised to use a BatchSpanProcessor in production environments.\n", + "| \n", + "| `register` has set this TracerProvider as the global OpenTelemetry default.\n", + "| To disable this behavior, call `register` with `set_global_tracer_provider=False`.\n", + "\n" + ] + } + ], + "source": [ + "from phoenix.otel import register\n", + "\n", + "tracer_provider = register(\n", + " endpoint=\"http://localhost:6006/v1/traces\",\n", + " project_name=\"pydantic-evals-tutorial\",\n", + " auto_instrument=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 4: Create Example Traces to Evaluate\n", + "\n", + "Next, we'll run some example inputs through an LLM call to generate traces that we can evaluate. In practice, you'd likely already have an application you're tracing that you'd want to evaluate instead." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "client = OpenAI()\n", + "\n", + "inputs = [\n", + " \"What is the capital of France?\",\n", + " \"Who wrote Romeo and Juliet?\",\n", + " \"What is the largest planet in our solar system?\",\n", + "]\n", + "\n", + "\n", + "def generate_trace(input):\n", + " client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": \"You are a helpful assistant. Only respond with the answer to the question as a single word or proper noun.\",\n", + " },\n", + " {\"role\": \"user\", \"content\": input},\n", + " ],\n", + " )\n", + "\n", + "\n", + "for input in inputs:\n", + " generate_trace(input)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 5: Export Traces from Phoenix\n", + "\n", + "Export your traces from Phoenix to then evaluate with Pydantic Evals." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jgilhuly/.local/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'phoenix.evals'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpx\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtrace\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdsl\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m SpanQuery\n\u001b[32m 5\u001b[39m query = SpanQuery().where(\n\u001b[32m 6\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mspan_kind == \u001b[39m\u001b[33m'\u001b[39m\u001b[33mLLM\u001b[39m\u001b[33m'\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 7\u001b[39m ).select(\n\u001b[32m 8\u001b[39m \u001b[38;5;28minput\u001b[39m=\u001b[33m\"\u001b[39m\u001b[33minput.value\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 9\u001b[39m output=\u001b[33m\"\u001b[39m\u001b[33moutput.value\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 10\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/phoenix/__init__.py:10\u001b[39m\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01minferences\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01minferences\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Inferences\n\u001b[32m 9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01minferences\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mschema\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m EmbeddingColumnNames, RetrievalEmbeddingColumnNames, Schema\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01msession\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mclient\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Client\n\u001b[32m 11\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01msession\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mevaluation\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m log_evaluations\n\u001b[32m 12\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01msession\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01msession\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[32m 13\u001b[39m NotebookEnvironment,\n\u001b[32m 14\u001b[39m Session,\n\u001b[32m (...)\u001b[39m\u001b[32m 18\u001b[39m launch_app,\n\u001b[32m 19\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/phoenix/session/client.py:32\u001b[39m\n\u001b[32m 30\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdatetime_utils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m normalize_datetime\n\u001b[32m 31\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdb\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01minsertion\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdataset\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DatasetKeys\n\u001b[32m---> \u001b[39m\u001b[32m32\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mexperiments\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtypes\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Dataset, Example, Experiment\n\u001b[32m 33\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01msession\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdata_extractor\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DEFAULT_SPAN_LIMIT, TraceDataExtractor\n\u001b[32m 34\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtrace\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Evaluations, TraceDataset\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/phoenix/experiments/__init__.py:1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01mfunctions\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m evaluate_experiment, run_experiment\n\u001b[32m 3\u001b[39m __all__ = [\n\u001b[32m 4\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mevaluate_experiment\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 5\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mrun_experiment\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 6\u001b[39m ]\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/phoenix/experiments/functions.py:34\u001b[39m\n\u001b[32m 31\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtyping_extensions\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m TypeAlias\n\u001b[32m 33\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mconfig\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m get_base_url, get_env_client_headers\n\u001b[32m---> \u001b[39m\u001b[32m34\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mevals\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mexecutors\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m get_executor_on_sync_context\n\u001b[32m 35\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mevals\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mmodels\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mrate_limiters\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m RateLimiter\n\u001b[32m 36\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mevals\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m get_tqdm_progress_bar_formatter\n", + "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'phoenix.evals'" + ] + } + ], + "source": [ + "from phoenix.trace.dsl import SpanQuery\n", + "\n", + "query = (\n", + " SpanQuery()\n", + " .where(\n", + " \"span_kind == 'LLM'\",\n", + " )\n", + " .select(\n", + " input=\"input.value\",\n", + " output=\"output.value\",\n", + " )\n", + ")\n", + "\n", + "# The Phoenix Client can take this query and return the dataframe.\n", + "px.Client().query_spans(query, project_name=\"pydantic-evals-tutorial\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Define the Evaluation Dataset\n", + "Create a dataset of test cases using Pydantic Evals for a question-answering task.\n", + "1. Each Case represents a single test with an input (question) and an expected output (answer).\n", + "2. The Dataset aggregates these cases for evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "cases = [\n", + " Case(\n", + " name=\"capital of France\", inputs=\"What is the capital of France?\", expected_output=\"Paris\"\n", + " ),\n", + " Case(\n", + " name=\"author of Romeo and Juliet\",\n", + " inputs=\"Who wrote Romeo and Juliet?\",\n", + " expected_output=\"William Shakespeare\",\n", + " ),\n", + " Case(\n", + " name=\"largest planet\",\n", + " inputs=\"What is the largest planet in our solar system?\",\n", + " expected_output=\"Jupiter\",\n", + " ),\n", + "]\n", + "dataset = Dataset(cases=cases)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Setup LLM task to evaluate\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "client = OpenAI()\n", + "\n", + "\n", + "def evaluate_case(case):\n", + " response = client.chat.completions.create(\n", + " model=\"gpt-4o-mini\", messages=[{\"role\": \"user\", \"content\": case.inputs}]\n", + " )\n", + " output = response.choices[0].message.content\n", + " print(output)\n", + " is_correct = case.expected_output.lower() in output.strip().lower()\n", + " return is_correct" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Run your experiment and evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "ename": "APIConnectionError", + "evalue": "Connection error.", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mLocalProtocolError\u001b[39m Traceback (most recent call last)", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_transports/default.py:101\u001b[39m, in \u001b[36mmap_httpcore_exceptions\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 100\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m101\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[32m 102\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_transports/default.py:250\u001b[39m, in \u001b[36mHTTPTransport.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 249\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[32m--> \u001b[39m\u001b[32m250\u001b[39m resp = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_pool\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreq\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 252\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(resp.stream, typing.Iterable)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_sync/connection_pool.py:256\u001b[39m, in \u001b[36mConnectionPool.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 255\u001b[39m \u001b[38;5;28mself\u001b[39m._close_connections(closing)\n\u001b[32m--> \u001b[39m\u001b[32m256\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 258\u001b[39m \u001b[38;5;66;03m# Return the response. Note that in this case we still have to manage\u001b[39;00m\n\u001b[32m 259\u001b[39m \u001b[38;5;66;03m# the point at which the response is closed.\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_sync/connection_pool.py:236\u001b[39m, in \u001b[36mConnectionPool.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 234\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 235\u001b[39m \u001b[38;5;66;03m# Send the request on the assigned connection.\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m236\u001b[39m response = \u001b[43mconnection\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 237\u001b[39m \u001b[43m \u001b[49m\u001b[43mpool_request\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrequest\u001b[49m\n\u001b[32m 238\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 239\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m ConnectionNotAvailable:\n\u001b[32m 240\u001b[39m \u001b[38;5;66;03m# In some cases a connection may initially be available to\u001b[39;00m\n\u001b[32m 241\u001b[39m \u001b[38;5;66;03m# handle a request, but then become unavailable.\u001b[39;00m\n\u001b[32m 242\u001b[39m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[32m 243\u001b[39m \u001b[38;5;66;03m# In this case we clear the connection and try again.\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_sync/connection.py:103\u001b[39m, in \u001b[36mHTTPConnection.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m exc\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_connection\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_sync/http11.py:136\u001b[39m, in \u001b[36mHTTP11Connection.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 135\u001b[39m \u001b[38;5;28mself\u001b[39m._response_closed()\n\u001b[32m--> \u001b[39m\u001b[32m136\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m exc\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_sync/http11.py:86\u001b[39m, in \u001b[36mHTTP11Connection.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 83\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m Trace(\n\u001b[32m 84\u001b[39m \u001b[33m\"\u001b[39m\u001b[33msend_request_headers\u001b[39m\u001b[33m\"\u001b[39m, logger, request, kwargs\n\u001b[32m 85\u001b[39m ) \u001b[38;5;28;01mas\u001b[39;00m trace:\n\u001b[32m---> \u001b[39m\u001b[32m86\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_send_request_headers\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 87\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m Trace(\u001b[33m\"\u001b[39m\u001b[33msend_request_body\u001b[39m\u001b[33m\"\u001b[39m, logger, request, kwargs) \u001b[38;5;28;01mas\u001b[39;00m trace:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_sync/http11.py:144\u001b[39m, in \u001b[36mHTTP11Connection._send_request_headers\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 142\u001b[39m timeout = timeouts.get(\u001b[33m\"\u001b[39m\u001b[33mwrite\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m--> \u001b[39m\u001b[32m144\u001b[39m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mwith\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mmap_exceptions\u001b[49m\u001b[43m(\u001b[49m\u001b[43m{\u001b[49m\u001b[43mh11\u001b[49m\u001b[43m.\u001b[49m\u001b[43mLocalProtocolError\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mLocalProtocolError\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 145\u001b[39m \u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mh11\u001b[49m\u001b[43m.\u001b[49m\u001b[43mRequest\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 146\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 147\u001b[39m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43murl\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 148\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 149\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/contextlib.py:158\u001b[39m, in \u001b[36m_GeneratorContextManager.__exit__\u001b[39m\u001b[34m(self, typ, value, traceback)\u001b[39m\n\u001b[32m 157\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m158\u001b[39m \u001b[38;5;28mself\u001b[39m.gen.throw(typ, value, traceback)\n\u001b[32m 159\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[32m 160\u001b[39m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[32m 161\u001b[39m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[32m 162\u001b[39m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_exceptions.py:14\u001b[39m, in \u001b[36mmap_exceptions\u001b[39m\u001b[34m(map)\u001b[39m\n\u001b[32m 13\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(exc, from_exc):\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m to_exc(exc) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mexc\u001b[39;00m\n\u001b[32m 15\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m\n", + "\u001b[31mLocalProtocolError\u001b[39m: Illegal header value b'Bearer '", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[31mLocalProtocolError\u001b[39m Traceback (most recent call last)", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/openai/_base_client.py:969\u001b[39m, in \u001b[36mSyncAPIClient.request\u001b[39m\u001b[34m(self, cast_to, options, stream, stream_cls)\u001b[39m\n\u001b[32m 968\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m969\u001b[39m response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_client\u001b[49m\u001b[43m.\u001b[49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 970\u001b[39m \u001b[43m \u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 971\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_should_stream_response_body\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 972\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 973\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 974\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m httpx.TimeoutException \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_client.py:914\u001b[39m, in \u001b[36mClient.send\u001b[39m\u001b[34m(self, request, stream, auth, follow_redirects)\u001b[39m\n\u001b[32m 912\u001b[39m auth = \u001b[38;5;28mself\u001b[39m._build_request_auth(request, auth)\n\u001b[32m--> \u001b[39m\u001b[32m914\u001b[39m response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_send_handling_auth\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 915\u001b[39m \u001b[43m \u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 916\u001b[39m \u001b[43m \u001b[49m\u001b[43mauth\u001b[49m\u001b[43m=\u001b[49m\u001b[43mauth\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 917\u001b[39m \u001b[43m \u001b[49m\u001b[43mfollow_redirects\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfollow_redirects\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 918\u001b[39m \u001b[43m \u001b[49m\u001b[43mhistory\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 919\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 920\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_client.py:942\u001b[39m, in \u001b[36mClient._send_handling_auth\u001b[39m\u001b[34m(self, request, auth, follow_redirects, history)\u001b[39m\n\u001b[32m 941\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m942\u001b[39m response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_send_handling_redirects\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 943\u001b[39m \u001b[43m \u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 944\u001b[39m \u001b[43m \u001b[49m\u001b[43mfollow_redirects\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfollow_redirects\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 945\u001b[39m \u001b[43m \u001b[49m\u001b[43mhistory\u001b[49m\u001b[43m=\u001b[49m\u001b[43mhistory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 946\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 947\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_client.py:979\u001b[39m, in \u001b[36mClient._send_handling_redirects\u001b[39m\u001b[34m(self, request, follow_redirects, history)\u001b[39m\n\u001b[32m 977\u001b[39m hook(request)\n\u001b[32m--> \u001b[39m\u001b[32m979\u001b[39m response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_send_single_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 980\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_client.py:1014\u001b[39m, in \u001b[36mClient._send_single_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 1013\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m request_context(request=request):\n\u001b[32m-> \u001b[39m\u001b[32m1014\u001b[39m response = \u001b[43mtransport\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1016\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(response.stream, SyncByteStream)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_transports/default.py:249\u001b[39m, in \u001b[36mHTTPTransport.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 237\u001b[39m req = httpcore.Request(\n\u001b[32m 238\u001b[39m method=request.method,\n\u001b[32m 239\u001b[39m url=httpcore.URL(\n\u001b[32m (...)\u001b[39m\u001b[32m 247\u001b[39m extensions=request.extensions,\n\u001b[32m 248\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m249\u001b[39m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mwith\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mmap_httpcore_exceptions\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 250\u001b[39m \u001b[43m \u001b[49m\u001b[43mresp\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_pool\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreq\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/contextlib.py:158\u001b[39m, in \u001b[36m_GeneratorContextManager.__exit__\u001b[39m\u001b[34m(self, typ, value, traceback)\u001b[39m\n\u001b[32m 157\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m158\u001b[39m \u001b[38;5;28mself\u001b[39m.gen.throw(typ, value, traceback)\n\u001b[32m 159\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[32m 160\u001b[39m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[32m 161\u001b[39m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[32m 162\u001b[39m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_transports/default.py:118\u001b[39m, in \u001b[36mmap_httpcore_exceptions\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 117\u001b[39m message = \u001b[38;5;28mstr\u001b[39m(exc)\n\u001b[32m--> \u001b[39m\u001b[32m118\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m mapped_exc(message) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mexc\u001b[39;00m\n", + "\u001b[31mLocalProtocolError\u001b[39m: Illegal header value b'Bearer '", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[31mAPIConnectionError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m results = \u001b[43m[\u001b[49m\u001b[43mevaluate_case\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcase\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mcase\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcases\u001b[49m\u001b[43m]\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m case, result \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(dataset.cases, results):\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCase: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcase.name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, Correct: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 1\u001b[39m, in \u001b[36m\u001b[39m\u001b[34m(.0)\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m results = [\u001b[43mevaluate_case\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcase\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m case \u001b[38;5;129;01min\u001b[39;00m dataset.cases]\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m case, result \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(dataset.cases, results):\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCase: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcase.name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, Correct: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 4\u001b[39m, in \u001b[36mevaluate_case\u001b[39m\u001b[34m(case)\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mevaluate_case\u001b[39m(case):\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m response = \u001b[43mclient\u001b[49m\u001b[43m.\u001b[49m\u001b[43mchat\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcompletions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcreate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 5\u001b[39m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgpt-4o-mini\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43m{\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrole\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mcase\u001b[49m\u001b[43m.\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m}\u001b[49m\u001b[43m]\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 8\u001b[39m output = response.choices[\u001b[32m0\u001b[39m].message.content\n\u001b[32m 9\u001b[39m \u001b[38;5;28mprint\u001b[39m(output)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/openai/_utils/_utils.py:287\u001b[39m, in \u001b[36mrequired_args..inner..wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 285\u001b[39m msg = \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMissing required argument: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mquote(missing[\u001b[32m0\u001b[39m])\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 286\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(msg)\n\u001b[32m--> \u001b[39m\u001b[32m287\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/openai/resources/chat/completions/completions.py:925\u001b[39m, in \u001b[36mCompletions.create\u001b[39m\u001b[34m(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, web_search_options, extra_headers, extra_query, extra_body, timeout)\u001b[39m\n\u001b[32m 882\u001b[39m \u001b[38;5;129m@required_args\u001b[39m([\u001b[33m\"\u001b[39m\u001b[33mmessages\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mmodel\u001b[39m\u001b[33m\"\u001b[39m], [\u001b[33m\"\u001b[39m\u001b[33mmessages\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mmodel\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mstream\u001b[39m\u001b[33m\"\u001b[39m])\n\u001b[32m 883\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcreate\u001b[39m(\n\u001b[32m 884\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 922\u001b[39m timeout: \u001b[38;5;28mfloat\u001b[39m | httpx.Timeout | \u001b[38;5;28;01mNone\u001b[39;00m | NotGiven = NOT_GIVEN,\n\u001b[32m 923\u001b[39m ) -> ChatCompletion | Stream[ChatCompletionChunk]:\n\u001b[32m 924\u001b[39m validate_response_format(response_format)\n\u001b[32m--> \u001b[39m\u001b[32m925\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_post\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 926\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m/chat/completions\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 927\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmaybe_transform\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 928\u001b[39m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m 929\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessages\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 930\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmodel\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 931\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43maudio\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43maudio\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 932\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfrequency_penalty\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrequency_penalty\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 933\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfunction_call\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunction_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 934\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfunctions\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunctions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 935\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mlogit_bias\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogit_bias\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 936\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mlogprobs\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogprobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 937\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmax_completion_tokens\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_completion_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 938\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmax_tokens\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 939\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmetadata\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 940\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmodalities\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodalities\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 941\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mn\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 942\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mparallel_tool_calls\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mparallel_tool_calls\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 943\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mprediction\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mprediction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 944\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mpresence_penalty\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mpresence_penalty\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 945\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mreasoning_effort\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mreasoning_effort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 946\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mresponse_format\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mresponse_format\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 947\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mseed\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mseed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 948\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mservice_tier\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mservice_tier\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 949\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstop\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 950\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstore\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstore\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 951\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstream\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 952\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstream_options\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 953\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtemperature\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemperature\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 954\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtool_choice\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtool_choice\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 955\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtools\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtools\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 956\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtop_logprobs\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_logprobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 957\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtop_p\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_p\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 958\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43muser\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 959\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mweb_search_options\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mweb_search_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 960\u001b[39m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 961\u001b[39m \u001b[43m \u001b[49m\u001b[43mcompletion_create_params\u001b[49m\u001b[43m.\u001b[49m\u001b[43mCompletionCreateParamsStreaming\u001b[49m\n\u001b[32m 962\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\n\u001b[32m 963\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mcompletion_create_params\u001b[49m\u001b[43m.\u001b[49m\u001b[43mCompletionCreateParamsNonStreaming\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 964\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 965\u001b[39m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmake_request_options\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 966\u001b[39m \u001b[43m \u001b[49m\u001b[43mextra_headers\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextra_headers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_query\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextra_query\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_body\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextra_body\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\n\u001b[32m 967\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 968\u001b[39m \u001b[43m \u001b[49m\u001b[43mcast_to\u001b[49m\u001b[43m=\u001b[49m\u001b[43mChatCompletion\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 969\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 970\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m=\u001b[49m\u001b[43mStream\u001b[49m\u001b[43m[\u001b[49m\u001b[43mChatCompletionChunk\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 971\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/openai/_base_client.py:1239\u001b[39m, in \u001b[36mSyncAPIClient.post\u001b[39m\u001b[34m(self, path, cast_to, body, options, files, stream, stream_cls)\u001b[39m\n\u001b[32m 1225\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mpost\u001b[39m(\n\u001b[32m 1226\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 1227\u001b[39m path: \u001b[38;5;28mstr\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 1234\u001b[39m stream_cls: \u001b[38;5;28mtype\u001b[39m[_StreamT] | \u001b[38;5;28;01mNone\u001b[39;00m = \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m 1235\u001b[39m ) -> ResponseT | _StreamT:\n\u001b[32m 1236\u001b[39m opts = FinalRequestOptions.construct(\n\u001b[32m 1237\u001b[39m method=\u001b[33m\"\u001b[39m\u001b[33mpost\u001b[39m\u001b[33m\"\u001b[39m, url=path, json_data=body, files=to_httpx_files(files), **options\n\u001b[32m 1238\u001b[39m )\n\u001b[32m-> \u001b[39m\u001b[32m1239\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m cast(ResponseT, \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcast_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m)\u001b[49m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/openai/_base_client.py:1001\u001b[39m, in \u001b[36mSyncAPIClient.request\u001b[39m\u001b[34m(self, cast_to, options, stream, stream_cls)\u001b[39m\n\u001b[32m 998\u001b[39m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[32m 1000\u001b[39m log.debug(\u001b[33m\"\u001b[39m\u001b[33mRaising connection error\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1001\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m APIConnectionError(request=request) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01merr\u001b[39;00m\n\u001b[32m 1003\u001b[39m log.debug(\n\u001b[32m 1004\u001b[39m \u001b[33m'\u001b[39m\u001b[33mHTTP Response: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m \u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m%i\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m\u001b[33m \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m'\u001b[39m,\n\u001b[32m 1005\u001b[39m request.method,\n\u001b[32m (...)\u001b[39m\u001b[32m 1009\u001b[39m response.headers,\n\u001b[32m 1010\u001b[39m )\n\u001b[32m 1011\u001b[39m log.debug(\u001b[33m\"\u001b[39m\u001b[33mrequest_id: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m, response.headers.get(\u001b[33m\"\u001b[39m\u001b[33mx-request-id\u001b[39m\u001b[33m\"\u001b[39m))\n", + "\u001b[31mAPIConnectionError\u001b[39m: Connection error." + ] + } + ], + "source": [ + "results = [evaluate_case(case) for case in dataset.cases]\n", + "\n", + "for case, result in zip(dataset.cases, results):\n", + " print(f\"Case: {case.name}, Correct: {result}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 7. See your results in Arize Phoenix\n", + "\n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From d0fcb2c48ded8d8f7336dd4d88bcdf385fcefda8 Mon Sep 17 00:00:00 2001 From: JohnGilhuly Date: Thu, 1 May 2025 20:02:21 -0400 Subject: [PATCH 2/4] Update pydantic-evals.ipynb --- tutorials/evals/pydantic-evals.ipynb | 374 +++++++++++++++------------ 1 file changed, 207 insertions(+), 167 deletions(-) diff --git a/tutorials/evals/pydantic-evals.ipynb b/tutorials/evals/pydantic-evals.ipynb index 7cd8823468..b83990c172 100644 --- a/tutorials/evals/pydantic-evals.ipynb +++ b/tutorials/evals/pydantic-evals.ipynb @@ -28,36 +28,7 @@ "id": "857d8bf104ed" }, "source": [ - "## Step 1: Install dependencies" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mWARNING: Ignoring invalid distribution ~oogle-cloud-aiplatform (/Users/jgilhuly/.local/lib/python3.11/site-packages)\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution ~oogle-cloud-aiplatform (/Users/jgilhuly/.local/lib/python3.11/site-packages)\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution ~oogle-cloud-aiplatform (/Users/jgilhuly/.local/lib/python3.11/site-packages)\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "crewai 0.114.0 requires chromadb>=0.5.23, which is not installed.\n", - "crewai-tools 0.40.1 requires chromadb>=0.4.22, which is not installed.\n", - "langchain 0.3.23 requires langsmith<0.4,>=0.1.17, which is not installed.\n", - "embedchain 0.1.128 requires chromadb<0.6.0,>=0.5.10, which is not installed.\n", - "embedchain 0.1.128 requires langsmith<0.4.0,>=0.3.18, which is not installed.\n", - "langchain-community 0.3.21 requires langsmith<0.4,>=0.1.125, which is not installed.\u001b[0m\u001b[31m\n", - "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution ~oogle-cloud-aiplatform (/Users/jgilhuly/.local/lib/python3.11/site-packages)\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution ~oogle-cloud-aiplatform (/Users/jgilhuly/.local/lib/python3.11/site-packages)\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q pydantic-evals arize-phoenix openai openinference-instrumentation-openai \"httpx<0.28.0,>=0.23.0\"" + "## Install dependencies" ] }, { @@ -66,19 +37,19 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install arize-phoenix" + "!pip install -q pydantic-evals arize-phoenix openai openinference-instrumentation-openai \"httpx<0.28.0,>=0.23.0\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Step 2: Setup API keys and imports" + "## Setup API keys and imports" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -98,7 +69,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Step 3: Enable Phoenix Tracing" + "## Enable Phoenix Tracing" ] }, { @@ -110,48 +81,26 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "if os.getenv(\"PHOENIX_API_KEY\") is None:\n", " os.environ[\"PHOENIX_API_KEY\"] = getpass(\"Enter your Phoenix API key: \")\n", "\n", - "os.environ[\"PHOENIX_COLLECTOR_ENDPOINT\"] = \"https://app.phoenix.arize.com/\"\n", + "os.environ[\"PHOENIX_COLLECTOR_ENDPOINT\"] = \"https://app.phoenix.arize.com\"\n", "os.environ[\"PHOENIX_CLIENT_HEADERS\"] = f\"api_key={os.getenv('PHOENIX_API_KEY')}\"" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🔭 OpenTelemetry Tracing Details 🔭\n", - "| Phoenix Project: pydantic-evals-tutorial\n", - "| Span Processor: SimpleSpanProcessor\n", - "| Collector Endpoint: http://localhost:6006/v1/traces\n", - "| Transport: HTTP + protobuf\n", - "| Transport Headers: {'authorization': '****'}\n", - "| \n", - "| Using a default SpanProcessor. `add_span_processor` will overwrite this default.\n", - "| \n", - "| ⚠️ WARNING: It is strongly advised to use a BatchSpanProcessor in production environments.\n", - "| \n", - "| `register` has set this TracerProvider as the global OpenTelemetry default.\n", - "| To disable this behavior, call `register` with `set_global_tracer_provider=False`.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "from phoenix.otel import register\n", "\n", "tracer_provider = register(\n", - " endpoint=\"http://localhost:6006/v1/traces\",\n", " project_name=\"pydantic-evals-tutorial\",\n", " auto_instrument=True,\n", ")" @@ -161,14 +110,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Step 4: Create Example Traces to Evaluate\n", + "## Create Example Traces to Evaluate\n", "\n", "Next, we'll run some example inputs through an LLM call to generate traces that we can evaluate. In practice, you'd likely already have an application you're tracing that you'd want to evaluate instead." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -202,63 +151,36 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Step 5: Export Traces from Phoenix\n", + "## Export Traces from Phoenix\n", "\n", "Export your traces from Phoenix to then evaluate with Pydantic Evals." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jgilhuly/.local/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'phoenix.evals'", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpx\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtrace\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdsl\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m SpanQuery\n\u001b[32m 5\u001b[39m query = SpanQuery().where(\n\u001b[32m 6\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mspan_kind == \u001b[39m\u001b[33m'\u001b[39m\u001b[33mLLM\u001b[39m\u001b[33m'\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 7\u001b[39m ).select(\n\u001b[32m 8\u001b[39m \u001b[38;5;28minput\u001b[39m=\u001b[33m\"\u001b[39m\u001b[33minput.value\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 9\u001b[39m output=\u001b[33m\"\u001b[39m\u001b[33moutput.value\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 10\u001b[39m )\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/phoenix/__init__.py:10\u001b[39m\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01minferences\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01minferences\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Inferences\n\u001b[32m 9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01minferences\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mschema\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m EmbeddingColumnNames, RetrievalEmbeddingColumnNames, Schema\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01msession\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mclient\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Client\n\u001b[32m 11\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01msession\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mevaluation\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m log_evaluations\n\u001b[32m 12\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01msession\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01msession\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[32m 13\u001b[39m NotebookEnvironment,\n\u001b[32m 14\u001b[39m Session,\n\u001b[32m (...)\u001b[39m\u001b[32m 18\u001b[39m launch_app,\n\u001b[32m 19\u001b[39m )\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/phoenix/session/client.py:32\u001b[39m\n\u001b[32m 30\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdatetime_utils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m normalize_datetime\n\u001b[32m 31\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdb\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01minsertion\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdataset\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DatasetKeys\n\u001b[32m---> \u001b[39m\u001b[32m32\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mexperiments\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtypes\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Dataset, Example, Experiment\n\u001b[32m 33\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01msession\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdata_extractor\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DEFAULT_SPAN_LIMIT, TraceDataExtractor\n\u001b[32m 34\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtrace\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Evaluations, TraceDataset\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/phoenix/experiments/__init__.py:1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01m.\u001b[39;00m\u001b[34;01mfunctions\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m evaluate_experiment, run_experiment\n\u001b[32m 3\u001b[39m __all__ = [\n\u001b[32m 4\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mevaluate_experiment\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 5\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mrun_experiment\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 6\u001b[39m ]\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/phoenix/experiments/functions.py:34\u001b[39m\n\u001b[32m 31\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtyping_extensions\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m TypeAlias\n\u001b[32m 33\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mconfig\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m get_base_url, get_env_client_headers\n\u001b[32m---> \u001b[39m\u001b[32m34\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mevals\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mexecutors\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m get_executor_on_sync_context\n\u001b[32m 35\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mevals\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mmodels\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mrate_limiters\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m RateLimiter\n\u001b[32m 36\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mphoenix\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mevals\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m get_tqdm_progress_bar_formatter\n", - "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'phoenix.evals'" - ] - } - ], + "outputs": [], "source": [ "from phoenix.trace.dsl import SpanQuery\n", "\n", - "query = (\n", - " SpanQuery()\n", - " .where(\n", - " \"span_kind == 'LLM'\",\n", - " )\n", - " .select(\n", - " input=\"input.value\",\n", - " output=\"output.value\",\n", - " )\n", + "query = SpanQuery().select(\n", + " input=\"llm.input_messages\",\n", + " output=\"llm.output_messages\",\n", ")\n", "\n", "# The Phoenix Client can take this query and return the dataframe.\n", - "px.Client().query_spans(query, project_name=\"pydantic-evals-tutorial\")" + "spans = px.Client().query_spans(query, project_name=\"pydantic-evals-tutorial\")\n", + "spans[\"input\"] = spans[\"input\"].apply(lambda x: x[1].get(\"message\").get(\"content\"))\n", + "spans[\"output\"] = spans[\"output\"].apply(lambda x: x[0].get(\"message\").get(\"content\"))\n", + "spans.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Step 4: Define the Evaluation Dataset\n", + "## Define the Evaluation Dataset\n", "Create a dataset of test cases using Pydantic Evals for a question-answering task.\n", "1. Each Case represents a single test with an input (question) and an expected output (answer).\n", "2. The Dataset aggregates these cases for evaluation." @@ -266,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -284,110 +206,228 @@ " inputs=\"What is the largest planet in our solar system?\",\n", " expected_output=\"Jupiter\",\n", " ),\n", - "]\n", - "dataset = Dataset(cases=cases)" + "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Step 5: Setup LLM task to evaluate\n" + "## Setup LLM task to evaluate\n" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ + "from pydantic_evals.evaluators import Evaluator, EvaluatorContext\n", + "\n", "client = OpenAI()\n", "\n", "\n", - "def evaluate_case(case):\n", - " response = client.chat.completions.create(\n", - " model=\"gpt-4o-mini\", messages=[{\"role\": \"user\", \"content\": case.inputs}]\n", - " )\n", - " output = response.choices[0].message.content\n", - " print(output)\n", - " is_correct = case.expected_output.lower() in output.strip().lower()\n", - " return is_correct" + "class MatchesExpectedOutput(Evaluator[str, str]):\n", + " def evaluate(self, ctx: EvaluatorContext[str, str]) -> float:\n", + " is_correct = ctx.expected_output == ctx.output\n", + " return is_correct" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = Dataset(\n", + " cases=cases,\n", + " evaluators=[MatchesExpectedOutput()],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()\n", + "\n", + "\n", + "async def task(input: str) -> str:\n", + " output = spans[spans[\"input\"] == input][\"output\"].values[0]\n", + " return output" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Step 6: Run your experiment and evaluation" + "## Run your experiment and evaluation" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "APIConnectionError", - "evalue": "Connection error.", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mLocalProtocolError\u001b[39m Traceback (most recent call last)", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_transports/default.py:101\u001b[39m, in \u001b[36mmap_httpcore_exceptions\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 100\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m101\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[32m 102\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_transports/default.py:250\u001b[39m, in \u001b[36mHTTPTransport.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 249\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[32m--> \u001b[39m\u001b[32m250\u001b[39m resp = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_pool\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreq\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 252\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(resp.stream, typing.Iterable)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_sync/connection_pool.py:256\u001b[39m, in \u001b[36mConnectionPool.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 255\u001b[39m \u001b[38;5;28mself\u001b[39m._close_connections(closing)\n\u001b[32m--> \u001b[39m\u001b[32m256\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 258\u001b[39m \u001b[38;5;66;03m# Return the response. Note that in this case we still have to manage\u001b[39;00m\n\u001b[32m 259\u001b[39m \u001b[38;5;66;03m# the point at which the response is closed.\u001b[39;00m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_sync/connection_pool.py:236\u001b[39m, in \u001b[36mConnectionPool.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 234\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 235\u001b[39m \u001b[38;5;66;03m# Send the request on the assigned connection.\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m236\u001b[39m response = \u001b[43mconnection\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 237\u001b[39m \u001b[43m \u001b[49m\u001b[43mpool_request\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrequest\u001b[49m\n\u001b[32m 238\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 239\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m ConnectionNotAvailable:\n\u001b[32m 240\u001b[39m \u001b[38;5;66;03m# In some cases a connection may initially be available to\u001b[39;00m\n\u001b[32m 241\u001b[39m \u001b[38;5;66;03m# handle a request, but then become unavailable.\u001b[39;00m\n\u001b[32m 242\u001b[39m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[32m 243\u001b[39m \u001b[38;5;66;03m# In this case we clear the connection and try again.\u001b[39;00m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_sync/connection.py:103\u001b[39m, in \u001b[36mHTTPConnection.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m exc\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_connection\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_sync/http11.py:136\u001b[39m, in \u001b[36mHTTP11Connection.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 135\u001b[39m \u001b[38;5;28mself\u001b[39m._response_closed()\n\u001b[32m--> \u001b[39m\u001b[32m136\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m exc\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_sync/http11.py:86\u001b[39m, in \u001b[36mHTTP11Connection.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 83\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m Trace(\n\u001b[32m 84\u001b[39m \u001b[33m\"\u001b[39m\u001b[33msend_request_headers\u001b[39m\u001b[33m\"\u001b[39m, logger, request, kwargs\n\u001b[32m 85\u001b[39m ) \u001b[38;5;28;01mas\u001b[39;00m trace:\n\u001b[32m---> \u001b[39m\u001b[32m86\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_send_request_headers\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 87\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m Trace(\u001b[33m\"\u001b[39m\u001b[33msend_request_body\u001b[39m\u001b[33m\"\u001b[39m, logger, request, kwargs) \u001b[38;5;28;01mas\u001b[39;00m trace:\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_sync/http11.py:144\u001b[39m, in \u001b[36mHTTP11Connection._send_request_headers\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 142\u001b[39m timeout = timeouts.get(\u001b[33m\"\u001b[39m\u001b[33mwrite\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m--> \u001b[39m\u001b[32m144\u001b[39m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mwith\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mmap_exceptions\u001b[49m\u001b[43m(\u001b[49m\u001b[43m{\u001b[49m\u001b[43mh11\u001b[49m\u001b[43m.\u001b[49m\u001b[43mLocalProtocolError\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mLocalProtocolError\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 145\u001b[39m \u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mh11\u001b[49m\u001b[43m.\u001b[49m\u001b[43mRequest\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 146\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 147\u001b[39m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43murl\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 148\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 149\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/contextlib.py:158\u001b[39m, in \u001b[36m_GeneratorContextManager.__exit__\u001b[39m\u001b[34m(self, typ, value, traceback)\u001b[39m\n\u001b[32m 157\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m158\u001b[39m \u001b[38;5;28mself\u001b[39m.gen.throw(typ, value, traceback)\n\u001b[32m 159\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[32m 160\u001b[39m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[32m 161\u001b[39m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[32m 162\u001b[39m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpcore/_exceptions.py:14\u001b[39m, in \u001b[36mmap_exceptions\u001b[39m\u001b[34m(map)\u001b[39m\n\u001b[32m 13\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(exc, from_exc):\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m to_exc(exc) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mexc\u001b[39;00m\n\u001b[32m 15\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m\n", - "\u001b[31mLocalProtocolError\u001b[39m: Illegal header value b'Bearer '", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[31mLocalProtocolError\u001b[39m Traceback (most recent call last)", - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/openai/_base_client.py:969\u001b[39m, in \u001b[36mSyncAPIClient.request\u001b[39m\u001b[34m(self, cast_to, options, stream, stream_cls)\u001b[39m\n\u001b[32m 968\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m969\u001b[39m response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_client\u001b[49m\u001b[43m.\u001b[49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 970\u001b[39m \u001b[43m \u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 971\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_should_stream_response_body\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 972\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 973\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 974\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m httpx.TimeoutException \u001b[38;5;28;01mas\u001b[39;00m err:\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_client.py:914\u001b[39m, in \u001b[36mClient.send\u001b[39m\u001b[34m(self, request, stream, auth, follow_redirects)\u001b[39m\n\u001b[32m 912\u001b[39m auth = \u001b[38;5;28mself\u001b[39m._build_request_auth(request, auth)\n\u001b[32m--> \u001b[39m\u001b[32m914\u001b[39m response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_send_handling_auth\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 915\u001b[39m \u001b[43m \u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 916\u001b[39m \u001b[43m \u001b[49m\u001b[43mauth\u001b[49m\u001b[43m=\u001b[49m\u001b[43mauth\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 917\u001b[39m \u001b[43m \u001b[49m\u001b[43mfollow_redirects\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfollow_redirects\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 918\u001b[39m \u001b[43m \u001b[49m\u001b[43mhistory\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 919\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 920\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_client.py:942\u001b[39m, in \u001b[36mClient._send_handling_auth\u001b[39m\u001b[34m(self, request, auth, follow_redirects, history)\u001b[39m\n\u001b[32m 941\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m942\u001b[39m response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_send_handling_redirects\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 943\u001b[39m \u001b[43m \u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 944\u001b[39m \u001b[43m \u001b[49m\u001b[43mfollow_redirects\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfollow_redirects\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 945\u001b[39m \u001b[43m \u001b[49m\u001b[43mhistory\u001b[49m\u001b[43m=\u001b[49m\u001b[43mhistory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 946\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 947\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_client.py:979\u001b[39m, in \u001b[36mClient._send_handling_redirects\u001b[39m\u001b[34m(self, request, follow_redirects, history)\u001b[39m\n\u001b[32m 977\u001b[39m hook(request)\n\u001b[32m--> \u001b[39m\u001b[32m979\u001b[39m response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_send_single_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 980\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_client.py:1014\u001b[39m, in \u001b[36mClient._send_single_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 1013\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m request_context(request=request):\n\u001b[32m-> \u001b[39m\u001b[32m1014\u001b[39m response = \u001b[43mtransport\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1016\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(response.stream, SyncByteStream)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_transports/default.py:249\u001b[39m, in \u001b[36mHTTPTransport.handle_request\u001b[39m\u001b[34m(self, request)\u001b[39m\n\u001b[32m 237\u001b[39m req = httpcore.Request(\n\u001b[32m 238\u001b[39m method=request.method,\n\u001b[32m 239\u001b[39m url=httpcore.URL(\n\u001b[32m (...)\u001b[39m\u001b[32m 247\u001b[39m extensions=request.extensions,\n\u001b[32m 248\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m249\u001b[39m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mwith\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mmap_httpcore_exceptions\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 250\u001b[39m \u001b[43m \u001b[49m\u001b[43mresp\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_pool\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhandle_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreq\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/contextlib.py:158\u001b[39m, in \u001b[36m_GeneratorContextManager.__exit__\u001b[39m\u001b[34m(self, typ, value, traceback)\u001b[39m\n\u001b[32m 157\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m158\u001b[39m \u001b[38;5;28mself\u001b[39m.gen.throw(typ, value, traceback)\n\u001b[32m 159\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[32m 160\u001b[39m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[32m 161\u001b[39m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[32m 162\u001b[39m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.11/site-packages/httpx/_transports/default.py:118\u001b[39m, in \u001b[36mmap_httpcore_exceptions\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 117\u001b[39m message = \u001b[38;5;28mstr\u001b[39m(exc)\n\u001b[32m--> \u001b[39m\u001b[32m118\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m mapped_exc(message) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mexc\u001b[39;00m\n", - "\u001b[31mLocalProtocolError\u001b[39m: Illegal header value b'Bearer '", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[31mAPIConnectionError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m results = \u001b[43m[\u001b[49m\u001b[43mevaluate_case\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcase\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mcase\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcases\u001b[49m\u001b[43m]\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m case, result \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(dataset.cases, results):\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCase: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcase.name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, Correct: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 1\u001b[39m, in \u001b[36m\u001b[39m\u001b[34m(.0)\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m results = [\u001b[43mevaluate_case\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcase\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m case \u001b[38;5;129;01min\u001b[39;00m dataset.cases]\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m case, result \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(dataset.cases, results):\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCase: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcase.name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, Correct: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 4\u001b[39m, in \u001b[36mevaluate_case\u001b[39m\u001b[34m(case)\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mevaluate_case\u001b[39m(case):\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m response = \u001b[43mclient\u001b[49m\u001b[43m.\u001b[49m\u001b[43mchat\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcompletions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcreate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 5\u001b[39m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgpt-4o-mini\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43m{\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrole\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mcase\u001b[49m\u001b[43m.\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m}\u001b[49m\u001b[43m]\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 8\u001b[39m output = response.choices[\u001b[32m0\u001b[39m].message.content\n\u001b[32m 9\u001b[39m \u001b[38;5;28mprint\u001b[39m(output)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/openai/_utils/_utils.py:287\u001b[39m, in \u001b[36mrequired_args..inner..wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 285\u001b[39m msg = \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMissing required argument: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mquote(missing[\u001b[32m0\u001b[39m])\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 286\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(msg)\n\u001b[32m--> \u001b[39m\u001b[32m287\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/openai/resources/chat/completions/completions.py:925\u001b[39m, in \u001b[36mCompletions.create\u001b[39m\u001b[34m(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, web_search_options, extra_headers, extra_query, extra_body, timeout)\u001b[39m\n\u001b[32m 882\u001b[39m \u001b[38;5;129m@required_args\u001b[39m([\u001b[33m\"\u001b[39m\u001b[33mmessages\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mmodel\u001b[39m\u001b[33m\"\u001b[39m], [\u001b[33m\"\u001b[39m\u001b[33mmessages\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mmodel\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mstream\u001b[39m\u001b[33m\"\u001b[39m])\n\u001b[32m 883\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcreate\u001b[39m(\n\u001b[32m 884\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 922\u001b[39m timeout: \u001b[38;5;28mfloat\u001b[39m | httpx.Timeout | \u001b[38;5;28;01mNone\u001b[39;00m | NotGiven = NOT_GIVEN,\n\u001b[32m 923\u001b[39m ) -> ChatCompletion | Stream[ChatCompletionChunk]:\n\u001b[32m 924\u001b[39m validate_response_format(response_format)\n\u001b[32m--> \u001b[39m\u001b[32m925\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_post\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 926\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m/chat/completions\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 927\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmaybe_transform\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 928\u001b[39m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m 929\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessages\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 930\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmodel\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 931\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43maudio\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43maudio\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 932\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfrequency_penalty\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrequency_penalty\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 933\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfunction_call\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunction_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 934\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfunctions\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunctions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 935\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mlogit_bias\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogit_bias\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 936\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mlogprobs\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogprobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 937\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmax_completion_tokens\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_completion_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 938\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmax_tokens\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 939\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmetadata\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 940\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmodalities\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodalities\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 941\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mn\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 942\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mparallel_tool_calls\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mparallel_tool_calls\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 943\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mprediction\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mprediction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 944\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mpresence_penalty\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mpresence_penalty\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 945\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mreasoning_effort\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mreasoning_effort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 946\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mresponse_format\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mresponse_format\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 947\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mseed\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mseed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 948\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mservice_tier\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mservice_tier\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 949\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstop\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 950\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstore\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstore\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 951\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstream\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 952\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstream_options\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 953\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtemperature\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemperature\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 954\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtool_choice\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtool_choice\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 955\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtools\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtools\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 956\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtop_logprobs\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_logprobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 957\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtop_p\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_p\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 958\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43muser\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 959\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mweb_search_options\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mweb_search_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 960\u001b[39m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 961\u001b[39m \u001b[43m \u001b[49m\u001b[43mcompletion_create_params\u001b[49m\u001b[43m.\u001b[49m\u001b[43mCompletionCreateParamsStreaming\u001b[49m\n\u001b[32m 962\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\n\u001b[32m 963\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mcompletion_create_params\u001b[49m\u001b[43m.\u001b[49m\u001b[43mCompletionCreateParamsNonStreaming\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 964\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 965\u001b[39m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmake_request_options\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 966\u001b[39m \u001b[43m \u001b[49m\u001b[43mextra_headers\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextra_headers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_query\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextra_query\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextra_body\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextra_body\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\n\u001b[32m 967\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 968\u001b[39m \u001b[43m \u001b[49m\u001b[43mcast_to\u001b[49m\u001b[43m=\u001b[49m\u001b[43mChatCompletion\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 969\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 970\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m=\u001b[49m\u001b[43mStream\u001b[49m\u001b[43m[\u001b[49m\u001b[43mChatCompletionChunk\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 971\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/openai/_base_client.py:1239\u001b[39m, in \u001b[36mSyncAPIClient.post\u001b[39m\u001b[34m(self, path, cast_to, body, options, files, stream, stream_cls)\u001b[39m\n\u001b[32m 1225\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mpost\u001b[39m(\n\u001b[32m 1226\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 1227\u001b[39m path: \u001b[38;5;28mstr\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 1234\u001b[39m stream_cls: \u001b[38;5;28mtype\u001b[39m[_StreamT] | \u001b[38;5;28;01mNone\u001b[39;00m = \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m 1235\u001b[39m ) -> ResponseT | _StreamT:\n\u001b[32m 1236\u001b[39m opts = FinalRequestOptions.construct(\n\u001b[32m 1237\u001b[39m method=\u001b[33m\"\u001b[39m\u001b[33mpost\u001b[39m\u001b[33m\"\u001b[39m, url=path, json_data=body, files=to_httpx_files(files), **options\n\u001b[32m 1238\u001b[39m )\n\u001b[32m-> \u001b[39m\u001b[32m1239\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m cast(ResponseT, \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcast_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstream_cls\u001b[49m\u001b[43m)\u001b[49m)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/dev/GitHub/phoenix/.conda/lib/python3.11/site-packages/openai/_base_client.py:1001\u001b[39m, in \u001b[36mSyncAPIClient.request\u001b[39m\u001b[34m(self, cast_to, options, stream, stream_cls)\u001b[39m\n\u001b[32m 998\u001b[39m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[32m 1000\u001b[39m log.debug(\u001b[33m\"\u001b[39m\u001b[33mRaising connection error\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1001\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m APIConnectionError(request=request) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01merr\u001b[39;00m\n\u001b[32m 1003\u001b[39m log.debug(\n\u001b[32m 1004\u001b[39m \u001b[33m'\u001b[39m\u001b[33mHTTP Response: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m \u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m%i\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m\u001b[33m \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m'\u001b[39m,\n\u001b[32m 1005\u001b[39m request.method,\n\u001b[32m (...)\u001b[39m\u001b[32m 1009\u001b[39m response.headers,\n\u001b[32m 1010\u001b[39m )\n\u001b[32m 1011\u001b[39m log.debug(\u001b[33m\"\u001b[39m\u001b[33mrequest_id: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m, response.headers.get(\u001b[33m\"\u001b[39m\u001b[33mx-request-id\u001b[39m\u001b[33m\"\u001b[39m))\n", - "\u001b[31mAPIConnectionError\u001b[39m: Connection error." - ] - } - ], - "source": [ - "results = [evaluate_case(case) for case in dataset.cases]\n", - "\n", - "for case, result in zip(dataset.cases, results):\n", - " print(f\"Case: {case.name}, Correct: {result}\")" + "outputs": [], + "source": [ + "report = dataset.evaluate_sync(task)\n", + "print(report)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Step 7. See your results in Arize Phoenix\n", + "## Redefine Eval to be LLM-powered or Semantic" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "class FuzzyMatchesOutput(Evaluator[str, str]):\n", + " def evaluate(self, ctx: EvaluatorContext[str, str]) -> float:\n", + " # Using fuzzy matching to compare expected and actual outputs\n", + " from difflib import SequenceMatcher\n", + "\n", + " def similarity_ratio(a, b):\n", + " return SequenceMatcher(None, a, b).ratio()\n", + "\n", + " # Consider it correct if similarity is above 0.8 (80%)\n", + " is_correct = similarity_ratio(ctx.expected_output, ctx.output) > 0.8\n", + " return is_correct\n", + "\n", + "\n", + "dataset.add_evaluator(FuzzyMatchesOutput())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from pydantic_evals.evaluators import LLMJudge\n", "\n", - "" + "dataset.add_evaluator(\n", + " LLMJudge(\n", + " rubric=\"Output and Expected Output should match\",\n", + " include_input=True,\n", + " model=\"openai:gpt-4o-mini\",\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "report = dataset.evaluate_sync(task)\n", + "print(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upload Labels to Phoenix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = report.model_dump()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "meo_spans = spans.copy()\n", + "fuzzy_label_spans = spans.copy()\n", + "llm_label_spans = spans.copy()\n", + "\n", + "for case in results.get(\"cases\"):\n", + " meo_label = case.get(\"assertions\").get(\"MatchesExpectedOutput\").get(\"value\")\n", + " fuzzy_label = case.get(\"assertions\").get(\"FuzzyMatchesOutput\").get(\"value\")\n", + " llm_label = case.get(\"assertions\").get(\"LLMJudge\").get(\"value\")\n", + "\n", + " input = case.get(\"inputs\")\n", + "\n", + " # Update the label in meo_spans where the input matches\n", + " meo_spans.loc[meo_spans[\"input\"] == input, \"label\"] = str(meo_label)\n", + " fuzzy_label_spans.loc[meo_spans[\"input\"] == input, \"label\"] = str(fuzzy_label)\n", + " llm_label_spans.loc[llm_label_spans[\"input\"] == input, \"label\"] = str(llm_label)\n", + "\n", + "meo_spans[\"score\"] = meo_spans[\"label\"].apply(lambda x: 1 if x else 0)\n", + "fuzzy_label_spans[\"score\"] = fuzzy_label_spans[\"label\"].apply(lambda x: 1 if x else 0)\n", + "llm_label_spans[\"score\"] = llm_label_spans[\"label\"].apply(lambda x: 1 if x else 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "meo_spans.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from phoenix.trace import SpanEvaluations\n", + "\n", + "px.Client().log_evaluations(\n", + " SpanEvaluations(\n", + " dataframe=meo_spans,\n", + " eval_name=\"Direct Match Eval\",\n", + " ),\n", + " SpanEvaluations(\n", + " dataframe=fuzzy_label_spans,\n", + " eval_name=\"Fuzzy Match Eval\",\n", + " ),\n", + " SpanEvaluations(\n", + " dataframe=llm_label_spans,\n", + " eval_name=\"LLM Match Eval\",\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![results_in_phoenix](https://storage.googleapis.com/arize-phoenix-assets/assets/images/pydantic-evals-results.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### For more on LLM Evaluation, check out our [Arize Master Guide to LLM Evaluation](https://arize.com/llm-evaluation)!" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "phoenix", "language": "python", "name": "python3" }, @@ -401,7 +441,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.12" + "version": "3.11.10" } }, "nbformat": 4, From be0a75a0a8f3b432742fe6c4769345309b3b491b Mon Sep 17 00:00:00 2001 From: JohnGilhuly Date: Fri, 2 May 2025 09:21:55 -0400 Subject: [PATCH 3/4] Add additional walkthrough info --- tutorials/evals/pydantic-evals.ipynb | 88 ++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 26 deletions(-) diff --git a/tutorials/evals/pydantic-evals.ipynb b/tutorials/evals/pydantic-evals.ipynb index b83990c172..0173472139 100644 --- a/tutorials/evals/pydantic-evals.ipynb +++ b/tutorials/evals/pydantic-evals.ipynb @@ -18,8 +18,11 @@ "\n", "#
Evaluation using Pydantic Evals
\n", "\n", - "1. Use Pydantic Evals to evaluate your LLM app for a simple question-answering task.\n", - "2. Log your results to Arize Phoenix to track your experiments and traces." + "Pydantic offers an evaluation library that can be used to run preset direct evaluations, such as whether an output matches a Pydantic model, as well as LLM Judge evaluations. These evals can be run directly over dataframes of cases defined with Pydantic. However, you may want to run evaluations over real traces as opposed to presaved cases.\n", + "\n", + "This notebook shows you how you can use Pydantic Evals alongside Arize Phoenix to run evals on traces captured from your running application.\n", + "\n", + "*Note: Phoenix does include its own evals package, however it is designed to work with other eval packages like Pydantic Evals as well.*" ] }, { @@ -102,7 +105,7 @@ "\n", "tracer_provider = register(\n", " project_name=\"pydantic-evals-tutorial\",\n", - " auto_instrument=True,\n", + " auto_instrument=True, # because you've imported the openinference-instrumentation-openai package above, this will automatically instrument any OpenAI method calls\n", ")" ] }, @@ -147,13 +150,20 @@ " generate_trace(input)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should now see three traces captured in your Phoenix instance. If you don't see them right away, make sure you've selected the `pydantic-evals-tutorial` project." + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Export Traces from Phoenix\n", "\n", - "Export your traces from Phoenix to then evaluate with Pydantic Evals." + "Next, you export those traces from Phoenix so that you can evaluate them using Pydantic Evals." ] }, { @@ -213,7 +223,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setup LLM task to evaluate\n" + "## Setup LLM task, Evaluator, and Dataset for Pydantic\n", + "\n", + "Pydantic Evals requires a task to run each case through. Since you've already run this task for a given input (represented by the traces you captured above), this case will simply be retrieving the corresponding output from your dataframe of exported traces." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()\n", + "\n", + "\n", + "async def task(input: str) -> str:\n", + " output = spans[spans[\"input\"] == input][\"output\"].values[0]\n", + " return output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then create a basic evaluator that checks whether the output matches the expected value exactly." ] }, { @@ -245,27 +280,13 @@ ")" ] }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "import nest_asyncio\n", - "\n", - "nest_asyncio.apply()\n", - "\n", - "\n", - "async def task(input: str) -> str:\n", - " output = spans[spans[\"input\"] == input][\"output\"].values[0]\n", - " return output" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Run your experiment and evaluation" + "## Run your experiment and evaluation\n", + "\n", + "Now with everything connected up, you can run your evaluation using Pydantic:" ] }, { @@ -282,7 +303,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Redefine Eval to be LLM-powered or Semantic" + "## Redefine Eval to be LLM-powered or Semantic\n", + "\n", + "That evaluation works fine, however the exact match is a bit too strict to work in a real world setting. Try adding two other kinds of evaluators, a fuzzy match eval and an LLM judge eval." ] }, { @@ -317,7 +340,7 @@ "\n", "dataset.add_evaluator(\n", " LLMJudge(\n", - " rubric=\"Output and Expected Output should match\",\n", + " rubric=\"Output and Expected Output should represent the same answer, even if the text doesn't match exactly\",\n", " include_input=True,\n", " model=\"openai:gpt-4o-mini\",\n", " ),\n", @@ -338,7 +361,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Upload Labels to Phoenix" + "You should now see that the LLM Judge at least catches that \"Shakespeare\" and \"William Shakespeare\" represent the same answer." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upload Labels to Phoenix\n", + "\n", + "As a final step, you can now upload your eval results to Phoenix to capture them in the UI." ] }, { @@ -356,22 +388,25 @@ "metadata": {}, "outputs": [], "source": [ + "# Create a dataframe for each eval\n", "meo_spans = spans.copy()\n", "fuzzy_label_spans = spans.copy()\n", "llm_label_spans = spans.copy()\n", "\n", "for case in results.get(\"cases\"):\n", + " # Phoenix expects a \"label\" column, so start by extracting the eval result from each row\n", " meo_label = case.get(\"assertions\").get(\"MatchesExpectedOutput\").get(\"value\")\n", " fuzzy_label = case.get(\"assertions\").get(\"FuzzyMatchesOutput\").get(\"value\")\n", " llm_label = case.get(\"assertions\").get(\"LLMJudge\").get(\"value\")\n", "\n", " input = case.get(\"inputs\")\n", "\n", - " # Update the label in meo_spans where the input matches\n", + " # Update the label in each dataframe where the input value matches\n", " meo_spans.loc[meo_spans[\"input\"] == input, \"label\"] = str(meo_label)\n", " fuzzy_label_spans.loc[meo_spans[\"input\"] == input, \"label\"] = str(fuzzy_label)\n", " llm_label_spans.loc[llm_label_spans[\"input\"] == input, \"label\"] = str(llm_label)\n", "\n", + "# Phoenix can also take in a numeric score for each row which it uses to calculate overall metrics\n", "meo_spans[\"score\"] = meo_spans[\"label\"].apply(lambda x: 1 if x else 0)\n", "fuzzy_label_spans[\"score\"] = fuzzy_label_spans[\"label\"].apply(lambda x: 1 if x else 0)\n", "llm_label_spans[\"score\"] = llm_label_spans[\"label\"].apply(lambda x: 1 if x else 0)" @@ -394,6 +429,7 @@ "source": [ "from phoenix.trace import SpanEvaluations\n", "\n", + "# Upload your data to Phoenix:\n", "px.Client().log_evaluations(\n", " SpanEvaluations(\n", " dataframe=meo_spans,\n", From 4bba360938fb14fc8db305a57699f35a72187cb5 Mon Sep 17 00:00:00 2001 From: JohnGilhuly Date: Fri, 2 May 2025 09:36:52 -0400 Subject: [PATCH 4/4] Add diagram --- tutorials/evals/pydantic-evals.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tutorials/evals/pydantic-evals.ipynb b/tutorials/evals/pydantic-evals.ipynb index 0173472139..bff2c4fd6f 100644 --- a/tutorials/evals/pydantic-evals.ipynb +++ b/tutorials/evals/pydantic-evals.ipynb @@ -22,6 +22,8 @@ "\n", "This notebook shows you how you can use Pydantic Evals alongside Arize Phoenix to run evals on traces captured from your running application.\n", "\n", + "\n", + "\n", "*Note: Phoenix does include its own evals package, however it is designed to work with other eval packages like Pydantic Evals as well.*" ] },