From 290a0ebfdd16825ceeb0f756b62dc7151b0b2ecd Mon Sep 17 00:00:00 2001
From: Manuel Saelices <msaelices@gmail.com>
Date: Wed, 24 May 2023 12:37:38 +0200
Subject: [PATCH 1/4] feat: New is_correct() evaluation function which ask a
 LLM model to return if a response is correct

---
 examples/readme_examples.py |  6 ++++
 promptimize/evals.py        | 56 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/examples/readme_examples.py b/examples/readme_examples.py
index cce28db..8ab55e9 100644
--- a/examples/readme_examples.py
+++ b/examples/readme_examples.py
@@ -17,6 +17,12 @@
     # Prompting "hello there" and making sure there's "hi" or "hello"
     # somewhere in the answer
     PromptCase("hello there!", lambda x: evals.any_word(x.response, ["hi", "hello"])),
+    # Prompting "2+2" and making sure the answer is mathematically correct
+    PromptCase(
+        "What is 2+2?",
+        # we can put the results in number or in text as GPT will understand both
+        lambda x: evals.is_correct(x.response, question=x.prompt, predicted='four'),
+    ),
     # Making sure 3 specific guitar players are in the top 10
     # the score here is a percentage of the words found
     PromptCase(
diff --git a/promptimize/evals.py b/promptimize/evals.py
index b93f4f0..b5ec74d 100644
--- a/promptimize/evals.py
+++ b/promptimize/evals.py
@@ -8,7 +8,11 @@
 success, and a range in-between
 """
 
-from typing import List
+import os
+from typing import List, Optional
+
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import PromptTemplate
 
 
 def percentage_of_words(response: str, words: List[str], case_sensitive: bool = False) -> float:
@@ -120,3 +124,53 @@ def all(iteratable):
 
 def any(iteratable):
     return 1 if base_any([i == 1 for i in iteratable]) else 0
+
+
+def is_correct(response: str, question: str, predicted: str, model_name: Optional[str] = None) -> int:
+    """
+    Query a LLM to calculate the correctness of the prediction and the given response.
+
+    Args:
+        question (str): The question to be answered.
+        response (str): The answer given by the LLM.
+        predicted (str): The predicted answer.
+
+    Returns:
+        int: 1 if the answer in the response is CORRECT to the predicted one; otherwise, 0.
+
+    Examples:
+    >>> is_correct("5", "7")
+    0
+    >>> is_correct("5", "5.0")
+    1
+    >>> is_correct("a dog", "a cat")
+    0
+    """
+    model_name = model_name or 'gpt-4'  # GPT-4 works great for evaluating correctness
+    llm = ChatOpenAI(model_name=model_name, openai_api_key=os.environ.get("OPENAI_API_KEY"))
+    prompt = PromptTemplate(
+        input_variables=["response", "predicted", "question"],
+        template=IS_CORRECT_TEMPLATE,
+    ).format(response=response, predicted=predicted, question=question)
+
+    response = llm.predict(prompt)
+
+    return 0 if "INCORRECT" in response else 1
+
+
+IS_CORRECT_TEMPLATE = """
+You are a teacher grading an answer.
+You are given a predicted anwer and the actual answer. You are asked to score the answer as either CORRECT or INCORRECT, based on the context.
+
+Example Format:
+QUESTION: question here
+PREDICTED ANSWER: predicted answer here
+ANSWER: actual answer here
+GRADE: CORRECT or INCORRECT here
+
+Grade the answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the answer and true answer. It is OK if the answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin!
+
+QUESTION: {question}
+PREDICTED ANSWER: {predicted}
+ANSWER: {response}
+GRADE: """
\ No newline at end of file

From ff6ac673e0bf946d91616792b44b9de698ac67bf Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 May 2023 10:43:01 +0000
Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/readme_examples.py | 2 +-
 promptimize/evals.py        | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/readme_examples.py b/examples/readme_examples.py
index 8ab55e9..081f54b 100644
--- a/examples/readme_examples.py
+++ b/examples/readme_examples.py
@@ -21,7 +21,7 @@
     PromptCase(
         "What is 2+2?",
         # we can put the results in number or in text as GPT will understand both
-        lambda x: evals.is_correct(x.response, question=x.prompt, predicted='four'),
+        lambda x: evals.is_correct(x.response, question=x.prompt, predicted="four"),
     ),
     # Making sure 3 specific guitar players are in the top 10
     # the score here is a percentage of the words found
diff --git a/promptimize/evals.py b/promptimize/evals.py
index b5ec74d..b3e0ac1 100644
--- a/promptimize/evals.py
+++ b/promptimize/evals.py
@@ -126,7 +126,9 @@ def any(iteratable):
     return 1 if base_any([i == 1 for i in iteratable]) else 0
 
 
-def is_correct(response: str, question: str, predicted: str, model_name: Optional[str] = None) -> int:
+def is_correct(
+    response: str, question: str, predicted: str, model_name: Optional[str] = None
+) -> int:
     """
     Query a LLM to calculate the correctness of the prediction and the given response.
 
@@ -146,7 +148,7 @@ def is_correct(response: str, question: str, predicted: str, model_name: Optiona
     >>> is_correct("a dog", "a cat")
     0
     """
-    model_name = model_name or 'gpt-4'  # GPT-4 works great for evaluating correctness
+    model_name = model_name or "gpt-4"  # GPT-4 works great for evaluating correctness
     llm = ChatOpenAI(model_name=model_name, openai_api_key=os.environ.get("OPENAI_API_KEY"))
     prompt = PromptTemplate(
         input_variables=["response", "predicted", "question"],
@@ -173,4 +175,4 @@ def is_correct(response: str, question: str, predicted: str, model_name: Optiona
 QUESTION: {question}
 PREDICTED ANSWER: {predicted}
 ANSWER: {response}
-GRADE: """
\ No newline at end of file
+GRADE: """

From 2af42a558617c48788e25ae52663d93cbca9d221 Mon Sep 17 00:00:00 2001
From: Manuel Saelices <msaelices@gmail.com>
Date: Thu, 25 May 2023 19:07:40 +0200
Subject: [PATCH 3/4] feat: Improve the prompt to prioritize the expected
 answer and not the actual accuracy. Replace predicted with expected

---
 promptimize/evals.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/promptimize/evals.py b/promptimize/evals.py
index b3e0ac1..63e2430 100644
--- a/promptimize/evals.py
+++ b/promptimize/evals.py
@@ -127,18 +127,18 @@ def any(iteratable):
 
 
 def is_correct(
-    response: str, question: str, predicted: str, model_name: Optional[str] = None
+    response: str, question: str, expected: str, model_name: Optional[str] = None
 ) -> int:
     """
-    Query a LLM to calculate the correctness of the prediction and the given response.
+    Query a LLM to calculate the correctness of the expected and the given response.
 
     Args:
         question (str): The question to be answered.
         response (str): The answer given by the LLM.
-        predicted (str): The predicted answer.
+        expected (str): The expected answer.
 
     Returns:
-        int: 1 if the answer in the response is CORRECT to the predicted one; otherwise, 0.
+        int: 1 if the answer in the response is CORRECT to the expected one; otherwise, 0.
 
     Examples:
     >>> is_correct("5", "7")
@@ -151,9 +151,9 @@ def is_correct(
     model_name = model_name or "gpt-4"  # GPT-4 works great for evaluating correctness
     llm = ChatOpenAI(model_name=model_name, openai_api_key=os.environ.get("OPENAI_API_KEY"))
     prompt = PromptTemplate(
-        input_variables=["response", "predicted", "question"],
+        input_variables=["response", "expected", "question"],
         template=IS_CORRECT_TEMPLATE,
-    ).format(response=response, predicted=predicted, question=question)
+    ).format(response=response, expected=expected, question=question)
 
     response = llm.predict(prompt)
 
@@ -162,17 +162,17 @@ def is_correct(
 
 IS_CORRECT_TEMPLATE = """
 You are a teacher grading an answer.
-You are given a predicted anwer and the actual answer. You are asked to score the answer as either CORRECT or INCORRECT, based on the context.
+You are given a expected anwer and the actual answer. You are asked to score the answer as either CORRECT or INCORRECT, based on the context.
 
 Example Format:
 QUESTION: question here
-PREDICTED ANSWER: predicted answer here
+EXPECTED ANSWER: expected answer here
 ANSWER: actual answer here
 GRADE: CORRECT or INCORRECT here
 
-Grade the answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the answer and true answer. It is OK if the answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin!
+Grade the answers based ONLY on their accuracy compared with the expected ones, no matter of the actual accuracy. Ignore differences in punctuation and phrasing between the answer and true answer. It is OK if the answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin!
 
 QUESTION: {question}
-PREDICTED ANSWER: {predicted}
+EXPECTED ANSWER: {expected}
 ANSWER: {response}
 GRADE: """

From b77e30c2d06ef0d52ed6ff6a2d67e514fc787a38 Mon Sep 17 00:00:00 2001
From: Manuel Saelices <msaelices@gmail.com>
Date: Thu, 25 May 2023 19:09:43 +0200
Subject: [PATCH 4/4] feat: Fixed example after replacing predicted to expected

---
 examples/readme_examples.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/readme_examples.py b/examples/readme_examples.py
index 081f54b..4212f70 100644
--- a/examples/readme_examples.py
+++ b/examples/readme_examples.py
@@ -21,7 +21,7 @@
     PromptCase(
         "What is 2+2?",
         # we can put the results in number or in text as GPT will understand both
-        lambda x: evals.is_correct(x.response, question=x.prompt, predicted="four"),
+        lambda x: evals.is_correct(x.response, question=x.prompt, expected="four"),
     ),
     # Making sure 3 specific guitar players are in the top 10
     # the score here is a percentage of the words found