|
| 1 | +# Purpose of this module |
| 2 | + |
| 3 | +This modules shows how to set up LLM based evaluation. The example here is of a quiz app and |
| 4 | +we want to validate and check the quality of the quiz generated by the LLM. We do this by |
| 5 | +checking the format, factuality, and relevance of the quiz. |
| 6 | + |
| 7 | +## Example Usage |
| 8 | + |
| 9 | +### Inputs |
| 10 | +These are the defined inputs you can provide. |
| 11 | + |
| 12 | + - *question*: The user question that specifies what to generate for the quiz |
| 13 | + |
| 14 | +### Overrides |
| 15 | +With Hamilton you can easily override a function and provide a value for it. For example if you're |
| 16 | +iterating you might just want to override these two values before modifying the functions: |
| 17 | + |
| 18 | + - *quiz_bank*: provide a fixed string of context that is the facts for the quiz bank. |
| 19 | + - *llm_quiz_response*: what the LLM returns as the quiz for the user. You can fix this and test the grader functions. |
| 20 | + |
| 21 | +### Outputs |
| 22 | +You might want to use the following outputs: |
| 23 | + |
| 24 | + - *llm_quiz_response*: The result of the LLM call to produce the quiz. |
| 25 | + - *eval_format_response*: The result of the format grader. |
| 26 | + - *eval_factcheck_response*: The result of the factcheck grader. |
| 27 | + - *eval_relevance_check_response*: The result of the relevance grader. |
| 28 | + |
| 29 | +### Execution |
| 30 | +You can ask to get back any result of an intermediate function by providing the function name in the `execute` call. |
| 31 | +Here we just ask for the final result, but if you wanted to, you could ask for outputs of any of the functions, which |
| 32 | +you can then introspect or log for debugging/evaluation purposes. Note if you want more platform integrations, |
| 33 | +you can add adapters that will do this automatically for you, e.g. like we have the `PrintLn` adapter here. |
| 34 | + |
| 35 | +```python |
| 36 | +# import the module |
| 37 | +import pprint |
| 38 | +from hamilton import driver |
| 39 | + |
| 40 | +dr = ( |
| 41 | + driver.Builder() |
| 42 | + .with_modules(simple_eval_grader) |
| 43 | + .build() |
| 44 | +) |
| 45 | +dr.display_all_functions("dag.png") |
| 46 | +good_response = """ |
| 47 | +Question 1:#### What is the largest telescope in space called and what material is its mirror made of? |
| 48 | +
|
| 49 | +Question 2:#### True or False: Water slows down the speed of light. |
| 50 | +
|
| 51 | +Question 3:#### What did Marie and Pierre Curie discover in Paris? |
| 52 | +""" |
| 53 | +result = dr.execute(["eval_format_response"], overrides={"llm_quiz_response": good_response}) |
| 54 | +print(result) |
| 55 | +assert result["eval_format_response"] == "Y" |
| 56 | + |
| 57 | +bad_response = "There are lots of interesting facts. Tell me more about what you'd like to know" |
| 58 | +result = dr.execute(["eval_format_response"], overrides={"llm_quiz_response": bad_response}) |
| 59 | + |
| 60 | +print(result) |
| 61 | +assert result["eval_format_response"] == "N" |
| 62 | + |
| 63 | +quiz_request = "Write me a quiz about books." |
| 64 | +eval_response = dr.execute([ |
| 65 | + "llm_quiz_response", |
| 66 | + "eval_format_response", |
| 67 | + "eval_factcheck_response", |
| 68 | +], inputs={"question": quiz_request}) |
| 69 | +pprint.pprint(eval_response) |
| 70 | +# Our test asks about a subject not in the context, so the agent should answer N |
| 71 | +assert eval_response["eval_format_response"] == "Y" |
| 72 | +assert "Decision: Yes" in eval_response["eval_factcheck_response"] |
| 73 | + |
| 74 | +result = dr.execute( |
| 75 | + ["eval_relevance_check_response"], |
| 76 | + inputs={"question": quiz_request}, |
| 77 | + overrides={ |
| 78 | + "llm_quiz_response": "Great! Here's a customized quiz about books:\n\n" |
| 79 | + "Question 1:####\n" |
| 80 | + "Subject: Leonardo DaVinci\nCategory: Art, Science\n" |
| 81 | + "Fact: Leonardo DaVinci is known for his artistic masterpiece, the Mona Lisa. " |
| 82 | + "Can you name any other field of study that DaVinci was interested in?\n\n" |
| 83 | + "Question 2:####\nSubject: Paris\nCategory: Art, Geography\nFact: Paris is home to" |
| 84 | + " the Louvre, one of the world's largest and most famous museums. Can you name the" |
| 85 | + " painting that is displayed in the Louvre and is considered one of the most iconic" |
| 86 | + " artworks of all time?\n\n" |
| 87 | + "Question 3:####\nSubject: Starry Night\nCategory: Art\nFact: Vincent van Gogh's " |
| 88 | + "painting, Starry Night, is a famous artwork that captures the east-facing view of " |
| 89 | + "his room in Saint-Rémy-de-Provence. Can you name any other famous painting by " |
| 90 | + "van Gogh?\n\n" |
| 91 | + "Feel free to answer the questions and let me know when you're ready for the answers!"} |
| 92 | +) |
| 93 | +pprint.pprint(result) |
| 94 | +``` |
| 95 | + |
| 96 | +# How to extend this module |
| 97 | +With this example you would likely change all the prompts |
| 98 | +for your specific use case, since the prompts are specific to the quiz. |
| 99 | + |
| 100 | +# Configuration Options |
| 101 | +There is no configuration needed for this module. |
| 102 | + |
| 103 | +# Limitations |
| 104 | + |
| 105 | +You need to have the OPENAI_API_KEY in your environment. |
| 106 | +It should be accessible from your code by doing `os.environ["OPENAI_API_KEY"]`. |
| 107 | + |
| 108 | +The code does not check the context length, so it may fail if the context passed is too long |
| 109 | +for the LLM you send it to. |
| 110 | + |
| 111 | + |
| 112 | +# Pytest Integration |
| 113 | +Here's an example of how you might use this module with pytest. |
| 114 | +```python |
| 115 | +from hamilton.contrib.dagworks import simple_eval_grader |
| 116 | +from hamilton import driver |
| 117 | +import pytest |
| 118 | +import pandas as pd |
| 119 | + |
| 120 | + |
| 121 | +@pytest.fixture |
| 122 | +def driver_fixture(): |
| 123 | + dr = ( |
| 124 | + driver.Builder() |
| 125 | + .with_modules(simple_eval_grader) |
| 126 | + .build() |
| 127 | + ) |
| 128 | + return dr |
| 129 | + |
| 130 | + |
| 131 | +def test_format_grader_works(driver_fixture): |
| 132 | + good_response = """ |
| 133 | + Question 1:#### What is the largest telescope in space called and what material is its mirror made of? |
| 134 | +
|
| 135 | + Question 2:#### True or False: Water slows down the speed of light. |
| 136 | +
|
| 137 | + Question 3:#### What did Marie and Pierre Curie discover in Paris? |
| 138 | + """ |
| 139 | + result = driver_fixture.execute( |
| 140 | + ["eval_format_response"], |
| 141 | + overrides={"llm_quiz_response": good_response} |
| 142 | + ) |
| 143 | + assert result["eval_format_response"] == "Y" |
| 144 | + |
| 145 | + bad_response = "There are lots of interesting facts. Tell me more about what you'd like to know" |
| 146 | + result = driver_fixture.execute( |
| 147 | + ["eval_format_response"], |
| 148 | + overrides={"llm_quiz_response": bad_response} |
| 149 | + ) |
| 150 | + assert result["eval_format_response"] == "N" |
| 151 | + |
| 152 | + |
| 153 | +def test_factcheck_grader_works(driver_fixture): |
| 154 | + good_response = """ |
| 155 | + Question 1:#### What is the largest telescope in space called and what material is its mirror made of? |
| 156 | + """ |
| 157 | + result = driver_fixture.execute( |
| 158 | + ["eval_factcheck_response"], |
| 159 | + overrides={"llm_quiz_response": good_response, |
| 160 | + "quiz_bank": "The largest telescope in space is called the Hubble Space Telescope" |
| 161 | + " and its mirror is made of glass."} |
| 162 | + ) |
| 163 | + assert "Decision: Yes" in result["eval_factcheck_response"] |
| 164 | + |
| 165 | + |
| 166 | +@pytest.fixture |
| 167 | +def quiz_bank() -> str: |
| 168 | + return ( |
| 169 | + """1. Subject: Leonardo DaVinci |
| 170 | + Categories: Art, Science |
| 171 | + Facts: |
| 172 | + - Painted the Mona Lisa |
| 173 | + - Studied zoology, anatomy, geology, optics |
| 174 | + - Designed a flying machine |
| 175 | +
|
| 176 | +2. Subject: Paris |
| 177 | + Categories: Art, Science |
| 178 | + Facts: |
| 179 | + - Location of the Louvre, the museum where the Mona Lisa is displayed |
| 180 | + - Capital of France |
| 181 | + - Most populous city in France |
| 182 | + - Where Radium and Polonium were discovered by scientists Marie and Pierre Curie |
| 183 | +
|
| 184 | +3. Subject: Telescopes |
| 185 | + Category: Science |
| 186 | + Facts: |
| 187 | + - Device to observe different objects |
| 188 | + - The first refracting telescopes were invented in the Netherlands in the 17th Century |
| 189 | + - The James Webb space telescope is the largest telescope in space. It uses a gold-berillyum mirror |
| 190 | +
|
| 191 | +4. Subject: Starry Night |
| 192 | + Category: Art |
| 193 | + Facts: |
| 194 | + - Painted by Vincent van Gogh in 1889 |
| 195 | + - Captures the east-facing view of van Gogh's room in Saint-Rémy-de-Provence |
| 196 | +
|
| 197 | +5. Subject: Physics |
| 198 | + Category: Science |
| 199 | + Facts: |
| 200 | + - The sun doesn't change color during sunset. |
| 201 | + - Water slows the speed of light |
| 202 | + - The Eiffel Tower in Paris is taller in the summer than the winter due to expansion of the metal. |
| 203 | +""") |
| 204 | + |
| 205 | + |
| 206 | +test_dataset = [ |
| 207 | + {"input": "I'm trying to learn about science, can you give me a quiz to test my knowledge", |
| 208 | + "expectation": "PASS"}, |
| 209 | + {"input": "I'm an geography expert, give a quiz to prove it?", "expectation": "FAIL"}, |
| 210 | + {"input": "Quiz me about Italy", "expectation": "FAIL"}, |
| 211 | + {"input": "Write me a quiz about books", "expectation": "FAIL"}, |
| 212 | +] |
| 213 | + |
| 214 | + |
| 215 | +def test_quiz_creation_with_llm_grader(driver_fixture): |
| 216 | + eval_results = [] |
| 217 | + for test_case in test_dataset: |
| 218 | + eval_result = {} |
| 219 | + results = driver_fixture.execute([ |
| 220 | + "llm_quiz_response", |
| 221 | + "eval_format_response", |
| 222 | + "eval_factcheck_response", |
| 223 | + "eval_relevance_check_response", |
| 224 | + ], inputs={"question": test_case["input"]}) |
| 225 | + eval_result["input"] = test_case["input"] |
| 226 | + eval_result["output"] = results["llm_quiz_response"] |
| 227 | + eval_result["format"] = results["eval_format_response"] |
| 228 | + eval_result["factuality"] = results["eval_factcheck_response"] |
| 229 | + eval_result["relevance"] = results["eval_relevance_check_response"] |
| 230 | + eval_result["expectation"] = test_case["expectation"] |
| 231 | + if all([results["eval_format_response"] == "Y", |
| 232 | + "Decision: Yes" in results["eval_factcheck_response"], |
| 233 | + "Decision: Yes" in results["eval_relevance_check_response"]]): |
| 234 | + eval_result["actual"] = "PASS" |
| 235 | + else: |
| 236 | + eval_result["actual"] = "FAIL" |
| 237 | + eval_results.append(eval_result) |
| 238 | + df = pd.DataFrame(eval_results) |
| 239 | + df_html = df.to_html().replace("\\n", "<br>") |
| 240 | + print(df_html) |
| 241 | + # don't assert anything, just run things and save the results to a dataframe that you |
| 242 | + # would probably save/push somewhere. |
| 243 | +``` |
0 commit comments