|
1 |
| -"""Regression models using LLMs.""" |
| 1 | +"""Regression modelling using LLMs.""" |
| 2 | +from __future__ import annotations |
| 3 | + |
2 | 4 | import re
|
| 5 | +from logging import getLogger |
| 6 | +from typing import Literal |
3 | 7 |
|
4 | 8 | import numpy as np
|
5 | 9 | from dotenv import load_dotenv
|
6 |
| -from numpy.random import default_rng |
7 |
| -from openai import BadRequestError, OpenAI |
| 10 | +from numpy import ndarray |
| 11 | +from openai import OpenAI |
8 | 12 | from pandas import DataFrame
|
9 |
| -from sklearn.linear_model import LinearRegression |
10 |
| -from sklearn.metrics import mean_absolute_error, r2_score |
11 |
| -from sklearn.model_selection import train_test_split |
12 | 13 | from tqdm import tqdm
|
13 | 14 |
|
| 15 | +OpenAiModel = Literal["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"] |
14 | 16 |
|
15 |
| -def predict( |
16 |
| - test_data: DataFrame, train_data: DataFrame, *, verbose: bool = False |
17 |
| -) -> DataFrame: |
18 |
| - """Score a dataset using an LLM. |
19 |
| -
|
20 |
| - Args: |
21 |
| - ---- |
22 |
| - test_data: Dataframe of features/variables to use for prediction. |
23 |
| - train_data: Dataframe of labelled features/variables to use for training. |
24 |
| - verbose: Print prompt for first test data instances? |
25 |
| -
|
26 |
| - Returns: |
27 |
| - ------- |
28 |
| - A dataframe with predicted values for the test data. |
29 |
| - """ |
30 |
| - load_dotenv() # load OPEN_API_KEY from .env file (if present) |
31 |
| - client = OpenAI() |
32 |
| - |
33 |
| - system_prompt = ( |
34 |
| - "Your task is to provide your best estimate for ”Output”. Please provide that " |
35 |
| - "and only that, without any additional text." |
36 |
| - ) |
37 |
| - |
38 |
| - prompt_train_data = [ |
39 |
| - f"Feature 0: {row.x}\nOutput: {row.y}" for row in train_data.itertuples() |
40 |
| - ] |
41 |
| - |
42 |
| - y_pred: list[float] = [] |
43 |
| - for row in tqdm( |
44 |
| - test_data.itertuples(), |
45 |
| - total=test_data.shape[0], |
46 |
| - ): |
47 |
| - prompt_test_data = [f"Feature 0: {row.x}\nOutput:"] |
48 |
| - |
49 |
| - user_prompt = "\n\n".join(prompt_train_data + prompt_test_data) |
50 |
| - if verbose: |
51 |
| - print(user_prompt) |
52 |
| - |
53 |
| - completion = client.chat.completions.create( |
54 |
| - model="gpt-4o", |
55 |
| - messages=[ |
56 |
| - {"role": "system", "content": system_prompt}, |
57 |
| - {"role": "user", "content": user_prompt}, |
58 |
| - ], |
59 |
| - temperature=0, |
60 |
| - response_format={"type": "text"}, |
61 |
| - seed=42, |
62 |
| - ) |
63 |
| - try: |
64 |
| - prediction = completion.choices[0].message.content |
65 |
| - except BadRequestError as e: |
66 |
| - raise ModelError("API call to LLM failed") from e |
67 |
| - if prediction: |
68 |
| - y_pred += [_parse_model_output(prediction)] |
69 |
| - else: |
70 |
| - raise ModelError("prediciton failed") |
| 17 | +log = getLogger("OpenAIRegressionLogger") |
| 18 | + |
| 19 | + |
| 20 | +class OpenAiRegressor: |
| 21 | + """Generic regression using Open AI LLMs.""" |
71 | 22 |
|
72 |
| - return DataFrame({"y_pred": y_pred}) |
| 23 | + def __init__(self, model: OpenAiModel = "gpt-3.5-turbo", seed: int = 42): |
| 24 | + """Initialise object. |
73 | 25 |
|
| 26 | + Args: |
| 27 | + ---- |
| 28 | + model: Open AI model to use. Defaults to "gpt-3.5-turbo". |
| 29 | + seed: Random seed to use with OpenAI model. Defaults to 42 |
| 30 | + """ |
| 31 | + load_dotenv() # load OPEN_API_KEY from .env file (if present) |
| 32 | + self._client = OpenAI() |
| 33 | + self._model = model |
| 34 | + self._model_seed = seed |
| 35 | + self._prompt_instruction = ( |
| 36 | + "Your task is to provide your best estimate for ”Output”. Please provide " |
| 37 | + "that and only that, without any additional text." |
| 38 | + ) |
| 39 | + self._prompt_train_data: str = "" |
| 40 | + |
| 41 | + def __repr__(self) -> str: |
| 42 | + """Create string representation.""" |
| 43 | + return f"OpenAiRegressor(model={self._model})" |
| 44 | + |
| 45 | + def fit(self, X: DataFrame | ndarray, y: DataFrame | ndarray) -> OpenAiRegressor: |
| 46 | + """Create a prompt based on training data to use when predicting with an LLM. |
| 47 | +
|
| 48 | + Args: |
| 49 | + ---- |
| 50 | + X: Feature data. |
| 51 | + y: Labels. |
| 52 | +
|
| 53 | + Raises: |
| 54 | + ------ |
| 55 | + ValueError: If the dimensions of X or y are invalid and/or inconsistent with |
| 56 | + one another. |
| 57 | +
|
| 58 | + Returns: |
| 59 | + ------- |
| 60 | + The OpenAiRegressor object. |
| 61 | + """ |
| 62 | + if X.ndim < 2: |
| 63 | + raise ValueError("X.ndim must be >= 2") |
| 64 | + if y.ndim < 2: |
| 65 | + raise ValueError("y.ndim must be == 2") |
| 66 | + if len(X) != len(y): |
| 67 | + raise ValueError("len(y) != len(X)") |
| 68 | + |
| 69 | + _X = X.tolist() if isinstance(X, ndarray) else X.values.tolist() |
| 70 | + _y = y.tolist() if isinstance(y, ndarray) else y.values.tolist() |
| 71 | + |
| 72 | + self._prompt_train_data = "\n\n".join( |
| 73 | + [self._format_data_row(row, _y[n_row]) for n_row, row in enumerate(_X)] |
| 74 | + ) |
| 75 | + |
| 76 | + return self |
| 77 | + |
| 78 | + def predict(self, X: DataFrame | ndarray, logging: bool = True) -> ndarray: |
| 79 | + """Predict labels using model and feature data. |
| 80 | +
|
| 81 | + Any prediction failures will return `numpy.nan` - prediction won't be halted, |
| 82 | + given the expense of querying LLMs. |
| 83 | +
|
| 84 | + Args: |
| 85 | + ---- |
| 86 | + X: Feature data to use for predictions. |
| 87 | + logging: Enable logging. Default to True. |
| 88 | +
|
| 89 | + Raises: |
| 90 | + ------ |
| 91 | + RuntimeError: If `.fit` has not been called. |
| 92 | +
|
| 93 | + Returns: |
| 94 | + ------- |
| 95 | + Model predictions |
| 96 | + """ |
| 97 | + if not self._prompt_train_data: |
| 98 | + raise RuntimeError("please fit model before trying to generate predictions") |
| 99 | + |
| 100 | + _X = X if isinstance(X, ndarray) else X.values |
| 101 | + y_pred: list[float] = [] |
| 102 | + |
| 103 | + for n, row in tqdm(enumerate(_X), total=len(_X)): |
| 104 | + try: |
| 105 | + prediction_prompt = self._compose_prediction_prompt( |
| 106 | + self._prompt_instruction, |
| 107 | + self._prompt_train_data, |
| 108 | + self._format_data_row(row), |
| 109 | + ) |
| 110 | + llm_response = self._client.chat.completions.create( |
| 111 | + model=self._model, |
| 112 | + messages=[{"role": "user", "content": prediction_prompt}], |
| 113 | + temperature=0, |
| 114 | + response_format={"type": "text"}, |
| 115 | + seed=self._model_seed, |
| 116 | + ) |
| 117 | + llm_generation = llm_response.choices[0].message.content |
| 118 | + if llm_generation: |
| 119 | + y_pred += [self._parse_model_output(llm_generation)] |
| 120 | + else: |
| 121 | + y_pred += [np.nan] |
| 122 | + except Exception as e: |
| 123 | + if logging: |
| 124 | + log.warning(f"LLM error for test data row #{n} - {str(e)}") |
| 125 | + y_pred += [np.nan] |
| 126 | + |
| 127 | + return np.array(y_pred).reshape(-1, 1) |
| 128 | + |
| 129 | + @staticmethod |
| 130 | + def _compose_prediction_prompt( |
| 131 | + instruction: str, train_data: str, test_data: str |
| 132 | + ) -> str: |
| 133 | + """Compose full prompt from constituent parts.""" |
| 134 | + return instruction + "\n" + train_data + "\n\n" + test_data |
| 135 | + |
| 136 | + @staticmethod |
| 137 | + def _format_data_row(x_row: ndarray, y_row: ndarray | None = None) -> str: |
| 138 | + """Format a data row for inclusion in model prompt.""" |
| 139 | + output = y_row[0] if y_row else "" |
| 140 | + prompt_data = "\n".join( |
| 141 | + [f"Feature {n}: {x}" for n, x in enumerate(x_row)] + [f"Output: {output}"] |
| 142 | + ) |
| 143 | + return prompt_data |
74 | 144 |
|
75 |
| -def _parse_model_output(output: str) -> float: |
76 |
| - """Parse the models's output.""" |
77 |
| - try: |
| 145 | + @staticmethod |
| 146 | + def _parse_model_output(output: str) -> float: |
| 147 | + """Parse the models's output.""" |
78 | 148 | result = re.findall(r"-?\d+\.?\d*", output)[0]
|
79 | 149 | return float(result)
|
80 |
| - except (ValueError, IndexError) as e: |
81 |
| - raise ModelError("invalid model prediction") from e |
82 |
| - |
83 |
| - |
84 |
| -def make_univariate_linear_test_data( |
85 |
| - n_samples: int = 1000, *, rho: float = 0.75, seed: int = 42 |
86 |
| -) -> DataFrame: |
87 |
| - """Simulate a y = rho * x + sqrt(1 - rho ** 2) * epsilon. |
88 |
| -
|
89 |
| - Args: |
90 |
| - ---- |
91 |
| - n_samples: Number of samples to generate. Defaults to 1000. |
92 |
| - rho: Rho coeffcient (correlation coefficient). Defaults to 0.75. |
93 |
| - seed: Random seed. Defaults to 42. |
94 |
| -
|
95 |
| - Returns: |
96 |
| - ------- |
97 |
| - Dataframe of test data. |
98 |
| - """ |
99 |
| - if not (rho >= 0 and rho <= 1): |
100 |
| - raise ValueError(f"rho = {rho} - must in [0, 1]") |
101 |
| - rng = default_rng(seed) |
102 |
| - x = rng.standard_normal(n_samples) |
103 |
| - epsilon = rng.standard_normal(n_samples) |
104 |
| - y = rho * x + np.sqrt(1 - rho * rho) * epsilon |
105 |
| - return DataFrame({"x": x, "y": y}) |
106 |
| - |
107 |
| - |
108 |
| -class ModelError(Exception): |
109 |
| - """Custom exception class for model errors.""" |
110 |
| - |
111 |
| - pass |
112 |
| - |
113 |
| - |
114 |
| -if __name__ == "__main__": |
115 |
| - # make datasets |
116 |
| - n_samples = 1000 |
117 |
| - dataset = make_univariate_linear_test_data(n_samples, rho=0.9) |
118 |
| - train_data, test_data = train_test_split(dataset, test_size=0.05, random_state=42) |
119 |
| - |
120 |
| - # ols regression |
121 |
| - ols_regressor = LinearRegression() |
122 |
| - ols_regressor.fit(train_data[["x"]], train_data[["y"]]) |
123 |
| - y_pred_ols = ols_regressor.predict(test_data[["x"]]) |
124 |
| - |
125 |
| - ols_results = test_data.copy().reset_index(drop=True).assign(y_pred=y_pred_ols) |
126 |
| - mean_abs_err_ols = mean_absolute_error(ols_results["y"], ols_results["y_pred"]) |
127 |
| - r_squared_ols = r2_score(ols_results["y"], ols_results["y_pred"]) |
128 |
| - print(f"mean_abs_error = {mean_abs_err_ols}") |
129 |
| - print(f"r_squared = {r_squared_ols}") |
130 |
| - |
131 |
| - # llm regression |
132 |
| - y_pred = predict(test_data, train_data) |
133 |
| - |
134 |
| - llm_results = ( |
135 |
| - test_data.copy().reset_index(drop=True).assign(y_pred=y_pred["y_pred"]) |
136 |
| - ) |
137 |
| - mean_abs_err_llm = mean_absolute_error(llm_results["y"], llm_results["y_pred"]) |
138 |
| - r_squared_llm = r2_score(llm_results["y"], llm_results["y_pred"]) |
139 |
| - print(f"mean_abs_error = {mean_abs_err_llm}") |
140 |
| - print(f"r_squared = {r_squared_llm}") |
141 |
| - |
142 |
| -# mean_abs_error = 0.4107320869725583 |
143 |
| -# r_squared = 0.7865828324377897 |
144 |
| -# mean_abs_error = 0.38392287248603985 |
145 |
| -# r_squared = 0.8083485333779725 |
|
0 commit comments