Skip to content

Commit 37ca575

Browse files
AlexIoannidesalexioannides
and
alexioannides
authored
Refactor experimental code into OpenAiRegressor class (#2)
Co-authored-by: alexioannides <[email protected]>
1 parent 3fe6a90 commit 37ca575

File tree

6 files changed

+324
-138
lines changed

6 files changed

+324
-138
lines changed

LICENSE

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1818
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1919
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2020
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21-
SOFTWARE.
21+
SOFTWARE.

src/llm_regression/__init__.py

+7
Original file line numberDiff line numberDiff line change
@@ -1 +1,8 @@
11
"""The llm_regression package."""
2+
from .models import OpenAiRegressor
3+
from .utils import make_univariate_linear_test_data
4+
5+
__all__ = [
6+
"OpenAiRegressor",
7+
"make_univariate_linear_test_data",
8+
]

src/llm_regression/models.py

+136-132
Original file line numberDiff line numberDiff line change
@@ -1,145 +1,149 @@
1-
"""Regression models using LLMs."""
1+
"""Regression modelling using LLMs."""
2+
from __future__ import annotations
3+
24
import re
5+
from logging import getLogger
6+
from typing import Literal
37

48
import numpy as np
59
from dotenv import load_dotenv
6-
from numpy.random import default_rng
7-
from openai import BadRequestError, OpenAI
10+
from numpy import ndarray
11+
from openai import OpenAI
812
from pandas import DataFrame
9-
from sklearn.linear_model import LinearRegression
10-
from sklearn.metrics import mean_absolute_error, r2_score
11-
from sklearn.model_selection import train_test_split
1213
from tqdm import tqdm
1314

15+
OpenAiModel = Literal["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]
1416

15-
def predict(
16-
test_data: DataFrame, train_data: DataFrame, *, verbose: bool = False
17-
) -> DataFrame:
18-
"""Score a dataset using an LLM.
19-
20-
Args:
21-
----
22-
test_data: Dataframe of features/variables to use for prediction.
23-
train_data: Dataframe of labelled features/variables to use for training.
24-
verbose: Print prompt for first test data instances?
25-
26-
Returns:
27-
-------
28-
A dataframe with predicted values for the test data.
29-
"""
30-
load_dotenv() # load OPEN_API_KEY from .env file (if present)
31-
client = OpenAI()
32-
33-
system_prompt = (
34-
"Your task is to provide your best estimate for ”Output”. Please provide that "
35-
"and only that, without any additional text."
36-
)
37-
38-
prompt_train_data = [
39-
f"Feature 0: {row.x}\nOutput: {row.y}" for row in train_data.itertuples()
40-
]
41-
42-
y_pred: list[float] = []
43-
for row in tqdm(
44-
test_data.itertuples(),
45-
total=test_data.shape[0],
46-
):
47-
prompt_test_data = [f"Feature 0: {row.x}\nOutput:"]
48-
49-
user_prompt = "\n\n".join(prompt_train_data + prompt_test_data)
50-
if verbose:
51-
print(user_prompt)
52-
53-
completion = client.chat.completions.create(
54-
model="gpt-4o",
55-
messages=[
56-
{"role": "system", "content": system_prompt},
57-
{"role": "user", "content": user_prompt},
58-
],
59-
temperature=0,
60-
response_format={"type": "text"},
61-
seed=42,
62-
)
63-
try:
64-
prediction = completion.choices[0].message.content
65-
except BadRequestError as e:
66-
raise ModelError("API call to LLM failed") from e
67-
if prediction:
68-
y_pred += [_parse_model_output(prediction)]
69-
else:
70-
raise ModelError("prediciton failed")
17+
log = getLogger("OpenAIRegressionLogger")
18+
19+
20+
class OpenAiRegressor:
21+
"""Generic regression using Open AI LLMs."""
7122

72-
return DataFrame({"y_pred": y_pred})
23+
def __init__(self, model: OpenAiModel = "gpt-3.5-turbo", seed: int = 42):
24+
"""Initialise object.
7325
26+
Args:
27+
----
28+
model: Open AI model to use. Defaults to "gpt-3.5-turbo".
29+
seed: Random seed to use with OpenAI model. Defaults to 42
30+
"""
31+
load_dotenv() # load OPEN_API_KEY from .env file (if present)
32+
self._client = OpenAI()
33+
self._model = model
34+
self._model_seed = seed
35+
self._prompt_instruction = (
36+
"Your task is to provide your best estimate for ”Output”. Please provide "
37+
"that and only that, without any additional text."
38+
)
39+
self._prompt_train_data: str = ""
40+
41+
def __repr__(self) -> str:
42+
"""Create string representation."""
43+
return f"OpenAiRegressor(model={self._model})"
44+
45+
def fit(self, X: DataFrame | ndarray, y: DataFrame | ndarray) -> OpenAiRegressor:
46+
"""Create a prompt based on training data to use when predicting with an LLM.
47+
48+
Args:
49+
----
50+
X: Feature data.
51+
y: Labels.
52+
53+
Raises:
54+
------
55+
ValueError: If the dimensions of X or y are invalid and/or inconsistent with
56+
one another.
57+
58+
Returns:
59+
-------
60+
The OpenAiRegressor object.
61+
"""
62+
if X.ndim < 2:
63+
raise ValueError("X.ndim must be >= 2")
64+
if y.ndim < 2:
65+
raise ValueError("y.ndim must be == 2")
66+
if len(X) != len(y):
67+
raise ValueError("len(y) != len(X)")
68+
69+
_X = X.tolist() if isinstance(X, ndarray) else X.values.tolist()
70+
_y = y.tolist() if isinstance(y, ndarray) else y.values.tolist()
71+
72+
self._prompt_train_data = "\n\n".join(
73+
[self._format_data_row(row, _y[n_row]) for n_row, row in enumerate(_X)]
74+
)
75+
76+
return self
77+
78+
def predict(self, X: DataFrame | ndarray, logging: bool = True) -> ndarray:
79+
"""Predict labels using model and feature data.
80+
81+
Any prediction failures will return `numpy.nan` - prediction won't be halted,
82+
given the expense of querying LLMs.
83+
84+
Args:
85+
----
86+
X: Feature data to use for predictions.
87+
logging: Enable logging. Default to True.
88+
89+
Raises:
90+
------
91+
RuntimeError: If `.fit` has not been called.
92+
93+
Returns:
94+
-------
95+
Model predictions
96+
"""
97+
if not self._prompt_train_data:
98+
raise RuntimeError("please fit model before trying to generate predictions")
99+
100+
_X = X if isinstance(X, ndarray) else X.values
101+
y_pred: list[float] = []
102+
103+
for n, row in tqdm(enumerate(_X), total=len(_X)):
104+
try:
105+
prediction_prompt = self._compose_prediction_prompt(
106+
self._prompt_instruction,
107+
self._prompt_train_data,
108+
self._format_data_row(row),
109+
)
110+
llm_response = self._client.chat.completions.create(
111+
model=self._model,
112+
messages=[{"role": "user", "content": prediction_prompt}],
113+
temperature=0,
114+
response_format={"type": "text"},
115+
seed=self._model_seed,
116+
)
117+
llm_generation = llm_response.choices[0].message.content
118+
if llm_generation:
119+
y_pred += [self._parse_model_output(llm_generation)]
120+
else:
121+
y_pred += [np.nan]
122+
except Exception as e:
123+
if logging:
124+
log.warning(f"LLM error for test data row #{n} - {str(e)}")
125+
y_pred += [np.nan]
126+
127+
return np.array(y_pred).reshape(-1, 1)
128+
129+
@staticmethod
130+
def _compose_prediction_prompt(
131+
instruction: str, train_data: str, test_data: str
132+
) -> str:
133+
"""Compose full prompt from constituent parts."""
134+
return instruction + "\n" + train_data + "\n\n" + test_data
135+
136+
@staticmethod
137+
def _format_data_row(x_row: ndarray, y_row: ndarray | None = None) -> str:
138+
"""Format a data row for inclusion in model prompt."""
139+
output = y_row[0] if y_row else ""
140+
prompt_data = "\n".join(
141+
[f"Feature {n}: {x}" for n, x in enumerate(x_row)] + [f"Output: {output}"]
142+
)
143+
return prompt_data
74144

75-
def _parse_model_output(output: str) -> float:
76-
"""Parse the models's output."""
77-
try:
145+
@staticmethod
146+
def _parse_model_output(output: str) -> float:
147+
"""Parse the models's output."""
78148
result = re.findall(r"-?\d+\.?\d*", output)[0]
79149
return float(result)
80-
except (ValueError, IndexError) as e:
81-
raise ModelError("invalid model prediction") from e
82-
83-
84-
def make_univariate_linear_test_data(
85-
n_samples: int = 1000, *, rho: float = 0.75, seed: int = 42
86-
) -> DataFrame:
87-
"""Simulate a y = rho * x + sqrt(1 - rho ** 2) * epsilon.
88-
89-
Args:
90-
----
91-
n_samples: Number of samples to generate. Defaults to 1000.
92-
rho: Rho coeffcient (correlation coefficient). Defaults to 0.75.
93-
seed: Random seed. Defaults to 42.
94-
95-
Returns:
96-
-------
97-
Dataframe of test data.
98-
"""
99-
if not (rho >= 0 and rho <= 1):
100-
raise ValueError(f"rho = {rho} - must in [0, 1]")
101-
rng = default_rng(seed)
102-
x = rng.standard_normal(n_samples)
103-
epsilon = rng.standard_normal(n_samples)
104-
y = rho * x + np.sqrt(1 - rho * rho) * epsilon
105-
return DataFrame({"x": x, "y": y})
106-
107-
108-
class ModelError(Exception):
109-
"""Custom exception class for model errors."""
110-
111-
pass
112-
113-
114-
if __name__ == "__main__":
115-
# make datasets
116-
n_samples = 1000
117-
dataset = make_univariate_linear_test_data(n_samples, rho=0.9)
118-
train_data, test_data = train_test_split(dataset, test_size=0.05, random_state=42)
119-
120-
# ols regression
121-
ols_regressor = LinearRegression()
122-
ols_regressor.fit(train_data[["x"]], train_data[["y"]])
123-
y_pred_ols = ols_regressor.predict(test_data[["x"]])
124-
125-
ols_results = test_data.copy().reset_index(drop=True).assign(y_pred=y_pred_ols)
126-
mean_abs_err_ols = mean_absolute_error(ols_results["y"], ols_results["y_pred"])
127-
r_squared_ols = r2_score(ols_results["y"], ols_results["y_pred"])
128-
print(f"mean_abs_error = {mean_abs_err_ols}")
129-
print(f"r_squared = {r_squared_ols}")
130-
131-
# llm regression
132-
y_pred = predict(test_data, train_data)
133-
134-
llm_results = (
135-
test_data.copy().reset_index(drop=True).assign(y_pred=y_pred["y_pred"])
136-
)
137-
mean_abs_err_llm = mean_absolute_error(llm_results["y"], llm_results["y_pred"])
138-
r_squared_llm = r2_score(llm_results["y"], llm_results["y_pred"])
139-
print(f"mean_abs_error = {mean_abs_err_llm}")
140-
print(f"r_squared = {r_squared_llm}")
141-
142-
# mean_abs_error = 0.4107320869725583
143-
# r_squared = 0.7865828324377897
144-
# mean_abs_error = 0.38392287248603985
145-
# r_squared = 0.8083485333779725

src/llm_regression/utils.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
"""Helpful functions."""
2+
from numpy import sqrt
3+
from numpy.random import default_rng
4+
from pandas import DataFrame
5+
6+
7+
def make_univariate_linear_test_data(
8+
n_samples: int = 1000, *, rho: float = 0.75, seed: int = 42
9+
) -> DataFrame:
10+
"""Simulate a y = rho * x + sqrt(1 - rho ** 2) * epsilon.
11+
12+
This paradign ensures that the standard deviation of x and y is always 1, and that
13+
x has correlation with y given by rho.
14+
15+
Args:
16+
----
17+
n_samples: Number of samples to generate. Defaults to 1000.
18+
rho: Rho coeffcient (correlation coefficient). Defaults to 0.75.
19+
seed: Random seed. Defaults to 42.
20+
21+
Returns:
22+
-------
23+
Dataframe of test data.
24+
"""
25+
if not (rho >= 0 and rho <= 1):
26+
raise ValueError(f"rho = {rho} - must in [0, 1]")
27+
rng = default_rng(seed)
28+
x = rng.standard_normal(n_samples)
29+
epsilon = rng.standard_normal(n_samples)
30+
y = rho * x + sqrt(1 - rho * rho) * epsilon
31+
return DataFrame({"x": x, "y": y})

0 commit comments

Comments
 (0)