From db24498c30ad48304f418d608b800f52e914c8dc Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Tue, 11 Mar 2025 21:42:31 -0700 Subject: [PATCH 01/15] add is_valid check for extract usage --- eureka_ml_insights/data_utils/transform.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py index f1bf574e..034702a1 100644 --- a/eureka_ml_insights/data_utils/transform.py +++ b/eureka_ml_insights/data_utils/transform.py @@ -464,9 +464,21 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: # if the model is one for which the usage of completion tokens is known, use that corresponding column for the model # otherwise, use the default "n_output_tokens" which is computed with a universal tokenizer as shown in TokenCounterTransform() if usage_completion_read_col: - df[self.usage_completion_output_col] = df[self.prepend_completion_read_col + "usage"].apply(lambda x: x[usage_completion_read_col]) + df[self.usage_completion_output_col] = df.apply(lambda x: self._extract_usage(x, usage_completion_read_col), axis=1) elif self.prepend_completion_read_col + "n_output_tokens" in df.columns: df[self.usage_completion_output_col] = df[self.prepend_completion_read_col + "n_output_tokens"] else: df[self.usage_completion_output_col] = np.nan return df + + def _extract_usage(self, row, usage_completion_read_col): + """ + Extracts the token usage for a given row is is_valid is True. + Args: + row (pd.Series): A row of the dataframe. + Returns: + int: The token usage for the row. + """ + if row[self.prepend_completion_read_col + "is_valid"]: + return row[self.prepend_completion_read_col + "usage"][usage_completion_read_col] + return np.nan From 7a31781dcefe049be2af2d4cc5b0cf3bd60d3f06 Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Wed, 12 Mar 2025 11:17:24 -0700 Subject: [PATCH 02/15] comment fix --- eureka_ml_insights/data_utils/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py index 034702a1..455bf926 100644 --- a/eureka_ml_insights/data_utils/transform.py +++ b/eureka_ml_insights/data_utils/transform.py @@ -473,7 +473,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: def _extract_usage(self, row, usage_completion_read_col): """ - Extracts the token usage for a given row is is_valid is True. + Extracts the token usage for a given row if is_valid is True. Args: row (pd.Series): A row of the dataframe. Returns: From 196c3857f9f9852cfeef83ea60c286cbc46cbed9 Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Wed, 12 Mar 2025 12:10:04 -0700 Subject: [PATCH 03/15] changed check --- eureka_ml_insights/data_utils/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py index 455bf926..869cd835 100644 --- a/eureka_ml_insights/data_utils/transform.py +++ b/eureka_ml_insights/data_utils/transform.py @@ -479,6 +479,6 @@ def _extract_usage(self, row, usage_completion_read_col): Returns: int: The token usage for the row. """ - if row[self.prepend_completion_read_col + "is_valid"]: + if not pd.isna(row['usage']) and usage_completion_read_col in row['usage']: return row[self.prepend_completion_read_col + "usage"][usage_completion_read_col] return np.nan From 7bf52aab9fa096b4a31ba18ed52596093521f6e6 Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Wed, 12 Mar 2025 14:53:02 -0700 Subject: [PATCH 04/15] add column validation --- eureka_ml_insights/data_utils/transform.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py index 869cd835..cca38009 100644 --- a/eureka_ml_insights/data_utils/transform.py +++ b/eureka_ml_insights/data_utils/transform.py @@ -463,6 +463,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: usage_completion_read_col = "completion_tokens" # if the model is one for which the usage of completion tokens is known, use that corresponding column for the model # otherwise, use the default "n_output_tokens" which is computed with a universal tokenizer as shown in TokenCounterTransform() + self.validate(df) if usage_completion_read_col: df[self.usage_completion_output_col] = df.apply(lambda x: self._extract_usage(x, usage_completion_read_col), axis=1) elif self.prepend_completion_read_col + "n_output_tokens" in df.columns: @@ -471,6 +472,13 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: df[self.usage_completion_output_col] = np.nan return df + def validate(self, df: pd.DataFrame, usage_completion_read_col: str) -> pd.DataFrame: + """Check that all columns to be transformed are present actually in the data frame.""" + if usage_completion_read_col and self.prepend_completion_read_col+'usage' not in df.columns: + raise ValueError(f"The {self.prepend_completion_read_col + 'usage'} column is not present in the data frame.") + if self.prepend_completion_read_col + "n_output_tokens" not in df.columns: + raise ValueError(f"The {self.prepend_completion_read_col + 'n_output_tokens'} column is not present in the data frame.") + def _extract_usage(self, row, usage_completion_read_col): """ Extracts the token usage for a given row if is_valid is True. @@ -479,6 +487,6 @@ def _extract_usage(self, row, usage_completion_read_col): Returns: int: The token usage for the row. """ - if not pd.isna(row['usage']) and usage_completion_read_col in row['usage']: + if not pd.isna(row[self.prepend_completion_read_col + 'usage']) and usage_completion_read_col in row[self.prepend_completion_read_col + 'usage']: return row[self.prepend_completion_read_col + "usage"][usage_completion_read_col] return np.nan From 4914d5772165e19700124350e0a9b6c9698bef6e Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Wed, 12 Mar 2025 14:53:35 -0700 Subject: [PATCH 05/15] add column validation --- eureka_ml_insights/data_utils/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py index cca38009..595e1335 100644 --- a/eureka_ml_insights/data_utils/transform.py +++ b/eureka_ml_insights/data_utils/transform.py @@ -476,7 +476,7 @@ def validate(self, df: pd.DataFrame, usage_completion_read_col: str) -> pd.DataF """Check that all columns to be transformed are present actually in the data frame.""" if usage_completion_read_col and self.prepend_completion_read_col+'usage' not in df.columns: raise ValueError(f"The {self.prepend_completion_read_col + 'usage'} column is not present in the data frame.") - if self.prepend_completion_read_col + "n_output_tokens" not in df.columns: + elif self.prepend_completion_read_col + "n_output_tokens" not in df.columns: raise ValueError(f"The {self.prepend_completion_read_col + 'n_output_tokens'} column is not present in the data frame.") def _extract_usage(self, row, usage_completion_read_col): From 2aa917a876d705ad091f33c5e4b2f315bb9a9b7e Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Wed, 12 Mar 2025 14:54:52 -0700 Subject: [PATCH 06/15] add column validation --- eureka_ml_insights/data_utils/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py index 595e1335..2ba80ffb 100644 --- a/eureka_ml_insights/data_utils/transform.py +++ b/eureka_ml_insights/data_utils/transform.py @@ -481,7 +481,7 @@ def validate(self, df: pd.DataFrame, usage_completion_read_col: str) -> pd.DataF def _extract_usage(self, row, usage_completion_read_col): """ - Extracts the token usage for a given row if is_valid is True. + Extracts the token usage for a given row if usage column and corresponding completion column exists. Args: row (pd.Series): A row of the dataframe. Returns: From 697d21c87571e8a07efca08a78453e50e6daace6 Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Wed, 12 Mar 2025 15:29:16 -0700 Subject: [PATCH 07/15] rename variables --- eureka_ml_insights/data_utils/transform.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py index 2ba80ffb..57c9f46f 100644 --- a/eureka_ml_insights/data_utils/transform.py +++ b/eureka_ml_insights/data_utils/transform.py @@ -436,7 +436,8 @@ class ExtractUsageTransform: """ model_config: ModelConfig usage_completion_output_col: str = "usage_completion" - prepend_completion_read_col: str = "" + usage_column: str = "usage" + n_tokens_column: str = "n_output_tokens" def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ @@ -466,18 +467,18 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: self.validate(df) if usage_completion_read_col: df[self.usage_completion_output_col] = df.apply(lambda x: self._extract_usage(x, usage_completion_read_col), axis=1) - elif self.prepend_completion_read_col + "n_output_tokens" in df.columns: - df[self.usage_completion_output_col] = df[self.prepend_completion_read_col + "n_output_tokens"] + elif self.n_tokens_column in df.columns: + df[self.usage_completion_output_col] = df[self.self.n_tokens_column] else: df[self.usage_completion_output_col] = np.nan return df def validate(self, df: pd.DataFrame, usage_completion_read_col: str) -> pd.DataFrame: """Check that all columns to be transformed are present actually in the data frame.""" - if usage_completion_read_col and self.prepend_completion_read_col+'usage' not in df.columns: - raise ValueError(f"The {self.prepend_completion_read_col + 'usage'} column is not present in the data frame.") - elif self.prepend_completion_read_col + "n_output_tokens" not in df.columns: - raise ValueError(f"The {self.prepend_completion_read_col + 'n_output_tokens'} column is not present in the data frame.") + if usage_completion_read_col and self.usage_column not in df.columns: + raise ValueError(f"The {self.usage_column} column is not present in the data frame.") + elif self.n_tokens_column not in df.columns: + raise ValueError(f"The {self.n_tokens_column + 'n_output_tokens'} column is not present in the data frame.") def _extract_usage(self, row, usage_completion_read_col): """ From f8f3e68870ee2bd5b9d0318ebfd8859563e710c5 Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Wed, 12 Mar 2025 15:31:39 -0700 Subject: [PATCH 08/15] rename variables --- eureka_ml_insights/data_utils/transform.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py index 57c9f46f..b334e691 100644 --- a/eureka_ml_insights/data_utils/transform.py +++ b/eureka_ml_insights/data_utils/transform.py @@ -478,7 +478,7 @@ def validate(self, df: pd.DataFrame, usage_completion_read_col: str) -> pd.DataF if usage_completion_read_col and self.usage_column not in df.columns: raise ValueError(f"The {self.usage_column} column is not present in the data frame.") elif self.n_tokens_column not in df.columns: - raise ValueError(f"The {self.n_tokens_column + 'n_output_tokens'} column is not present in the data frame.") + raise ValueError(f"The {self.n_tokens_column} column is not present in the data frame.") def _extract_usage(self, row, usage_completion_read_col): """ @@ -488,6 +488,6 @@ def _extract_usage(self, row, usage_completion_read_col): Returns: int: The token usage for the row. """ - if not pd.isna(row[self.prepend_completion_read_col + 'usage']) and usage_completion_read_col in row[self.prepend_completion_read_col + 'usage']: - return row[self.prepend_completion_read_col + "usage"][usage_completion_read_col] + if not pd.isna(row[self.usage_column]) and usage_completion_read_col in row[self.usage_column]: + return row[self.usage_column][usage_completion_read_col] return np.nan From aaa501ce8210479d30571f76d152b8a6b42806f7 Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Thu, 13 Mar 2025 14:02:15 -0700 Subject: [PATCH 09/15] update comments --- eureka_ml_insights/data_utils/transform.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py index b334e691..817b9eec 100644 --- a/eureka_ml_insights/data_utils/transform.py +++ b/eureka_ml_insights/data_utils/transform.py @@ -431,8 +431,9 @@ class ExtractUsageTransform: Extracts token usage completion numbers (except prompt input tokens) for all models. args: model_config: config used for the experiment. - usage_completion_output_col: str, default name of the column where completion numbers will be stored for all models - prepend_completion_read_col: str, prepend string to add to the name of the usage column from which to read. Useful for cases when the usage column might have been renamed earlier in the pipeline. + usage_completion_output_col: str, default name of the column where completion numbers will be stored for model + usage_column: str, default name of the column where usage information is stored for model + n_tokens_column: str, default name of the column where number of tokens is stored for model """ model_config: ModelConfig usage_completion_output_col: str = "usage_completion" From fd2064962836b3de4a16c302a23acabd435e5d40 Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Thu, 13 Mar 2025 23:34:32 -0700 Subject: [PATCH 10/15] update comment --- eureka_ml_insights/data_utils/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py index 817b9eec..25c6bf5e 100644 --- a/eureka_ml_insights/data_utils/transform.py +++ b/eureka_ml_insights/data_utils/transform.py @@ -475,7 +475,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: return df def validate(self, df: pd.DataFrame, usage_completion_read_col: str) -> pd.DataFrame: - """Check that all columns to be transformed are present actually in the data frame.""" + """Check that usage_columns or n_tokens_columns are present actually in the data frame.""" if usage_completion_read_col and self.usage_column not in df.columns: raise ValueError(f"The {self.usage_column} column is not present in the data frame.") elif self.n_tokens_column not in df.columns: From b66d95e71171f958615a84712d5ef77aad94714b Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Thu, 13 Mar 2025 23:39:39 -0700 Subject: [PATCH 11/15] update comment --- eureka_ml_insights/data_utils/transform.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py index 25c6bf5e..e11efd4f 100644 --- a/eureka_ml_insights/data_utils/transform.py +++ b/eureka_ml_insights/data_utils/transform.py @@ -448,7 +448,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: df (pd.DataFrame): Input dataframe of inference results retrieved with the model_config. Returns: - pd.DataFrame: Transformed dataframe with completion token numbers in completion_usage_col. + pd.DataFrame: Transformed dataframe with completion token numbers in usage_completion_output_col. """ usage_completion_read_col = None if (self.model_config.class_name is GeminiModel): @@ -475,7 +475,11 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: return df def validate(self, df: pd.DataFrame, usage_completion_read_col: str) -> pd.DataFrame: - """Check that usage_columns or n_tokens_columns are present actually in the data frame.""" + """Check that usage_columns or n_tokens_columns are present actually in the data frame. + Args: + df (pd.DataFrame): Input dataframe containing model_output_col and id_col. + usage_completion_read_col (str): The column name for token extraction. + """ if usage_completion_read_col and self.usage_column not in df.columns: raise ValueError(f"The {self.usage_column} column is not present in the data frame.") elif self.n_tokens_column not in df.columns: @@ -486,6 +490,7 @@ def _extract_usage(self, row, usage_completion_read_col): Extracts the token usage for a given row if usage column and corresponding completion column exists. Args: row (pd.Series): A row of the dataframe. + usage_completion_read_col (str): The column name to extract the token usage from. Returns: int: The token usage for the row. """ From eb1d7c28ad3446a7d3edf0511f0fe11a600e61db Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Fri, 14 Mar 2025 10:48:11 -0700 Subject: [PATCH 12/15] updating func --- eureka_ml_insights/data_utils/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py index e11efd4f..84fe8ff2 100644 --- a/eureka_ml_insights/data_utils/transform.py +++ b/eureka_ml_insights/data_utils/transform.py @@ -465,7 +465,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: usage_completion_read_col = "completion_tokens" # if the model is one for which the usage of completion tokens is known, use that corresponding column for the model # otherwise, use the default "n_output_tokens" which is computed with a universal tokenizer as shown in TokenCounterTransform() - self.validate(df) + self.validate(df, usage_completion_read_col) if usage_completion_read_col: df[self.usage_completion_output_col] = df.apply(lambda x: self._extract_usage(x, usage_completion_read_col), axis=1) elif self.n_tokens_column in df.columns: From 3fa1cb445adf06482596ddc00a338e1466f07e09 Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Fri, 14 Mar 2025 11:09:39 -0700 Subject: [PATCH 13/15] updated tests --- eureka_ml_insights/data_utils/transform.py | 2 +- tests/test_utils.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py index 7a1822f8..6597b355 100644 --- a/eureka_ml_insights/data_utils/transform.py +++ b/eureka_ml_insights/data_utils/transform.py @@ -475,7 +475,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: if usage_completion_read_col: df[self.usage_completion_output_col] = df.apply(lambda x: self._extract_usage(x, usage_completion_read_col), axis=1) elif self.n_tokens_column in df.columns: - df[self.usage_completion_output_col] = df[self.self.n_tokens_column] + df[self.usage_completion_output_col] = df[self.n_tokens_column] else: df[self.usage_completion_output_col] = np.nan return df diff --git a/tests/test_utils.py b/tests/test_utils.py index dc8ae164..e9af285e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -111,7 +111,9 @@ def __init__(self, model_name="generic_test_model"): self.name = model_name def generate(self, text_prompt, *args, **kwargs): - return {"model_output": "Generic model output", "is_valid": random.choice([True, False])} + return {"model_output": "Generic model output", + "is_valid": random.choice([True, False]), + "n_output_tokens": 3} class DNAEvaluationInferenceTestModel: From 13058797ab9b53eeef972a84a8aee22995dbcd3f Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Fri, 14 Mar 2025 13:57:02 -0700 Subject: [PATCH 14/15] updated test model --- tests/pipeline_tests.py | 1 - tests/test_utils.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/pipeline_tests.py b/tests/pipeline_tests.py index 547d125d..be068e8c 100644 --- a/tests/pipeline_tests.py +++ b/tests/pipeline_tests.py @@ -540,7 +540,6 @@ def get_config(self): return TEST_KITAB_ONE_BOOK_CONSTRAINT_PIPELINE().pipeline_config -@unittest.skipIf("skip_tests_with_missing_ds" in os.environ, "Missing public dataset. TODO: revert") class GPQA_PipelineTest(PipelineTest, unittest.TestCase): def get_config(self): return TEST_GPQA_PIPELINE().pipeline_config diff --git a/tests/test_utils.py b/tests/test_utils.py index e9af285e..824bbabb 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -51,7 +51,8 @@ def __init__(self): def generate(self, text_prompt, query_images=None): return {"model_output": random.choice(["Final Answer: A", "Final Answer: B", "Final Answer: C", "Final Answer: D"]), - "is_valid": random.choice([True, False])} + "is_valid": random.choice([True, False]), + "n_output_tokens": 3} def name(self): return self.name From 9306d6a6a9e53ed44b74e350b08a3c8fad8fa1c9 Mon Sep 17 00:00:00 2001 From: Vidhisha Balachandran Date: Fri, 14 Mar 2025 14:11:27 -0700 Subject: [PATCH 15/15] skipped gpqa --- tests/pipeline_tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/pipeline_tests.py b/tests/pipeline_tests.py index be068e8c..547d125d 100644 --- a/tests/pipeline_tests.py +++ b/tests/pipeline_tests.py @@ -540,6 +540,7 @@ def get_config(self): return TEST_KITAB_ONE_BOOK_CONSTRAINT_PIPELINE().pipeline_config +@unittest.skipIf("skip_tests_with_missing_ds" in os.environ, "Missing public dataset. TODO: revert") class GPQA_PipelineTest(PipelineTest, unittest.TestCase): def get_config(self): return TEST_GPQA_PIPELINE().pipeline_config