amazon-science
diff --git a/‎src/fmcore/experimental/metrics/__init__.py
Lines changed: 1 addition & 1 deletion b/‎src/fmcore/experimental/metrics/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/fmcore/experimental/metrics/custom_metric.py
Lines changed: 10 additions & 5 deletions b/‎src/fmcore/experimental/metrics/custom_metric.py
Lines changed: 10 additions & 5 deletions
diff --git a/‎src/fmcore/experimental/metrics/deepeval_geval.py
Lines changed: 7 additions & 3 deletions b/‎src/fmcore/experimental/metrics/deepeval_geval.py
Lines changed: 7 additions & 3 deletions
diff --git a/‎src/fmcore/experimental/prompt_tuner/dspy_prompt_tuner.py
Lines changed: 77 additions & 21 deletions b/‎src/fmcore/experimental/prompt_tuner/dspy_prompt_tuner.py
Lines changed: 77 additions & 21 deletions
diff --git a/‎src/fmcore/experimental/prompt_tuner/utils/dspy_utils.py
Lines changed: 22 additions & 21 deletions b/‎src/fmcore/experimental/prompt_tuner/utils/dspy_utils.py
Lines changed: 22 additions & 21 deletions
@@ -1,3 +1,3 @@
 from fmcore.experimental.metrics.base_metric import BaseMetric
 from fmcore.experimental.metrics.deepeval_geval import DeepEvalGEvalMetric
-from fmcore.experimental.metrics.custom_metric import CustomMetric
+from fmcore.experimental.metrics.custom_metric import CustomMetric
@@ -1,21 +1,19 @@
 import json_repair
 from typing import Dict, List
-from langchain_core.messages import BaseMessage,HumanMessage
+from langchain_core.messages import BaseMessage, HumanMessage
 from jinja2 import Template
 from fmcore.experimental.llm.base_llm import BaseLLM
 from fmcore.experimental.metrics.base_metric import BaseMetric
 from fmcore.experimental.types.enums.metric_enums import SupportedMetrics
 from fmcore.experimental.types.metric_types import CustomMetricResult, MetricConfig, MetricResult
 
 
-
 class CustomMetric(BaseMetric):
     aliases = [SupportedMetrics.CUSTOM]
 
     metric_name: str
     llm: BaseLLM
     prompt_template: Template
-    
 
     @classmethod
     def _get_constructor_parameters(cls, *, metric_config: MetricConfig) -> Dict:
@@ -25,8 +23,13 @@ def _get_constructor_parameters(cls, *, metric_config: MetricConfig) -> Dict:
         prompt_template = Template(prompt)
 
         metric_name: str = metric_config.metric_params["name"]
-       
-        return {"config": metric_config, "prompt_template": prompt_template, "llm": llm, "metric_name": metric_name}
+
+        return {
+            "config": metric_config,
+            "prompt_template": prompt_template,
+            "llm": llm,
+            "metric_name": metric_name,
+        }
 
     def evaluate(self, data: Dict) -> MetricResult:
         prompt: str = self.prompt_template.render(**data)
@@ -43,3 +46,5 @@ async def aevaluate(self, data: Dict) -> MetricResult:
         result: Dict = json_repair.loads(response.content)
 
         return CustomMetricResult(**result)
+
+
@@ -7,7 +7,11 @@
 from fmcore.experimental.adapters.deepeval_adapter import DeepEvalLLMAdapter
 from fmcore.experimental.llm.base_llm import BaseLLM
 from fmcore.experimental.metrics.base_metric import BaseMetric
-from fmcore.experimental.types.enums.metric_enums import MetricFramework, SupportedMetrics, EvaluationFieldType
+from fmcore.experimental.types.enums.metric_enums import (
+    MetricFramework,
+    SupportedMetrics,
+    EvaluationFieldType,
+)
 from fmcore.experimental.types.llm_types import LLMConfig
 from fmcore.experimental.types.metric_types import (
     MetricConfig,
@@ -36,12 +40,12 @@ def _get_constructor_parameters(cls, *, metric_config: MetricConfig) -> Dict:
             geval_metric_params["evaluation_params"] = DeepEvalUtils.infer_evaluation_params(
                 field_mapping=metric_config.field_mapping
             )
-        
+
         if not metric_config.framework:
             metric_config.framework = MetricFramework.DEEPEVAL
 
         geval_metric_params["model"] = model
-        
+
         return {"config": metric_config, "geval_metric_params": geval_metric_params}
 
     def evaluate(self, data: Dict) -> MetricResult:
 
@@ -1,13 +1,13 @@
 import dspy
+import mlflow.dspy
 from pandas import DataFrame
-from typing import Callable, Dict, Optional, List, Tuple, Type
+from typing import Callable, Dict, Optional, List, Tuple, Type, Any, Union
 
 from dspy.teleprompt import Teleprompter
 from dspy.teleprompt.mipro_optimizer_v2 import MIPROv2
 from dspy.teleprompt.bootstrap import BootstrapFewShot
 from dspy import Signature, Module
 
-
 from fmcore.experimental.metrics.base_metric import BaseMetric
 from fmcore.experimental.prompt_tuner.base_prompt_tuner import BasePromptTuner
 from fmcore.experimental.types.enums.prompt_tuner_enums import PromptTunerFramework
@@ -16,27 +16,52 @@
 from fmcore.experimental.adapters.dspy_adapter import DSPyLLMAdapter
 from fmcore.experimental.utils.introspection_utils import IntrospectionUtils
 from fmcore.experimental.prompt_tuner.utils.dspy_utils import DSPyUtils
-from py_expression_eval import Parser
 from asteval import Interpreter
 
 
-
-
 class DSPyPromptTuner(BasePromptTuner):
+    """
+    A prompt tuner implementation using the DSPy framework.
+    
+    This class provides functionality to optimize prompts using various DSPy optimizers
+    such as MIPROv2 or BootstrapFewShot. It uses a student model for generating responses
+    and evaluates them using a configured metric to iteratively improve the prompt.
+    
+    Attributes:
+        aliases (List[PromptTunerFramework]): Framework identifiers for this tuner.
+        student (dspy.LM): The student language model used for prompt optimization.
+        teacher (Optional[dspy.LM]): The teacher language model used in some optimization techniques.
+        optimizer_metric (BaseMetric): The metric used to evaluate prompt performance.
+    """
+    
     aliases = [PromptTunerFramework.DSPY]
     student: dspy.LM
     teacher: Optional[dspy.LM]
     optimizer_metric: BaseMetric
 
     @classmethod
-    def _get_constructor_parameters(cls, *, config: PromptTunerConfig) -> Dict:
+    def _get_constructor_parameters(cls, *, config: PromptTunerConfig) -> Dict[str, Any]:
+        """
+        Creates and configures the necessary components for DSPy prompt tuning.
+        
+        Args:
+            config: Configuration containing all necessary parameters for the prompt tuner.
+                   Must include student model config and optionally teacher model config.
+        
+        Returns:
+            Dictionary of parameters needed to initialize the DSPyPromptTuner instance.
+        """
+        # Initialize student model and configure DSPy to use it
         student_model = DSPyLLMAdapter(llm_config=config.optimzer_config.student_config)
         dspy.configure(lm=student_model)
 
+        # Initialize teacher model (or use student if not specified)
         if config.optimzer_config.teacher_config:
             teacher_model = DSPyLLMAdapter(llm_config=config.optimzer_config.teacher_config)
         else:
             teacher_model = student_model
+            
+        # Initialize metric for optimization
         optimizer_metric = BaseMetric.of(metric_config=config.optimzer_config.metric_config)
 
         return {
@@ -46,19 +71,33 @@ def _get_constructor_parameters(cls, *, config: PromptTunerConfig) -> Dict:
             "config": config,
         }
 
-    def _create_evaluation_function(self):
+    def _create_evaluation_function(self) -> Callable:
         """
         Creates an evaluation function that uses the configured metric.
-
+        
+        The function evaluates DSPy predictions by applying the metric and interpreting
+        the criteria expression to determine the quality of the prediction.
+        
         Returns:
-            Evaluation function that takes an example and prediction
+            A callable function that takes an example and prediction and returns a 
+            numerical or boolean evaluation score.
         """
-        
         # Store criteria once to avoid re-fetching it in each evaluation call
         criteria = self.optimizer_metric.config.metric_params["criteria"]
 
-        def evaluate_func(example: dspy.Example, prediction: dspy.Prediction, trace=None):
-            # Get evaluation results
+        def evaluate_func(example: dspy.Example, prediction: dspy.Prediction, trace=None) -> Union[float, bool]:
+            """
+            Evaluates a single example-prediction pair using the configured metric.
+            
+            Args:
+                example: The DSPy example containing input data
+                prediction: The model's prediction to evaluate
+                trace: Optional trace information from DSPy (not used)
+                
+            Returns:
+                Evaluation score as determined by the configured criteria
+            """
+            # Get evaluation results from the metric
             evaluation_response: dict = DSPyUtils.evaluate(
                 example=example, 
                 prediction=prediction, 
@@ -72,23 +111,33 @@ def evaluate_func(example: dspy.Example, prediction: dspy.Prediction, trace=None
 
         return evaluate_func
 
-
-
     def tune(self, data: DataFrame) -> str:
         """
-        Tunes a prompt using the configured DSPy optimizer.
-
+        Tunes a prompt using the configured DSPy optimizer and training data.
+        
+        This method:
+        1. Converts the input data to DSPy examples
+        2. Creates a DSPy signature and module based on the prompt configuration
+        3. Configures an evaluation function using the specified metric
+        4. Applies the DSPy optimizer to generate an optimized prompt
+        
         Args:
-            data: DataFrame containing the training data
-            prompt_config: Configuration containing input and output fields
-
+            data: DataFrame containing the training data with input and expected output fields
+        
         Returns:
             The optimized prompt as a string
+        
+        Raises:
+            ValueError: If the optimization process fails or returns invalid results
         """
 
+        import mlflow
+        mlflow.dspy.autolog(log_traces=True, log_traces_from_compile=True, log_traces_from_eval=True, disable=False, silent=False)
+
         # Convert data to DSPy examples
         dspy_examples = DSPyUtils.convert_to_dspy_examples(
-            data=data, prompt_config=self.config.prompt_config
+            data=data, 
+            prompt_config=self.config.prompt_config
         )
 
         # Create signature and module separately
@@ -108,13 +157,20 @@ def tune(self, data: DataFrame) -> str:
             evaluate_func=evaluate_func,
         )
 
+        # Filter optimizer parameters to only include those accepted by the compile method
         filtered_optimizer_params = IntrospectionUtils.filter_params(
-            func=optimizer.compile, params=self.config.optimzer_config.params
+            func=optimizer.compile, 
+            params=self.config.optimzer_config.params
         )
+        
         # Compile the module with the optimizer
         optimized_module = optimizer.compile(
             student=module,
             trainset=dspy_examples,
             requires_permission_to_run=False,
             **filtered_optimizer_params,
         )
+
+        dspy.inspect_history(optimized_module)
+        
+        optimized_module.signature.prompt
@@ -20,14 +20,17 @@ def get_optimizer(
         optimzer_config: OptimizerConfig,
         evaluate_func: Callable,
     ) -> Teleprompter:
-        if optimzer_config.type == DspyOptimizerType.MIPRO_V2:
-            return MIPROv2(
+        
+        # This will be a factory method that returns the appropriate optimizer based on the optimizer type
+        # TODO: Add more optimizers
+        optimizer: Teleprompter = MIPROv2(
                 prompt_model=teacher,
                 task_model=student,
                 metric=evaluate_func,
                 **optimzer_config.params,
             )
-        return BootstrapFewShot(**optimzer_config.params, metric=evaluate_func)
+    
+        return optimizer
 
     @staticmethod
     def create_dspy_signature(prompt_config: PromptConfig) -> Type[dspy.Signature]:
@@ -43,24 +46,24 @@ def create_dspy_signature(prompt_config: PromptConfig) -> Type[dspy.Signature]:
 
         # Create a DSPy Signature class dictionary with annotations
         attrs = {
-            '__annotations__': {},
-            '__doc__': prompt_config.prompt if prompt_config.prompt else ""
+            "__annotations__": {},
+            "__doc__": prompt_config.prompt if prompt_config.prompt else "",
         }
 
         # Dynamically add input and output fields with type annotations
         for field in prompt_config.input_fields:
             # Assume field has a type attribute, otherwise default to str
-            field_type = getattr(field, 'type', str)
-            attrs['__annotations__'][field.name] = field_type
+            field_type = getattr(field, "type", str)
+            attrs["__annotations__"][field.name] = field_type
             attrs[field.name] = dspy.InputField(desc=field.description)
-            
+
         for field in prompt_config.output_fields:
-            field_type = getattr(field, 'type', str)
-            attrs['__annotations__'][field.name] = field_type
+            field_type = getattr(field, "type", str)
+            attrs["__annotations__"][field.name] = field_type
             attrs[field.name] = dspy.OutputField(desc=field.description)
 
         # Create the class dynamically with type annotations
-        TaskSignature = type('TaskSignature', (dspy.Signature,), attrs)
+        TaskSignature = type("TaskSignature", (dspy.Signature,), attrs)
 
         return TaskSignature
 
@@ -85,7 +88,7 @@ def __init__(self, signature: dspy.Signature):
             def forward(self, **kwargs):
                 prediction = self.predictor(**kwargs)
                 return prediction
-        
+
         return TaskModule(signature=signature)
 
     @staticmethod
@@ -106,20 +109,18 @@ def convert_to_dspy_examples(
 
         loader = DataLoader()
         input_keys = [field.name for field in prompt_config.input_fields]
-        examples = loader.from_pandas(
-            data,
-            fields=data.columns.tolist(),
-            input_keys=input_keys
-        )
-        
+        examples = loader.from_pandas(data, fields=data.columns.tolist(), input_keys=input_keys)
+
         return examples
 
     @staticmethod
-    def evaluate(example: dspy.Example, prediction: dspy.Prediction, metric: BaseMetric) -> MetricResult:
+    def evaluate(
+        example: dspy.Example, prediction: dspy.Prediction, metric: BaseMetric
+    ) -> MetricResult:
         row = {
             EvaluationFieldType.INPUT.name: example.toDict(),
             EvaluationFieldType.OUTPUT.name: prediction.toDict(),
         }
-    
+
         metric_result: MetricResult = metric.evaluate(data=row)
-        return metric_result.model_dump(exclude_none=True)
+        return metric_result.model_dump(exclude_none=True)