From 352829288df286ec0011c48137c01e5e6c6f1315 Mon Sep 17 00:00:00 2001 From: erandeutsch Date: Mon, 26 May 2025 17:20:12 -0700 Subject: [PATCH 1/4] Support custom imported module serialization with cloudpickle --- dspy/primitives/module.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/dspy/primitives/module.py b/dspy/primitives/module.py index 3ddaf74d66..45d790b229 100644 --- a/dspy/primitives/module.py +++ b/dspy/primitives/module.py @@ -163,7 +163,7 @@ def load_state(self, state): for name, param in self.named_parameters(): param.load_state(state[name]) - def save(self, path, save_program=False): + def save(self, path, save_program=False, modules_to_serialize=None): """Save the module. Save the module to a directory or a file. There are two modes: @@ -172,6 +172,10 @@ def save(self, path, save_program=False): - `save_program=True`: Save the whole module to a directory via cloudpickle, which contains both the state and architecture of the model. + If save_program=True and modules_to_serialize are provided, it will register those modules for serialization + with cloudpickle's `register_pickle_by_value`. This is useful when you have custom modules that need to be + serialized with cloudpickle. If None, then no modules will be registered for serialization. + We also save the dependency versions, so that the loaded model can check if there is a version mismatch on critical dependencies or DSPy version. @@ -180,6 +184,9 @@ def save(self, path, save_program=False): and a directory when `save_program=True`. save_program (bool): If True, save the whole module to a directory via cloudpickle, otherwise only save the state. + modules_to_serialize (list): A list of modules to serialize with cloudpickle's `register_pickle_by_value`. + If None, then no modules will be registered for serialization. + """ metadata = {} metadata["dependency_versions"] = get_dependency_versions() @@ -198,6 +205,10 @@ def save(self, path, save_program=False): path.mkdir(parents=True) try: + if modules_to_serialize is not None: + for module in modules_to_serialize: + cloudpickle.register_pickle_by_value(module) + with open(path / "program.pkl", "wb") as f: cloudpickle.dump(self, f) except Exception as e: From da0eb71eadcb488ce06d94b61319b80229fa9fed Mon Sep 17 00:00:00 2001 From: erandeutsch Date: Tue, 27 May 2025 18:51:47 -0700 Subject: [PATCH 2/4] Fix styling and modules_to_serialize's prevention of None values --- dspy/primitives/module.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dspy/primitives/module.py b/dspy/primitives/module.py index 45d790b229..1120eace58 100644 --- a/dspy/primitives/module.py +++ b/dspy/primitives/module.py @@ -172,7 +172,7 @@ def save(self, path, save_program=False, modules_to_serialize=None): - `save_program=True`: Save the whole module to a directory via cloudpickle, which contains both the state and architecture of the model. - If save_program=True and modules_to_serialize are provided, it will register those modules for serialization + If `save_program=True` and `modules_to_serialize` are provided, it will register those modules for serialization with cloudpickle's `register_pickle_by_value`. This is useful when you have custom modules that need to be serialized with cloudpickle. If None, then no modules will be registered for serialization. @@ -205,9 +205,9 @@ def save(self, path, save_program=False, modules_to_serialize=None): path.mkdir(parents=True) try: - if modules_to_serialize is not None: - for module in modules_to_serialize: - cloudpickle.register_pickle_by_value(module) + modules_to_serialize = modules_to_serialize or [] + for module in modules_to_serialize: + cloudpickle.register_pickle_by_value(module) with open(path / "program.pkl", "wb") as f: cloudpickle.dump(self, f) From c243a386b4b7da0c88192129105e4b5de4841c73 Mon Sep 17 00:00:00 2001 From: erandeutsch Date: Tue, 27 May 2025 19:37:05 -0700 Subject: [PATCH 3/4] Update the tutorial for saving programs with the new module serialization functionality --- docs/docs/tutorials/saving/index.md | 31 +++++++++++++++++++++++++++++ dspy/primitives/module.py | 6 ++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/docs/docs/tutorials/saving/index.md b/docs/docs/tutorials/saving/index.md index 66acb6470d..862d264d08 100644 --- a/docs/docs/tutorials/saving/index.md +++ b/docs/docs/tutorials/saving/index.md @@ -96,6 +96,37 @@ assert str(compiled_dspy_program.signature) == str(loaded_dspy_program.signature With whole program saving, you don't need to recreate the program, but can directly load the architecture along with the state. You can pick the suitable saviing approach based on your needs. +### Serializing Imported Modules + +When saving a program with `save_program=True`, you might need to include custom modules that your program depends on. + +You can specify which custom modules should be serialized with your program by passing them to the `modules_to_serialize` +parameter when calling `save`. This ensures that any dependencies your program relies on are included during serialization and +available when loading the program later. + +This uses cloudpickle's `cloudpickle.register_pickle_by_value` function in order to register a module as picklable by value. When +a module is registered this way, cloudpickle will serialize the module by value rather than by reference, ensuring that the +module contents are preserved with the saved program. + +For example, if your program uses custom modules: + +```python +import my_custom_module + +module = dspy.ChainOfThought(my_custom_module.custom_signature) + +# Save the program with the custom module +compiled_dspy_program.save( + "./dspy_program/", + save_program=True, + modules_to_serialize=[my_custom_module] +) +``` + +This ensures that the required modules are properly serialized and available when loading the program later. Any number of +modules can be passed to `modules_to_serialize`. If you don't specify `modules_to_serialize`, no additional modules will be +registered for serialization. + ## Backward Compatibility As of `dspy<2.7`, we don't guarantee the backward compatibility of the saved program. For example, if you save the program with `dspy==2.5.35`, diff --git a/dspy/primitives/module.py b/dspy/primitives/module.py index 1120eace58..1b9ebb6e54 100644 --- a/dspy/primitives/module.py +++ b/dspy/primitives/module.py @@ -173,8 +173,10 @@ def save(self, path, save_program=False, modules_to_serialize=None): architecture of the model. If `save_program=True` and `modules_to_serialize` are provided, it will register those modules for serialization - with cloudpickle's `register_pickle_by_value`. This is useful when you have custom modules that need to be - serialized with cloudpickle. If None, then no modules will be registered for serialization. + with cloudpickle's `register_pickle_by_value`. This causes cloudpickle to serialize the module by value rather + than by reference, ensuring the module is fully preserved along with the saved program. This is useful + when you have custom modules that need to be serialized alongside your program. If None, then no modules + will be registered for serialization. We also save the dependency versions, so that the loaded model can check if there is a version mismatch on critical dependencies or DSPy version. From e4b7148dc339f4f8ac2bf5ef15a735b2c3ae8df0 Mon Sep 17 00:00:00 2001 From: erandeutsch Date: Tue, 27 May 2025 20:09:06 -0700 Subject: [PATCH 4/4] Fix code in new saving tutorial and small typos --- docs/docs/tutorials/saving/index.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/docs/tutorials/saving/index.md b/docs/docs/tutorials/saving/index.md index 862d264d08..d30d880312 100644 --- a/docs/docs/tutorials/saving/index.md +++ b/docs/docs/tutorials/saving/index.md @@ -7,7 +7,7 @@ This guide demonstrates how to save and load your DSPy program. At a high level, ## State-only Saving -State represents the DSPy program's internal state, including the signature, demos (few-shot examples), and other informaiton like +State represents the DSPy program's internal state, including the signature, demos (few-shot examples), and other information like the `lm` to use for each `dspy.Predict` in the program. It also includes configurable attributes of other DSPy modules like `k` for `dspy.retrievers.Retriever`. To save the state of a program, use the `save` method and set `save_program=False`. You can choose to save the state to a JSON file or a pickle file. We recommend saving the state to a JSON file because it is safer and readable. @@ -94,9 +94,9 @@ assert str(compiled_dspy_program.signature) == str(loaded_dspy_program.signature ``` With whole program saving, you don't need to recreate the program, but can directly load the architecture along with the state. -You can pick the suitable saviing approach based on your needs. +You can pick the suitable saving approach based on your needs. -### Serializing Imported Modules +### Serializing Imported Modules When saving a program with `save_program=True`, you might need to include custom modules that your program depends on. @@ -111,9 +111,10 @@ module contents are preserved with the saved program. For example, if your program uses custom modules: ```python +import dspy import my_custom_module -module = dspy.ChainOfThought(my_custom_module.custom_signature) +compiled_dspy_program = dspy.ChainOfThought(my_custom_module.custom_signature) # Save the program with the custom module compiled_dspy_program.save( @@ -135,4 +136,4 @@ are that loading a saved file in a different version of DSPy will not raise an e the program was saved. Starting from `dspy>=2.7`, we will guarantee the backward compatibility of the saved program in major releases, i.e., programs saved in `dspy==2.7.0` -should be loadeable in `dspy==2.7.10`. +should be loadable in `dspy==2.7.10`.