diff --git a/src/nemotron/recipes/embed/stage0_sdg/pyproject.toml b/src/nemotron/recipes/embed/stage0_sdg/pyproject.toml index c7578444..463c8c0a 100644 --- a/src/nemotron/recipes/embed/stage0_sdg/pyproject.toml +++ b/src/nemotron/recipes/embed/stage0_sdg/pyproject.toml @@ -12,7 +12,7 @@ dependencies = [ [tool.uv.sources] # Point to vendored retriever-sdg package -retriever-sdg = { path = "vendor/retriever-sdg" } +retriever-sdg = { path = "vendor/retriever-sdg", editable = true } [tool.uv.extra-build-dependencies] # Build dependencies for vendored packages diff --git a/src/nemotron/recipes/embed/stage0_sdg/vendor/retriever-sdg/src/retriever_sdg/deduplication.py b/src/nemotron/recipes/embed/stage0_sdg/vendor/retriever-sdg/src/retriever_sdg/deduplication.py index 49a9a729..783e702f 100644 --- a/src/nemotron/recipes/embed/stage0_sdg/vendor/retriever-sdg/src/retriever_sdg/deduplication.py +++ b/src/nemotron/recipes/embed/stage0_sdg/vendor/retriever-sdg/src/retriever_sdg/deduplication.py @@ -7,9 +7,14 @@ import numpy as np from data_designer.config.base import SingleColumnConfig -from data_designer.engine.column_generators.generators.base import ColumnGeneratorCellByCell from data_designer.plugins import Plugin, PluginType +__all__ = [ + "DDRetrievalDedupConfig", + "DDRetrievalDedup", + "dd_retrieval_dedup_plugin", +] + logger = logging.getLogger(__name__) @@ -30,8 +35,19 @@ def side_effect_columns(self) -> list[str]: return [] -class DDRetrievalDedup(ColumnGeneratorCellByCell[DDRetrievalDedupConfig]): +# Data Designer may re-enter this module during plugin discovery while the +# generator base class import is still in progress. Publish the plugin after +# the config exists so the re-entrant import can resolve it cleanly. +dd_retrieval_dedup_plugin = Plugin( + impl_qualified_name="retriever_sdg.deduplication.DDRetrievalDedup", + config_qualified_name="retriever_sdg.deduplication.DDRetrievalDedupConfig", + plugin_type=PluginType.COLUMN_GENERATOR, +) + +from data_designer.engine.column_generators.generators.base import ColumnGeneratorCellByCell # noqa: E402 + +class DDRetrievalDedup(ColumnGeneratorCellByCell[DDRetrievalDedupConfig]): @property def embedder(self): return self.resource_provider.model_registry.get_model( @@ -123,10 +139,3 @@ def generate(self, data: dict) -> dict: retained_qa_pairs = [qa_pairs[i] for i in retained_qa_pair_indexes] return data | {self.config.name: retained_qa_pairs} - - -dd_retrieval_dedup_plugin = Plugin( - impl_qualified_name="retriever_sdg.deduplication.DDRetrievalDedup", - config_qualified_name="retriever_sdg.deduplication.DDRetrievalDedupConfig", - plugin_type=PluginType.COLUMN_GENERATOR, -) \ No newline at end of file