NVIDIA-NeMo
diff --git a/‎docs/user-guide/configuration.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/user-guide/configuration.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/user-guide/getting-started.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/user-guide/getting-started.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 4 additions & 4 deletions b/‎pyproject.toml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎pytest.ini‎
Lines changed: 1 addition & 0 deletions b/‎pytest.ini‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/nemo_safe_synthesizer/config/training.py‎
Lines changed: 5 additions & 1 deletion b/‎src/nemo_safe_synthesizer/config/training.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/nemo_safe_synthesizer/evaluation/components/attribute_inference_protection.py‎
Lines changed: 14 additions & 34 deletions b/‎src/nemo_safe_synthesizer/evaluation/components/attribute_inference_protection.py‎
Lines changed: 14 additions & 34 deletions
diff --git a/‎src/nemo_safe_synthesizer/evaluation/components/membership_inference_protection.py‎
Lines changed: 17 additions & 31 deletions b/‎src/nemo_safe_synthesizer/evaluation/components/membership_inference_protection.py‎
Lines changed: 17 additions & 31 deletions
@@ -104,6 +104,11 @@ We have extensively tested the following models for synthetic data use in NSS, a
 Benchmarking data for additional models will be added as they are
 validated. To understand the trade-offs with model selection, see [Training](running.md#training).
 
+When `training.pretrained_model` is set to a Hugging Face Hub model ID, the model is downloaded from the Hub; if a local path or an offline cache is provided, no download is performed. See [Pre-Caching Models](environment.md#pre-caching-models) for details.
+
+!!! warning "Security Note: Pretrained models from Hugging Face Hub"
+    Loading and using pretrained models from Hugging Face Hub (or any public source) can expose your environment to significant risks, including arbitrary code execution (ACE) or remote code execution (RCE) vulnerabilities. Only use models you have reviewed yourself or from organizations and authors you explicitly trust. Malicious or modified models may contain embedded code, backdoors, or privacy-leaking mechanisms.
+
 ---
 
 ## Generation
 
@@ -192,6 +192,8 @@ differential privacy). Both perform LoRA fine-tuning; see
 
 The default model is `HuggingFaceTB/SmolLM3-3B`. Safe Synthesizer has tested support for `HuggingFaceTB/SmolLM3-3B`, `TinyLlama/TinyLlama-1.1B-Chat-v1.0`, and `mistralai/Mistral-7B-Instruct-v0.3` (see [Configuration -- Training](configuration.md#training) for details on how to change the backend or model).
 
+Training requires 1 NVIDIA GPU (A100 or larger) to run. Multi-GPU training is not supported.
+
 !!! tip "Differential privacy"
     For formal privacy guarantees, enable Differentially Private Stochastic Gradient Descent (DP-SGD) when fine-tuning via `privacy.dp_enabled: true`. See [Configuration -- Differential Privacy](configuration.md#differential-privacy).
 
 
@@ -98,6 +98,7 @@ engine = [
     "pandas>=2.1.3, <3",
     "plotly",
     "ratelimit",
+    "scikit-learn",
     "range_regex>=0.1.0",
     "tenacity==9.1.4",
     "tiktoken>=0.7.0,<1.0",
@@ -119,7 +120,6 @@ cpu = [
   "bitsandbytes==0.49.1",
   "flashinfer-python==0.6.1; sys_platform=='linux'",
   "flashinfer-cubin==0.6.1; sys_platform=='linux'",
-  "faiss-cpu==1.13.2",
   "gliner",
   "kernels>=0.12.1",
   "peft",
@@ -141,7 +141,6 @@ cpu = [
 cu128 = [
   "accelerate",
   "bitsandbytes==0.49.1",
-  "faiss-gpu-cu12==1.13.2; sys_platform == 'linux'",
   "flashinfer-python==0.6.1; sys_platform == 'linux'",
   "flashinfer-cubin==0.6.1; sys_platform == 'linux'",
   "flashinfer-jit-cache==0.6.1+cu128; sys_platform == 'linux'",
@@ -226,10 +225,9 @@ constraint-dependencies = ["torch==2.9.1", "regex==2025.07.34", "pandas<3"]
   flashinfer-jit-cache = [
     { index = "flashinfer-jit-cache", marker = "sys_platform=='linux'", extra="cu128"},
   ]
-   nvidia-cublas-cu12 = [
+  nvidia-cublas-cu12 = [
     { index = "pytorch-cu128" },
   ]
-
   nvidia-cuda-cupti-cu12 = [
     { index = "pytorch-cu128" },
   ]
@@ -298,6 +296,7 @@ name = "nvidia-pypi-public"
 url = "https://pypi.nvidia.com"
 explicit = true
 
+
 [build-system]
 requires = ["hatchling", "uv-dynamic-versioning"]
 build-backend = "hatchling.build"
@@ -335,4 +334,5 @@ exclude = [
     "./src/nemo_safe_synthesizer/pii_replacer/",
     "./tests/",
     "./tools/",
+    "./script/slurm/nss_top.py",
 ]
@@ -22,6 +22,7 @@ markers =
     smollm2: SmolLM2 Hub download tests (used by Makefile for process isolation)
     unsloth: Unsloth backend tests (process-isolated from DP tests)
     noautouse: Marker to skip autouse fixtures for specific tests
+    benchmark: Test the performance of the code
 
 # Note: Unit tests (testing single classes/functions with no infrastructure dependencies)
 # do not need markers and are the default test type.
 
@@ -207,7 +207,11 @@ class TrainingHyperparams(Parameters):
         str,
         Field(
             title="pretrained_model",
-            description="Pretrained model to use for fine-tuning. Defaults to SmolLM3.",
+            description=(
+                "Pretrained model to use for fine-tuning. Defaults to SmolLM3. "
+                "May be a Hugging Face model ID (loaded from the Hugging Face Hub or cache) "
+                "or a local path. See security note in docs before using untrusted sources."
+            ),
         ),
     ] = "HuggingFaceTB/SmolLM3-3B"
 
 
@@ -27,16 +27,9 @@
 from ..components.component import Component
 from ..data_model.evaluation_dataset import EvaluationDataset
 from ..data_model.evaluation_score import EvaluationScore, PrivacyGrade
+from ..nearest_neighbors import NearestNeighborSearch
 from . import multi_modal_figures as figures
 
-faiss_available = False
-try:
-    import faiss
-
-    faiss_available = True
-except (ImportError, ModuleNotFoundError):
-    pass
-
 logger = get_logger(__name__)
 
 
@@ -77,10 +70,6 @@ def from_evaluation_dataset(
         evaluation_dataset: EvaluationDataset, config: SafeSynthesizerParameters | None = None
     ) -> AttributeInferenceProtection:
         """Run the attribute inference attack and return the protection score."""
-        if not faiss_available:
-            logger.info("FAISS is not available, skipping Attribute Inference Attack.")
-            return AttributeInferenceProtection(score=EvaluationScore())
-
         quasi_identifier_count = config.evaluation.quasi_identifier_count if config else QUASI_IDENTIFIER_COUNT
 
         score, col_accuracy_df = AttributeInferenceProtection._aia(
@@ -276,23 +265,17 @@ def _get_synth_nn(
                     df_train_use, df_synth_use
                 )
 
-        # If all tabular, just use FAISS to get NN
+        # If all tabular, use nearest neighbor search (torch CUDA or sklearn CPU fallback)
         if len(text_columns) == 0:
-            # Create the faiss index on the synthetic data
-            dim = df_synth_norm.shape[1]
-            index = faiss.IndexFlatL2(dim)  # ty: ignore[possibly-unbound-attribute]
-
-            # This usage matches documentation. Specifying n= and x= parameters as
-            # the type annotation for IndexFlatL2.add suggests seems unnecessary, possibly related
-            # to swig handling that ty is not aware of. Similar for other faiss calls below
-            # which are using swig-generated code.
-            index.add(np.float32(np.ascontiguousarray(np.array(df_synth_norm))))  # ty: ignore[missing-argument]
+            # Create the nearest neighbors index on the synthetic data
+            nn = NearestNeighborSearch(n_neighbors=k)
+            nn.fit(np.ascontiguousarray(np.array(df_synth_norm)).astype(np.float32))
 
             # Get nearest neighbors to this attack record
-            _, indexes = index.search(np.float32(np.ascontiguousarray(np.array(df_train_norm))), k)  # ty: ignore[missing-argument]
+            _, indexes = nn.kneighbors(np.ascontiguousarray(np.array(df_train_norm)).astype(np.float32))
             synth_rows = pd.DataFrame()
-            for index in indexes:
-                synth_rows = pd.concat([synth_rows, df_synth.iloc[index].copy()])
+            for idx_row in indexes:
+                synth_rows = pd.concat([synth_rows, df_synth.iloc[idx_row].copy()])
             return synth_rows
 
         # If all text, just use Sentence Transformer to get NN
@@ -339,23 +322,20 @@ def _get_synth_nn(
             corpus_ids.append(corpus_id)
             synth_NN = pd.concat([synth_NN, pd.DataFrame([df_synth_norm.iloc[int(corpus_id)]])], ignore_index=True)
 
-        # Now get the tabular similarity for these 1000 NN
-
-        dim = synth_NN.shape[1]
-        index = faiss.IndexFlatL2(dim)  # ty: ignore[possibly-unbound-attribute]
-        index.add(np.float32(np.ascontiguousarray(np.array(synth_NN))))  # ty: ignore[missing-argument]
-        dists, indexes = index.search(np.float32(np.ascontiguousarray(np.array(df_train_norm))), search_synth_k)  # ty: ignore[missing-argument]
+        # Now get the tabular similarity for these 1000 NN using nearest neighbor search
+        nn = NearestNeighborSearch(n_neighbors=search_synth_k)
+        nn.fit(np.ascontiguousarray(np.array(synth_NN)).astype(np.float32))
+        dists, indexes = nn.kneighbors(np.ascontiguousarray(np.array(df_train_norm)).astype(np.float32))
         # Scale the Euclidean distance to [0,1]
-        dists = np.sqrt(dists)
         max_dist = np.amax(dists)
         if max_dist > 0:
             dist_scaled = dists / max_dist
         else:
             dist_scaled = dists
         tab_dist = {}
         for i in range(search_synth_k):
-            index = indexes[0][i]
-            tab_dist[index] = dist_scaled[0][i]
+            idx = indexes[0][i]
+            tab_dist[idx] = dist_scaled[0][i]
 
         # Now get the hybrid distance
 
 
@@ -21,18 +21,10 @@
 from ...evaluation.components.component import Component
 from ...evaluation.data_model.evaluation_dataset import EvaluationDataset
 from ...evaluation.data_model.evaluation_score import EvaluationScore, PrivacyGrade
+from ...evaluation.nearest_neighbors import NearestNeighborSearch
 from ...observability import get_logger
 from . import multi_modal_figures as figures
 
-faiss_available = False
-try:
-    import faiss
-
-    faiss_available = True
-except (ImportError, ModuleNotFoundError):
-    pass
-
-
 logger = get_logger(__name__)
 
 
@@ -79,9 +71,6 @@ def from_evaluation_dataset(
         evaluation_dataset: EvaluationDataset, config: SafeSynthesizerParameters | None = None
     ) -> MembershipInferenceProtection:
         """Run the membership inference attack and return the protection score."""
-        if not faiss_available:
-            return MembershipInferenceProtection(score=EvaluationScore())
-
         score, attack_sum_df, tps_values, fps_values = MembershipInferenceProtection.mia(
             df_train=evaluation_dataset.reference,
             df_synth=evaluation_dataset.output,
@@ -249,7 +238,7 @@ def _compute_mia(
         df_train_norm: pd.DataFrame,
         df_test_norm: pd.DataFrame,
         df_synth_norm: pd.DataFrame,
-        index: faiss.IndexFlatL2 | None,  # ty: ignore[possibly-unbound-attribute]
+        nn_index: NearestNeighborSearch | None,
         run: int,
         text_cnt: int,
         tabular_cnt: int,
@@ -263,14 +252,14 @@ def _compute_mia(
 
         Builds an attack dataset from a slice of training rows mixed with
         test rows, computes nearest-neighbor distances to the synthetic
-        data (text via semantic search, tabular via FAISS L2), and
+        data (text via semantic search, tabular via L2 nearest neighbor), and
         classifies each record as member or non-member.
 
         Args:
             df_train_norm: Normalized training dataframe.
             df_test_norm: Normalized holdout (test) dataframe.
             df_synth_norm: Normalized synthetic dataframe.
-            index: Pre-built FAISS L2 index over the tabular columns of
+            nn_index: Pre-built NearestNeighborSearch index over the tabular columns of
                 the synthetic data, or ``None`` if no tabular columns exist.
             run: Zero-based run index controlling which training slice to use.
             text_cnt: Number of text columns in the dataset.
@@ -334,18 +323,16 @@ def _compute_mia(
                 attacker_data_tabular = real_data.copy()
                 k = 1
 
-            if index is None:
-                raise RuntimeError("faiss index not provided for MIA calculation when expected.")
+            if nn_index is None:
+                raise RuntimeError("Nearest neighbor index not provided for MIA calculation when expected.")
 
-            # This usage matches documentation despite type annotation for
-            # IndexFlatL2.search, possibly related to swig handling that ty is
-            # not aware of. Similar for other calls for faiss indexes.
-            dists, indices = index.search(
-                np.float32(np.ascontiguousarray(np.array(attacker_data_tabular))),
-                len(df_synth_norm),
-            )  # ty: ignore[missing-argument]
+            # Use nearest neighbor search (torch GPU or sklearn CPU fallback) for distance calculation
+            dists, indices = nn_index.kneighbors(
+                np.ascontiguousarray(np.array(attacker_data_tabular)).astype(np.float32),
+                n_neighbors=int(k),
+            )
             # Scale the Euclidean distance to [0,1]
-            dists = np.sqrt(dists)
+            # NearestNeighborSearch.kneighbors() returns L2 distance directly, not squared
             max_dist = np.amax(dists)
             if max_dist > 0:
                 dist_scaled = dists / max_dist
@@ -555,15 +542,14 @@ def mia(
                     df_train_norm, df_test_norm, df_synth_norm = MembershipInferenceProtection._normalize_onehot(
                         df_train_use, df_test, df_synth
                     )
-                # Create the faiss index on the synthetic tabular data
-                dim = df_synth_norm.shape[1]
-                index = faiss.IndexFlatL2(dim)  # ty: ignore[possibly-unbound-attribute]
-                index.add(np.float32(np.ascontiguousarray(np.array(df_synth_norm))))  # ty: ignore[missing-argument]
+                # Create nearest neighbor index on the synthetic tabular data (torch GPU or sklearn CPU fallback)
+                nn_index = NearestNeighborSearch(n_neighbors=len(df_synth_norm))
+                nn_index.fit(np.ascontiguousarray(np.array(df_synth_norm)).astype(np.float32))
             else:
                 df_train_norm = pd.DataFrame()
                 df_test_norm = pd.DataFrame()
                 df_synth_norm = pd.DataFrame()
-                index = None
+                nn_index = None
 
             # Create embeddings for text fields and combine the normalized tabular and the
             # new text embeddings into one dataframe.
@@ -588,7 +574,7 @@ def mia(
                     df_train_norm,
                     df_test_norm,
                     df_synth_norm,
-                    index,
+                    nn_index,
                     i,
                     text_cnt,
                     tabular_cnt,