NVIDIA-NeMo
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 4 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎pytest.ini‎
Lines changed: 1 addition & 0 deletions b/‎pytest.ini‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/nemo_safe_synthesizer/evaluation/components/attribute_inference_protection.py‎
Lines changed: 14 additions & 34 deletions b/‎src/nemo_safe_synthesizer/evaluation/components/attribute_inference_protection.py‎
Lines changed: 14 additions & 34 deletions
diff --git a/‎src/nemo_safe_synthesizer/evaluation/components/membership_inference_protection.py‎
Lines changed: 17 additions & 31 deletions b/‎src/nemo_safe_synthesizer/evaluation/components/membership_inference_protection.py‎
Lines changed: 17 additions & 31 deletions
@@ -98,6 +98,7 @@ engine = [
     "pandas>=2.1.3, <3",
     "plotly",
     "ratelimit",
+    "scikit-learn",
     "range_regex>=0.1.0",
     "tenacity==9.1.4",
     "tiktoken>=0.7.0,<1.0",
@@ -119,7 +120,6 @@ cpu = [
   "bitsandbytes==0.49.1",
   "flashinfer-python==0.6.1; sys_platform=='linux'",
   "flashinfer-cubin==0.6.1; sys_platform=='linux'",
-  "faiss-cpu==1.13.2",
   "gliner",
   "kernels>=0.12.1",
   "peft",
@@ -141,7 +141,6 @@ cpu = [
 cu128 = [
   "accelerate",
   "bitsandbytes==0.49.1",
-  "faiss-gpu-cu12==1.13.2; sys_platform == 'linux'",
   "flashinfer-python==0.6.1; sys_platform == 'linux'",
   "flashinfer-cubin==0.6.1; sys_platform == 'linux'",
   "flashinfer-jit-cache==0.6.1+cu128; sys_platform == 'linux'",
@@ -225,10 +224,9 @@ constraint-dependencies = ["torch==2.9.1", "regex==2025.07.34", "pandas<3"]
   flashinfer-jit-cache = [
     { index = "flashinfer-jit-cache", marker = "sys_platform=='linux'", extra="cu128"},
   ]
-   nvidia-cublas-cu12 = [
+  nvidia-cublas-cu12 = [
     { index = "pytorch-cu128" },
   ]
-
   nvidia-cuda-cupti-cu12 = [
     { index = "pytorch-cu128" },
   ]
@@ -297,6 +295,7 @@ name = "nvidia-pypi-public"
 url = "https://pypi.nvidia.com"
 explicit = true
 
+
 [build-system]
 requires = ["hatchling", "uv-dynamic-versioning"]
 build-backend = "hatchling.build"
 
@@ -22,6 +22,7 @@ markers =
     smollm2: SmolLM2 Hub download tests (used by Makefile for process isolation)
     unsloth: Unsloth backend tests (process-isolated from DP tests)
     noautouse: Marker to skip autouse fixtures for specific tests
+    benchmark: Test the performance of the code
 
 # Note: Unit tests (testing single classes/functions with no infrastructure dependencies)
 # do not need markers and are the default test type.
 
@@ -27,16 +27,9 @@
 from ..components.component import Component
 from ..data_model.evaluation_dataset import EvaluationDataset
 from ..data_model.evaluation_score import EvaluationScore, PrivacyGrade
+from ..nearest_neighbors import NearestNeighborSearch
 from . import multi_modal_figures as figures
 
-faiss_available = False
-try:
-    import faiss
-
-    faiss_available = True
-except (ImportError, ModuleNotFoundError):
-    pass
-
 logger = get_logger(__name__)
 
 
@@ -77,10 +70,6 @@ def from_evaluation_dataset(
         evaluation_dataset: EvaluationDataset, config: SafeSynthesizerParameters | None = None
     ) -> AttributeInferenceProtection:
         """Run the attribute inference attack and return the protection score."""
-        if not faiss_available:
-            logger.info("FAISS is not available, skipping Attribute Inference Attack.")
-            return AttributeInferenceProtection(score=EvaluationScore())
-
         quasi_identifier_count = config.evaluation.quasi_identifier_count if config else QUASI_IDENTIFIER_COUNT
 
         score, col_accuracy_df = AttributeInferenceProtection._aia(
@@ -276,23 +265,17 @@ def _get_synth_nn(
                     df_train_use, df_synth_use
                 )
 
-        # If all tabular, just use FAISS to get NN
+        # If all tabular, use nearest neighbor search (torch CUDA or sklearn CPU fallback)
         if len(text_columns) == 0:
-            # Create the faiss index on the synthetic data
-            dim = df_synth_norm.shape[1]
-            index = faiss.IndexFlatL2(dim)  # ty: ignore[possibly-unbound-attribute]
-
-            # This usage matches documentation. Specifying n= and x= parameters as
-            # the type annotation for IndexFlatL2.add suggests seems unnecessary, possibly related
-            # to swig handling that ty is not aware of. Similar for other faiss calls below
-            # which are using swig-generated code.
-            index.add(np.float32(np.ascontiguousarray(np.array(df_synth_norm))))  # ty: ignore[missing-argument]
+            # Create the nearest neighbors index on the synthetic data
+            nn = NearestNeighborSearch(n_neighbors=k)
+            nn.fit(np.ascontiguousarray(np.array(df_synth_norm)).astype(np.float32))
 
             # Get nearest neighbors to this attack record
-            _, indexes = index.search(np.float32(np.ascontiguousarray(np.array(df_train_norm))), k)  # ty: ignore[missing-argument]
+            _, indexes = nn.kneighbors(np.ascontiguousarray(np.array(df_train_norm)).astype(np.float32))
             synth_rows = pd.DataFrame()
-            for index in indexes:
-                synth_rows = pd.concat([synth_rows, df_synth.iloc[index].copy()])
+            for idx_row in indexes:
+                synth_rows = pd.concat([synth_rows, df_synth.iloc[idx_row].copy()])
             return synth_rows
 
         # If all text, just use Sentence Transformer to get NN
@@ -339,23 +322,20 @@ def _get_synth_nn(
             corpus_ids.append(corpus_id)
             synth_NN = pd.concat([synth_NN, pd.DataFrame([df_synth_norm.iloc[int(corpus_id)]])], ignore_index=True)
 
-        # Now get the tabular similarity for these 1000 NN
-
-        dim = synth_NN.shape[1]
-        index = faiss.IndexFlatL2(dim)  # ty: ignore[possibly-unbound-attribute]
-        index.add(np.float32(np.ascontiguousarray(np.array(synth_NN))))  # ty: ignore[missing-argument]
-        dists, indexes = index.search(np.float32(np.ascontiguousarray(np.array(df_train_norm))), search_synth_k)  # ty: ignore[missing-argument]
+        # Now get the tabular similarity for these 1000 NN using nearest neighbor search
+        nn = NearestNeighborSearch(n_neighbors=search_synth_k)
+        nn.fit(np.ascontiguousarray(np.array(synth_NN)).astype(np.float32))
+        dists, indexes = nn.kneighbors(np.ascontiguousarray(np.array(df_train_norm)).astype(np.float32))
         # Scale the Euclidean distance to [0,1]
-        dists = np.sqrt(dists)
         max_dist = np.amax(dists)
         if max_dist > 0:
             dist_scaled = dists / max_dist
         else:
             dist_scaled = dists
         tab_dist = {}
         for i in range(search_synth_k):
-            index = indexes[0][i]
-            tab_dist[index] = dist_scaled[0][i]
+            idx = indexes[0][i]
+            tab_dist[idx] = dist_scaled[0][i]
 
         # Now get the hybrid distance
 
 
@@ -21,18 +21,10 @@
 from ...evaluation.components.component import Component
 from ...evaluation.data_model.evaluation_dataset import EvaluationDataset
 from ...evaluation.data_model.evaluation_score import EvaluationScore, PrivacyGrade
+from ...evaluation.nearest_neighbors import NearestNeighborSearch
 from ...observability import get_logger
 from . import multi_modal_figures as figures
 
-faiss_available = False
-try:
-    import faiss
-
-    faiss_available = True
-except (ImportError, ModuleNotFoundError):
-    pass
-
-
 logger = get_logger(__name__)
 
 
@@ -79,9 +71,6 @@ def from_evaluation_dataset(
         evaluation_dataset: EvaluationDataset, config: SafeSynthesizerParameters | None = None
     ) -> MembershipInferenceProtection:
         """Run the membership inference attack and return the protection score."""
-        if not faiss_available:
-            return MembershipInferenceProtection(score=EvaluationScore())
-
         score, attack_sum_df, tps_values, fps_values = MembershipInferenceProtection.mia(
             df_train=evaluation_dataset.reference,
             df_synth=evaluation_dataset.output,
@@ -249,7 +238,7 @@ def _compute_mia(
         df_train_norm: pd.DataFrame,
         df_test_norm: pd.DataFrame,
         df_synth_norm: pd.DataFrame,
-        index: faiss.IndexFlatL2 | None,  # ty: ignore[possibly-unbound-attribute]
+        nn_index: NearestNeighborSearch | None,
         run: int,
         text_cnt: int,
         tabular_cnt: int,
@@ -263,14 +252,14 @@ def _compute_mia(
 
         Builds an attack dataset from a slice of training rows mixed with
         test rows, computes nearest-neighbor distances to the synthetic
-        data (text via semantic search, tabular via FAISS L2), and
+        data (text via semantic search, tabular via L2 nearest neighbor), and
         classifies each record as member or non-member.
 
         Args:
             df_train_norm: Normalized training dataframe.
             df_test_norm: Normalized holdout (test) dataframe.
             df_synth_norm: Normalized synthetic dataframe.
-            index: Pre-built FAISS L2 index over the tabular columns of
+            nn_index: Pre-built NearestNeighborSearch index over the tabular columns of
                 the synthetic data, or ``None`` if no tabular columns exist.
             run: Zero-based run index controlling which training slice to use.
             text_cnt: Number of text columns in the dataset.
@@ -334,18 +323,16 @@ def _compute_mia(
                 attacker_data_tabular = real_data.copy()
                 k = 1
 
-            if index is None:
-                raise RuntimeError("faiss index not provided for MIA calculation when expected.")
+            if nn_index is None:
+                raise RuntimeError("Nearest neighbor index not provided for MIA calculation when expected.")
 
-            # This usage matches documentation despite type annotation for
-            # IndexFlatL2.search, possibly related to swig handling that ty is
-            # not aware of. Similar for other calls for faiss indexes.
-            dists, indices = index.search(
-                np.float32(np.ascontiguousarray(np.array(attacker_data_tabular))),
-                len(df_synth_norm),
-            )  # ty: ignore[missing-argument]
+            # Use nearest neighbor search (torch GPU or sklearn CPU fallback) for distance calculation
+            dists, indices = nn_index.kneighbors(
+                np.ascontiguousarray(np.array(attacker_data_tabular)).astype(np.float32),
+                n_neighbors=int(k),
+            )
             # Scale the Euclidean distance to [0,1]
-            dists = np.sqrt(dists)
+            # NearestNeighborSearch.kneighbors() returns L2 distance directly, not squared
             max_dist = np.amax(dists)
             if max_dist > 0:
                 dist_scaled = dists / max_dist
@@ -555,15 +542,14 @@ def mia(
                     df_train_norm, df_test_norm, df_synth_norm = MembershipInferenceProtection._normalize_onehot(
                         df_train_use, df_test, df_synth
                     )
-                # Create the faiss index on the synthetic tabular data
-                dim = df_synth_norm.shape[1]
-                index = faiss.IndexFlatL2(dim)  # ty: ignore[possibly-unbound-attribute]
-                index.add(np.float32(np.ascontiguousarray(np.array(df_synth_norm))))  # ty: ignore[missing-argument]
+                # Create nearest neighbor index on the synthetic tabular data (torch GPU or sklearn CPU fallback)
+                nn_index = NearestNeighborSearch(n_neighbors=len(df_synth_norm))
+                nn_index.fit(np.ascontiguousarray(np.array(df_synth_norm)).astype(np.float32))
             else:
                 df_train_norm = pd.DataFrame()
                 df_test_norm = pd.DataFrame()
                 df_synth_norm = pd.DataFrame()
-                index = None
+                nn_index = None
 
             # Create embeddings for text fields and combine the normalized tabular and the
             # new text embeddings into one dataframe.
@@ -588,7 +574,7 @@ def mia(
                     df_train_norm,
                     df_test_norm,
                     df_synth_norm,
-                    index,
+                    nn_index,
                     i,
                     text_cnt,
                     tabular_cnt,