Merge pull request #65 from m-misiura/fix_tests_hf_runtime

m-misiura · web-flow · commit d0d08faf2917 · 2026-01-06T13:17:20.000Z
Fixing Tier 1 - Hugging Face Runtime unit tests
diff --git a/.github/workflows/test-huggingface-runtime.yaml b/.github/workflows/test-huggingface-runtime.yaml
@@ -74,15 +74,16 @@ jobs:
         HF_HOME: /tmp/huggingface
         TRANSFORMERS_CACHE: /tmp/transformers_cache
         TOKENIZERS_PARALLELISM: false
+        MODEL_DIR: tests/dummy_models/bert/BertForSequenceClassification
       run: |
         python -c "
         try:
-            from detectors.huggingface.detector import HuggingFaceDetector
-            print('HuggingFaceDetector import successful')
+            from detectors.huggingface.detector import Detector
+            print('Detector import successful')
 
             # Test basic initialization
-            detector = HuggingFaceDetector()
-            print('HuggingFaceDetector initialization successful')
+            detector = Detector()
+            print('Detector initialization successful')
         except Exception as e:
             print(f'Error testing HF detector: {e}')
             exit(1)
diff --git a/detectors/common/requirements-dev.txt b/detectors/common/requirements-dev.txt
@@ -4,3 +4,4 @@ pre-commit==3.8.0
 pytest==8.3.2
 tls-test-tools
 protobuf==6.33.0
+torch==2.9.0
diff --git a/tests/detectors/huggingface/test_method_initialize_model.py b/tests/detectors/huggingface/test_method_initialize_model.py
@@ -3,7 +3,7 @@
 import pytest
 
 # local imports
-from detectors.huggingface.scheme import ContentAnalysisResponse
+from detectors.common.scheme import ContentAnalysisResponse
 from detectors.huggingface.detector import Detector
 
 
diff --git a/tests/detectors/huggingface/test_method_process_causal_lm.py b/tests/detectors/huggingface/test_method_process_causal_lm.py
@@ -60,12 +60,9 @@ def validate_results(self, results, input_text, detector):
                 "detection",
                 "detection_type",
                 "score",
-                "sequence_classification",
-                "sequence_probability",
-                "token_classifications",
-                "token_probabilities",
                 "text",
                 "evidences",
+                "metadata",
             ]
 
             for field in expected_fields:
@@ -79,16 +76,12 @@ def validate_results(self, results, input_text, detector):
             assert isinstance(result.detection, str)
             assert isinstance(result.detection_type, str)
             assert isinstance(result.score, float)
-            assert isinstance(result.sequence_classification, str)
-            assert isinstance(result.sequence_probability, float)
             assert isinstance(result.text, str)
             assert isinstance(result.evidences, list)
 
             assert 0 <= result.start <= len(input_text)
             assert 0 <= result.end <= len(input_text)
             assert 0.0 <= result.score <= 1.0
-            assert 0.0 <= result.sequence_probability <= 1.0
-            assert result.sequence_classification in detector.risk_names
 
     def test_process_causal_lm_single_short_input(self, detector_instance):
         text = "This is a test."
diff --git a/tests/detectors/huggingface/test_method_process_sequence_classification.py b/tests/detectors/huggingface/test_method_process_sequence_classification.py
@@ -40,12 +40,9 @@ def validate_results(self, results, input_text):
             "detection",
             "detection_type",
             "score",
-            "sequence_classification",
-            "sequence_probability",
-            "token_classifications",
-            "token_probabilities",
             "text",
             "evidences",
+            "metadata",
         ]
 
         for field in expected_fields:
@@ -59,12 +56,6 @@ def validate_results(self, results, input_text):
         assert isinstance(result.detection, str), "detection should be string"
         assert isinstance(result.detection_type, str), "detection_type should be string"
         assert isinstance(result.score, float), "score should be float"
-        assert isinstance(
-            result.sequence_classification, str
-        ), "sequence_classification should be string"
-        assert isinstance(
-            result.sequence_probability, float
-        ), "sequence_probability should be float"
         assert isinstance(result.text, str), "text should be string"
         assert isinstance(result.evidences, list), "evidences should be list"
 
@@ -73,9 +64,6 @@ def validate_results(self, results, input_text):
         ), "start should be within text bounds"
         assert 0 <= result.end <= len(input_text), "end should be within text bounds"
         assert 0.0 <= result.score <= 1.0, "score should be between 0 and 1"
-        assert (
-            0.0 <= result.sequence_probability <= 1.0
-        ), "sequence_probability should be between 0 and 1"
 
         return result
 
diff --git a/tests/detectors/huggingface/test_method_run.py b/tests/detectors/huggingface/test_method_run.py
@@ -5,8 +5,8 @@
 from unittest.mock import Mock, patch
 
 # relative imports
-from detectors.huggingface.detector import Detector, ContentAnalysisResponse
-from scheme import ContentAnalysisHttpRequest
+from detectors.huggingface.detector import Detector
+from detectors.common.scheme import ContentAnalysisResponse, ContentAnalysisHttpRequest
 
 
 @pytest.fixture
@@ -60,58 +60,63 @@ def detector_causal_lm(self):
             detector.is_causal_lm = True
             detector.is_sequence_classifier = False
             detector.risk_names = ["harm", "bias"]
+            detector.function_name = "test_causal_lm"
+            detector.instruments = {}  # Initialize empty instruments dict
 
             return detector
 
     def test_run_sequence_classifier_single_short_input(self, detector_sequence):
-        request = ContentAnalysisHttpRequest(contents=["Test content"])
+        request = ContentAnalysisHttpRequest(contents=["Test content"], detector_params=None)
         results = detector_sequence.run(request)
 
         assert len(results) == 1
         assert isinstance(results[0][0], ContentAnalysisResponse)
-        assert results[0][0].detection_type == "sequence_classification"
+        # detection_type is the label from the model (e.g., "LABEL_1", not "sequence_classification")
+        assert results[0][0].detection_type in detector_sequence.model.config.id2label.values()
 
     def test_run_sequence_classifier_single_long_input(self, detector_sequence):
         request = ContentAnalysisHttpRequest(
             contents=[
                 "This is a long content. " * 1_000,
-            ]
+            ],
+            detector_params=None
         )
         results = detector_sequence.run(request)
 
         assert len(results) == 1
         assert isinstance(results[0][0], ContentAnalysisResponse)
-        assert results[0][0].detection_type == "sequence_classification"
+        assert results[0][0].detection_type in detector_sequence.model.config.id2label.values()
 
     def test_run_sequence_classifier_empty_input(self, detector_sequence):
-        request = ContentAnalysisHttpRequest(contents=[""])
+        request = ContentAnalysisHttpRequest(contents=[""], detector_params=None)
         results = detector_sequence.run(request)
 
         assert len(results) == 1
         assert isinstance(results[0][0], ContentAnalysisResponse)
-        assert results[0][0].detection_type == "sequence_classification"
+        assert results[0][0].detection_type in detector_sequence.model.config.id2label.values()
 
     def test_run_sequence_classifier_multiple_contents(self, detector_sequence):
-        request = ContentAnalysisHttpRequest(contents=["Content 1", "Content 2"])
+        request = ContentAnalysisHttpRequest(contents=["Content 1", "Content 2"], detector_params=None)
         results = detector_sequence.run(request)
 
         assert len(results) == 2
         for content_analysis in results:
             assert len(content_analysis) == 1
             assert isinstance(content_analysis[0], ContentAnalysisResponse)
-            assert content_analysis[0].detection_type == "sequence_classification"
+            assert content_analysis[0].detection_type in detector_sequence.model.config.id2label.values()
 
     def test_run_unsupported_model(self):
         detector = Detector.__new__(Detector)
         detector.is_causal_lm = False
         detector.is_sequence_classifier = False
+        detector.function_name = "test_detector"
 
-        request = ContentAnalysisHttpRequest(contents=["Test content"])
+        request = ContentAnalysisHttpRequest(contents=["Test content"], detector_params=None)
         with pytest.raises(ValueError, match="Unsupported model type for analysis"):
             detector.run(request)
 
     def test_run_causal_lm_single_short_input(self, detector_causal_lm):
-        request = ContentAnalysisHttpRequest(contents=["Test content"])
+        request = ContentAnalysisHttpRequest(contents=["Test content"], detector_params=None)
         results = detector_causal_lm.run(request)
 
         assert len(results) == 1
@@ -122,7 +127,8 @@ def test_run_causal_lm_single_long_input(self, detector_causal_lm):
         request = ContentAnalysisHttpRequest(
             contents=[
                 "This is a long content. " * 1_000,
-            ]
+            ],
+            detector_params=None
         )
         results = detector_causal_lm.run(request)
 
@@ -131,15 +137,15 @@ def test_run_causal_lm_single_long_input(self, detector_causal_lm):
         assert results[0][0].detection_type == "causal_lm"
 
     def test_run_causal_lm_empty_input(self, detector_causal_lm):
-        request = ContentAnalysisHttpRequest(contents=[""])
+        request = ContentAnalysisHttpRequest(contents=[""], detector_params=None)
         results = detector_causal_lm.run(request)
 
         assert len(results) == 1
         assert isinstance(results[0][0], ContentAnalysisResponse)
         assert results[0][0].detection_type == "causal_lm"
 
     def tes_run_causal_lm_multiple_contents(self, detector_causal_lm):
-        request = ContentAnalysisHttpRequest(contents=["Content 1", "Content 2"])
+        request = ContentAnalysisHttpRequest(contents=["Content 1", "Content 2"], detector_params=None)
         results = detector_causal_lm.run(request)
 
         assert len(results) == 2
diff --git a/tests/detectors/huggingface/test_metrics.py b/tests/detectors/huggingface/test_metrics.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 from starlette.testclient import TestClient
+from prometheus_client import REGISTRY
 
 # DO NOT IMPORT THIS VALUE, if we import common.app before the test fixtures we can break prometheus multiprocessing
 METRIC_PREFIX = "trustyai_guardrails"
@@ -25,8 +26,10 @@ def send_request(client: TestClient, detect: bool, slow: bool = False):
 
 
 def get_metric_dict(client: TestClient):
-    metrics = client.get("/metrics")
-    metrics = metrics.content.decode().split("\n")
+    # In test mode with TestClient, we're running in a single process,
+    # so multiprocess mode doesn't work. Use the default REGISTRY directly.
+    from prometheus_client import generate_latest, REGISTRY
+    metrics = generate_latest(REGISTRY).decode().split("\n")
     metric_dict = {}
 
     for m in metrics:
@@ -36,45 +39,54 @@ def get_metric_dict(client: TestClient):
 
     return metric_dict
 
+@pytest.fixture(scope="session")
+def client(prometheus_multiproc_dir):
+    # Clear any existing metrics from the REGISTRY before importing the app
+    # This is needed because even in multiprocess mode, metrics are registered to REGISTRY
+    collectors_to_unregister = [
+        c for c in list(REGISTRY._collector_to_names.keys())
+        if hasattr(c, '_name') and 'trustyai_guardrails' in c._name
+    ]
+    for collector in collectors_to_unregister:
+        try:
+            REGISTRY.unregister(collector)
+        except Exception:
+            pass
+
+    current_dir = os.path.dirname(__file__)
+    parent_dir = os.path.dirname(os.path.dirname(current_dir))
+    os.environ["MODEL_DIR"] = os.path.join(parent_dir, "dummy_models", "bert/BertForSequenceClassification")
+
+    from detectors.huggingface.app import app
+    from detectors.huggingface.detector import Detector
+    detector = Detector()
+
+    # patch the model to allow for control over detections - long messages will flag
+    def detection_fn(*args, **kwargs):
+        output = Mock()
+        if kwargs["input_ids"].shape[-1] > 10:
+            output.logits = torch.tensor([[0.0, 1.0]])
+        else:
+            output.logits = torch.tensor([[1.0, 0.0]])
+
+        if kwargs["input_ids"].shape[-1] > 100:
+            time.sleep(.25)
+        return output
+
+    class ModelMock:
+        def __init__(self):
+            self.config = Mock()
+            self.config.id2label = detector.model.config.id2label
+            self.config.problem_type = detector.model.config.problem_type
+        def __call__(self, *args, **kwargs):
+            return detection_fn(*args, **kwargs)
+
+    detector.model = ModelMock()
+    app.set_detector(detector, detector.registry_name)
+    detector.set_instruments(app.state.instruments)
+    return TestClient(app)
+
 class TestMetrics:
-    @pytest.fixture
-    def client(self):
-        current_dir = os.path.dirname(__file__)
-        parent_dir = os.path.dirname(os.path.dirname(current_dir))
-        os.environ["MODEL_DIR"] = os.path.join(parent_dir, "dummy_models", "bert/BertForSequenceClassification")
-
-        from detectors.huggingface.app import app
-        # clear the metric registry at the start of each test, but AFTER the multiprocessing metrics is set up
-        import prometheus_client
-        prometheus_client.REGISTRY._names_to_collectors.clear()
-
-        from detectors.huggingface.detector import Detector
-        detector = Detector()
-
-        # patch the model to allow for control over detections - long messages will flag
-        def detection_fn(*args, **kwargs):
-            output = Mock()
-            if kwargs["input_ids"].shape[-1] > 10:
-                output.logits = torch.tensor([[0.0, 1.0]])
-            else:
-                output.logits = torch.tensor([[1.0, 0.0]])
-
-            if kwargs["input_ids"].shape[-1] > 100:
-                time.sleep(.25)
-            return output
-
-        class ModelMock:
-            def __init__(self):
-                self.config = Mock()
-                self.config.id2label = detector.model.config.id2label
-                self.config.problem_type = detector.model.config.problem_type
-            def __call__(self, *args, **kwargs):
-                return detection_fn(*args, **kwargs)
-
-        detector.model = ModelMock()
-        app.set_detector(detector, detector.registry_name)
-        detector.set_instruments(app.state.instruments)
-        return TestClient(app)