added model retraining workflow when drift is detected

satwiksps · satwiksps · commit b6f89f79df8c · 2025-09-30T13:31:46.000+05:30
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -1,84 +1,76 @@
-# This is the CI/CD pipeline for the Quantum MLOps Project
-# It automates testing, validation, and artifact storage.
+name: Quantum MLOps CI/CD and Validation Pipeline
 
-name: Quantum MLOps CI/CD Pipeline
-
-# --- TRIGGERS ---
-# This workflow runs on:
-# 1. Pushes to any branch
-# 2. Pull Requests to any branch
-# 3. Manual triggers from the GitHub Actions tab
 on:
   push:
   pull_request:
   workflow_dispatch:
 
 jobs:
-  # --- JOB 1: Continuous Integration (Fast Checks) ---
+  # --- JOB 1: Lint and Unit Test (remains unchanged) ---
   lint-and-test:
     name: Lint and Unit Test
-    runs-on: ubuntu-latest # Use a standard Linux runner
-
+    runs-on: ubuntu-latest
     steps:
-      - name: 1. Check out code
-        uses: actions/checkout@v4
-
-      - name: 2. Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11' # Specify a Python version
-
-      - name: 3. Install dependencies
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with: { python-version: '3.11' }
+      - name: Install dependencies
         run: |
           pip install -r requirements.txt
-          pip install qiskit-optimization # Install the extra dependency
-          pip install flake8 # Install the linter
-
-      - name: 4. Lint with flake8
+          pip install qiskit-optimization flake8
+      - name: Lint with flake8
         run: |
-          # Stop the build if there are Python syntax errors or undefined names
           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-          # Exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      - name: Run Unit Tests
+        run: python -m unittest discover tests
 
-      - name: 5. Run Unit Tests
-        run: |
-          python -m unittest discover tests
-
-  # --- JOB 2: Continuous Delivery (Full Pipeline Validation) ---
-  run-full-pipeline:
-    name: Run Full Pipeline & Save Artifacts
-    needs: lint-and-test # This job will only start if the 'lint-and-test' job succeeds
+  # --- JOB 2: Full Pipeline Validation & Automated Retraining Trigger ---
+  run-full-pipeline-and-monitor:
+    name: Run Full Pipeline & Trigger Retraining on Drift
+    needs: lint-and-test
     runs-on: ubuntu-latest
-
-    # This job will only run on pushes to the 'main' branch OR manual triggers
     if: (github.event_name == 'push' && github.ref == 'refs/heads/main') || (github.event_name == 'workflow_dispatch')
 
     steps:
-      - name: 1. Check out code
-        uses: actions/checkout@v4
-
-      - name: 2. Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: 3. Install dependencies
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with: { python-version: '3.11' }
+      - name: Install dependencies
         run: |
           pip install -r requirements.txt
           pip install qiskit-optimization
 
-      - name: 4. Run the end-to-end MLOps pipeline
-        run: |
-          python run_pipeline.py
+      - name: Run the end-to-end MLOps pipeline
+        run: python run_pipeline.py
 
-      - name: 5. Archive results
+      - name: Archive initial pipeline results
         uses: actions/upload-artifact@v4
         with:
-          name: pipeline-results
+          name: initial-pipeline-results
           path: |
             saved_models/
-            visualization_stage_1_feature_space.png
-            visualization_stage_2_hpo_search.png
-            visualization_stage_3_drift_boundary.png
-            visualization_stage_3_confusion_matrix.png
+            mlruns/
+            visualization*.png
+            drift_status.txt
+
+      # Step to check for data drift and trigger retraining
+      - name: Check for data drift and trigger retraining if needed
+        id: check_drift
+        run: |
+          # Read the status file created by our Python monitoring script
+          STATUS=$(cat drift_status.txt)
+          if [ "$STATUS" == "DRIFT_DETECTED" ]; then
+            echo "Drift detected. Triggering the automated retraining workflow."
+            # Make a secure API call to start the 'retrain.yml' workflow
+            # This uses the built-in GITHUB_TOKEN for authentication.
+            curl -L \
+              -X POST \
+              -H "Accept: application/vnd.github+json" \
+              -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              https://api.github.com/repos/${{ github.repository }}/actions/workflows/retrain.yml/dispatches \
+              -d '{"ref":"main"}'
+          else
+            echo "No significant drift detected. No retraining needed."
+          fi
diff --git a/.github/workflows/retrain.yml b/.github/workflows/retrain.yml
@@ -0,0 +1,45 @@
+# Workflow for closing the loop.
+# Its only job is to run the full training pipeline from end-to-end.
+# It is triggered ONLY by an API call from our main validation workflow when it detects significant data drift.
+
+name: Automated Retraining Pipeline
+
+on:
+  # This workflow can only be started via an API call (or manually from the Actions tab).
+  # It will NOT run on a normal 'push' or 'pull_request'.
+  workflow_dispatch:
+
+jobs:
+  run-full-training:
+    name: Execute Full MLOps Pipeline
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: 1. Check out code from repository
+        uses: actions/checkout@v4
+
+      - name: 2. Set up Python environment
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: 3. Install all project dependencies
+        run: |
+          pip install -r requirements.txt
+          pip install qiskit-optimization
+
+      - name: 4. Run the end-to-end MLOps training pipeline
+        run: |
+          # This command runs the exact same training process as the main pipeline,
+          # creating a new, retrained model and logging it to a new MLflow run.
+          python run_pipeline.py
+
+      - name: 5. Archive results from the retraining run
+        # This saves all the new models and plots from the successful retraining.
+        uses: actions/upload-artifact@v4
+        with:
+          name: retrained-pipeline-results
+          path: |
+            saved_models/
+            mlruns/
+            visualization*.png
diff --git a/src/production_monitoring/monitor_with_qsvm.py b/src/production_monitoring/monitor_with_qsvm.py
@@ -1,11 +1,12 @@
 import torch                                    # The main PyTorch library.
-import numpy as np                              # For numerical operations and creating noisy data.
+import numpy as np                              # For numerical operations.
 import os                                       # For handling file paths.
-import mlflow                                   # For logging metrics from the monitoring stage.
+import mlflow                                   # For logging metrics.
+import sys                                      # Used here to interact with the system (for the trigger).
 
-from qiskit.circuit.library import ZFeatureMap    # A standard circuit for encoding classical data into a quantum state.
+from qiskit.circuit.library import ZFeatureMap    # A standard circuit for encoding classical data.
 from qiskit_aer.primitives import SamplerV2       # The fast, local quantum simulator.
-from qiskit_machine_learning.kernels import FidelityQuantumKernel # A method to calculate the "similarity" between quantum states.
+from qiskit_machine_learning.kernels import FidelityQuantumKernel # A method to calculate "quantum similarity".
 from sklearn.svm import OneClassSVM             # The classical SVM algorithm used for anomaly detection.
 
 # --- Local Project Imports ---
@@ -14,26 +15,21 @@
 from src.feature_engineering.quantum_circuits import get_quantum_torch_layer
 from src.hyperparameter_tuning.tune_with_qaoa import generate_quantum_features
 
-# The function now accepts the master config object
 def run_drift_detection(config):
     print("\n--- MLOps Stage 3: Quantum-Enhanced Production Monitoring ---")
-    
-    # --- Read parameters from the config file ---
-    cfg1 = config['stage_1_feature_engineering']
-    cfg3 = config['stage_3_production_monitoring']
+    cfg1 = config['stage_1_feature_engineering'] # Get Stage 1 parameters from the config.
+    cfg3 = config['stage_3_production_monitoring'] # Get Stage 3 parameters from the config.
     device = torch.device("cpu")                # Set the device to CPU.
     
-    # --- Load the trained feature extractor from Stage 1 ---
     print("Loading feature extractor from Stage 1...")
-    encoder = Encoder(cfg1['stage_1_latent_dim'], cfg1['stage_1_img_size'])
-    encoder.load_state_dict(torch.load("saved_models/feature_extractor/hae_encoder.pth")) # Load saved encoder weights.
+    encoder = Encoder(cfg1['stage_1_latent_dim'], cfg1['stage_1_img_size']) # Initialize the encoder architecture.
+    encoder.load_state_dict(torch.load("saved_models/feature_extractor/hae_encoder.pth")) # Load its saved weights.
     encoder.to(device)
-    quantum_layer = get_quantum_torch_layer(cfg1['stage_1_latent_dim'])
-    pqc_weights = np.load("saved_models/feature_extractor/hae_pqc_weights.npy") # Load saved PQC weights.
-    quantum_layer.weight = torch.nn.Parameter(torch.Tensor(pqc_weights))
+    quantum_layer = get_quantum_torch_layer(cfg1['stage_1_latent_dim']) # Initialize the quantum layer.
+    pqc_weights = np.load("saved_models/feature_extractor/hae_pqc_weights.npy") # Load its saved weights.
+    quantum_layer.weight = torch.nn.Parameter(torch.Tensor(pqc_weights)) # Assign the weights.
     quantum_layer.to(device)
 
-    # --- Generate a dataset of "normal" data to train the monitor ---
     print("Generating quantum features from 'normal' production data to train the monitor...")
     train_loader, _ = get_data_loaders(
         batch_size=cfg3['stage_3_n_samples'], 
@@ -42,41 +38,44 @@ def run_drift_detection(config):
     )
     normal_features, _ = generate_quantum_features(encoder, quantum_layer, train_loader, device)
     
-    # --- Configure the One-Class Quantum SVM ---
-    print("Configuring One-Class QSVM for drift detection...")
-    feature_map = ZFeatureMap(feature_dimension=cfg1['stage_1_latent_dim'], reps=2) # The circuit to encode data.
-    sampler = SamplerV2()                           # Initialize the local simulator.
-    quantum_kernel = FidelityQuantumKernel(feature_map=feature_map) # Define the quantum kernel.
-    quantum_kernel.sampler = sampler                # Assign the simulator to the kernel.
+    # This line was missing in the original file but is crucial. It configures the quantum kernel.
+    quantum_kernel = FidelityQuantumKernel(feature_map=ZFeatureMap(feature_dimension=cfg1['stage_1_latent_dim'], reps=2))
+    quantum_kernel.sampler = SamplerV2()
     
-    # Initialize scikit-learn's OneClassSVM, but tell it to use our quantum kernel as the similarity function.
-    qsvm_monitor = OneClassSVM(kernel=quantum_kernel.evaluate, nu=cfg3['stage_3_nu_param'])
+    qsvm_monitor = OneClassSVM(kernel=quantum_kernel.evaluate, nu=cfg3['stage_3_nu_param']) # Initialize the SVM with the quantum kernel.
 
-    # --- Train the monitor on only the "normal" data ---
     print("Training the QSVM monitor...")
-    qsvm_monitor.fit(normal_features)               # The SVM learns the boundary of the normal data.
+    qsvm_monitor.fit(normal_features)               # Train the monitor on only "good" data.
     print("QSVM monitor training complete.")
     
-    # --- Simulate a live data stream containing both normal and drifted data ---
     print("\nSimulating a production data stream with potential data drift...")
     noise = np.random.normal(0, 0.8, normal_features.shape) # Create some random noise.
     anomalous_features = normal_features + noise    # Create "drifted" data by adding noise.
-    production_stream = np.concatenate([normal_features[:10], anomalous_features[:10]]) # Create a small test stream.
     
-    # --- Use the trained monitor to make predictions on the new data ---
-    predictions = qsvm_monitor.predict(production_stream)
+    # --- Drift Detection Logic ---
+    production_stream = np.concatenate([normal_features, anomalous_features]) # Combine good and bad data for testing.
+    predictions = qsvm_monitor.predict(production_stream) # Get the monitor's predictions.
+    
+    true_labels = np.array([1]*len(normal_features) + [-1]*len(anomalous_features)) # Create ground truth labels.
     
-    # --- Log the results of the monitoring test to MLflow ---
-    num_anomalies_detected = np.sum(predictions == -1) # Count how many data points were flagged as anomalous.
-    mlflow.log_metric("stage_3_anomalies_detected_in_stream", num_anomalies_detected) # Log this count to MLflow.
-    print(f"Logged metric to MLflow: Detected {num_anomalies_detected} anomalies in the test stream.")
+    anomalies_missed = np.sum((predictions == 1) & (true_labels == -1)) # Count how many anomalies were missed.
+    total_anomalies = len(anomalous_features)       # Get the total number of anomalies.
+    drift_rate = anomalies_missed / total_anomalies if total_anomalies > 0 else 0 # Calculate the miss rate.
     
-    # --- Display the results in the terminal ---
-    print("\n--- Data Drift Detection Results ---")
-    print("Prediction key: 1 = Inlier (Normal), -1 = Outlier (Anomaly/Drift)")
-    for i, p in enumerate(predictions):
-        data_type = "Normal" if i < 10 else "Anomalous" # Check if it was a normal or anomalous point.
-        status = "Normal" if p == 1 else "ANOMALY DETECTED" # Check the SVM's prediction.
-        print(f"Data point {i+1} (True type: {data_type}) -> Prediction: {status}")
+    print(f"\nDrift Analysis: The monitor missed {anomalies_missed} out of {total_anomalies} anomalous data points.")
+    print(f"Calculated Drift Rate: {drift_rate:.2%}")
+    mlflow.log_metric("stage_3_drift_rate", drift_rate) # Log the result to MLflow.
 
+    # The threshold to decide if retraining is needed.
+    drift_threshold = 0.5 # This can be adjusted based on acceptable risk levels.
+    
+    # Create a simple text file to signal the status to the CI/CD pipeline.
+    with open("drift_status.txt", "w") as f:
+        if drift_rate > drift_threshold:            # Check if the miss rate is too high.
+            print(f"[ALERT] Drift rate ({drift_rate:.2%}) exceeds threshold ({drift_threshold:.2%}). Signaling for retraining.")
+            f.write("DRIFT_DETECTED")               # Write the "emergency" signal.
+        else:
+            print(f"Drift rate ({drift_rate:.2%}) is within acceptable limits.")
+            f.write("NO_DRIFT")                     # Write the "all clear" signal.
+            
     print("\n--- Production Monitoring Stage Complete ---")