[feat]: Add common functions for prediction files validation (#26)

vpchung · web-flow · commit 293eb5177dac · 2024-05-17T12:43:44.000-07:00
* add common functions for validating prediction files

* add to docs site

* add function for checking binary values

* add example use case to docs

* use spacy section style to match with synapseclient
diff --git a/cnb_tools/validation_toolkit.py b/cnb_tools/validation_toolkit.py
@@ -0,0 +1,144 @@
+import pandas as pd
+
+
+def check_missing_keys(
+    gold_col: pd.Series, pred_col: pd.Series, verbose: bool = False
+) -> str:
+    """Check for missing keys.
+
+    Tip: Example Use Case
+      There is at least one prediction for every patient / sample / etc.
+
+    Args:
+      gold_col: Dataframe column containing the true keys
+      pred_col: Dataframe column containing the keys to validate
+      verbose: Include list of affected keys in error message
+
+    Returns:
+       An error message, if any (default is an empty string)
+
+    """
+    error = ""
+    missing_ids = gold_col[~gold_col.isin(pred_col)]
+    if missing_ids.any():
+        error = f"Found {missing_ids.shape[0]} missing ID(s)"
+
+        if verbose:
+            error += f": {missing_ids.to_list()}"
+    return error
+
+
+def check_unknown_keys(
+    gold_col: pd.Series, pred_col: pd.Series, verbose: bool = False
+) -> str:
+    """Check for unknown keys.
+
+    Tip: Example Use Case
+      There are no predictions without a corresponding groundtruth value.
+
+    Args:
+      gold_col: Dataframe column containing the true keys
+      pred_col: Dataframe column containing the keys to validate
+      verbose: Include list of affected keys in error message
+
+    Returns:
+       An error message, if any (default is an empty string)
+
+    """
+    error = ""
+    unknown_ids = pred_col[~pred_col.isin(gold_col)]
+    if unknown_ids.any():
+        error = f"Found {unknown_ids.shape[0]} unknown ID(s)"
+
+        if verbose:
+            error += f": {unknown_ids.to_list()}"
+    return error
+
+
+def check_duplicate_keys(pred_col: pd.Series, verbose: bool = False) -> str:
+    """Check for duplicate keys.
+
+    Tip: Example Use Case
+      There is exactly one prediction for a patient / sample / etc.
+
+    Args:
+      pred_col: Dataframe column containing the keys to validate
+      verbose: Include list of affected keys in error message
+
+    Returns:
+       An error message, if any (default is an empty string)
+
+    """
+    error = ""
+    duplicates = pred_col.duplicated()
+    if duplicates.any():
+        error = f"Found {duplicates.sum()} duplicate ID(s)"
+
+        if verbose:
+            error += f": {pred_col[duplicates].to_list()}"
+    return error
+
+
+def check_nan_values(pred_col: pd.Series) -> str:
+    """Check for NAN values.
+
+    Tip: Example Use Case
+      Predictions must not be null / None.
+
+    Args:
+      pred_col: Dataframe column containing the values to validate
+
+    Returns:
+       An error message, if any (default is an empty string)
+
+    """
+    nan_count = pred_col.isna().sum()
+    if nan_count:
+        return f"'{pred_col.name}' column contains {nan_count} NaN value(s)."
+    return ""
+
+
+def check_binary_values(
+    pred_col: pd.Series, label1: int = 0, label2: int = 1
+) -> str:
+    """Check that values are binary (default: 0 or 1).
+
+    Tip: Example Use Case
+      Predictions can only be 0 (no disease present) or 1 (disease present).
+
+    Args:
+        pred_col: Dataframe column containing the values to validate.
+        label1: First acceptable binary value.
+        label2: Second acceptable binary value.
+
+    Returns:
+        An error message, if any (default is an empty string)
+
+    """
+    if not pred_col.isin([label1, label2]).all():
+        return f"'{pred_col.name}' values should only be {label1} or {label2}."
+    return ""
+
+
+# pylint: disable=unsupported-binary-operation
+def check_values_range(
+    pred_col: pd.Series, min_val: int | float = 0, max_val: int | float = 1
+) -> str:
+    """Check that values are between min and max values, inclusive.
+
+    Tip: Example Use Case
+      Predictions must be a probability from 0 (disease not likely) to 1
+      (disease likely).
+
+    Args:
+      pred_col: Dataframe column containing the values to validate
+      min_val: Lower limit of range
+      max_val: Upper limit of range
+
+    Returns:
+       An error message, if any (default is an empty string)
+
+    """
+    if (pred_col < min_val).any() or (pred_col > max_val).any():
+        return f"'{pred_col.name}' values should be between [{min_val}, {max_val}]."
+    return ""
diff --git a/docs/reference/validation-toolkit.md b/docs/reference/validation-toolkit.md
@@ -0,0 +1 @@
+::: cnb_tools.validation_toolkit
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -18,6 +18,7 @@ nav:
     - Contributing: user-guide/contributing.md
   - Reference:
     - CLI: reference/cli.md
+    - Validation Toolkit: reference/validation-toolkit.md
   - Changelog: 
     - changelog/release-notes.md
     - How to upgrade: changelog/upgrade.md
@@ -54,7 +55,11 @@ theme:
 
 plugins:
 - search
-- mkdocstrings
+- mkdocstrings:
+    handlers:
+      python:
+        options:
+          docstring_section_style: spacy
 - autorefs
 - termynal:
     prompt_literal_start: