Skip to content

Commit 293eb51

Browse files
authored
[feat]: Add common functions for prediction files validation (#26)
* add common functions for validating prediction files * add to docs site * add function for checking binary values * add example use case to docs * use spacy section style to match with synapseclient
1 parent d67d819 commit 293eb51

File tree

3 files changed

+151
-1
lines changed

3 files changed

+151
-1
lines changed

cnb_tools/validation_toolkit.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
import pandas as pd
2+
3+
4+
def check_missing_keys(
5+
gold_col: pd.Series, pred_col: pd.Series, verbose: bool = False
6+
) -> str:
7+
"""Check for missing keys.
8+
9+
Tip: Example Use Case
10+
There is at least one prediction for every patient / sample / etc.
11+
12+
Args:
13+
gold_col: Dataframe column containing the true keys
14+
pred_col: Dataframe column containing the keys to validate
15+
verbose: Include list of affected keys in error message
16+
17+
Returns:
18+
An error message, if any (default is an empty string)
19+
20+
"""
21+
error = ""
22+
missing_ids = gold_col[~gold_col.isin(pred_col)]
23+
if missing_ids.any():
24+
error = f"Found {missing_ids.shape[0]} missing ID(s)"
25+
26+
if verbose:
27+
error += f": {missing_ids.to_list()}"
28+
return error
29+
30+
31+
def check_unknown_keys(
32+
gold_col: pd.Series, pred_col: pd.Series, verbose: bool = False
33+
) -> str:
34+
"""Check for unknown keys.
35+
36+
Tip: Example Use Case
37+
There are no predictions without a corresponding groundtruth value.
38+
39+
Args:
40+
gold_col: Dataframe column containing the true keys
41+
pred_col: Dataframe column containing the keys to validate
42+
verbose: Include list of affected keys in error message
43+
44+
Returns:
45+
An error message, if any (default is an empty string)
46+
47+
"""
48+
error = ""
49+
unknown_ids = pred_col[~pred_col.isin(gold_col)]
50+
if unknown_ids.any():
51+
error = f"Found {unknown_ids.shape[0]} unknown ID(s)"
52+
53+
if verbose:
54+
error += f": {unknown_ids.to_list()}"
55+
return error
56+
57+
58+
def check_duplicate_keys(pred_col: pd.Series, verbose: bool = False) -> str:
59+
"""Check for duplicate keys.
60+
61+
Tip: Example Use Case
62+
There is exactly one prediction for a patient / sample / etc.
63+
64+
Args:
65+
pred_col: Dataframe column containing the keys to validate
66+
verbose: Include list of affected keys in error message
67+
68+
Returns:
69+
An error message, if any (default is an empty string)
70+
71+
"""
72+
error = ""
73+
duplicates = pred_col.duplicated()
74+
if duplicates.any():
75+
error = f"Found {duplicates.sum()} duplicate ID(s)"
76+
77+
if verbose:
78+
error += f": {pred_col[duplicates].to_list()}"
79+
return error
80+
81+
82+
def check_nan_values(pred_col: pd.Series) -> str:
83+
"""Check for NAN values.
84+
85+
Tip: Example Use Case
86+
Predictions must not be null / None.
87+
88+
Args:
89+
pred_col: Dataframe column containing the values to validate
90+
91+
Returns:
92+
An error message, if any (default is an empty string)
93+
94+
"""
95+
nan_count = pred_col.isna().sum()
96+
if nan_count:
97+
return f"'{pred_col.name}' column contains {nan_count} NaN value(s)."
98+
return ""
99+
100+
101+
def check_binary_values(
102+
pred_col: pd.Series, label1: int = 0, label2: int = 1
103+
) -> str:
104+
"""Check that values are binary (default: 0 or 1).
105+
106+
Tip: Example Use Case
107+
Predictions can only be 0 (no disease present) or 1 (disease present).
108+
109+
Args:
110+
pred_col: Dataframe column containing the values to validate.
111+
label1: First acceptable binary value.
112+
label2: Second acceptable binary value.
113+
114+
Returns:
115+
An error message, if any (default is an empty string)
116+
117+
"""
118+
if not pred_col.isin([label1, label2]).all():
119+
return f"'{pred_col.name}' values should only be {label1} or {label2}."
120+
return ""
121+
122+
123+
# pylint: disable=unsupported-binary-operation
124+
def check_values_range(
125+
pred_col: pd.Series, min_val: int | float = 0, max_val: int | float = 1
126+
) -> str:
127+
"""Check that values are between min and max values, inclusive.
128+
129+
Tip: Example Use Case
130+
Predictions must be a probability from 0 (disease not likely) to 1
131+
(disease likely).
132+
133+
Args:
134+
pred_col: Dataframe column containing the values to validate
135+
min_val: Lower limit of range
136+
max_val: Upper limit of range
137+
138+
Returns:
139+
An error message, if any (default is an empty string)
140+
141+
"""
142+
if (pred_col < min_val).any() or (pred_col > max_val).any():
143+
return f"'{pred_col.name}' values should be between [{min_val}, {max_val}]."
144+
return ""
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
::: cnb_tools.validation_toolkit

mkdocs.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ nav:
1818
- Contributing: user-guide/contributing.md
1919
- Reference:
2020
- CLI: reference/cli.md
21+
- Validation Toolkit: reference/validation-toolkit.md
2122
- Changelog:
2223
- changelog/release-notes.md
2324
- How to upgrade: changelog/upgrade.md
@@ -54,7 +55,11 @@ theme:
5455

5556
plugins:
5657
- search
57-
- mkdocstrings
58+
- mkdocstrings:
59+
handlers:
60+
python:
61+
options:
62+
docstring_section_style: spacy
5863
- autorefs
5964
- termynal:
6065
prompt_literal_start:

0 commit comments

Comments
 (0)