|
| 1 | +import pandas as pd |
| 2 | + |
| 3 | + |
| 4 | +def check_missing_keys( |
| 5 | + gold_col: pd.Series, pred_col: pd.Series, verbose: bool = False |
| 6 | +) -> str: |
| 7 | + """Check for missing keys. |
| 8 | +
|
| 9 | + Tip: Example Use Case |
| 10 | + There is at least one prediction for every patient / sample / etc. |
| 11 | +
|
| 12 | + Args: |
| 13 | + gold_col: Dataframe column containing the true keys |
| 14 | + pred_col: Dataframe column containing the keys to validate |
| 15 | + verbose: Include list of affected keys in error message |
| 16 | +
|
| 17 | + Returns: |
| 18 | + An error message, if any (default is an empty string) |
| 19 | +
|
| 20 | + """ |
| 21 | + error = "" |
| 22 | + missing_ids = gold_col[~gold_col.isin(pred_col)] |
| 23 | + if missing_ids.any(): |
| 24 | + error = f"Found {missing_ids.shape[0]} missing ID(s)" |
| 25 | + |
| 26 | + if verbose: |
| 27 | + error += f": {missing_ids.to_list()}" |
| 28 | + return error |
| 29 | + |
| 30 | + |
| 31 | +def check_unknown_keys( |
| 32 | + gold_col: pd.Series, pred_col: pd.Series, verbose: bool = False |
| 33 | +) -> str: |
| 34 | + """Check for unknown keys. |
| 35 | +
|
| 36 | + Tip: Example Use Case |
| 37 | + There are no predictions without a corresponding groundtruth value. |
| 38 | +
|
| 39 | + Args: |
| 40 | + gold_col: Dataframe column containing the true keys |
| 41 | + pred_col: Dataframe column containing the keys to validate |
| 42 | + verbose: Include list of affected keys in error message |
| 43 | +
|
| 44 | + Returns: |
| 45 | + An error message, if any (default is an empty string) |
| 46 | +
|
| 47 | + """ |
| 48 | + error = "" |
| 49 | + unknown_ids = pred_col[~pred_col.isin(gold_col)] |
| 50 | + if unknown_ids.any(): |
| 51 | + error = f"Found {unknown_ids.shape[0]} unknown ID(s)" |
| 52 | + |
| 53 | + if verbose: |
| 54 | + error += f": {unknown_ids.to_list()}" |
| 55 | + return error |
| 56 | + |
| 57 | + |
| 58 | +def check_duplicate_keys(pred_col: pd.Series, verbose: bool = False) -> str: |
| 59 | + """Check for duplicate keys. |
| 60 | +
|
| 61 | + Tip: Example Use Case |
| 62 | + There is exactly one prediction for a patient / sample / etc. |
| 63 | +
|
| 64 | + Args: |
| 65 | + pred_col: Dataframe column containing the keys to validate |
| 66 | + verbose: Include list of affected keys in error message |
| 67 | +
|
| 68 | + Returns: |
| 69 | + An error message, if any (default is an empty string) |
| 70 | +
|
| 71 | + """ |
| 72 | + error = "" |
| 73 | + duplicates = pred_col.duplicated() |
| 74 | + if duplicates.any(): |
| 75 | + error = f"Found {duplicates.sum()} duplicate ID(s)" |
| 76 | + |
| 77 | + if verbose: |
| 78 | + error += f": {pred_col[duplicates].to_list()}" |
| 79 | + return error |
| 80 | + |
| 81 | + |
| 82 | +def check_nan_values(pred_col: pd.Series) -> str: |
| 83 | + """Check for NAN values. |
| 84 | +
|
| 85 | + Tip: Example Use Case |
| 86 | + Predictions must not be null / None. |
| 87 | +
|
| 88 | + Args: |
| 89 | + pred_col: Dataframe column containing the values to validate |
| 90 | +
|
| 91 | + Returns: |
| 92 | + An error message, if any (default is an empty string) |
| 93 | +
|
| 94 | + """ |
| 95 | + nan_count = pred_col.isna().sum() |
| 96 | + if nan_count: |
| 97 | + return f"'{pred_col.name}' column contains {nan_count} NaN value(s)." |
| 98 | + return "" |
| 99 | + |
| 100 | + |
| 101 | +def check_binary_values( |
| 102 | + pred_col: pd.Series, label1: int = 0, label2: int = 1 |
| 103 | +) -> str: |
| 104 | + """Check that values are binary (default: 0 or 1). |
| 105 | +
|
| 106 | + Tip: Example Use Case |
| 107 | + Predictions can only be 0 (no disease present) or 1 (disease present). |
| 108 | +
|
| 109 | + Args: |
| 110 | + pred_col: Dataframe column containing the values to validate. |
| 111 | + label1: First acceptable binary value. |
| 112 | + label2: Second acceptable binary value. |
| 113 | +
|
| 114 | + Returns: |
| 115 | + An error message, if any (default is an empty string) |
| 116 | +
|
| 117 | + """ |
| 118 | + if not pred_col.isin([label1, label2]).all(): |
| 119 | + return f"'{pred_col.name}' values should only be {label1} or {label2}." |
| 120 | + return "" |
| 121 | + |
| 122 | + |
| 123 | +# pylint: disable=unsupported-binary-operation |
| 124 | +def check_values_range( |
| 125 | + pred_col: pd.Series, min_val: int | float = 0, max_val: int | float = 1 |
| 126 | +) -> str: |
| 127 | + """Check that values are between min and max values, inclusive. |
| 128 | +
|
| 129 | + Tip: Example Use Case |
| 130 | + Predictions must be a probability from 0 (disease not likely) to 1 |
| 131 | + (disease likely). |
| 132 | +
|
| 133 | + Args: |
| 134 | + pred_col: Dataframe column containing the values to validate |
| 135 | + min_val: Lower limit of range |
| 136 | + max_val: Upper limit of range |
| 137 | +
|
| 138 | + Returns: |
| 139 | + An error message, if any (default is an empty string) |
| 140 | +
|
| 141 | + """ |
| 142 | + if (pred_col < min_val).any() or (pred_col > max_val).any(): |
| 143 | + return f"'{pred_col.name}' values should be between [{min_val}, {max_val}]." |
| 144 | + return "" |
0 commit comments