Update doctsrings of maf functions (#577)

rxu17 · web-flow · commit 95fc7c12372c · 2024-08-13T22:49:57.000-07:00
* update doctsrings of maf functions

* fix linting
diff --git a/genie_registry/maf.py b/genie_registry/maf.py
@@ -11,44 +11,90 @@
 logger = logging.getLogger(__name__)
 
 
-def _check_allele_col_validity(df):
-    """There are two linked validation rules in this function:
-
-    1) If maf file has ALL three of the following columns:
-        - TUMOR_SEQ_ALLELE1 (TSA1)
-        - TUMOR_SEQ_ALLELE2 (TSA2)
-        - REFERENCE ALLELE (REF)
-        THEN
-        ALL rows of TSA1 must equal REF
-        OR
-        ALL rows of TSA1 must equal TSA2
-
-        TSA1 is used by Genome Nexus (GN) to annotate data when it senses there is ambiguity
-        regarding which variant (TSA1 vs TSA2) to use. This is
-        why there cannot be mixed rows where some rows have TSA1 == REF and some rows
-        have TSA1 == TSA2.
-
-        e.g:
-        VALID
-        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
-        | C                | C                 | A
-        | T                | T                 | C
-
-        VALID
-        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
-        | C                | A                 | A
-        | T                | C                 | C
-
-        INVALID
-        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
-        | C                | C                 | A
-        | C                | A                 | A
-
-        See https://github.com/genome-nexus/annotation-tools/issues/26 for
-        more background regarding why this validation rule was implemented.
-
-    2) There can't be ANY rows where REF == TSA2. This is a missense mutation
-    flagged as invalid by GN
+def _check_allele_col_validity(df: pd.DataFrame) -> str:
+    """
+    This function checks specific columns in a MAF (Mutation Annotation Format)
+    file for certain conditions.
+
+    The following conditions must be met:
+        **If the MAF file has all three of these columns**
+
+            - TUMOR_SEQ_ALLELE1 (TSA1)
+            - TUMOR_SEQ_ALLELE2 (TSA2)
+            - REFERENCE_ALLELE (REF)
+
+        **Then, one of the following must be true**
+
+            - Every value in TSA1 must be the same as the value in REF
+            - Every value in TSA1 must be the same as the value in TSA2
+
+        **Additionally, if the MAF file has at least these two columns**
+
+            - REFERENCE_ALLELE (REF)
+            - TUMOR_SEQ_ALLELE2 (TSA2)
+
+        **Then**
+
+            NO values in REF can match TSA2
+
+        These rules are important because Genome Nexus (GN) uses `TSA1` to annotate data
+        when it's not clear which variant to use. So, there can't be a mix of rows where
+        some have `TSA1` equal to `REF` and some have `TSA1` equal to `TSA2`.
+
+    Example: Valid Examples
+        ```
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
+        | ---------------- | ----------------- | ----------------- |
+        | C                | C                 | A                 |
+        | T                | T                 | C                 |
+        ```
+
+        ```
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
+        | ---------------- | ----------------- | ----------------- |
+        | C                | A                 | A                 |
+        | T                | C                 | C                 |
+        ```
+
+        ```
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE2 |
+        | ---------------- | ----------------- |
+        | C                | A                 |
+        | T                | C                 |
+        ```
+
+
+    Example: Invalid Examples
+        ```
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
+        | ---------------- | ----------------- | ----------------- |
+        | C                | C                 | A                 |
+        | C                | A                 | A                 |
+        ```
+
+        ```
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
+        | ---------------- | ----------------- | ----------------- |
+        | A                | C                 | A                 |
+        | T                | C                 | T                 |
+        ```
+
+        ```
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE2 |
+        | ---------------- | ----------------- |
+        | C                | C                 |
+        | T                | C                 |
+        ```
+
+
+    See this [Genome Nexus issue](https://github.com/genome-nexus/annotation-tools/issues/26) for
+    more background regarding why this validation rule was implemented.
+
+    Args:
+        df: input mutation dataframe
+
+    Returns:
+        str: the error message
     """
     tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2")
     tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1")
@@ -135,6 +181,32 @@ def _validate(self, mutationDF):
         This function validates the mutation file to make sure it
         adheres to the mutation SOP.
 
+        t_depth: This column is conditionally optional.
+        1. If this column is missing, the data must include the t_ref_count column. Otherwise, it will cause a validation error.
+        2. If this column is present, it must have one of the following:
+            - A mix of numeric values and NAs
+            - All NAs
+            - All numeric values
+
+        There are no other checks on the actual values in this column.
+
+        t_ref_count: This column is conditionally optional.
+        1. If this column is missing, the data must include the t_depth column. Otherwise, it will cause a validation error.
+        2. If this column is present, it must have one of the following:
+            - A mix of numeric values and NAs
+            - All NAs
+            - All numeric values
+
+        There are no other checks on the actual values in this column.
+
+        t_alt_count: This column is entirely optional.
+        1. If this column is present, it must have one of the following:
+            - A mix of numeric values and NAs
+            - All NAs
+            - All numeric values
+
+        There are no other checks on the actual values in this column.
+
         Args:
             mutationDF: mutation dataframe
 
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -110,15 +110,14 @@ plugins:
         options:
           members_order: source
           # shows all functions and attributes even hidden
-          members: yes
-          show_if_no_docstring: False
-          show_root_heading: True
-          show_root_full_path: True
-          show_category_heading: True
+          members: true
+          show_private_members: true
+          show_if_no_docstring: false
+          show_root_heading: true
+          show_root_full_path: true
+          show_category_heading: true
           docstring_style: google
           docstring_section_style: spacy
-          filters:
-          - "!^_"
 # Allows external links to open in a new tab
 - open-in-new-tab