Sage-Bionetworks-Workflows · rxu17 · Jan 9, 2026 · Jan 9, 2026 · Jan 13, 2026
@@ -19,17 +19,9 @@ docker run -it -e SYNAPSE_AUTH_TOKEN=<insert_synapse_token> <docker_image_name>
 
 ## Create Patient and Sample Tracking Table Script
 
-### Input
+### Updating the Table Schema
 
-The input data model expects a format like the following:
-
-![data_model_picture.png](/img/data_model_picture.png)
-
-With the following required columns:
-
-- Attribute
-- Valid Values
-- Validation Rules
+When there are new BPC cohort or SP projects that get released, the `STRING_COLS` and `BOOLEAN_COLS` will need to be updated. Please create a PR with the updated values.
 
 ### How to Run
 
@@ -49,14 +41,6 @@ python create_patient_sample_tracking_table_schema.py \
     --project-synid syn7208886
 ```
 
-Run with these settings to create a table from a different input
-data model that is not the default
-
-```shell
-python create_patient_sample_tracking_table_schema.py \
-    --data-model-synid syn1241241
-```
-
 ### Output
 
 The output will look like the following:

@@ -8,122 +8,236 @@
 import argparse
 from typing import List
 
-import pandas as pd
 import synapseclient
+from synapseclient import Wiki
 from synapseclient.models import Column, Table
 
-syn = synapseclient.login()
 
-def get_data_model(synid: str) -> pd.DataFrame:
-    """Converts the data model into pandas dataframe for
-        parsing later
+STRING_COLS = [
+    "SAMPLE_ID",
+    "PATIENT_ID",
+    "MAIN_GENIE_RELEASE",
+    "BPC_CRC2_RELEASE",
+    "BPC_PANC_RELEASE",
+    "BPC_RENAL_RELEASE",
+    "BPC_BLADDER_RELEASE",
+    "BPC_BRCA_RELEASE",
+    "BPC_NSCLC_RELEASE",
+    "BPC_PROSTATE_RELEASE",
+]
 
-    Args:
-        synid (str): synapse id of the data model file
+BOOLEAN_COLS = [
+    "IN_LATEST_MAIN_GENIE",
+    "IN_AKT1_PROJECT",
+    "IN_BRCA_DDR_PROJECT",
+    "IN_ERBB2_PROJECT",
+    "IN_FGFE4_PROJECT",
+    "IN_KRAS_PROJECT",
+    "IN_NTRK_PROJECT",
+    "IN_BPC_CRC_RELEASE",
+    "IN_BPC_CRC2_RELEASE",
+    "IN_BPC_PANC_RELEASE",
+    "IN_BPC_RENAL_RELEASE",
+    "IN_BPC_BLADDER_RELEASE",
+    "IN_BPC_BRCA_RELEASE",
+    "IN_BPC_NSCLC_RELEASE",
+    "IN_BPC_PROSTATE_RELEASE",
+]
 
-    Returns:
-        pd.DataFrame: data model as pandas dataframe
+def create_columns() -> List[Column]:
     """
-    data_model = pd.read_csv(syn.get(synid).path, sep="\t")
-    return data_model
-
-
-def get_synapse_col_type(validation_rule: str) -> str:
-    """Helper to map validation rules to
-        Synapse column types
-
-    Args:
-        validation_rule (str): the value,
-            current supported values are
-            ['bool', 'int', 'float', 'date', 'str']
+    Creates the columns of the schema.
+    Build Synapse Column objects in the desired order:
+      SAMPLE_ID, PATIENT_ID, then IN_* booleans, then release-name strings.
 
     Returns:
-        str: string representation of the rule translated
-        to synapse table column data types
+        List[Column]: list of the column schemas in the table
     """
-    rules_map = {
-        "boolean": "BOOLEAN",
-        "int": "INTEGER",
-        "float": "DOUBLE",
-        "date": "DATE",
-        "str": "STRING",
-    }
-    if pd.isna(validation_rule):
-        return "STRING"
-    rule = validation_rule.lower()
-
-    if rule in rules_map.keys():
-        return rules_map[rule]
-    else:
-        raise ValueError(
-            f"{rule} is not one of the supported rules: {rules_map.keys()}"
-        )
-
+    columns: List[Column] = []
 
-def create_columns(data_model: pd.DataFrame) -> List[Column]:
-    """Creates the columns of the schema with
-        column type, valid values and enum values filled out
-        where applicable
+    # strings first
+    for name in ["SAMPLE_ID", "PATIENT_ID"]:
+        columns.append(
+            Column(
+                name=name,
+                column_type="STRING",
+                maximum_size=250,
+            )
+        )
 
-    Args:
-        data_model (pd.DataFrame): table of the data model
-            to parse for the schema values
+    # boolean flags
+    for name in BOOLEAN_COLS:
+        # SAMPLE_ID and PATIENT_ID are already added; skip if present by mistake
+        if name in ("SAMPLE_ID", "PATIENT_ID"):
+            continue
+        columns.append(
+            Column(
+                name=name,
+                column_type="BOOLEAN",
+            )
+        )
 
-    Returns:
-        List[Column]: list of the column schemas in the table
-    """
-    # Build list of Columns (with Restrict Values)
-    columns = []
-    for _, row in data_model.iterrows():
-        name = row["Attribute"].strip()
-        col_type = get_synapse_col_type(row.get("Validation Rules", ""))
-        valid_values = row.get("Valid Values")
-
-        # Parse Restrict Values if available
-        enum_values = None
-        if isinstance(valid_values, str) and "," in valid_values:
-            # Split on commas and strip whitespace
-            enum_values = [v.strip() for v in valid_values.split(",") if v.strip()]
-
-        col = Column(
-            name=name,
-            column_type=col_type,
-            enum_values=enum_values,  # this is the "Restrict Values" list
-            maximum_size=250 if col_type == "STRING" else None,
+    # release-name strings (and any other non-boolean strings)
+    # NOTE: STRING_COLS already includes SAMPLE_ID/PATIENT_ID; skip duplicates
+    for name in STRING_COLS:
+        if name in ("SAMPLE_ID", "PATIENT_ID"):
+            continue
+        columns.append(
+            Column(
+                name=name,
+                column_type="STRING",
+                maximum_size=250,
+            )
         )
-        columns.append(col)
+
     return columns
 
 
-def create_table(data_model_synid: str, project_synid: str, table_name: str) -> None:
+def create_table(project_synid: str, table_name: str) -> Table:
     """Create and initializes the empty Synapse Table
         using the table schema generated from the data model
 
     Args:
+        syn (synapseclient.Synapse): synapse client connection
         data_model_synid (str): synapse id of the input data model to parse
         project_synid (str): synapse if of the synapse project to create table in
         table_name (str): name of the table to create
+
+    Returns:
+        Table: Synapse table entity that was created
     """
-    data_model = get_data_model(synid=data_model_synid)
-    columns = create_columns(data_model)
-    # Create an empty table instance with the schema
+    columns = create_columns()
+
     table = Table(
         name=table_name,
         columns=columns,
         parent_id=project_synid,
     )
     table = table.store()
     print(f"Created table: {table.name} ({table.id})")
+    return table
+
+
+def add_table_wiki(syn : synapseclient.Synapse, table: Table) -> None:
+    """Adds the wiki with instructions and examples for the table
+        and how to use it
+
+    Args:
+        syn (synapseclient.Synapse): synapse client connection
+        table (Table): synapse table entity
+    """
+
+    content = """
+    # Patient and Sample Tracking Table
+    ${toc}
+    ##Overview
+    This Patient-Sample tracking table contains ALL of the `SAMPLE_ID`, `PATIENT_ID` across all of the **latest** versions of each project type: BPC, MAIN Genie and SP.
+
+    You can query for available projects:
+    ```sql
+    SELECT
+    DISTINCT RELEASE_PROJECT_TYPE
+    FROM syn71708167;
+    ```
+
+    Here is the data dictionary for the table and its attributes
+    | ATTRIBUTE                 | DESCRIPTION                                                                                                                                                                                                    | REQUIRED |
+    | ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- |
+    | `SAMPLE_ID`               | Sample identifier for the given sample-patient pair.                                                                                                                                                           | YES      |
+    | `PATIENT_ID`              | Patient identifier for the given sample-patient pair.                                                                                                                                                          | YES      |
+    | `IN_LATEST_MAIN_GENIE`    | Whether the sample-patient pair is present in the latest MAIN GENIE release.                                                                                                                                   | YES      |
+    | `IN_AKT1_PROJECT`         | Whether the sample-patient pair exists in the AKT1 sponsored project dataset (regardless of MAIN GENIE membership).                                                                                            | YES      |
+    | `IN_BRCA_DDR_PROJECT`     | Whether the sample-patient pair exists in the BRCA_DDR sponsored project dataset (regardless of MAIN GENIE membership).                                                                                        | YES      |
+    | `IN_ERBB2_PROJECT`        | Whether the sample-patient pair exists in the ERBB2 sponsored project dataset (regardless of MAIN GENIE membership).                                                                                           | YES      |
+    | `IN_FGFE4_PROJECT`        | Whether the sample-patient pair exists in the FGFE4 sponsored project dataset (regardless of MAIN GENIE membership).                                                                                           | YES      |
+    | `IN_KRAS_PROJECT`         | Whether the sample-patient pair exists in the KRAS sponsored project dataset (regardless of MAIN GENIE membership).                                                                                            | YES      |
+    | `IN_NTRK_PROJECT`         | Whether the sample-patient pair exists in the NTRK sponsored project dataset (regardless of MAIN GENIE membership).                                                                                            | YES      |
+    | `IN_BPC_CRC_RELEASE`      | Whether the sample-patient pair exists in the latest CRC BPC cohort release slice used for tracking (latest-per-cohort).                                                                                       | YES      |
+    | `IN_BPC_CRC2_RELEASE`     | Whether the sample-patient pair exists in the latest CRC2 BPC cohort release slice used for tracking (latest-per-cohort).                                                                                      | YES      |
+    | `IN_BPC_PANC_RELEASE`     | Whether the sample-patient pair exists in the latest PANC BPC cohort release slice used for tracking (latest-per-cohort).                                                                                      | YES      |
+    | `IN_BPC_RENAL_RELEASE`    | Whether the sample-patient pair exists in the latest RENAL BPC cohort release slice used for tracking (latest-per-cohort).                                                                                     | YES      |
+    | `IN_BPC_BLADDER_RELEASE`  | Whether the sample-patient pair exists in the latest BLADDER BPC cohort release slice used for tracking (latest-per-cohort).                                                                                   | YES      |
+    | `IN_BPC_BRCA_RELEASE`     | Whether the sample-patient pair exists in the latest BRCA BPC cohort release slice used for tracking (latest-per-cohort).                                                                                      | YES      |
+    | `IN_BPC_NSCLC_RELEASE`    | Whether the sample-patient pair exists in the latest NSCLC BPC cohort release slice used for tracking (latest-per-cohort).                                                                                     | YES      |
+    | `IN_BPC_PROSTATE_RELEASE` | Whether the sample-patient pair exists in the latest PROSTATE BPC cohort release slice used for tracking (latest-per-cohort).                                                                                  | YES      |
+    | `MAIN_GENIE_RELEASE`      | Release identifier for the latest MAIN GENIE release used for tracking (e.g., `NN.N-public`). Populated for pairs where `IN_LATEST_MAIN_GENIE` is true; otherwise may be null/blank.                           | YES      |
+    | `BPC_CRC2_RELEASE`        | Release identifier for the latest CRC2 BPC cohort release used for tracking (e.g., `CRC2_17.0-consortium` or similar). Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank. | YES      |
+    | `BPC_PANC_RELEASE`        | Release identifier for the latest PANC BPC cohort release used for tracking (e.g., `PANC_17.0-consortium` or similar). Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank. | YES      |
+    | `BPC_RENAL_RELEASE`       | Release identifier for the latest RENAL BPC cohort release used for tracking. Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank.                                          | YES      |
+    | `BPC_BLADDER_RELEASE`     | Release identifier for the latest BLADDER BPC cohort release used for tracking. Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank.                                        | YES      |
+    | `BPC_BRCA_RELEASE`        | Release identifier for the latest BRCA BPC cohort release used for tracking. Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank.                                           | YES      |
+    | `BPC_NSCLC_RELEASE`       | Release identifier for the latest NSCLC BPC cohort release used for tracking. Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank.                                          | YES      |
+    | `BPC_PROSTATE_RELEASE`    | Release identifier for the latest PROSTATE BPC cohort release used for tracking. Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank.                                       | YES      |
+
+    ##Getting Started
+
+    ### Query templates
+
+    #### Scenario 1
+    You want to query which sample-patient pairs are in the latest main genie consortium release for a specific sponsored project (SP):
+    ```sql
+    SELECT
+    SAMPLE_ID,
+    PATIENT_ID,
+    FROM <TABLE_SYNAPSE_ID>
+    AND IN_LATEST_MAIN_GENIE = TRUE;
+    ```
+
+    **Example**
+
+    ```sql
+    SELECT
+    SAMPLE_ID,
+    PATIENT_ID,
+    FROM syn71708167
+    WHERE RELEASE_PROJECT_TYPE = 'SP_KRAS'
+    AND IN_LATEST_MAIN_GENIE = TRUE;
+    ```
+
+    *Sample Result*
+    | SAMPLE_ID     | PATIENT_ID    |
+    | ------------- | ------------- |
+    | P-0001234-T01 | GENIE-0005678 |
+    | P-0009876-T01 | GENIE-0009999 |
+
+
+    #### Scenario 2
+    You want to filter on which sample-patient pairs were not present in a specific project(s) but present in the latest main genie release
+    ```sql
+    SELECT SAMPLE_ID, PATIENT_ID
+    FROM syn72246564
+    WHERE IN_LATEST_MAIN_GENIE = TRUE
+    AND <PROJECT_A_FLAG> = FALSE
+    AND <PROJECT_B_FLAG> = FALSE;
+    ```
+    **Example**
+    Here we filter on sample-patient ids that are not present in BPC's breast cancer cohort's latest release and not present in the AKT1 sponsored project
+    ```sql
+    SELECT SAMPLE_ID, PATIENT_ID
+    FROM syn72246564
+    WHERE IN_LATEST_MAIN_GENIE = TRUE
+    AND IN_BPC_BRCA_RELEASE = FALSE
+    AND IN_AKT1_PROJECT = FALSE;
+    ```
+
+    *Sample Result*
+    | SAMPLE_ID     | PATIENT_ID    |
+    | ------------- | ------------- |
+    | P-0011111-T01 | GENIE-0002222 |
+    | P-0033333-T02 | GENIE-0004444 |
+
+    """
+
+    wiki = Wiki(title='Patient and Sample Tracking Table',
+                owner=table,
+                markdown=content)
+    print(f"Created wiki for table: {table.name} ({table.id})")
+    syn.store(wiki)
 
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Generate a Synapse table schema from a data model TSV."
-    )
-    parser.add_argument(
-        "--data-model-synid",
-        default="syn71411200",
-        help="Synapse ID of the data model TSV (e.g. syn71411200).",
+        description="Generate a Synapse table schema from a data model."
     )
     parser.add_argument(
         "--project-synid",
@@ -137,11 +251,9 @@ def main():
     )
 
     args = parser.parse_args()
-    create_table(
-        data_model_synid=args.data_model_synid,
-        project_synid=args.project_synid,
-        table_name=args.table_name,
-    )
+    syn = synapseclient.login()
+    create_table(project_synid=args.project_synid, table_name=args.table_name)
+    add_table_wiki(syn)
 
 
 if __name__ == "__main__":

@@ -1,2 +1 @@
 synapseclient>=4.9,<5.0
-pandas>=2.0,<3.0