Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified img/schema_output_picture.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
20 changes: 2 additions & 18 deletions scripts/table_schemas/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,9 @@ docker run -it -e SYNAPSE_AUTH_TOKEN=<insert_synapse_token> <docker_image_name>

## Create Patient and Sample Tracking Table Script

### Input
### Updating the Table Schema

The input data model expects a format like the following:

![data_model_picture.png](/img/data_model_picture.png)

With the following required columns:

- Attribute
- Valid Values
- Validation Rules
When there are new BPC cohort or SP projects that get released, the `STRING_COLS` and `BOOLEAN_COLS` will need to be updated. Please create a PR with the updated values.

### How to Run

Expand All @@ -49,14 +41,6 @@ python create_patient_sample_tracking_table_schema.py \
--project-synid syn7208886
```

Run with these settings to create a table from a different input
data model that is not the default

```shell
python create_patient_sample_tracking_table_schema.py \
--data-model-synid syn1241241
```

### Output

The output will look like the following:
Expand Down
282 changes: 197 additions & 85 deletions scripts/table_schemas/create_patient_sample_tracking_table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,122 +8,236 @@
import argparse
from typing import List

import pandas as pd
import synapseclient
from synapseclient import Wiki
from synapseclient.models import Column, Table

syn = synapseclient.login()

def get_data_model(synid: str) -> pd.DataFrame:
"""Converts the data model into pandas dataframe for
parsing later
STRING_COLS = [
"SAMPLE_ID",
"PATIENT_ID",
"MAIN_GENIE_RELEASE",
"BPC_CRC2_RELEASE",
"BPC_PANC_RELEASE",
"BPC_RENAL_RELEASE",
"BPC_BLADDER_RELEASE",
"BPC_BRCA_RELEASE",
"BPC_NSCLC_RELEASE",
"BPC_PROSTATE_RELEASE",
]

Args:
synid (str): synapse id of the data model file
BOOLEAN_COLS = [
"IN_LATEST_MAIN_GENIE",
"IN_AKT1_PROJECT",
"IN_BRCA_DDR_PROJECT",
"IN_ERBB2_PROJECT",
"IN_FGFE4_PROJECT",
"IN_KRAS_PROJECT",
"IN_NTRK_PROJECT",
"IN_BPC_CRC_RELEASE",
"IN_BPC_CRC2_RELEASE",
"IN_BPC_PANC_RELEASE",
"IN_BPC_RENAL_RELEASE",
"IN_BPC_BLADDER_RELEASE",
"IN_BPC_BRCA_RELEASE",
"IN_BPC_NSCLC_RELEASE",
"IN_BPC_PROSTATE_RELEASE",
]

Returns:
pd.DataFrame: data model as pandas dataframe
def create_columns() -> List[Column]:
"""
data_model = pd.read_csv(syn.get(synid).path, sep="\t")
return data_model


def get_synapse_col_type(validation_rule: str) -> str:
"""Helper to map validation rules to
Synapse column types

Args:
validation_rule (str): the value,
current supported values are
['bool', 'int', 'float', 'date', 'str']
Creates the columns of the schema.
Build Synapse Column objects in the desired order:
SAMPLE_ID, PATIENT_ID, then IN_* booleans, then release-name strings.

Returns:
str: string representation of the rule translated
to synapse table column data types
List[Column]: list of the column schemas in the table
"""
rules_map = {
"boolean": "BOOLEAN",
"int": "INTEGER",
"float": "DOUBLE",
"date": "DATE",
"str": "STRING",
}
if pd.isna(validation_rule):
return "STRING"
rule = validation_rule.lower()

if rule in rules_map.keys():
return rules_map[rule]
else:
raise ValueError(
f"{rule} is not one of the supported rules: {rules_map.keys()}"
)

columns: List[Column] = []

def create_columns(data_model: pd.DataFrame) -> List[Column]:
"""Creates the columns of the schema with
column type, valid values and enum values filled out
where applicable
# strings first
for name in ["SAMPLE_ID", "PATIENT_ID"]:
columns.append(
Column(
name=name,
column_type="STRING",
maximum_size=250,
)
)

Args:
data_model (pd.DataFrame): table of the data model
to parse for the schema values
# boolean flags
for name in BOOLEAN_COLS:
# SAMPLE_ID and PATIENT_ID are already added; skip if present by mistake
if name in ("SAMPLE_ID", "PATIENT_ID"):
continue
columns.append(
Column(
name=name,
column_type="BOOLEAN",
)
)

Returns:
List[Column]: list of the column schemas in the table
"""
# Build list of Columns (with Restrict Values)
columns = []
for _, row in data_model.iterrows():
name = row["Attribute"].strip()
col_type = get_synapse_col_type(row.get("Validation Rules", ""))
valid_values = row.get("Valid Values")

# Parse Restrict Values if available
enum_values = None
if isinstance(valid_values, str) and "," in valid_values:
# Split on commas and strip whitespace
enum_values = [v.strip() for v in valid_values.split(",") if v.strip()]

col = Column(
name=name,
column_type=col_type,
enum_values=enum_values, # this is the "Restrict Values" list
maximum_size=250 if col_type == "STRING" else None,
# release-name strings (and any other non-boolean strings)
# NOTE: STRING_COLS already includes SAMPLE_ID/PATIENT_ID; skip duplicates
for name in STRING_COLS:
if name in ("SAMPLE_ID", "PATIENT_ID"):
continue
columns.append(
Column(
name=name,
column_type="STRING",
maximum_size=250,
)
)
columns.append(col)

return columns


def create_table(data_model_synid: str, project_synid: str, table_name: str) -> None:
def create_table(project_synid: str, table_name: str) -> Table:
"""Create and initializes the empty Synapse Table
using the table schema generated from the data model

Args:
syn (synapseclient.Synapse): synapse client connection
data_model_synid (str): synapse id of the input data model to parse
project_synid (str): synapse if of the synapse project to create table in
table_name (str): name of the table to create

Returns:
Table: Synapse table entity that was created
"""
data_model = get_data_model(synid=data_model_synid)
columns = create_columns(data_model)
# Create an empty table instance with the schema
columns = create_columns()

table = Table(
name=table_name,
columns=columns,
parent_id=project_synid,
)
table = table.store()
print(f"Created table: {table.name} ({table.id})")
return table


def add_table_wiki(syn : synapseclient.Synapse, table: Table) -> None:
"""Adds the wiki with instructions and examples for the table
and how to use it

Args:
syn (synapseclient.Synapse): synapse client connection
table (Table): synapse table entity
"""

content = """
# Patient and Sample Tracking Table
${toc}
##Overview
This Patient-Sample tracking table contains ALL of the `SAMPLE_ID`, `PATIENT_ID` across all of the **latest** versions of each project type: BPC, MAIN Genie and SP.

You can query for available projects:
```sql
SELECT
DISTINCT RELEASE_PROJECT_TYPE
FROM syn71708167;
```

Here is the data dictionary for the table and its attributes
| ATTRIBUTE | DESCRIPTION | REQUIRED |
| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- |
| `SAMPLE_ID` | Sample identifier for the given sample-patient pair. | YES |
| `PATIENT_ID` | Patient identifier for the given sample-patient pair. | YES |
| `IN_LATEST_MAIN_GENIE` | Whether the sample-patient pair is present in the latest MAIN GENIE release. | YES |
| `IN_AKT1_PROJECT` | Whether the sample-patient pair exists in the AKT1 sponsored project dataset (regardless of MAIN GENIE membership). | YES |
| `IN_BRCA_DDR_PROJECT` | Whether the sample-patient pair exists in the BRCA_DDR sponsored project dataset (regardless of MAIN GENIE membership). | YES |
| `IN_ERBB2_PROJECT` | Whether the sample-patient pair exists in the ERBB2 sponsored project dataset (regardless of MAIN GENIE membership). | YES |
| `IN_FGFE4_PROJECT` | Whether the sample-patient pair exists in the FGFE4 sponsored project dataset (regardless of MAIN GENIE membership). | YES |
| `IN_KRAS_PROJECT` | Whether the sample-patient pair exists in the KRAS sponsored project dataset (regardless of MAIN GENIE membership). | YES |
| `IN_NTRK_PROJECT` | Whether the sample-patient pair exists in the NTRK sponsored project dataset (regardless of MAIN GENIE membership). | YES |
| `IN_BPC_CRC_RELEASE` | Whether the sample-patient pair exists in the latest CRC BPC cohort release slice used for tracking (latest-per-cohort). | YES |
| `IN_BPC_CRC2_RELEASE` | Whether the sample-patient pair exists in the latest CRC2 BPC cohort release slice used for tracking (latest-per-cohort). | YES |
| `IN_BPC_PANC_RELEASE` | Whether the sample-patient pair exists in the latest PANC BPC cohort release slice used for tracking (latest-per-cohort). | YES |
| `IN_BPC_RENAL_RELEASE` | Whether the sample-patient pair exists in the latest RENAL BPC cohort release slice used for tracking (latest-per-cohort). | YES |
| `IN_BPC_BLADDER_RELEASE` | Whether the sample-patient pair exists in the latest BLADDER BPC cohort release slice used for tracking (latest-per-cohort). | YES |
| `IN_BPC_BRCA_RELEASE` | Whether the sample-patient pair exists in the latest BRCA BPC cohort release slice used for tracking (latest-per-cohort). | YES |
| `IN_BPC_NSCLC_RELEASE` | Whether the sample-patient pair exists in the latest NSCLC BPC cohort release slice used for tracking (latest-per-cohort). | YES |
| `IN_BPC_PROSTATE_RELEASE` | Whether the sample-patient pair exists in the latest PROSTATE BPC cohort release slice used for tracking (latest-per-cohort). | YES |
| `MAIN_GENIE_RELEASE` | Release identifier for the latest MAIN GENIE release used for tracking (e.g., `NN.N-public`). Populated for pairs where `IN_LATEST_MAIN_GENIE` is true; otherwise may be null/blank. | YES |
| `BPC_CRC2_RELEASE` | Release identifier for the latest CRC2 BPC cohort release used for tracking (e.g., `CRC2_17.0-consortium` or similar). Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank. | YES |
| `BPC_PANC_RELEASE` | Release identifier for the latest PANC BPC cohort release used for tracking (e.g., `PANC_17.0-consortium` or similar). Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank. | YES |
| `BPC_RENAL_RELEASE` | Release identifier for the latest RENAL BPC cohort release used for tracking. Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank. | YES |
| `BPC_BLADDER_RELEASE` | Release identifier for the latest BLADDER BPC cohort release used for tracking. Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank. | YES |
| `BPC_BRCA_RELEASE` | Release identifier for the latest BRCA BPC cohort release used for tracking. Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank. | YES |
| `BPC_NSCLC_RELEASE` | Release identifier for the latest NSCLC BPC cohort release used for tracking. Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank. | YES |
| `BPC_PROSTATE_RELEASE` | Release identifier for the latest PROSTATE BPC cohort release used for tracking. Populated for pairs present in that cohort’s latest slice; otherwise may be null/blank. | YES |

##Getting Started

### Query templates

#### Scenario 1
You want to query which sample-patient pairs are in the latest main genie consortium release for a specific sponsored project (SP):
```sql
SELECT
SAMPLE_ID,
PATIENT_ID,
FROM <TABLE_SYNAPSE_ID>
AND IN_LATEST_MAIN_GENIE = TRUE;
```

**Example**

```sql
SELECT
SAMPLE_ID,
PATIENT_ID,
FROM syn71708167
WHERE RELEASE_PROJECT_TYPE = 'SP_KRAS'
AND IN_LATEST_MAIN_GENIE = TRUE;
```

*Sample Result*
| SAMPLE_ID | PATIENT_ID |
| ------------- | ------------- |
| P-0001234-T01 | GENIE-0005678 |
| P-0009876-T01 | GENIE-0009999 |


#### Scenario 2
You want to filter on which sample-patient pairs were not present in a specific project(s) but present in the latest main genie release
```sql
SELECT SAMPLE_ID, PATIENT_ID
FROM syn72246564
WHERE IN_LATEST_MAIN_GENIE = TRUE
AND <PROJECT_A_FLAG> = FALSE
AND <PROJECT_B_FLAG> = FALSE;
```
**Example**
Here we filter on sample-patient ids that are not present in BPC's breast cancer cohort's latest release and not present in the AKT1 sponsored project
```sql
SELECT SAMPLE_ID, PATIENT_ID
FROM syn72246564
WHERE IN_LATEST_MAIN_GENIE = TRUE
AND IN_BPC_BRCA_RELEASE = FALSE
AND IN_AKT1_PROJECT = FALSE;
```

*Sample Result*
| SAMPLE_ID | PATIENT_ID |
| ------------- | ------------- |
| P-0011111-T01 | GENIE-0002222 |
| P-0033333-T02 | GENIE-0004444 |

"""

wiki = Wiki(title='Patient and Sample Tracking Table',
owner=table,
markdown=content)
print(f"Created wiki for table: {table.name} ({table.id})")
syn.store(wiki)


def main():
parser = argparse.ArgumentParser(
description="Generate a Synapse table schema from a data model TSV."
)
parser.add_argument(
"--data-model-synid",
default="syn71411200",
help="Synapse ID of the data model TSV (e.g. syn71411200).",
description="Generate a Synapse table schema from a data model."
)
parser.add_argument(
"--project-synid",
Expand All @@ -137,11 +251,9 @@ def main():
)

args = parser.parse_args()
create_table(
data_model_synid=args.data_model_synid,
project_synid=args.project_synid,
table_name=args.table_name,
)
syn = synapseclient.login()
create_table(project_synid=args.project_synid, table_name=args.table_name)
add_table_wiki(syn)


if __name__ == "__main__":
Expand Down
1 change: 0 additions & 1 deletion scripts/table_schemas/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
synapseclient>=4.9,<5.0
pandas>=2.0,<3.0
Loading