Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ python3 clinical.py
--cli_to_oncotree_mapping_synid syn66313842 \
--datahub_tools_path /<some_path>/datahub-study-curation-tools \
--lens_id_mapping_synid syn68826836
--neoantigen-data-synid syn21841882
--neoantigen_data_synid syn21841882
```

Run maf processing
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
[project]
name = "cbioportal_export"
version = "0.1.0"
requires-python = ">=3.9,<3.12"
dependencies = [
"synapseclient[pandas]>=4,<5",
"pandas>=2.2",
"pyyaml==6.0"
]

[dependency-groups]
Expand Down
25 changes: 3 additions & 22 deletions src/iatlascbioportalexport/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import sys

import synapseclient
import synapseutils

import utils

Expand Down Expand Up @@ -49,44 +48,26 @@ def save_to_synapse(
dataset_name (str): name of the iatlas dataset to save to
synapse
datahub_tools_path (str): Path to the datahub tools repo
output_folder_synid (str): Synapse id of the output folder
output_folder_synid (str): Synapse id of the outputs folder
version_comment (str): Version comment for this iteration of files on synapse. Optional.
Defaults to None.
"""
# TODO: Make into argument
dataset_dir = os.path.join(datahub_tools_path, "add-clinical-header", dataset_name)
# see if dataset_folder exists
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove code as we just need to input the output folder synapse id directly instead of trying to create one inside the project

dataset_folder_exists = False
for _, directory_names, _ in synapseutils.walk(syn=syn, synId=output_folder_synid):
directories = directory_names # top level directories
break

for dataset_folder in directories:
if dataset_name == dataset_folder[0]:
dataset_folder_exists = True
dataset_folder_id = dataset_folder[1]
break

if not dataset_folder_exists:
new_dataset_folder = synapseclient.Folder(
dataset_name, parent=output_folder_synid
)
dataset_folder_id = syn.store(new_dataset_folder).id

# store required files
for file in utils.REQUIRED_OUTPUT_FILES:
syn.store(
synapseclient.File(
f"{dataset_dir}/{file}",
name=file,
parent=dataset_folder_id,
parent=output_folder_synid,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to save all files in the same level of repo?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that is the expected folder structure. Case list files get their own folder within the output folder

version_comment=version_comment
)
)

# store case lists
case_list_files = os.listdir(os.path.join(dataset_dir, "case_lists"))
case_list_folder = synapseclient.Folder("case_lists", parent=dataset_folder_id)
case_list_folder = synapseclient.Folder("case_lists", parent=output_folder_synid)
try:
case_list_folder_id = syn.store(case_list_folder).id
except:
Expand Down
139 changes: 1 addition & 138 deletions src/iatlascbioportalexport/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,130 +6,10 @@
from typing import Dict

import pandas as pd
import synapseclient

import utils

my_agent = "iatlas-cbioportal/0.0.0"
syn = synapseclient.Synapse(user_agent=my_agent).login()

REQUIRED_MAF_COLS = [
"Hugo_Symbol",
"Entrez_Gene_Id",
"Center",
"NCBI_Build",
"Chromosome",
"Start_Position",
"End_Position",
"Strand",
"Consequence",
"Variant_Classification",
"Variant_Type",
"Reference_Allele",
"Tumor_Seq_Allele1",
"Tumor_Seq_Allele2",
"dbSNP_RS",
"dbSNP_Val_Status",
"Tumor_Sample_Barcode",
"Matched_Norm_Sample_Barcode",
"Match_Norm_Seq_Allele1",
"Match_Norm_Seq_Allele2",
"Tumor_Validation_Allele1",
"Tumor_Validation_Allele2",
"Match_Norm_Validation_Allele1",
"Match_Norm_Validation_Allele2",
"Verification_Status",
"Validation_Status",
"Mutation_Status",
"Sequencing_Phase",
"Sequence_Source",
"Validation_Method",
"Score",
"BAM_File",
"Sequencer",
"n_ref_count",
"n_alt_count",
"HGVSc",
"HGVSp",
"HGVSp_Short",
"Transcript_ID",
"RefSeq",
"Protein_position",
"Codons",
"Exon_Number",
"AA_AF",
"AF",
"AFR_AF",
"ALLELE_NUM",
"AMR_AF",
"ASN_AF",
"Allele",
"Amino_acids",
"BIOTYPE",
"CANONICAL",
"CCDS",
"CDS_position",
"CLIN_SIG",
"DISTANCE",
"DOMAINS",
"EAS_AF",
"EA_AF",
"ENSP",
"EUR_AF",
"EXON",
"Existing_variation",
"FILTER",
"Feature",
"Feature_type",
"GENE_PHENO",
"Gene",
"HGNC_ID",
"HGVS_OFFSET",
"HIGH_INF_POS",
"IMPACT",
"INTRON",
"MINIMISED",
"MOTIF_NAME",
"MOTIF_POS",
"MOTIF_SCORE_CHANGE",
"PHENO",
"PICK",
"PUBMED",
"PolyPhen",
"SAS_AF",
"SIFT",
"SOMATIC",
"STRAND_VEP",
"SWISSPROT",
"SYMBOL",
"SYMBOL_SOURCE",
"TREMBL",
"TSL",
"UNIPARC",
"VARIANT_CLASS",
"all_effects",
"cDNA_position",
"flanking_bps",
"genomic_location_explanation",
"gnomADe_AF",
"gnomADe_AFR_AF",
"gnomADe_AMR_AF",
"gnomADe_ASJ_AF",
"gnomADe_EAS_AF",
"gnomADe_FIN_AF",
"gnomADe_NFE_AF",
"gnomADe_OTH_AF",
"gnomADe_SAS_AF",
"n_depth",
"t_depth",
"t_ref_count",
"t_alt_count",
"vcf_id",
"vcf_pos",
"vcf_qual",
"Annotation_Status",
]

syn = utils.synapse_login()

def read_and_merge_maf_files(input_folder_synid: str) -> pd.DataFrame:
"""Read in and merge MAF files from a specified folder
Expand Down Expand Up @@ -383,20 +263,6 @@ def validate_that_allele_freq_are_not_na(
)


def validate_that_required_columns_are_present(
input_df: pd.DataFrame, **kwargs
) -> None:
"""Validate that required set of maf columns are present

Args:
input_df (pd.DataFrame): _description_
"""
logger = kwargs.get("logger", logging.getLogger(__name__))
if set(REQUIRED_MAF_COLS) != set(list(input_df.columns)):
missing_cols = set(REQUIRED_MAF_COLS) - set(list(input_df.columns))
logger.error(f"Missing required columns in maf: {list(missing_cols)}")


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
Expand Down Expand Up @@ -464,9 +330,6 @@ def main():
validate_export_files(
input_df=maf_df, output_df=mafs["annotated_maf"], logger=dataset_logger
)
validate_that_required_columns_are_present(
mafs["annotated_maf"], logger=dataset_logger
)
validate_that_allele_freq_are_not_na(mafs["annotated_maf"], logger=dataset_logger)
generate_meta_files(
dataset_name=args.dataset, datahub_tools_path=args.datahub_tools_path
Expand Down
1 change: 0 additions & 1 deletion src/iatlascbioportalexport/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
"meta_gene_signatures.txt",
"data_rna_seq_mrna.txt",
"meta_rna_seq_mrna.txt",
"cbioportal_validator_output.txt",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you track why this file is removed?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think my comment got lost in the old orca-recipes repo but this is removed as it's not a required file we need to run validation on haha

]


Expand Down
Loading