|
| 1 | +#!/usr/bin/env python |
| 2 | +""" |
| 3 | +Create a SQL script to insert single_cell_expression data into the database |
| 4 | +""" |
| 5 | + |
| 6 | +import json |
| 7 | +import pandas as pd |
| 8 | + |
| 9 | +# STUDY_ID = msk_spectrum_tme_2022 |
| 10 | +cancer_study_id = 40 |
| 11 | +# THIS DOESNT MATTER, JUST A HIGH NUMBER? |
| 12 | +genetic_profile_id = 10000 |
| 13 | + |
| 14 | + |
| 15 | +create_table_statement = """ |
| 16 | +DROP TABLE IF EXISTS single_cell_expression; |
| 17 | +CREATE TABLE IF NOT EXISTS single_cell_expression ( |
| 18 | + GENETIC_PROFILE_ID int NOT NULL, |
| 19 | + SAMPLE_ID int NOT NULL, |
| 20 | + TISSUE varchar(255) NOT NULL, |
| 21 | + CELL_TYPE varchar(255) NOT NULL, |
| 22 | + ENTREZ_GENE_ID int NOT NULL, |
| 23 | + EXPRESSION_VALUE float, |
| 24 | + FOREIGN KEY(GENETIC_PROFILE_ID) REFERENCES genetic_profile(GENETIC_PROFILE_ID), |
| 25 | + FOREIGN KEY(SAMPLE_ID) REFERENCES sample(INTERNAL_ID), |
| 26 | + FOREIGN KEY(ENTREZ_GENE_ID) REFERENCES gene(ENTREZ_GENE_ID) |
| 27 | +); |
| 28 | +""" |
| 29 | + |
| 30 | +add_genetic_profile = f""" |
| 31 | +DELETE FROM genetic_profile WHERE STABLE_ID = "single_cell_expression"; |
| 32 | +INSERT INTO genetic_profile ( |
| 33 | + GENETIC_PROFILE_ID, STABLE_ID, CANCER_STUDY_ID, GENETIC_ALTERATION_TYPE, |
| 34 | + DATATYPE, NAME, DESCRIPTION, SHOW_PROFILE_IN_ANALYSIS_TAB |
| 35 | +) VALUES ( |
| 36 | + {genetic_profile_id}, "SINGLE_CELL_EXPRESSION", {cancer_study_id}, "single_cell_expression", |
| 37 | + "single_cell_expression", "Single Cell Expression", "Single Cell Expression", 1 |
| 38 | +); |
| 39 | +""" |
| 40 | + |
| 41 | +insert_data_start = """ |
| 42 | +INSERT INTO single_cell_expression ( |
| 43 | + GENETIC_PROFILE_ID, SAMPLE_ID, TISSUE, CELL_TYPE, ENTREZ_GENE_ID, EXPRESSION_VALUE |
| 44 | +) VALUES ( |
| 45 | +""" |
| 46 | + |
| 47 | +value_split = "\n), (\n" |
| 48 | + |
| 49 | +def create_sample_map(): |
| 50 | + """SQL |
| 51 | + SELECT sample.INTERNAL_ID, sample.STABLE_ID FROM sample |
| 52 | + INNER JOIN patient ON sample.PATIENT_ID = patient.INTERNAL_ID |
| 53 | + INNER JOIN cancer_study ON patient.CANCER_STUDY_ID = cancer_study.CANCER_STUDY_ID |
| 54 | + WHERE cancer_study.CANCER_STUDY_IDENTIFIER = "msk_spectrum_tme_2022"; |
| 55 | + """ |
| 56 | + data = pd.read_csv("sample_map.tsv", skiprows=1, sep="\t") |
| 57 | + data = data.dropna(how="any") |
| 58 | + return dict(zip(data["INTERNAL_ID"], data["STABLE_ID"])) |
| 59 | + |
| 60 | + |
| 61 | +def create_gene_map(): |
| 62 | + """SQL: |
| 63 | + select ENTREZ_GENE_ID, HUGO_GENE_SYMBOL from gene; |
| 64 | + """ |
| 65 | + data = pd.read_csv("gene_map.tsv", skiprows=1, sep="\t") |
| 66 | + data = data.dropna(how="any") |
| 67 | + return dict(zip(data["ENTREZ_GENE_ID"], data["HUGO_GENE_SYMBOL"])) |
| 68 | + |
| 69 | + |
| 70 | +def create_data_sql() -> str: |
| 71 | + sample_map = create_sample_map() |
| 72 | + gene_map = create_gene_map() |
| 73 | + with open("sc-expression-msk-spectrum.json") as f: |
| 74 | + data = json.loads(f.read()) |
| 75 | + |
| 76 | + sql = """""" |
| 77 | + first = True |
| 78 | + for sample_id in data.keys(): |
| 79 | + mapped_sample_id = sample_map.get(sample_id) |
| 80 | + if mapped_sample_id is None: |
| 81 | + continue |
| 82 | + for tissue in data[sample_id].keys(): |
| 83 | + for cell_type in data[sample_id][tissue].keys(): |
| 84 | + for gene, value in data[sample_id][tissue][cell_type].items(): |
| 85 | + mapped_gene_id = gene_map.get(gene) |
| 86 | + if mapped_gene_id is None: |
| 87 | + continue |
| 88 | + if not first: |
| 89 | + sql += "), (" |
| 90 | + sql += f"{genetic_profile_id}, {mapped_sample_id}, \"{tissue}\", \"{cell_type}\", {mapped_gene_id}, {value} " |
| 91 | + first = False |
| 92 | + sql += ");" |
| 93 | + print(sql) |
| 94 | + return sql |
| 95 | + |
| 96 | +def create_sql_file(): |
| 97 | + with open("single_cell_expression.sql", "w+") as f: |
| 98 | + f.write(create_table_statement + "\n") |
| 99 | + f.write(add_genetic_profile + "\n") |
| 100 | + f.write(insert_data_start) |
| 101 | + f.write(create_data_sql()) |
| 102 | + |
| 103 | +create_sql_file() |
0 commit comments