Skip to content

Commit 5197a4a

Browse files
committed
create sql script
1 parent c7e94ed commit 5197a4a

File tree

2 files changed

+104
-0
lines changed

2 files changed

+104
-0
lines changed

dev/create_sc_expression_sql.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/usr/bin/env python
2+
"""
3+
Create a SQL script to insert single_cell_expression data into the database
4+
"""
5+
6+
import json
7+
import pandas as pd
8+
9+
# STUDY_ID = msk_spectrum_tme_2022
10+
cancer_study_id = 40
11+
# THIS DOESNT MATTER, JUST A HIGH NUMBER?
12+
genetic_profile_id = 10000
13+
14+
15+
create_table_statement = """
16+
DROP TABLE IF EXISTS single_cell_expression;
17+
CREATE TABLE IF NOT EXISTS single_cell_expression (
18+
GENETIC_PROFILE_ID int NOT NULL,
19+
SAMPLE_ID int NOT NULL,
20+
TISSUE varchar(255) NOT NULL,
21+
CELL_TYPE varchar(255) NOT NULL,
22+
ENTREZ_GENE_ID int NOT NULL,
23+
EXPRESSION_VALUE float,
24+
FOREIGN KEY(GENETIC_PROFILE_ID) REFERENCES genetic_profile(GENETIC_PROFILE_ID),
25+
FOREIGN KEY(SAMPLE_ID) REFERENCES sample(INTERNAL_ID),
26+
FOREIGN KEY(ENTREZ_GENE_ID) REFERENCES gene(ENTREZ_GENE_ID)
27+
);
28+
"""
29+
30+
add_genetic_profile = f"""
31+
DELETE FROM genetic_profile WHERE STABLE_ID = "single_cell_expression";
32+
INSERT INTO genetic_profile (
33+
GENETIC_PROFILE_ID, STABLE_ID, CANCER_STUDY_ID, GENETIC_ALTERATION_TYPE,
34+
DATATYPE, NAME, DESCRIPTION, SHOW_PROFILE_IN_ANALYSIS_TAB
35+
) VALUES (
36+
{genetic_profile_id}, "SINGLE_CELL_EXPRESSION", {cancer_study_id}, "single_cell_expression",
37+
"single_cell_expression", "Single Cell Expression", "Single Cell Expression", 1
38+
);
39+
"""
40+
41+
insert_data_start = """
42+
INSERT INTO single_cell_expression (
43+
GENETIC_PROFILE_ID, SAMPLE_ID, TISSUE, CELL_TYPE, ENTREZ_GENE_ID, EXPRESSION_VALUE
44+
) VALUES (
45+
"""
46+
47+
value_split = "\n), (\n"
48+
49+
def create_sample_map():
50+
"""SQL
51+
SELECT sample.INTERNAL_ID, sample.STABLE_ID FROM sample
52+
INNER JOIN patient ON sample.PATIENT_ID = patient.INTERNAL_ID
53+
INNER JOIN cancer_study ON patient.CANCER_STUDY_ID = cancer_study.CANCER_STUDY_ID
54+
WHERE cancer_study.CANCER_STUDY_IDENTIFIER = "msk_spectrum_tme_2022";
55+
"""
56+
data = pd.read_csv("sample_map.tsv", skiprows=1, sep="\t")
57+
data = data.dropna(how="any")
58+
return dict(zip(data["INTERNAL_ID"], data["STABLE_ID"]))
59+
60+
61+
def create_gene_map():
62+
"""SQL:
63+
select ENTREZ_GENE_ID, HUGO_GENE_SYMBOL from gene;
64+
"""
65+
data = pd.read_csv("gene_map.tsv", skiprows=1, sep="\t")
66+
data = data.dropna(how="any")
67+
return dict(zip(data["ENTREZ_GENE_ID"], data["HUGO_GENE_SYMBOL"]))
68+
69+
70+
def create_data_sql() -> str:
71+
sample_map = create_sample_map()
72+
gene_map = create_gene_map()
73+
with open("sc-expression-msk-spectrum.json") as f:
74+
data = json.loads(f.read())
75+
76+
sql = """"""
77+
first = True
78+
for sample_id in data.keys():
79+
mapped_sample_id = sample_map.get(sample_id)
80+
if mapped_sample_id is None:
81+
continue
82+
for tissue in data[sample_id].keys():
83+
for cell_type in data[sample_id][tissue].keys():
84+
for gene, value in data[sample_id][tissue][cell_type].items():
85+
mapped_gene_id = gene_map.get(gene)
86+
if mapped_gene_id is None:
87+
continue
88+
if not first:
89+
sql += "), ("
90+
sql += f"{genetic_profile_id}, {mapped_sample_id}, \"{tissue}\", \"{cell_type}\", {mapped_gene_id}, {value} "
91+
first = False
92+
sql += ");"
93+
print(sql)
94+
return sql
95+
96+
def create_sql_file():
97+
with open("single_cell_expression.sql", "w+") as f:
98+
f.write(create_table_statement + "\n")
99+
f.write(add_genetic_profile + "\n")
100+
f.write(insert_data_start)
101+
f.write(create_data_sql())
102+
103+
create_sql_file()

dev/sc-expression-msk-spectrum.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)