-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLinkProjectToSamples.py
More file actions
275 lines (246 loc) · 11.7 KB
/
LinkProjectToSamples.py
File metadata and controls
275 lines (246 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import requests
from requests.exceptions import HTTPError
import re
import socket
from os import listdir
import os.path
import subprocess
import time
import sys
import setaccess
NGS_STATS_ENDPOINT = "http://igodb.mskcc.org:8080/ngs-stats/permissions/getRequestPermissions/"
LIMS_ENDPOINT = "https://igolims.mskcc.org:8443/LimsRest"
FASTQ_ROOT = "/igo/delivery/FASTQ/%s/Project_%s/%s" # (runID, requestID, Sample)
DELIVERY_ROOT = "/igo/delivery/share/%s/Project_%s/%s" # (labName, requestID, trimmedRun)
DELIVERY = "/igo/delivery/"
NANOPORE_DELIVERY = "/igo/delivery/nanopore/"
SDC=False
file1 = open('ConnectLimsRest.txt', 'r')
allLines = file1.readlines()
username = allLines[0].strip()
password = allLines[1].strip()
if socket.gethostname().startswith("isvigoacl01"):
print("Setting default paths for SDC")
SDC = True
FASTQ_ROOT = "/ifs/datadelivery/igo_core/FASTQ/%s/Project_%s/%s" # (runID, requestID, Sample)
DELIVERY_ROOT = "/ifs/datadelivery/igo_core/share/%s/Project_%s/%s" # (labName, requestID, trimmedRun)
DELIVERY = "/ifs/datadelivery/igo_core/"
NANOPORE_DELIVERY = "/ifs/datadelivery/igo_core/nanopore/"
# given requestID as input and get json dictionary as return
def get_NGS_stats(reqID):
ngs_query_url = NGS_STATS_ENDPOINT + reqID
response = requests.get(ngs_query_url, verify=False)
return(response.json())
def get_qc_stats(reqID):
qc_query_url = LIMS_ENDPOINT + "/getProjectQc?project=" + reqID
try:
response = requests.get(qc_query_url, auth=(username, password), verify=False)
response.raise_for_status()
run_sample_qc_info ={}
json_info = response.json()[0]["samples"]
for i in json_info:
run = i["qc"]["run"]
# Truncate baseId to first 3 parts (e.g., "17123_D_108" from "17123_D_108_1_2_1_1")
# to match the FASTQ folder naming convention
base_id_parts = i["baseId"].split("_")
short_base_id = "_".join(base_id_parts[:3])
sample_name = "Sample_" + i["qc"]["sampleName"] + "_IGO_" + short_base_id
if run not in run_sample_qc_info:
run_sample_qc_info[run] = {}
run_sample_qc_info[run][sample_name] = {"recipe": i["recipe"], "qcstatus": i["qc"]["qcStatus"]}
else:
run_sample_qc_info[run][sample_name] = {"recipe": i["recipe"], "qcstatus": i["qc"]["qcStatus"]}
return run_sample_qc_info
except HTTPError as http_err:
print(f'HTTP error occurred: {http_err}')
# NGS_Stats class, need json from ngs endpoint to create.
class NGS_Stats:
def __init__(self, stats_json):
self.labName = stats_json["labName"] # name of delivery folder
self.fastq_list = stats_json["fastqs"] # list of original fastq files need to be linked
self.samples = self.get_sample_run_dict() # dictionary of sample -> runs from fastq list
self.requestName = stats_json["requestName"] # requestName
self.isDLP = stats_json["isDLP"] #
# get dictionary of sample -> run by fastq_list
def get_sample_run_dict(self):
samples = {}
for fastq in self.fastq_list:
info_list = fastq.split("/")
run = info_list[4]
sample = info_list[6]
if sample in samples.keys():
if run not in samples[sample]:
samples[sample].append(run)
else:
samples[sample] = [run]
return samples
def trimRunID(runID):
trimmedRun = re.match("([A-Za-z0-9]+_[0-9]+).*", runID).groups()[0]
print("Trimmed Run: {}".format(trimmedRun))
return trimmedRun
# given reqID, sample and list of runs, if trimmedRun are same, keep only latest runID
def updateRun(runs, reqID, sample):
trimmedRun_run_dict = {}
updatedRuns = []
for run in runs:
trimmedRun = trimRunID(run)
if trimmedRun in trimmedRun_run_dict.keys():
trimmedRun_run_dict[trimmedRun].append(run)
else:
trimmedRun_run_dict[trimmedRun] = [run]
for trimmedRun, runID in trimmedRun_run_dict.items():
if len(runID) == 1:
updatedRuns.append(runID[0])
else:
source = runID[0]
for possibleRun in runID:
source_path = DELIVERY + "FASTQ/{}/Project_{}/{}".format(source, reqID, sample)
possibleRun_path = DELIVERY + "FASTQ/{}/Project_{}/{}".format(possibleRun, reqID, sample)
# check if folder exists before create link for cases that project contains old samples eg: 08822
if os.path.exists(source_path) and os.path.exists(possibleRun_path):
if os.path.getmtime(possibleRun_path) > os.path.getmtime(source_path):
source = possibleRun
updatedRuns.append(source)
return updatedRuns
# DLP has different rule for linking no linking for samples
# step 1 get project ID as input, query from db to get fastq list eg :http://igodb.mskcc.org:8080/ngs-stats/permissions/getRequestPermissions/13117_B
# step 2 create symbol links eg: ln -sf /igo/delivery/FASTQ/RUTH_0089_AHHLYJDSX3/Project_13117_B/Sample_HCTWT1_IGO_13117_B_1 /igo/delivery/share/bakhoums/Project_13117_B/RUTH_0089
# step 3 call setaccess
def link_by_request(reqID):
json_info = get_NGS_stats(reqID)
stats = NGS_Stats(json_info)
labName = stats.labName
request_name = stats.requestName # ie PEDPEG, SingleCell, etc.
isDLP = stats.isDLP
print("RequestName: " + request_name)
# check if lab folder exist, if not create one
labDir = DELIVERY + "share/%s" % (labName)
projDir = DELIVERY + "share/%s/Project_%s" % (labName, reqID)
if not os.path.exists(labDir):
cmd = "mkdir " + labDir
subprocess.run(cmd, shell=True)
cmd = "chmod +rx " + labDir
subprocess.run(cmd, shell=True) # piDir is always readable by all, Project dirs are not
# then change project dirs to not world readable
mask = 0o007
# Set the current umask value and get the previous umask value
umask = os.umask(mask)
print("Current umask:", mask)
print("Previous umask:", umask)
if not os.path.exists(projDir):
cmd = "mkdir " + projDir
subprocess.run(cmd, shell=True)
if SDC: # replace all paths /igo/delivery/FASTQ with /ifs/datadelivery/igo_core/FASTQ
print("Replacing all paths for SDC")
stats.fastq_list = [s.replace("/igo/delivery/FASTQ", "/ifs/datadelivery/igo_core/FASTQ") for s in stats.fastq_list]
madeDir = []
# create symbol links for each sample
# if it is DLP only create link for the run not each sample
run_sample_qc = get_qc_stats(reqID)
if isDLP :
print("Linking DLP run")
# get fastq file folder path instead of each fastq
fastq_directories = set()
for fastq in stats.fastq_list:
fastq_directories.add(os.path.dirname(fastq))
# create link for each folder path
for fastq_dir in fastq_directories:
dlink = projDir + "/" + fastq_dir.split('/')[-2]
if not os.path.exists(dlink):
cmd = "mkdir " + dlink
subprocess.run(cmd, shell=True)
slink = fastq_dir
cmd = "ln -sf {} {}".format(slink, dlink)
print(cmd)
subprocess.run(cmd, shell=True)
# if it is nanopore date, search under folder /igo/delivery/nanopore for project data path. The name for the folder should start with "Project_12345__"
elif request_name == "Nanopore":
# find project data folder
parent_dir = NANOPORE_DELIVERY
project_list = os.listdir(parent_dir)
project_folder = "Project_" + reqID
for project_dir in project_list:
if project_folder == project_dir:
# create symbol link
slink = parent_dir + project_dir
dlink = projDir
cmd = "ln -sf {} {}".format(slink, dlink)
print(cmd)
subprocess.run(cmd, shell=True)
else:
for sample, runs in stats.samples.items():
updated_runs = updateRun(runs, reqID, sample)
for run in updated_runs:
dlink = DELIVERY_ROOT % (labName, reqID, trimRunID(run))
slink = FASTQ_ROOT % (run, reqID, sample)
# check if the file has status of failed
run_key = "_".join(run.split("_")[:3])
qcstatus = (
run_sample_qc
.get(run_key, {}) # run might not exist
.get(sample, {}) # sample might not exist
.get("qcstatus") # qcstatus might not exist
)
if qcstatus is None or qcstatus == "Failed":
print("{} from run {} failed, don't deliver".format(sample, run))
else:
# check if folder exists before create link for cases that project contains old samples eg: 08822
if os.path.exists(slink):
# check if recipe of the sample is SC_Chromium-Multiome-GEX or SC_Chromium-Multiome-ATAC, if yes append recipe to the folder name
recipe = run_sample_qc[run_key][sample]["recipe"]
if recipe == "SC_Chromium-Multiome-GEX":
dlink = dlink + "_GEX"
if recipe == "SC_Chromium-Multiome-ATAC":
dlink = dlink + "_ATAC"
# check if lab/project/run folder exist, if not create one
if not os.path.exists(dlink) and dlink not in madeDir:
cmd = "mkdir " + dlink
print(cmd)
madeDir.append(dlink)
subprocess.run(cmd, shell=True)
cmd = "ln -sf {} {}".format(slink, dlink)
print (cmd)
subprocess.run(cmd, shell=True)
else:
print("{} not exits".format(slink))
setaccess.set_request_acls(reqID, "")
# loop link_by_request method by time peirod, time as argument, unit will be min
# step 1 get project list within time period by given time interval from LIMS eg: "https://igolims.mskcc.org:8443/LimsRest/getRecentDeliveries?time=30&units=m"
# step 2 update the project list by removing the projects without sample information(DLP)
# step 3 call link_by_request for each project in the updated list
def get_recent_delivery(time):
url = "{}/getRecentDeliveries?time={}&units=m".format(LIMS_ENDPOINT, time)
try:
response = requests.get(url, auth=(username, password), verify=False)
response.raise_for_status()
return response.json()
except HTTPError as http_err:
print(f'HTTP error occurred: {http_err}')
def link_by_time(time):
deliver_list_orig = get_recent_delivery(time)
if deliver_list_orig is None:
print("check input")
return
else:
toDeliver = []
for possibleDelivered in deliver_list_orig:
if "samples" not in possibleDelivered:
continue
toDeliver.append(possibleDelivered['requestId'])
if len(toDeliver) == 0:
print("No projects need to deliver during last {} mins".format(time))
else:
for req in toDeliver:
link_by_request(req)
print("{} projects are delivered".format(len(toDeliver)))
if __name__ == '__main__':
if (len(sys.argv) != 2):
print("Usage: python3 LinkProjectToSamples.py REQUEST=<request> | TIME=<minutes>")
else:
args = sys.argv[1]
if args.startswith("REQUEST="):
request = args[8:]
link_by_request(request)
if args.startswith("TIME="):
time = args[5:]
link_by_time(time)