Skip to content

Features/alignment files indexing on ingestion json #574

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ workflow experiments_json_with_files {

call write_drs_responses_to_file {
input:
drs_responses = post_to_drs.response_message
drs_responses = flatten(post_to_drs.response_message)
}

call parse_json {
Expand All @@ -63,7 +63,7 @@ workflow experiments_json_with_files {
output {
File download_list = prepare_files_list.path_list
Array[String] consolidated_paths_for_drs = prepare_for_drs.consolidated_paths_for_drs
Array[String] drs_responses = post_to_drs.response_message
Array[String] drs_responses = flatten(post_to_drs.response_message)
File results_post_drs = write_drs_responses_to_file.results_post_drs
File processed_drs_responses = parse_json.processed_drs_responses
File final_updated_json = update_experiment_json.final_updated_json
Expand All @@ -74,7 +74,7 @@ task prepare_files_list {
input {
File json_document
String directory
String filter_out_vcf_files
Boolean filter_out_vcf_files
}
command <<<
python3 -c "
Expand Down Expand Up @@ -146,20 +146,47 @@ task post_to_drs {
String token
Boolean validate_ssl
}

command <<<
# Extract project_id and dataset_id
project_id=$(python3 -c 'print("~{project_dataset}".split(":")[0])')
dataset_id=$(python3 -c 'print("~{project_dataset}".split(":")[1])')
curl ~{true="" false="-k" validate_ssl} \

resp_main=$(curl ~{true="" false="-k" validate_ssl} \
-X POST \
-F "file=@~{file_path}" \
-F "project_id=$project_id" \
-F "dataset_id=$dataset_id" \
-H "Authorization: Bearer ~{token}" \
--fail-with-body \
"~{drs_url}/ingest"
"~{drs_url}/ingest")
echo "$resp_main" | jq -c

# If it's BAM or CRAM, ingest the index as well
if [[ "~{file_path}" =~ \.(bam|cram)$ ]]; then

if [[ "~{file_path}" =~ \.bam$ ]]; then
index_ext=".bai"
else
index_ext=".crai"
fi

samtools index "~{file_path}" 1>/dev/null 2>&1

resp_index=$(curl ~{true="" false="-k" validate_ssl} \
-X POST \
-F "file=@~{file_path}${index_ext}" \
-F "project_id=$project_id" \
-F "dataset_id=$dataset_id" \
-H "Authorization: Bearer ~{token}" \
--fail-with-body \
"~{drs_url}/ingest")
echo "$resp_index" | jq -c
fi
>>>

output {
String response_message = read_string(stdout())
Array[String] response_message = read_lines(stdout())
}
}

Expand All @@ -171,18 +198,24 @@ task write_drs_responses_to_file {
command <<<
python3 -c "
import json
# this is a temporary file to store the responses (str) from DRS
temporary_file_drs_responses = '~{write_json(drs_responses)}'
with open(temporary_file_drs_responses, 'r') as file:
drs_responses = json.load(file)

responses = []
for response in drs_responses:
response_corrected = json.loads(response)
responses.append(response_corrected)

with open('results_post_drs.json', 'w') as output_file:
json.dump(responses, output_file, indent=2)
temporary_file_drs_responses = '~{write_json(drs_responses)}'
with open(temporary_file_drs_responses, 'r') as f:
lines = json.load(f)

valid_json_responses = []
for line in lines:
line = line.strip()
if not line:
continue
try:
parsed = json.loads(line)
valid_json_responses.append(parsed)
except json.JSONDecodeError:
pass

with open('results_post_drs.json', 'w') as outfile:
json.dump(valid_json_responses, outfile, indent=2)
"
>>>

Expand Down Expand Up @@ -230,36 +263,58 @@ task update_experiment_json {
}
command <<<
python3 -c "
import json
import json, os

with open('~{json_document}', 'r') as file:
data = json.load(file)
with open('~{json_document}', 'r') as f:
data = json.load(f)

with open('~{processed_drs_responses}', 'r') as file:
drs_data = json.load(file)
with open('~{processed_drs_responses}', 'r') as f:
drs_data = json.load(f)

def construct_drs_name_for_index(filename, format):
return filename + '.' + format.lower()
def construct_index_basename(filename, fmt):
return f'{filename}.{fmt.lower()}'

# Update the original JSON document with DRS URIs for both files and their indices
for experiment in data.get('experiments', []):
for result in experiment.get('experiment_results', []):
# Update primary file URL
for drs_response in drs_data:
if result['filename'] == drs_response['name']:
result['url'] = drs_response['self_uri']
filename = result.get('filename', '')

if 'indices' not in result or not isinstance(result['indices'], list):
result['indices'] = []

for drs_resp in drs_data:
if os.path.basename(drs_resp.get('name', '')) == filename:
result['url'] = drs_resp.get('self_uri')
break
# Update indices URLs if present
for index in result.get('indices', []):
expected_drs_name = construct_drs_name_for_index(result['filename'], index['format'])
for drs_response in drs_data:
if expected_drs_name == drs_response['name']:
index['url'] = drs_response['self_uri']

new_index_format = None
if filename.endswith('.bam'):
new_index_format = 'BAI'
elif filename.endswith('.cram'):
new_index_format = 'CRAI'

if new_index_format:
index_basename = construct_index_basename(filename, new_index_format)

for drs_resp in drs_data:
if os.path.basename(drs_resp.get('name', '')) == index_basename:
existing_idx = None
for idx in result['indices']:
if idx.get('format') == new_index_format:
existing_idx = idx
break

if existing_idx:
existing_idx['url'] = drs_resp.get('self_uri')
else:
result['indices'].append({
'format': new_index_format,
'url': drs_resp.get('self_uri')
})
break

with open('final_updated_json.json', 'w') as file:
json.dump(data, file, indent=4)
"
with open('final_updated_json.json', 'w') as f:
json.dump(data, f, indent=4)
"
>>>
output {
File final_updated_json = "final_updated_json.json"
Expand Down