Skip to content

Commit 0dcd7dc

Browse files
authored
Merge pull request #325 from broadinstitute/staging
Staging -> Master
2 parents 59e19e5 + 84a028b commit 0dcd7dc

159 files changed

Lines changed: 11418 additions & 9410 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.circleci/config.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,15 @@ jobs:
122122
./tests/skylab/trigger_test.sh ATAC
123123
no_output_timeout: 3.0h
124124

125+
test_smartseq2_single_nucleus:
126+
machine: true
127+
steps:
128+
- checkout
129+
- run:
130+
command: |
131+
./tests/skylab/trigger_test.sh smartseq2_single_nucleus
132+
no_output_timeout: 3.0h
133+
125134
workflows:
126135
version: 2
127136
test_all:
@@ -132,6 +141,7 @@ workflows:
132141
- test_optimus_mouse
133142
- test_smartseq2
134143
- test_smartseq2_single_end
144+
- test_smartseq2_single_nucleus
135145
# - test_npz2rds
136146
- test_sc_atac
137147
- test_atac

.dockstore.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@ workflows:
88
primaryDescriptorPath: /pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.wdl
99
- name: Smartseq2_Single_Sample
1010
subclass: WDL
11-
primaryDescriptorPath: /pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.wdl
11+
- name: Smartseq2_Single_Nucleus_Multisample
12+
subclass: WDL
13+
primaryDescriptorPath: /pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl
14+
- name: Smartseq2_Single_Nucleus
15+
subclass: WDL
16+
primaryDescriptorPath: /pipelines/skylab/smartseq2_single_nucleus/SmartSeq2SingleNucleus.wdl
1217
- name: IlluminaGenotypingArray
1318
subclass: WDL
1419
primaryDescriptorPath: /pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl

.github/workflows/doc_publish.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,16 @@ jobs:
2424
key: ${{ runner.os }}-modules-${{ env.cache-name }}-${{ hashFiles('**/yarn.lock') }}
2525

2626
- name: Setup NodeJS
27-
uses: actions/setup-node@v2-beta
27+
uses: actions/checkout@v2
2828
with:
29-
node-version: 12.x
29+
node-version: '14'
3030

3131
- name: Install and Build
32-
run: yarn --cwd=website/docs install && yarn --cwd=website/docs build
32+
run: yarn --cwd=website install && yarn --cwd=website build
3333

3434
- name: Deploy
3535
uses: JamesIves/github-pages-deploy-action@releases/v3
3636
with:
3737
BRANCH: gh-pages
3838
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
39-
FOLDER: website/docs/.vuepress/dist
39+
FOLDER: website/build

.github/workflows/doc_test.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ jobs:
2323
key: ${{ runner.os }}-modules-${{ env.cache-name }}-${{ hashFiles('**/yarn.lock') }}
2424

2525
- name: Setup NodeJS
26-
uses: actions/setup-node@v2-beta
26+
uses: actions/checkout@v2
2727
with:
28-
node-version: 12.x
28+
node-version: '14'
2929

3030
- name: Install and Build
31-
run: yarn --cwd=website/docs install && yarn --cwd=website/docs build
31+
run: yarn --cwd=website install && yarn --cwd=website build

.pullapprove.yml

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
---
22
version: 3
33

4-
# DO NOT EDIT THIS FILE DIRECTLY!!!
4+
# DO NOT EDIT THIS FILE DIRECTLY!!!
55
# To edit the pullapprove.yml file, edit the pullapprove_template.yml file or the scripts/process.sh script instead.
66
# Top level wdls for each pipeline are defined in the scripts/process.sh script.
77
# All other content in this file can be found in pullapprove_template.yml.
@@ -64,6 +64,57 @@ groups:
6464
users:
6565
- gbggrant # George Grant
6666

67+
scientific_owners_gdc_pipeline:
68+
conditions:
69+
- "'eng-only' not in labels"
70+
- "base.ref != 'master'"
71+
- "base.ref != 'RC'"
72+
- >
73+
'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/GDCWholeGenomeSomaticSingleSample.wdl' in files or
74+
'pipelines/broad/reprocessing/cram_to_unmapped_bams/CramToUnmappedBams.wdl' in files or
75+
'tasks/broad/CheckContaminationSomatic.wdl' in files or
76+
'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/GDC.png' in files or
77+
'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/GDCWholeGenomeSomaticSingleSample.changelog.md' in files or
78+
'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/GDCWholeGenomeSomaticSingleSample.options.json' in files or
79+
'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/GDCWholeGenomeSomaticSingleSample.wdl' in files or
80+
'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/README.md' in files or
81+
'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/biobambam2' in files or
82+
'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/bwa' in files or
83+
'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/input_files' in files or
84+
'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/samtools_nio' in files or
85+
'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/test_inputs' in files
86+
87+
reviews:
88+
required: 1
89+
author_value: 1
90+
request: 1
91+
request_order: given
92+
reviewers:
93+
users:
94+
- chipstewart # Chip Stewart
95+
96+
scientific_owners_cram_to_unmapped_bam:
97+
conditions:
98+
- "'eng-only' not in labels"
99+
- "base.ref != 'master'"
100+
- "base.ref != 'RC'"
101+
- >
102+
'pipelines/broad/reprocessing/cram_to_unmapped_bams/CramToUnmappedBams.wdl' in files or
103+
'pipelines/broad/reprocessing/cram_to_unmapped_bams/CramToUnmappedBams.changelog.md' in files or
104+
'pipelines/broad/reprocessing/cram_to_unmapped_bams/CramToUnmappedBams.options.json' in files or
105+
'pipelines/broad/reprocessing/cram_to_unmapped_bams/CramToUnmappedBams.wdl' in files or
106+
'pipelines/broad/reprocessing/cram_to_unmapped_bams/input_files' in files or
107+
'pipelines/broad/reprocessing/cram_to_unmapped_bams/test_inputs' in files
108+
109+
reviews:
110+
required: 1
111+
author_value: 1
112+
request: 1
113+
request_order: given
114+
reviewers:
115+
users:
116+
- kachulis # Chris Kachulis
117+
67118
scientific_owners_germline_single_sample:
68119
conditions:
69120
- "'eng-only' not in labels"
@@ -72,6 +123,7 @@ groups:
72123
- >
73124
'pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl' in files or
74125
'pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl' in files or
126+
'pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl' in files or
75127
'pipelines/broad/reprocessing/cram_to_unmapped_bams/CramToUnmappedBams.wdl' in files or
76128
'pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl' in files or
77129
'pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl' in files or
@@ -87,7 +139,6 @@ groups:
87139
'tasks/broad/SplitLargeReadGroup.wdl' in files or
88140
'tasks/broad/UnmappedBamToAlignedBam.wdl' in files or
89141
'tasks/broad/Utilities.wdl' in files or
90-
'tasks/broad/VariantCalling.wdl' in files or
91142
'verification/VerifyGermlineSingleSample.wdl' in files or
92143
'verification/VerifyMetrics.wdl' in files or
93144
'verification/VerifyReprocessing.wdl' in files or
@@ -106,7 +157,7 @@ groups:
106157
reviewers:
107158
users:
108159
- ldgauthier # Laura Gauthier
109-
- yfarjoun # Yossi Farjoun
160+
- kachulis # Chris Kachulis
110161

111162
scientific_owners_joint_genotyping:
112163
conditions:
@@ -177,6 +228,7 @@ groups:
177228
- "base.ref != 'RC'"
178229
- >
179230
'pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl' in files or
231+
'pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl' in files or
180232
'structs/dna_seq/DNASeqStructs.wdl' in files or
181233
'tasks/broad/AggregatedBamQC.wdl' in files or
182234
'tasks/broad/Alignment.wdl' in files or
@@ -186,8 +238,7 @@ groups:
186238
'tasks/broad/Qc.wdl' in files or
187239
'tasks/broad/SplitLargeReadGroup.wdl' in files or
188240
'tasks/broad/UnmappedBamToAlignedBam.wdl' in files or
189-
'tasks/broad/Utilities.wdl' in files or
190-
'tasks/broad/VariantCalling.wdl' in files
241+
'tasks/broad/Utilities.wdl' in files
191242
192243
reviews:
193244
required: 0
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
FROM python:3.6.2
2+
3+
LABEL maintainer="Lantern Team <lantern@broadinstitute.org>" \
4+
software="subread package" \
5+
version="2.0.1" \
6+
description="RNA-seq high-performance read alignment, quantification and mutation discovery" \
7+
website="http://subread.sourceforge.net/"
8+
9+
# Install compiler
10+
RUN apt-get update --fix-missing && apt-get install -y wget
11+
12+
COPY requirements.txt .
13+
RUN pip3 install -r requirements.txt
14+
15+
# Install subread
16+
WORKDIR /usr/local/
17+
ENV VERSION="2.0.1"
18+
RUN wget "https://downloads.sourceforge.net/project/subread/subread-${VERSION}/subread-${VERSION}-source.tar.gz" \
19+
&& tar -xzvf subread-${VERSION}-source.tar.gz
20+
WORKDIR /usr/local/subread-${VERSION}-source/src
21+
RUN make -f Makefile.Linux
22+
ENV PATH /usr/local/subread-${VERSION}-source/bin/:$PATH
23+
# Cleanup
24+
RUN apt-get clean
25+
26+
# copy the script that removes alignments spanning intron-exon junctions
27+
RUN mkdir /tools
28+
WORKDIR /tools
29+
COPY remove-reads-on-junctions.py .
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
import gzip
4+
import re
5+
import sys
6+
import pysam
7+
from bisect import bisect_left, bisect_right
8+
9+
10+
def get_feature(line, feature):
11+
features = re.sub('\"', '', line.strip().split('\t')[8].strip())
12+
features_dic = {x.split()[0]:x.split()[1] for x in features.split(';') if x}
13+
14+
if feature in features_dic:
15+
return features_dic[feature]
16+
return None
17+
18+
19+
def main():
20+
""" This script subselects alignments that either crosses an intron-exon junction or
21+
the ones that are entirely contained in exons.
22+
"""
23+
parser = argparse.ArgumentParser()
24+
parser.add_argument(
25+
"--input-gtf", "-g", dest="input_gtf", required=True, help="input GTF"
26+
)
27+
parser.add_argument(
28+
"--input-bam", "-i", dest="input_bam", required=True, help="input BAM"
29+
)
30+
parser.add_argument(
31+
"--output-bam", "-o", dest="output_bam", required=True, help="output BAM without intron-exon junctions"
32+
)
33+
args = parser.parse_args()
34+
35+
intron_cands = {}
36+
37+
exons = {}
38+
exon_ids = {}
39+
gene_locs = {}
40+
gene_locations = {}
41+
# gather the location of each genes and exons; and exon_ids to avoid duplication
42+
with gzip.open(args.input_gtf, "rt") if args.input_gtf.endswith(".gz"
43+
) else open(args.input_gtf, "r") as input_file:
44+
for line in input_file:
45+
if not line.startswith("#"):
46+
fields = [x.strip() for x in line.strip().split('\t')]
47+
if fields[2] == 'exon':
48+
gene_id = get_feature(line.strip(), 'gene_id')
49+
exon_id = get_feature(line.strip(), 'exon_id')
50+
contig_id = fields[0]
51+
locpair = (int(fields[3]), int(fields[4]))
52+
if contig_id not in exons:
53+
exons[contig_id] = []
54+
if exon_id not in exon_ids:
55+
exons[contig_id].append(locpair)
56+
exon_ids[exon_id] = True
57+
elif fields[2] == 'gene':
58+
gene_id = get_feature(line.strip(), 'gene_id')
59+
contig_id = fields[0]
60+
locpair = (int(fields[3]), int(fields[4]), gene_id)
61+
if gene_id != None:
62+
if contig_id not in gene_locs:
63+
gene_locs[contig_id] = []
64+
gene_locs[contig_id].append(locpair)
65+
66+
gene_locations[gene_id] = locpair
67+
68+
69+
# sorted the gene locs by start
70+
for contig_id in gene_locs:
71+
gene_locs[contig_id].sort(key = lambda x: x[0], reverse=False)
72+
73+
# keep sort the exons by start by contig
74+
for contig_id in exons:
75+
exons[contig_id].sort(key = lambda x: x[0], reverse=False)
76+
77+
# compute the intron candidates for each contig
78+
# where any bp that is not an exon is an candidate intron whithout
79+
# worrying about the inclusiveness of that base pair within the range
80+
# of a gene
81+
for contig_id in exons:
82+
intron_cands[contig_id] = []
83+
last_exon_end = 0
84+
for exon_coor in exons[contig_id]:
85+
# add all coordinate pair that is to the right of the last exon_end
86+
if exon_coor[0] > last_exon_end:
87+
pair = (last_exon_end, exon_coor[0])
88+
intron_cands[contig_id].append(pair)
89+
90+
# select the right most one
91+
last_exon_end = max(last_exon_end, exon_coor[1])
92+
93+
#add the remaining last
94+
pair = (last_exon_end, 30000000000)
95+
intron_cands[contig_id].append(pair)
96+
97+
# Given a list of intervals that are potentially intronic regions, the following block finds intronic regions for each gene.
98+
# For each chromosome (contig_id), for each gene_id within the chromosome, find the regions that exclude any exon intervals.
99+
# The potential intron intervals start and end points are in a global ordered (ascending) array
100+
# The odd indices are start points and the even indices are end points. If an interval crosses the gene start or end, it gets restricted to the gene body.
101+
102+
introns = {}
103+
for contig_id in gene_locs:
104+
introns[contig_id] = []
105+
intronic_points = []
106+
for coor in intron_cands[contig_id]:
107+
intronic_points.append(coor[0])
108+
intronic_points.append(coor[1])
109+
110+
for gene_loc in gene_locs[contig_id]:
111+
i = bisect_right(intronic_points, gene_loc[0], 0, len(intronic_points))
112+
j = bisect_left(intronic_points, gene_loc[1], 0, len(intronic_points))
113+
114+
if i%2 == 1: # it is a start location on i
115+
intron_start = gene_loc[0]
116+
intron_end = intronic_points[i]
117+
118+
for k in range(i, j, 2):
119+
introns[contig_id].append(intronic_points[k])
120+
introns[contig_id].append(intronic_points[k+1])
121+
122+
if j%2 == 1:
123+
intron_start = intronic_points[j]
124+
intron_end = gene_loc[1]
125+
introns[contig_id].append(intron_start)
126+
introns[contig_id].append(intron_end)
127+
128+
# all the introns organize by genes
129+
with pysam.AlignmentFile(args.input_bam, "rb", check_sq=False) as input_alignments:
130+
with pysam.AlignmentFile(args.output_bam, "wb", template=input_alignments) as outbam:
131+
for a in input_alignments:
132+
if a.reference_name in introns:
133+
i = bisect_left(introns[a.reference_name], a.reference_start)
134+
j = bisect_left(introns[a.reference_name], a.reference_end)
135+
# If a read crosses only one junction, it is counted towards the introns otherwise, it is counted towards the exons.
136+
# The reads could be from a premature mRNA inside the nucleus or it could be from a splices mRNA. If it is splices, the read could align to the junction crossing from one exon to another.
137+
# Since we align reads to the entire genome (introns included) these reads have a gap in them that crosses two or more junction points.
138+
if j-i!= 1:
139+
outbam.write(a)
140+
141+
142+
if __name__ == "__main__":
143+
main()
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Cython==0.24.1
2+
pysam==0.16.0.1
3+
pytest-cov==2.10.1
4+
pytest==5.1.1
5+
black==19.3b0
6+
flake8==3.7.7

dockers/skylab/loom-output/Dockerfile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@ LABEL maintainer="Lantern Team <lantern@broadinstitute.org>"
44

55
RUN pip install --upgrade pip
66

7+
RUN apt-get update && apt-get install wget
8+
9+
RUN python -m pip install git+https://github.com/HumanCellAtlas/sctools.git#egg=sctools
10+
711
COPY requirements.txt .
8-
RUN pip3 install numpy==1.17.0
9-
RUN pip3 install cython==0.29.15
1012
RUN pip3 install -r requirements.txt
1113

1214
RUN mkdir /tools
@@ -16,3 +18,5 @@ COPY create_loom_optimus.py .
1618
COPY create_loom_ss2.py .
1719
COPY loomCompare.py .
1820
COPY ss2_loom_merge.py .
21+
COPY create_snss2_counts_csv.py .
22+
COPY create_loom_snss2.py .

0 commit comments

Comments
 (0)