broadinstitute
diff --git a/‎.circleci/config.yml‎
Lines changed: 10 additions & 0 deletions b/‎.circleci/config.yml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎.dockstore.yml‎
Lines changed: 6 additions & 1 deletion b/‎.dockstore.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/doc_publish.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/doc_publish.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/doc_test.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/doc_test.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.pullapprove.yml‎
Lines changed: 56 additions & 5 deletions b/‎.pullapprove.yml‎
Lines changed: 56 additions & 5 deletions
diff --git a/‎dockers/skylab/featureCounts/Dockerfile‎
Lines changed: 29 additions & 0 deletions b/‎dockers/skylab/featureCounts/Dockerfile‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎dockers/skylab/featureCounts/remove-reads-on-junctions.py‎
Lines changed: 143 additions & 0 deletions b/‎dockers/skylab/featureCounts/remove-reads-on-junctions.py‎
Lines changed: 143 additions & 0 deletions
diff --git a/‎dockers/skylab/featureCounts/requirements.txt‎
Lines changed: 6 additions & 0 deletions b/‎dockers/skylab/featureCounts/requirements.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎dockers/skylab/loom-output/Dockerfile‎
Lines changed: 6 additions & 2 deletions b/‎dockers/skylab/loom-output/Dockerfile‎
Lines changed: 6 additions & 2 deletions
@@ -122,6 +122,15 @@ jobs:
                       ./tests/skylab/trigger_test.sh ATAC
                   no_output_timeout: 3.0h
 
+    test_smartseq2_single_nucleus:
+      machine: true
+      steps:
+        - checkout
+        - run:
+            command: |
+              ./tests/skylab/trigger_test.sh smartseq2_single_nucleus
+            no_output_timeout: 3.0h
+
 workflows:
     version: 2
     test_all:
@@ -132,6 +141,7 @@ workflows:
             - test_optimus_mouse
             - test_smartseq2
             - test_smartseq2_single_end
+            - test_smartseq2_single_nucleus
             #         - test_npz2rds
             - test_sc_atac
             - test_atac
 
@@ -8,7 +8,12 @@ workflows:
     primaryDescriptorPath: /pipelines/skylab/smartseq2_multisample/MultiSampleSmartSeq2.wdl
   - name: Smartseq2_Single_Sample
     subclass: WDL
-    primaryDescriptorPath: /pipelines/skylab/smartseq2_single_sample/SmartSeq2SingleSample.wdl
+  - name: Smartseq2_Single_Nucleus_Multisample
+    subclass: WDL
+    primaryDescriptorPath: /pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl
+  - name: Smartseq2_Single_Nucleus
+    subclass: WDL
+    primaryDescriptorPath: /pipelines/skylab/smartseq2_single_nucleus/SmartSeq2SingleNucleus.wdl
   - name: IlluminaGenotypingArray
     subclass: WDL
     primaryDescriptorPath: /pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl
 
@@ -24,16 +24,16 @@ jobs:
           key: ${{ runner.os }}-modules-${{ env.cache-name }}-${{ hashFiles('**/yarn.lock') }}
 
       - name: Setup NodeJS
-        uses: actions/setup-node@v2-beta
+        uses: actions/checkout@v2
         with:
-          node-version: 12.x
+          node-version: '14'
 
       - name: Install and Build
-        run: yarn --cwd=website/docs install && yarn --cwd=website/docs build
+        run: yarn --cwd=website install && yarn --cwd=website build
 
       - name: Deploy
         uses: JamesIves/github-pages-deploy-action@releases/v3
         with:
           BRANCH: gh-pages
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          FOLDER: website/docs/.vuepress/dist
+          FOLDER: website/build
@@ -23,9 +23,9 @@ jobs:
           key: ${{ runner.os }}-modules-${{ env.cache-name }}-${{ hashFiles('**/yarn.lock') }}
 
       - name: Setup NodeJS
-        uses: actions/setup-node@v2-beta
+        uses: actions/checkout@v2
         with:
-          node-version: 12.x
+          node-version: '14'
 
       - name: Install and Build
-        run: yarn --cwd=website/docs install && yarn --cwd=website/docs build
+        run: yarn --cwd=website install && yarn --cwd=website build
@@ -1,7 +1,7 @@
 ---
 version: 3
 
-# DO NOT EDIT THIS FILE DIRECTLY!!!
+  # DO NOT EDIT THIS FILE DIRECTLY!!!
 # To edit the pullapprove.yml file, edit the pullapprove_template.yml file or the scripts/process.sh script instead.
 # Top level wdls for each pipeline are defined in the scripts/process.sh script.
 # All other content in this file can be found in pullapprove_template.yml.
@@ -64,6 +64,57 @@ groups:
       users:
         - gbggrant # George Grant
 
+  scientific_owners_gdc_pipeline:
+    conditions:
+      - "'eng-only' not in labels"
+      - "base.ref != 'master'"
+      - "base.ref != 'RC'"
+      - >
+        'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/GDCWholeGenomeSomaticSingleSample.wdl' in files or
+        'pipelines/broad/reprocessing/cram_to_unmapped_bams/CramToUnmappedBams.wdl' in files or
+        'tasks/broad/CheckContaminationSomatic.wdl' in files or
+        'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/GDC.png' in files or
+        'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/GDCWholeGenomeSomaticSingleSample.changelog.md' in files or
+        'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/GDCWholeGenomeSomaticSingleSample.options.json' in files or
+        'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/GDCWholeGenomeSomaticSingleSample.wdl' in files or
+        'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/README.md' in files or
+        'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/biobambam2' in files or
+        'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/bwa' in files or
+        'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/input_files' in files or
+        'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/samtools_nio' in files or
+        'pipelines/broad/dna_seq/somatic/single_sample/wgs/gdc_genome/test_inputs' in files
+        
+    reviews:
+      required: 1
+      author_value: 1
+      request: 1
+      request_order: given
+    reviewers:
+      users:
+        - chipstewart # Chip Stewart
+
+  scientific_owners_cram_to_unmapped_bam:
+    conditions:
+      - "'eng-only' not in labels"
+      - "base.ref != 'master'"
+      - "base.ref != 'RC'"
+      - >
+        'pipelines/broad/reprocessing/cram_to_unmapped_bams/CramToUnmappedBams.wdl' in files or
+        'pipelines/broad/reprocessing/cram_to_unmapped_bams/CramToUnmappedBams.changelog.md' in files or
+        'pipelines/broad/reprocessing/cram_to_unmapped_bams/CramToUnmappedBams.options.json' in files or
+        'pipelines/broad/reprocessing/cram_to_unmapped_bams/CramToUnmappedBams.wdl' in files or
+        'pipelines/broad/reprocessing/cram_to_unmapped_bams/input_files' in files or
+        'pipelines/broad/reprocessing/cram_to_unmapped_bams/test_inputs' in files
+        
+    reviews:
+      required: 1
+      author_value: 1
+      request: 1
+      request_order: given
+    reviewers:
+      users:
+        - kachulis # Chris Kachulis
+
   scientific_owners_germline_single_sample:
     conditions:
       - "'eng-only' not in labels"
@@ -72,6 +123,7 @@ groups:
       - >
         'pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl' in files or
         'pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl' in files or
+        'pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl' in files or
         'pipelines/broad/reprocessing/cram_to_unmapped_bams/CramToUnmappedBams.wdl' in files or
         'pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl' in files or
         'pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl' in files or
@@ -87,7 +139,6 @@ groups:
         'tasks/broad/SplitLargeReadGroup.wdl' in files or
         'tasks/broad/UnmappedBamToAlignedBam.wdl' in files or
         'tasks/broad/Utilities.wdl' in files or
-        'tasks/broad/VariantCalling.wdl' in files or
         'verification/VerifyGermlineSingleSample.wdl' in files or
         'verification/VerifyMetrics.wdl' in files or
         'verification/VerifyReprocessing.wdl' in files or
@@ -106,7 +157,7 @@ groups:
     reviewers:
       users:
         - ldgauthier # Laura Gauthier
-        - yfarjoun # Yossi Farjoun
+        - kachulis # Chris Kachulis
 
   scientific_owners_joint_genotyping:
     conditions:
@@ -177,6 +228,7 @@ groups:
       - "base.ref != 'RC'"
       - >
         'pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl' in files or
+        'pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl' in files or
         'structs/dna_seq/DNASeqStructs.wdl' in files or
         'tasks/broad/AggregatedBamQC.wdl' in files or
         'tasks/broad/Alignment.wdl' in files or
@@ -186,8 +238,7 @@ groups:
         'tasks/broad/Qc.wdl' in files or
         'tasks/broad/SplitLargeReadGroup.wdl' in files or
         'tasks/broad/UnmappedBamToAlignedBam.wdl' in files or
-        'tasks/broad/Utilities.wdl' in files or
-        'tasks/broad/VariantCalling.wdl' in files
+        'tasks/broad/Utilities.wdl' in files
         
     reviews:
       required: 0
 
@@ -0,0 +1,29 @@
+FROM python:3.6.2
+
+LABEL maintainer="Lantern Team <lantern@broadinstitute.org>" \
+      software="subread package" \
+      version="2.0.1" \
+      description="RNA-seq high-performance read alignment, quantification and mutation discovery" \
+      website="http://subread.sourceforge.net/"
+
+# Install compiler 
+RUN apt-get update --fix-missing && apt-get install -y wget 
+
+COPY requirements.txt .
+RUN pip3 install -r requirements.txt    
+
+# Install subread 
+WORKDIR /usr/local/ 
+ENV VERSION="2.0.1"
+RUN wget "https://downloads.sourceforge.net/project/subread/subread-${VERSION}/subread-${VERSION}-source.tar.gz" \
+     && tar -xzvf subread-${VERSION}-source.tar.gz
+WORKDIR /usr/local/subread-${VERSION}-source/src 
+RUN make -f Makefile.Linux 
+ENV PATH /usr/local/subread-${VERSION}-source/bin/:$PATH
+# Cleanup
+RUN apt-get clean
+
+# copy the script that removes alignments spanning intron-exon junctions
+RUN mkdir /tools
+WORKDIR /tools
+COPY remove-reads-on-junctions.py .
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+import argparse
+import gzip
+import re
+import sys
+import pysam
+from bisect import bisect_left, bisect_right
+
+
+def get_feature(line, feature):
+    features = re.sub('\"', '', line.strip().split('\t')[8].strip())
+    features_dic = {x.split()[0]:x.split()[1] for x in features.split(';') if x} 
+
+    if feature in features_dic:
+       return features_dic[feature]
+    return None
+    
+
+def main():
+    """ This script subselects alignments that either crosses an intron-exon junction or 
+        the ones that are entirely contained in exons.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+      "--input-gtf", "-g", dest="input_gtf",  required=True, help="input GTF"
+    )
+    parser.add_argument(
+      "--input-bam", "-i", dest="input_bam",  required=True, help="input BAM"
+    )
+    parser.add_argument(
+      "--output-bam", "-o", dest="output_bam",  required=True, help="output BAM without intron-exon junctions"
+    )
+    args = parser.parse_args()
+
+    intron_cands = {}
+
+    exons = {}
+    exon_ids = {}
+    gene_locs = {}
+    gene_locations = {}
+    # gather the location of each genes and exons; and exon_ids to avoid duplication
+    with gzip.open(args.input_gtf, "rt") if args.input_gtf.endswith(".gz"
+    ) else open(args.input_gtf, "r") as input_file:
+        for line in input_file:
+            if not line.startswith("#"):
+               fields = [x.strip() for x in line.strip().split('\t')]
+               if fields[2] == 'exon':
+                  gene_id = get_feature(line.strip(), 'gene_id')
+                  exon_id = get_feature(line.strip(), 'exon_id')
+                  contig_id = fields[0] 
+                  locpair = (int(fields[3]), int(fields[4]))
+                  if contig_id not in exons:
+                     exons[contig_id] = []
+                  if exon_id not in exon_ids:
+                      exons[contig_id].append(locpair)
+                      exon_ids[exon_id] = True
+               elif fields[2] == 'gene':
+                  gene_id = get_feature(line.strip(), 'gene_id')
+                  contig_id = fields[0] 
+                  locpair = (int(fields[3]), int(fields[4]), gene_id)
+                  if gene_id != None:    
+                     if contig_id not in gene_locs:
+                         gene_locs[contig_id] = []
+                     gene_locs[contig_id].append(locpair)
+
+                     gene_locations[gene_id] = locpair
+
+    
+    # sorted the gene locs by start
+    for contig_id in gene_locs:
+        gene_locs[contig_id].sort(key = lambda x: x[0], reverse=False)
+
+    # keep sort the exons by start by contig
+    for contig_id in exons:
+        exons[contig_id].sort(key = lambda x: x[0], reverse=False)
+
+    # compute the intron candidates for each contig
+    # where any bp that is not an exon is an candidate intron whithout 
+    # worrying about the inclusiveness of that base pair within the range
+    # of a gene
+    for contig_id in exons:
+        intron_cands[contig_id] = []
+        last_exon_end = 0
+        for exon_coor in exons[contig_id]:
+            # add all coordinate pair that is to the right of the last exon_end 
+            if exon_coor[0] > last_exon_end:
+               pair = (last_exon_end, exon_coor[0])
+               intron_cands[contig_id].append(pair)
+
+            # select the right most one
+            last_exon_end = max(last_exon_end, exon_coor[1])
+
+        #add the remaining last  
+        pair = (last_exon_end, 30000000000)
+        intron_cands[contig_id].append(pair)
+
+    # Given a list of intervals that are potentially intronic regions, the following block finds intronic regions for each gene.
+    # For each chromosome (contig_id), for each gene_id within the chromosome, find the regions that exclude any exon intervals.
+    # The potential intron intervals start and end points are in a global ordered (ascending) array
+    # The odd indices are start points and the even indices are end points. If an interval crosses the gene start or end, it gets restricted to the gene body.
+
+    introns = {}
+    for contig_id in gene_locs:
+        introns[contig_id] = []
+        intronic_points = []
+        for coor in intron_cands[contig_id]:
+            intronic_points.append(coor[0])
+            intronic_points.append(coor[1])
+
+        for gene_loc in gene_locs[contig_id]:
+           i =  bisect_right(intronic_points, gene_loc[0], 0, len(intronic_points))
+           j =  bisect_left(intronic_points, gene_loc[1], 0, len(intronic_points))
+
+           if i%2 == 1: # it is a start location on i
+              intron_start = gene_loc[0]
+              intron_end = intronic_points[i]
+         
+           for k in range(i, j, 2):
+              introns[contig_id].append(intronic_points[k])
+              introns[contig_id].append(intronic_points[k+1])
+
+           if j%2 == 1:
+              intron_start = intronic_points[j]
+              intron_end = gene_loc[1]
+              introns[contig_id].append(intron_start) 
+              introns[contig_id].append(intron_end) 
+
+    # all the introns organize by genes
+    with pysam.AlignmentFile(args.input_bam, "rb", check_sq=False) as input_alignments:
+        with pysam.AlignmentFile(args.output_bam, "wb", template=input_alignments) as outbam:
+            for a in input_alignments:
+                if a.reference_name in introns:
+                    i = bisect_left(introns[a.reference_name], a.reference_start) 
+                    j = bisect_left(introns[a.reference_name], a.reference_end)
+                    # If a read crosses only one junction, it is counted towards the introns otherwise, it is counted towards the exons.
+                    # The reads could be from a premature mRNA inside the nucleus or it could be from a splices mRNA. If it is splices, the read could align to the junction crossing from one exon to another.
+                    # Since we align reads to the entire genome (introns included) these reads have a gap in them that crosses two or more junction points.
+                    if j-i!= 1: 
+                       outbam.write(a)
+              
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,6 @@
+Cython==0.24.1
+pysam==0.16.0.1
+pytest-cov==2.10.1
+pytest==5.1.1
+black==19.3b0
+flake8==3.7.7
@@ -4,9 +4,11 @@ LABEL maintainer="Lantern Team <lantern@broadinstitute.org>"
 
 RUN pip install --upgrade pip
 
+RUN apt-get update && apt-get install wget
+
+RUN python -m pip install git+https://github.com/HumanCellAtlas/sctools.git#egg=sctools
+
 COPY requirements.txt .
-RUN pip3 install numpy==1.17.0
-RUN pip3 install cython==0.29.15
 RUN pip3 install -r requirements.txt
 
 RUN mkdir /tools
@@ -16,3 +18,5 @@ COPY create_loom_optimus.py .
 COPY create_loom_ss2.py .
 COPY loomCompare.py .
 COPY ss2_loom_merge.py .
+COPY create_snss2_counts_csv.py .
+COPY create_loom_snss2.py .