Merge pull request #13 from OpenOmics/dev

chenv3 · web-flow · commit c9a8304d8f3f · 2025-09-24T17:59:54.000-04:00
Adding in a check to the Seurat CITE QC to handle situations where no ADT or HTO features are detected in data even when assays are present.
Adding in support for 2024 reference for ATAC and Multiome pipelines
Add check of libraries.csv file to see if samples have more than one modality associated
Update documentation
Add code to generate some QC plots for RPL and RPS genes for future update
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-3.0.4
+3.0.5
diff --git a/cell-seek b/cell-seek
@@ -368,7 +368,7 @@ def parsed_arguments(name, description):
                                 genome of the samples. {2} does comes bundled with
                                 prebuilt reference files for human and mouse samples.
                                 The newest 10X reference (2024-A) is available for
-                                GEX, CITE, and Multi analysis (hg2024, mm2024).
+                                GEX, CITE, ATAC, Multi, and Multiome analysis (hg2024, mm2024).
                                 A custom reference genome can also be provided. For
                                 prebuilt references, please select one of the following
                                 options: hg38, mm10, hg2024, mm2024.
diff --git a/config/genome.json b/config/genome.json
@@ -17,12 +17,16 @@
 	"hg2024": {
 	    "gex_transcriptome": "/data/OpenOmics/references/cell-seek/human/refdata-gex-GRCh38-2024-A",
 	    "cite_transcriptome": "/data/OpenOmics/references/cell-seek/human/refdata-gex-GRCh38-2024-A",
-	    "vdj_ref": "/data/OpenOmics/references/cell-seek/human/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.1.0/"
+            "atac_ref": "/data/OpenOmics/references/cell-seek/human/refdata-cellranger-arc-GRCh38-2024-A",
+	    "vdj_ref": "/data/OpenOmics/references/cell-seek/human/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.1.0/",
+            "arc_ref": "/data/OpenOmics/references/cell-seek/human/refdata-cellranger-arc-GRCh38-2024-A"
 	},
         "mm2024": {
             "gex_transcriptome": "/data/OpenOmics/references/cell-seek/mouse/refdata-gex-GRCm39-2024-A",
             "cite_transcriptome": "/data/OpenOmics/references/cell-seek/mouse/refdata-gex-GRCm39-2024-A",
-	    "vdj_ref": "/data/OpenOmics/references/cell-seek/mouse/refdata-cellranger-vdj-GRCm38-alts-ensembl-7.0.0/"
+            "atac_ref": "/data/OpenOmics/references/cell-seek/mouse/refdata-cellranger-arc-GRCm39-2024-A",
+	    "vdj_ref": "/data/OpenOmics/references/cell-seek/mouse/refdata-cellranger-vdj-GRCm38-alts-ensembl-7.0.0/",
+            "arc_ref": "/data/OpenOmics/references/cell-seek/mouse/refdata-cellranger-arc-GRCm39-2024-A"
         }
     }
 }
diff --git a/docs/usage/genome.md b/docs/usage/genome.md
@@ -7,6 +7,9 @@ This part of the documentation describes options and concepts for <code>cell-see
 
 If a reference genome that does not come with the pipeline, then a custom json file needs to be provided to run.
 
+This command does not help with creating the 10x compatible reference itself, that would need to be done separately. 10x documentation about the process can be found for [GEX](https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/inputs/cr-3p-references), [VDJ](https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/inputs/cr-5p-references), [ATAC](https://www.10xgenomics.com/support/software/cell-ranger-atac/latest/analysis/inputs/creating-a-reference-package-mkref), and [Multiome](https://www.10xgenomics.com/support/software/cell-ranger-arc/latest/analysis/inputs/mkref)
+
+
 Creating a custom reference genome file is fast and easy! In its most basic form, <code>cell-seek <b>genome</b></code> only has *one required input* with the optional arguments supplying the reference paths.
 
 ## 2. Synopsis
diff --git a/docs/usage/run.md b/docs/usage/run.md
@@ -741,7 +741,7 @@ Each of the following arguments are required. Failure to provide a required argu
 > ***Example:*** `--pipeline atac`
 
 ---  
-  `--genome {hg38, mm10, custom.json}`
+  `--genome {hg38, mm10, hg2024, mm2024, custom.json}`
 > **Reference genome.**   
 > *type: string*
 >   
@@ -841,7 +841,7 @@ Each of the following arguments are required. Failure to provide a required argu
 > ***Example:*** `--pipeline multiome`
 
 ---  
-  `--genome {hg38, mm10, custom.json}`
+  `--genome {hg38, mm10, hg2024, mm2024, custom.json}`
 > **Reference genome.**   
 > *type: string*
 >   
diff --git a/src/run.py b/src/run.py
@@ -810,6 +810,7 @@ def finalcheck(config, flag, delimeter=','):
 
     # Dictionary holding unique contents from files to use for comparisons
     contents = {}
+    name_type = {}
     with open(filename) as fh:
         try:
             header = next(fh).strip().split(delimeter)
@@ -827,6 +828,9 @@ def finalcheck(config, flag, delimeter=','):
                 values = contents.get(i, set())
                 values.add(linelist[indices[i]])
                 contents[i] = values
+            value = name_type.get(linelist[indices['name']], set())
+            value.add(linelist[indices['type']])
+            name_type[linelist[indices['name']]] = value
 
     # Compiles the sample names and fastq paths from the input (config)
     samples  = set([re.sub("_S[0-9]+_L00[0-9]", "", i) for i in config['samples']])
@@ -866,6 +870,12 @@ def finalcheck(config, flag, delimeter=','):
             └── Please note that the followed listed FASTQ names are not found in the input files: {{}} '.format(flag, filename, ','.join(missing_file))
                 )
 
+    names = [name for name in name_type if (len(name_type[name]) <= 1)]
+    if len(names) > 0:
+        print(f"\nWarning: Some samples only have one feature type associated with them! \nWarning: --{{}} {{}} only contains one feature type for some of the samples.\n \
+            └── Please note that only one feature type was provided for the following sample(s): {{}} \n \
+            If this is correct, these samples do not need to be run using cellranger multi.".format(flag, filename, ','.join(names)))
+
 
 def check_conditional_parameters(config):
     """Check the compiled config fictionary to ensure
@@ -909,7 +919,7 @@ def check_conditional_parameters(config):
 
     #Check reference
     if config['options']['genome'] in ['hg2024', 'mm2024']:
-        if config['options']['pipeline'] in ['atac', 'multiome']:
+        if config['options']['pipeline'] in []:
             errorMessage += [
                 "Error: The {} reference is not available for the {} pipeline\n \
                 └── Please use the --genome flag to select one of the available references: {}".format(
diff --git a/workflow/rules/gex.smk b/workflow/rules/gex.smk
@@ -173,7 +173,6 @@ rule count:
         log ="run_{sample}_10x_cellranger_count.log"
     params:
         rname = "count",
-        batch = "-l nodes=1:ppn=16,mem=96gb",
         id = "{sample}",
         sample = sample_rename,
         transcriptome = config["references"][genome]["gex_transcriptome"],
diff --git a/workflow/scripts/seuratCiteSampleQC.R b/workflow/scripts/seuratCiteSampleQC.R
@@ -46,8 +46,10 @@ if (length(grep('^HTO[-_]', grep('hashtag', rownames(rdata$`Antibody Capture`),
   adt_assay <- CreateAssayObject(counts=rdata$`Antibody Capture`[grep('^HTO[-_]', grep('hashtag', rownames(rdata$`Antibody Capture`), value=TRUE, ignore.case=TRUE, invert=TRUE), value=TRUE, ignore.case=TRUE, invert=TRUE),])
   filtered_cite[['ADT']] <- names(which(apply(GetAssayData(adt_assay, slot='counts'), 1, max) <= adt_thresh))
   adt_names <- names(which(apply(GetAssayData(adt_assay, slot='counts'), 1, max) > adt_thresh))
-  seur[['ADT']] <- CreateAssayObject(counts=GetAssayData(adt_assay, slot='counts')[adt_names,])
-  adt = TRUE
+  if (length(adt_names) > 0) {
+    seur[['ADT']] <- CreateAssayObject(counts=GetAssayData(adt_assay, slot='counts')[adt_names,])
+    adt = TRUE
+  }
 }
 
 # Add in HTO assay if features with HTO was found
@@ -56,8 +58,10 @@ if (length(as.character(c(grep('hashtag', rownames(rdata$`Antibody Capture`), va
   hto_assay <- CreateAssayObject(counts=rdata$`Antibody Capture`[unique(as.character(c(grep('hashtag', rownames(rdata$`Antibody Capture`), value=TRUE, ignore.case=TRUE), grep('^HTO[-_]', rownames(rdata$`Antibody Capture`), value=TRUE, ignore.case=TRUE)))),])
   filtered_cite[['HTO']] <- names(which(apply(GetAssayData(hto_assay, slot='counts'), 1, max) <= adt_thresh))
   hto_names <- names(which(apply(GetAssayData(hto_assay, slot='counts'), 1, max) > adt_thresh))
-  seur[['HTO']] <- CreateAssayObject(counts=GetAssayData(hto_assay, slot='counts')[hto_names,])
-  hashtag = TRUE
+  if (length(hto_names) > 0) {
+    seur[['HTO']] <- CreateAssayObject(counts=GetAssayData(hto_assay, slot='counts')[hto_names,])
+    hashtag = TRUE
+  }
 }
 
 write.table(adt_thresh, 'CITE_threshold.txt', col.names = FALSE, row.names=FALSE)
@@ -82,6 +86,8 @@ figures <- list()
 
 ## ----Pre-Filter Gene Plot----
 seur[["percent.mito"]] <- PercentageFeatureSet(seur, pattern="^[Mm][Tt]-")
+seur[["percent.rps"]] <- PercentageFeatureSet(seur, pattern="^R[Pp][Ss]")
+seur[["percent.rpl"]] <- PercentageFeatureSet(seur, pattern="^R[Pp][Ll]")
 
 plot1 <- FeatureScatter(seur, feature1 = "nCount_RNA", feature2 = "percent.mito") + NoLegend()
 plot2 <- FeatureScatter(seur, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + NoLegend()
@@ -93,6 +99,7 @@ png("PreFilter_Gene_Plot.png", height=5, width=10, units='in', res=300)
 plot1+plot3+plot2
 dev.off()
 
+
 ## ----Cell Quality Thresholds - Default----
 thresh <- list()
 defaultThreshold <- function(seur) {
@@ -199,6 +206,13 @@ dev.off()
 figures$PreFilter_VlnPlot_RNA <- do.call("grid.arrange", c(plots, nrow=1))
 
 
+plots <- sapply(c("percent.rpl", "percent.rps"), function(x) doVlnPlot(aspect=x, seur=seur, thresh=thresh))
+
+png("PreFilter_VlnPlot_Ribo.png", height=7, width=5, units='in', res=300)
+do.call("grid.arrange", c(plots, nrow=1))
+dev.off()
+
+
 if (adt) {
 ## ----Pre-Filter ADT Violin Plot
   plots <- sapply(c("nFeature_ADT", "nCount_ADT"), function(x) doVlnPlot(aspect=x, seur=seur, thresh=thresh))
@@ -265,6 +279,13 @@ dev.off()
 figures$PostFilter_VlnPlot_RNA <- VlnPlot(seur, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3)
 
 
+plots <- sapply(c("percent.rpl", "percent.rps"), function(x) doVlnPlot(aspect=x, seur=seur, thresh=thresh))
+
+png("PostFilter_VlnPlot_Ribo.png", height=7, width=5, units='in', res=300)
+do.call("grid.arrange", c(plots, nrow=1))
+dev.off()
+
+
 ## ----Post-Filter ADT Violin Plot
 if (adt) {
   png("PostFilter_VlnPlot_ADT.png", height=7, width=5, units='in', res=300)
@@ -434,7 +455,7 @@ saveRDS(seur, 'seur_cluster.rds')
 
 # ----Matrix export----
 if ( !dir.exists(file.path(opt$workdir, "cite-seq-matrix")) ) {
-  dir.create(opt$workdir, "cite-hto-adt-matrix", showWarnings = F, recursive = T, mode = "1755")
+  dir.create(file.path(opt$workdir, "cite-seq-matrix"), showWarnings = F, recursive = T, mode = "1755")
 } 
 if ( is.element("HTO", names(seur@assays)) ) {
   hto_mat <- GetAssayData(object = seur, assay = "HTO", layer = "data")
diff --git a/workflow/scripts/seuratSampleQC.R b/workflow/scripts/seuratSampleQC.R
@@ -49,6 +49,9 @@ figures <- list()
 
 ## ----Pre-Filter Gene Plot----
 seur[["percent.mito"]] <- PercentageFeatureSet(seur, pattern="^[Mm][Tt]-")
+seur[["percent.mito"]] <- PercentageFeatureSet(seur, pattern="^[Mm][Tt]-")
+seur[["percent.rps"]] <- PercentageFeatureSet(seur, pattern="^R[Pp][Ss]")
+seur[["percent.rpl"]] <- PercentageFeatureSet(seur, pattern="^R[Pp][Ll]")
 
 plot1 <- FeatureScatter(seur, group.by='Sample', feature1 = "nCount_RNA", feature2 = "percent.mito") + NoLegend()
 plot2 <- FeatureScatter(seur, group.by='Sample', feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + NoLegend()
@@ -71,12 +74,12 @@ defaultThreshold <- function(seur) {
   thresh['nCount_RNA_low'] <- expm1(median(log1p(seur$nCount_RNA)) - 3*mad(log1p(seur$nCount_RNA))) %>% round
   thresh['nCount_RNA_high'] <- expm1(median(log1p(seur$nCount_RNA)) + 3*mad(log1p(seur$nCount_RNA))) %>% round
   thresh['percent.mito_high'] = min(expm1(median(log1p(seur$percent.mito)) + 3*mad(log1p(seur$percent.mito))) %>% round, 100)
-  
+
   cellsToRemove <- colnames(seur)[which(seur$nFeature_RNA < thresh['nFeature_RNA_low'] | seur$nFeature_RNA > thresh['nFeature_RNA_high'])]
   cellsToRemove <- union(cellsToRemove, colnames(seur)[which(seur$nCount_RNA < thresh['nCount_RNA_low'] | seur$nCount_RNA > thresh['nCount_RNA_high'])])
   cellsToRemove <- union(cellsToRemove,  colnames(seur)[which(seur$percent.mito > thresh['percent.mito_high'])])
-  
-  
+
+
   thresh['numCellsRemove'] <- length(cellsToRemove)
   thresh['pctCellsRemove'] <- length(cellsToRemove) / dim(seur)[2] * 100
   return(list(threshold=thresh, filter=cellsToRemove))
@@ -88,7 +91,7 @@ if (!is.na(opt$filterfile)){
   if (sum(thresholds[,index] == opt$sample) == 1) {
     thresh_orig <- thresholds[which(thresholds[,index] == opt$sample),]
     thresh_orig[index] <- NULL
-    
+
     thresh <- list()
     cellsToRemove <- character()
     for (i in colnames(thresh_orig)) {
@@ -146,6 +149,14 @@ dev.off()
 
 figures$PreFilter_VlnPlot_RNA <- do.call("grid.arrange", c(plots, nrow=1))
 
+
+plots <- sapply(c("percent.rpl", "percent.rps"), function(x) doVlnPlot(aspect=x, seur=seur, thresh=thresh))
+
+png("PreFilter_VlnPlot_Ribo.png", height=7, width=5, units='in', res=300)
+do.call("grid.arrange", c(plots, nrow=1))
+dev.off()
+
+
 ## ----Pre-Filter UMAP Plot-------
 seur <- NormalizeData(seur, normalization.method = "LogNormalize", scale.factor = 10000)
 seur <- FindVariableFeatures(seur, selection.method = "vst", nfeatures = 2000)
@@ -189,6 +200,13 @@ dev.off()
 
 figures$PostFilter_VlnPlot_RNA <- VlnPlot(seur, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3)
 
+plots <- sapply(c("percent.rpl", "percent.rps"), function(x) doVlnPlot(aspect=x, seur=seur, thresh=thresh))
+
+png("PostFilter_VlnPlot_Ribo.png", height=7, width=5, units='in', res=300)
+do.call("grid.arrange", c(plots, nrow=1))
+dev.off()
+
+
 ## ----RNA Normalizing and Clustering----
 seur <- NormalizeData(seur, normalization.method = "LogNormalize", scale.factor = 10000)
 seur <- FindVariableFeatures(seur, selection.method = "vst", nfeatures = 2000)
@@ -203,11 +221,11 @@ coord <- Embeddings(seur, reduction='pca')[,1:30]
 d <- dist(coord, method="euclidean")
 for(resolution in c(0.1, seq(0.2,1.0,0.2), 1.5, 2.0)){
   seur <- FindClusters(seur, resolution = resolution)
-  
+
   #Calculate silhouette scores and generate plots
   try({
     clusters <- Idents(seur)
-    sil<-silhouette(as.numeric(clusters), dist=d)  
+    sil<-silhouette(as.numeric(clusters), dist=d)
     pdf(paste0("SilhouettePlot_res.",resolution,".pdf"))
     print(plot(sil, col=as.factor(clusters[order(clusters, decreasing=FALSE)]), main=paste("Silhouette plot of Seurat clustering - resolution ", resolution, sep=""), lty=2))
     print(abline(v=mean(sil[,3]), col="red4", lty=2))
@@ -236,4 +254,3 @@ saveRDS(seur, 'seur_cluster.rds')
 #saveRDS(figures, 'seur_figures.rds')
 
 writeLines(capture.output(devtools::session_info()), 'sessionInfo.txt')
-

Original file line number	Diff line number	Diff line change
`@@ -17,12 +17,16 @@`
`17`	`17`	`"hg2024": {`
`18`	`18`	`"gex_transcriptome": "/data/OpenOmics/references/cell-seek/human/refdata-gex-GRCh38-2024-A",`
`19`	`19`	`"cite_transcriptome": "/data/OpenOmics/references/cell-seek/human/refdata-gex-GRCh38-2024-A",`
`20`		`- "vdj_ref": "/data/OpenOmics/references/cell-seek/human/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.1.0/"`
	`20`	`+ "atac_ref": "/data/OpenOmics/references/cell-seek/human/refdata-cellranger-arc-GRCh38-2024-A",`
	`21`	`+ "vdj_ref": "/data/OpenOmics/references/cell-seek/human/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.1.0/",`
	`22`	`+ "arc_ref": "/data/OpenOmics/references/cell-seek/human/refdata-cellranger-arc-GRCh38-2024-A"`
`21`	`23`	`},`
`22`	`24`	`"mm2024": {`
`23`	`25`	`"gex_transcriptome": "/data/OpenOmics/references/cell-seek/mouse/refdata-gex-GRCm39-2024-A",`
`24`	`26`	`"cite_transcriptome": "/data/OpenOmics/references/cell-seek/mouse/refdata-gex-GRCm39-2024-A",`
`25`		`- "vdj_ref": "/data/OpenOmics/references/cell-seek/mouse/refdata-cellranger-vdj-GRCm38-alts-ensembl-7.0.0/"`
	`27`	`+ "atac_ref": "/data/OpenOmics/references/cell-seek/mouse/refdata-cellranger-arc-GRCm39-2024-A",`
	`28`	`+ "vdj_ref": "/data/OpenOmics/references/cell-seek/mouse/refdata-cellranger-vdj-GRCm38-alts-ensembl-7.0.0/",`
	`29`	`+ "arc_ref": "/data/OpenOmics/references/cell-seek/mouse/refdata-cellranger-arc-GRCm39-2024-A"`
`26`	`30`	`}`
`27`	`31`	`}`
`28`	`32`	`}`