11# ' Fit Logistic Regression Model of FNR against set of positive control (ubiquitously expressed) genes
22# '
3- # ' @details logit(Probability of False Negative) ~ a + b*(mean log10p1 expression) .
3+ # ' @details logit(Probability of False Negative) ~ a + b*(median log- expression) .
44# '
55# ' @param expr matrix The data matrix in transcript-proportional units (genes in rows, cells in columns).
66# ' @param pos_controls A logical vector indexing positive control genes that will be used to compute false-negative rate characteristics.
7+ # ' User must provide at least 2 positive control genes.
78# ' @param fn_tresh Inclusive threshold for negative detection. Default 0.01.
9+ # ' fn_tresh must be non-negative.
810# '
9- # ' @return A list of logistic regression coefficients corresponding to glm fits in each sample. If a fit did not converge, the result reported is NA.
11+ # ' @return A matrix of logistic regression coefficients corresponding to glm fits in each sample (a and b in columns 1 and 2 respectively). If the a & b fit does not converge, b is set to zero and only a is estimated.
12+ # '
13+ # ' @importFrom boot logit
14+ # ' @importFrom matrixStats rowMedians
1015# '
1116simple_FNR_params = function (expr , pos_controls , fn_tresh = 0.01 ){
1217
13- # Mean log10p1 expression
14- mu_obs = rowMeans(log10(expr [pos_controls ,]+ 1 ))
15-
16- # Drop-outs
17- drop_outs = 0 + (expr [pos_controls ,] < = fn_tresh )
18+ stopifnot(! any(is.na(pos_controls )))
19+
20+ if (sum(pos_controls ) < 2 ){
21+ stop(" User must provide at least 2 positive control genes" )
22+ }
23+
24+ if (fn_tresh < 0 ){
25+ stop(" fn_tresh must be non-negative" )
26+ }
27+
28+ pos_expr = expr [pos_controls ,] # Selecting positive-control genes
29+ is_drop = pos_expr < = fn_tresh # Identify false negatives
30+ pos_expr [is_drop ] = NA # Set false negatives to NA
31+ drop_outs = 0 + is_drop # Numeric drop-out state
32+ drop_rate = colMeans(drop_outs ) # Total drop-out rate per sample
33+
34+ # Median log-expression in positive observations
35+ mu_obs = log(rowMedians(pos_expr ,na.rm = TRUE ))
36+ if (any(is.na(mu_obs ))){
37+ stop(" Median log-expression in positive observations NA for some positive control gene/s" )
38+ }
1839
1940 # Logistic Regression Model of FNR
20- ref.glms = list ( )
21- for (si in 1 : dim( drop_outs )[ 2 ] ){
41+ logistic_coef = matrix ( 0 ,ncol( drop_outs ), 2 )
42+ for (si in seq_len(ncol( drop_outs )) ){
2243 fit = suppressWarnings(glm(cbind(drop_outs [,si ],1 - drop_outs [,si ]) ~ mu_obs ,family = binomial(logit )))
2344 if (fit $ converged ){
24- ref.glms [[ si ] ] = fit $ coefficients
45+ logistic_coef [ si , ] = fit $ coefficients
2546 } else {
26- ref.glms [[ si ]] = NA
47+ logistic_coef [ si , 1 ] = logit( drop_rate [ si ])
2748 }
2849 }
29- return (ref.glms )
50+ return (logistic_coef )
3051}
3152
3253# ' metric-based sample filtering: function to filter single-cell RNA-Seq libraries.
@@ -51,7 +72,7 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){
5172# ' If NULL, filtered_fnr will be returned NA.
5273# ' @param scale. logical. Will expression be scaled by total expression for FNR computation? Default = FALSE
5374# ' @param glen Gene lengths for gene-length normalization (normalized data used in FNR computation).
54- # ' @param AUC_range An array of two values, representing range over which FNR AUC will be computed (log10 (expr_units + 1 )). Default c(0,6 )
75+ # ' @param AUC_range An array of two values, representing range over which FNR AUC will be computed (log (expr_units)). Default c(0,15 )
5576# ' @param zcut A numeric value determining threshold Z-score for sd, mad, and mixture sub-criteria. Default 1.
5677# ' If NULL, only hard threshold sub-criteria will be applied.
5778# ' @param mixture A logical value determining whether mixture modeling sub-criterion will be applied per primary criterion (metric).
@@ -61,11 +82,11 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){
6182# ' @param hard_nreads numeric. Hard (lower bound on) nreads threshold. Default 25000.
6283# ' @param hard_ralign numeric. Hard (lower bound on) ralign threshold. Default 15.
6384# ' @param hard_breadth numeric. Hard (lower bound on) breadth threshold. Default 0.2.
64- # ' @param hard_fnr numeric. Hard (upper bound on) fnr threshold. Default 3 .
65- # ' @param suff_nreads numeric. If not null, serves as an upper bound on nreads threshold.
66- # ' @param suff_ralign numeric. If not null, serves as an upper bound on ralign threshold. Default 65 .
67- # ' @param suff_breadth numeric. If not null, serves as an upper bound on breadth threshold. Default 0.8 .
68- # ' @param suff_fnr numeric. If not null, serves as an lower bound on fnr threshold.
85+ # ' @param hard_auc numeric. Hard (upper bound on) fnr auc threshold. Default 10 .
86+ # ' @param suff_nreads numeric. If not null, serves as an overriding upper bound on nreads threshold.
87+ # ' @param suff_ralign numeric. If not null, serves as an overriding upper bound on ralign threshold.
88+ # ' @param suff_breadth numeric. If not null, serves as an overriding upper bound on breadth threshold.
89+ # ' @param suff_auc numeric. If not null, serves as an overriding lower bound on fnr auc threshold.
6990# ' @param plot logical. Should a plot be produced?
7091# ' @param hist_breaks hist() breaks argument. Ignored if `plot=FALSE`.
7192# '
@@ -79,15 +100,16 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){
79100# '
80101# '@importFrom mixtools normalmixEM
81102# '@importFrom diptest dip.test
103+ # '@importFrom boot inv.logit
82104# '@export
83105# '
84106# '
85107metric_sample_filter = function (expr , nreads = colSums(expr ), ralign = NULL ,
86108 gene_filter = NULL , pos_controls = NULL ,scale. = FALSE ,glen = NULL ,
87- AUC_range = c(0 ,6 ), zcut = 1 ,
109+ AUC_range = c(0 ,15 ), zcut = 1 ,
88110 mixture = TRUE , dip_thresh = 0.05 ,
89- hard_nreads = 25000 , hard_ralign = 15 , hard_breadth = 0.2 , hard_fnr = 3 ,
90- suff_nreads = NULL , suff_ralign = 65 , suff_breadth = 0.8 , suff_fnr = NULL ,
111+ hard_nreads = 25000 , hard_ralign = 15 , hard_breadth = 0.2 , hard_auc = 10 ,
112+ suff_nreads = NULL , suff_ralign = NULL , suff_breadth = NULL , suff_auc = NULL ,
91113 plot = FALSE , hist_breaks = 10 ){
92114
93115 criterion_count = 0
@@ -210,17 +232,17 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
210232 }
211233
212234 # Compute FNR AUC
213- ref.glms = simple_FNR_params(expr = nexpr , pos_controls = pos_controls )
235+ logistic_coef = simple_FNR_params(expr = nexpr , pos_controls = pos_controls )
214236 AUC = NULL
215237 for (si in 1 : dim(expr )[2 ]){
216- if (! any(is.na( ref.glms [[ si ]])) ){
217- AUC [si ] = log(exp(ref.glms [[ si ]][ 1 ] + ref.glms [[ si ]][ 2 ] * AUC_range [2 ]) + 1 )/ ref.glms [[ si ]][ 2 ] - log(exp(ref.glms [[ si ]][ 1 ] + ref.glms [[ si ]][ 2 ] * AUC_range [1 ]) + 1 )/ ref.glms [[ si ]][ 2 ]
238+ if (logistic_coef [ si , 2 ] != 0 ){
239+ AUC [si ] = log(exp(logistic_coef [ si , 1 ] + logistic_coef [ si , 2 ] * AUC_range [2 ]) + 1 )/ logistic_coef [ si , 2 ] - log(exp(logistic_coef [ si , 1 ] + logistic_coef [ si , 2 ] * AUC_range [1 ]) + 1 )/ logistic_coef [ si , 2 ]
218240 } else {
219- stop( " glm fit did not converge " )
241+ AUC [ si ] = inv.logit( logistic_coef [ si , 1 ]) * ( AUC_range [ 2 ] - AUC_range [ 1 ] )
220242 }
221243 }
222244
223- AUC_CUTOFF = hard_fnr
245+ AUC_CUTOFF = hard_auc
224246
225247 if (! is.null(zcut )){
226248
@@ -239,8 +261,8 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
239261 }
240262 }
241263
242- if (! is.null(suff_fnr )){
243- AUC_CUTOFF = max(AUC_CUTOFF ,suff_fnr )
264+ if (! is.null(suff_auc )){
265+ AUC_CUTOFF = max(AUC_CUTOFF ,suff_auc )
244266 }
245267 }
246268 filtered_fnr = AUC > AUC_CUTOFF
@@ -254,7 +276,8 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
254276
255277 is_bad = rep(FALSE ,dim(expr )[2 ])
256278
257- par(mfcol = c(criterion_count ,2 ))
279+ op <- par(mfcol = c(criterion_count ,2 ))
280+ on.exit(par(op ))
258281
259282 if (! is.null(nreads )){
260283 is_bad = filtered_nreads
@@ -304,7 +327,7 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
304327 }else {
305328 hist(AUC , main = paste0(" auc: Thresh = " ,signif(AUC_CUTOFF ,3 )," , Rm = " ,sum(filtered_fnr )," , Tot_Rm = " ,sum(is_bad )), xlab = " FNR AUC" , breaks = hist_breaks )
306329 }
307- abline(v = hard_fnr , col = " yellow" , lty = 1 )
330+ abline(v = hard_auc , col = " yellow" , lty = 1 )
308331 abline(v = AUC_CUTOFF , col = " blue" , lty = 2 )
309332 }
310333
@@ -320,13 +343,6 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
320343 if (! is.null(pos_controls )){
321344 hist(AUC [! is_bad ], main = paste0(" auc: Tot = " ,sum(! is_bad )), xlab = " FNR AUC" , breaks = hist_breaks )
322345 }
323-
324- # v = rbind(filtered_nreads,filtered_ralign,filtered_breadth,filtered_fnr)
325- # rownames(v) = c("nreads","ralign","breadth","fnr")
326- # v = na.omit(v)
327- # m = v %*% t(v)
328- #
329- # barplot(m, beside = TRUE, legend.text = TRUE)
330346 }
331347
332348 return (list (filtered_nreads = filtered_nreads ,
@@ -417,8 +433,9 @@ factor_sample_filter = function(expr, qual, gene_filter = NULL, max_exp_pcs = 5,
417433 num_qual_pcs = which(csum > min_qual_variance )[1 ]
418434
419435 if (plot ){
436+ op <- par(mfrow = c(2 ,1 ))
437+ on.exit(par(op ))
420438 for (i in 1 : num_qual_pcs ){
421- par(mfrow = c(2 ,1 ))
422439 hist(qpc $ x [,i ],breaks = hist_breaks , main = paste0(" Distribution of Quality PC " ,i ), xlab = paste0(" Qual PC" ,i ))
423440 barplot(abs(qpc $ rotation [,i ]),col = c(" red" ," green" )[1 + (qpc $ rotation [,i ] > 0 )], cex.names = .25 ,horiz = T , las = 1 , main = " Loadings" )
424441 }
0 commit comments