Last updated: 2017-12-21

Code version: 6e42447

library(edgeR)
library(limma)
library(sva)
library(cate)
library(vicar)
library(ashr)
library(pROC)
source("../code/gdash.R")
mat = readRDS("../data/liver.sim.rds")
counts_to_summary = function (counts, design) {
  dgecounts = edgeR::calcNormFactors(edgeR::DGEList(counts = counts, group = design[, 2]))
  v = limma::voom(dgecounts, design, plot = FALSE)
  lim = limma::lmFit(v)
  r.ebayes = limma::eBayes(lim)
  p = r.ebayes$p.value[, 2]
  t = r.ebayes$t[, 2]
  z = sign(t) * qnorm(1 - p/2)
  betahat = lim$coefficients[,2]
  sebetahat = betahat / z
  return (list(betahat = betahat, sebetahat = sebetahat, z = z))
}
one_sim <- function (mat, ngene, nsamp, pi0, sd) {
## add simulated signals
mat.sim = seqgendiff::poisthin(t(mat), nsamp = nsamp, ngene = ngene, gselect = "random", signal_params = list(mean = 0, sd = sd), prop_null = pi0)
counts = t(mat.sim$Y) ## ngene * nsamples matrix 
design = mat.sim$X
beta = mat.sim$beta
which_signal = (beta != 0)

## methods using summary statistics only
summary = counts_to_summary(counts, design)

fit.pvalue = (1 - pnorm(abs(summary$z))) * 2
fit.BH = p.adjust(fit.pvalue, method = "BH")
fit.qvalue = qvalue::qvalue(fit.pvalue)
fit.locfdr = locfdr::locfdr(summary$z, bre = round(ngene / 20), plot = 0)
fit.ash = ashr::ash(summary$betahat, summary$sebetahat, mixcompdist = "normal", method = "fdr")
fit.gdash = gdash(summary$betahat, summary$sebetahat)
fit.gdash.ash = ashr::ash(summary$betahat, summary$sebetahat, fixg = TRUE, g = fit.gdash$fitted_g)

## methods using data matrix
Y = t(log(counts + 0.5))
X = design

num_sv <- sva::num.sv(dat = t(Y), mod = X, method = "be")

mout <- vicar::mouthwash(Y = Y, X = X, k = num_sv, cov_of_interest = 2, include_intercept = FALSE)

cate_cate <- cate::cate.fit(X.primary = X[, 2, drop = FALSE], X.nuis = X[, -2, drop = FALSE], Y = Y, r = num_sv, adj.method = "rr")

sva_sva <- sva::sva(dat = t(Y), mod = X, mod0 = X[, -2, drop = FALSE], n.sv = num_sv)
X.sva <- cbind(X, sva_sva$sv)
lmout <- limma::lmFit(object = t(Y), design = X.sva)
eout  <- limma::ebayes(lmout)
svaout           <- list()
svaout$betahat   <- lmout$coefficients[, 2]
svaout$sebetahat <- lmout$stdev.unscaled[, 2] * sqrt(eout$s2.post)
svaout$pvalues   <- eout$p.value[, 2]

## result: roc auc
roc_res = c(
  pvalue = pROC::roc(response = which_signal, predictor = fit.pvalue)$auc,
  BH = pROC::roc(response = which_signal, predictor = fit.BH)$auc,
  qvalue = pROC::roc(response = which_signal, predictor = fit.qvalue$lfdr)$auc,
  locfdr = pROC::roc(response = which_signal, predictor = fit.locfdr$fdr)$auc,
  ash = pROC::roc(response = which_signal, predictor = ashr::get_lfdr(fit.ash))$auc,
  cash = pROC::roc(response = which_signal, predictor = ashr::get_lfdr(fit.gdash.ash))$auc,
  mouthwash = pROC::roc(response = which_signal, predictor = c(mout$result$lfdr))$auc,
  cate = pROC::roc(response = which_signal, predictor = c(cate_cate$beta.p.value))$auc,
  sva = pROC::roc(response = which_signal, predictor = c(svaout$pvalues))$auc
)

## ash with summary statistics
method_list <- list()

method_list$cate           <- list()
method_list$cate$betahat   <- c(cate_cate$beta)
method_list$cate$sebetahat <- c(sqrt(cate_cate$beta.cov.row * cate_cate$beta.cov.col) / sqrt(nrow(X)))

method_list$sva             <- list()
method_list$sva$betahat     <- c(svaout$betahat)
method_list$sva$sebetahat   <- c(svaout$sebetahat)

ashfit <- lapply(method_list, FUN = function(x) {ashr::ash(x$betahat, x$sebetahat, mixcompdist = "normal", method = "fdr")})
ashfit$ash <- fit.ash
ashfit$cash <- fit.gdash.ash
ashfit$mouthwash <- mout
ashfit = ashfit[c("ash", "cash", "mouthwash", "cate", "sva")]

## pi0
pi0_res <- sapply(ashfit, FUN = ashr::get_pi0)
pi0_res <- c(
  qvalue = fit.qvalue$pi0,
  locfdr = min(1, fit.locfdr$fp0["mlest", "p0"]),
  pi0_res
  )

## mse
mse_res <- sapply(ashfit, FUN = function(x) {mean((ashr::get_pm(x) - beta)^2)})
mse_res <- c(ols = mean((summary$betahat - beta)^2), mse_res)

## pFDP calibration
pFDP_alpha = function (alpha, tail_stat, true, obs) {
  return(1 - mean(true[tail_stat <= alpha]))
}
pFSP_alpha = function (alpha, tail_stat, true, obs) {
  return(mean(sign(obs[tail_stat <= alpha]) != sign(true[tail_stat <= alpha])))
}

tail_cali_list = function (alpha_list, tail_cali_alpha, tail_stat, true, obs) {
  sapply(alpha_list, tail_cali_alpha, tail_stat, true, obs)
}
alpha_list = seq(0, 0.2, by = 0.001)
pFDP <- sapply(
  ashfit, FUN = function (x) {
    tail_cali_list(alpha_list, pFDP_alpha, ashr::get_qvalue(x), which_signal, x$data$x)
  }
)
pFDP_BH = tail_cali_list(alpha_list, pFDP_alpha, fit.BH, which_signal, summary$betahat)
pFDP_qvalue = tail_cali_list(alpha_list, pFDP_alpha, fit.qvalue$qvalues, which_signal, summary$betahat)
pFDP_res = cbind(BH = pFDP_BH, qvalue = pFDP_qvalue, pFDP)

## pFSR calibration
pFSP_res <- sapply(
  ashfit, FUN = function (x) {
  tail_cali_list(alpha_list, pFSP_alpha, ashr::get_svalue(x), beta, x$data$x)
  }
)

return(list(pi = pi0_res, mse = mse_res, auc = roc_res, alpha = alpha_list, pFDP = pFDP_res, pFSP = pFSP_res))
}
n_sim = function (n, mat, ngene, nsamp, pi0, sd) {
  pi0_list = mse_list = auc_list = pFDP_list = pFSP_list = list()
  for (i in 1 : n) {
    one_res = one_sim(mat, ngene, nsamp, pi0, sd)
    pi0_list[[i]] = one_res$pi
    mse_list[[i]] = one_res$mse
    auc_list[[i]] = one_res$auc
    pFDP_list[[i]] = one_res$pFDP
    pFSP_list[[i]] = one_res$pFSP
  }
  alpha_vec = one_res$alpha
  pi0_mat = matrix(unlist(pi0_list), nrow = n, byrow = TRUE)
  colnames(pi0_mat) = names(pi0_list[[1]])
  mse_mat = matrix(unlist(mse_list), nrow = n, byrow = TRUE)
  colnames(mse_mat) = names(mse_list[[1]])
  auc_mat = matrix(unlist(auc_list), nrow = n, byrow = TRUE)
  colnames(auc_mat) = names(auc_list[[1]])
  pFDP_mat = list()
  for (j in 1 : ncol(pFDP_list[[1]])) {
    pFDP_mat[[j]] = t(sapply(pFDP_list, FUN = function(x) {rbind(x[, j])}))
  }
  names(pFDP_mat) = colnames(pFDP_list[[1]])
  pFSP_mat = list()
  for (j in 1 : ncol(pFSP_list[[1]])) {
    pFSP_mat[[j]] = t(sapply(pFSP_list, FUN = function(x) {rbind(x[, j])}))
  }
  names(pFSP_mat) = colnames(pFSP_list[[1]])
  return(list(pi0 = pi0_mat, mse = mse_mat, auc = auc_mat, alpha = alpha_vec, pFDP = pFDP_mat, pFSP = pFSP_mat))
}
sd = 0.6
pi0 = 0.9
ngene = 1e3
nsamp = 10
nsim = 100

set.seed(777)
system.time(res <- n_sim(nsim, mat, ngene, nsamp, pi0, sd))
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Warning in log(rowSums(sweep(x = exp(ldmix - ldmax), MARGIN = 2, STATS =
pi_vals, : NaNs produced
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
Warning in locfdr::locfdr(summary$z, bre = round(ngene/20), plot = 0): CM
estimation failed, middle of histogram non-normal
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  
    user   system  elapsed 
1703.469  381.549 2135.044 

Session information

sessionInfo()
R version 3.4.3 (2017-11-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.2

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] Rmosek_8.0.69       PolynomF_1.0-1      CVXR_0.94-4        
 [4] REBayes_1.2         Matrix_1.2-12       SQUAREM_2017.10-1  
 [7] EQL_1.0-0           ttutils_1.0-1       pROC_1.10.0        
[10] ashr_2.2-2          vicar_0.1.6         cate_1.0.4         
[13] sva_3.26.0          BiocParallel_1.12.0 genefilter_1.60.0  
[16] mgcv_1.8-22         nlme_3.1-131        edgeR_3.20.2       
[19] limma_3.34.4       

loaded via a namespace (and not attached):
 [1] Biobase_2.38.0       svd_0.4.1            bit64_0.9-7         
 [4] splines_3.4.3        foreach_1.4.4        ECOSolveR_0.3-2     
 [7] R.utils_2.6.0        stats4_3.4.3         blob_1.1.0          
[10] yaml_2.1.16          RSQLite_2.0          backports_1.1.2     
[13] lattice_0.20-35      digest_0.6.13        colorspace_1.3-2    
[16] R.oo_1.21.0          htmltools_0.3.6      plyr_1.8.4          
[19] XML_3.98-1.9         esaBcv_1.2.1         xtable_1.8-2        
[22] corpcor_1.6.9        scales_0.5.0         scs_1.1-1           
[25] git2r_0.20.0         tibble_1.3.4         annotate_1.56.1     
[28] gmp_0.5-13.1         IRanges_2.12.0       ggplot2_2.2.1       
[31] BiocGenerics_0.24.0  lazyeval_0.2.1       Rmpfr_0.6-1         
[34] survival_2.41-3      magrittr_1.5         memoise_1.1.0       
[37] evaluate_0.10.1      R.methodsS3_1.7.1    doParallel_1.0.11   
[40] MASS_7.3-47          truncnorm_1.0-7      tools_3.4.3         
[43] matrixStats_0.52.2   stringr_1.2.0        S4Vectors_0.16.0    
[46] munsell_0.4.3        locfit_1.5-9.1       AnnotationDbi_1.40.0
[49] compiler_3.4.3       rlang_0.1.4          grid_3.4.3          
[52] leapp_1.2            RCurl_1.95-4.8       iterators_1.0.9     
[55] bitops_1.0-6         rmarkdown_1.8        gtable_0.2.0        
[58] codetools_0.2-15     DBI_0.7              R6_2.2.2            
[61] ruv_0.9.6            knitr_1.17           bit_1.1-12          
[64] rprojroot_1.3-1      stringi_1.1.6        pscl_1.5.2          
[67] parallel_3.4.3       Rcpp_0.12.14        

This R Markdown site was created with workflowr