From 68e88119e4946b44998bc4a1e730c60fa65c59f7 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Mon, 16 Jun 2025 17:37:00 -0500 Subject: [PATCH 01/24] adding gitignore file, adding .DS_store to that --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a4fe18b..24b1e75 100644 --- a/.gitignore +++ b/.gitignore @@ -398,3 +398,4 @@ FodyWeavers.xsd # JetBrains Rider *.sln.iml +.DS_Store From 4a09f2f41f7616a0b59f16d03ad59270259411b6 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Mon, 16 Jun 2025 17:45:47 -0500 Subject: [PATCH 02/24] adding input data contents (other than intersecting genes csv file) to gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 24b1e75..3e954a1 100644 --- a/.gitignore +++ b/.gitignore @@ -399,3 +399,7 @@ FodyWeavers.xsd # JetBrains Rider *.sln.iml .DS_Store + +# All input_data, except for intersecting genes csv file +/input_data/* +!/input_data/intersect_3d.csv From 0d803ce78528683a0631a7d951945176c60b6379 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Tue, 17 Jun 2025 10:58:33 -0600 Subject: [PATCH 03/24] moving loading renv to a new script on its own and renaming the get_data script to match the new numbering --- scripts/01_load_renv.R | 12 ++++++++++++ scripts/{1_get_data.R => 02_get_data.R} | 0 2 files changed, 12 insertions(+) create mode 100644 scripts/01_load_renv.R rename scripts/{1_get_data.R => 02_get_data.R} (100%) diff --git a/scripts/01_load_renv.R b/scripts/01_load_renv.R new file mode 100644 index 0000000..2087e2b --- /dev/null +++ b/scripts/01_load_renv.R @@ -0,0 +1,12 @@ +########################################################################################## +### 01_load_renv.R +### +### This script loads the renv from the lockfile. +########################################################################################## + + +# Load the renv library from the lockfile +if (!requireNamespace("renv", quietly = TRUE)) { + install.packages("renv", version = "1.1.0") # Install renv if not already installed +} +renv::restore() \ No newline at end of file diff --git a/scripts/1_get_data.R b/scripts/02_get_data.R similarity index 100% rename from scripts/1_get_data.R rename to scripts/02_get_data.R From ea8081a8ffafe87276bc007c7b40c5f9f3fac8f7 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Tue, 17 Jun 2025 11:06:03 -0600 Subject: [PATCH 04/24] updating numbering for scripts --- scripts/{0_unzip_input_data.py => 00_unzip_input_data.py} | 0 scripts/02_get_data.R | 2 +- scripts/{2_get_clustering.R => 03_get_clustering.R} | 2 +- .../{3_prepare_reference_data.R => 04_prepare_reference_data.R} | 2 +- scripts/{4_run_instaprism.R => 05_run_instaprism.R} | 2 +- ...e_instaprism_outputs.R => 06_visualize_instaprism_outputs.R} | 2 +- 6 files changed, 5 insertions(+), 5 deletions(-) rename scripts/{0_unzip_input_data.py => 00_unzip_input_data.py} (100%) rename scripts/{2_get_clustering.R => 03_get_clustering.R} (99%) rename scripts/{3_prepare_reference_data.R => 04_prepare_reference_data.R} (99%) rename scripts/{4_run_instaprism.R => 05_run_instaprism.R} (99%) rename scripts/{5_visualize_instaprism_outputs.R => 06_visualize_instaprism_outputs.R} (99%) diff --git a/scripts/0_unzip_input_data.py b/scripts/00_unzip_input_data.py similarity index 100% rename from scripts/0_unzip_input_data.py rename to scripts/00_unzip_input_data.py diff --git a/scripts/02_get_data.R b/scripts/02_get_data.R index 25436d6..50f2c86 100644 --- a/scripts/02_get_data.R +++ b/scripts/02_get_data.R @@ -1,5 +1,5 @@ ########################################################################################## -### 1_get_data.R +### 02_get_data.R ### ### This script reads the bulk RNA sequencing and microarray datasets and filters them ### to only include genes present in one common gene mapping list. It transforms the diff --git a/scripts/2_get_clustering.R b/scripts/03_get_clustering.R similarity index 99% rename from scripts/2_get_clustering.R rename to scripts/03_get_clustering.R index 8fa1c27..d7c4a47 100644 --- a/scripts/2_get_clustering.R +++ b/scripts/03_get_clustering.R @@ -1,5 +1,5 @@ ########################################################################################## -### 2_get_clustering.R +### 03_get_clustering.R ### ### This script performs k-means clustering (k=2,3,4), NMF clustering (k=2,3,4), and ### consensusOV subtyping for each bulk dataset. For k-means clustering, it uses log10(...+1) diff --git a/scripts/3_prepare_reference_data.R b/scripts/04_prepare_reference_data.R similarity index 99% rename from scripts/3_prepare_reference_data.R rename to scripts/04_prepare_reference_data.R index db80dfa..4f207b3 100644 --- a/scripts/3_prepare_reference_data.R +++ b/scripts/04_prepare_reference_data.R @@ -1,5 +1,5 @@ ########################################################################################## -### 3_prepare_reference_data.R +### 04_prepare_reference_data.R ### ### This script creates a single-cell/single-nucleus reference matrix for use as input into ### InstaPrism. It requires single cell RNA sequencing data of HGSOC diff --git a/scripts/4_run_instaprism.R b/scripts/05_run_instaprism.R similarity index 99% rename from scripts/4_run_instaprism.R rename to scripts/05_run_instaprism.R index 1e7eb3c..664062f 100644 --- a/scripts/4_run_instaprism.R +++ b/scripts/05_run_instaprism.R @@ -1,5 +1,5 @@ ########################################################################################## -### 4_run_instaprism.R +### 05_run_instaprism.R ### ### This script runs InstaPrism, an R package that performs deconvolution with a similar ### but faster method to BayesPrism. First, it loads in the reference single cell plus diff --git a/scripts/5_visualize_instaprism_outputs.R b/scripts/06_visualize_instaprism_outputs.R similarity index 99% rename from scripts/5_visualize_instaprism_outputs.R rename to scripts/06_visualize_instaprism_outputs.R index 078fd32..2b56b7d 100644 --- a/scripts/5_visualize_instaprism_outputs.R +++ b/scripts/06_visualize_instaprism_outputs.R @@ -1,5 +1,5 @@ ########################################################################################## -### 5_visualize_instaprism_output.R +### 06_visualize_instaprism_output.R ### ### This script creates several figures to visualize the deconvolution results generated ### in the previous script, and compare the results when run with and without adipocyte From bd1e9acdcfce20a155d34746d7518e468aca8b86 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Tue, 17 Jun 2025 11:14:10 -0600 Subject: [PATCH 05/24] Adding renv folder with activate.R and settings.json --- renv/activate.R | 1313 ++++++++++++++++++++++++++++++++++++++++++++ renv/settings.json | 19 + 2 files changed, 1332 insertions(+) create mode 100644 renv/activate.R create mode 100644 renv/settings.json diff --git a/renv/activate.R b/renv/activate.R new file mode 100644 index 0000000..2fe247d --- /dev/null +++ b/renv/activate.R @@ -0,0 +1,1313 @@ + +local({ + + # the requested version of renv + version <- "1.1.1" + attr(version, "sha") <- NULL + + # the project directory + project <- Sys.getenv("RENV_PROJECT") + if (!nzchar(project)) + project <- getwd() + + # use start-up diagnostics if enabled + diagnostics <- Sys.getenv("RENV_STARTUP_DIAGNOSTICS", unset = "FALSE") + if (diagnostics) { + start <- Sys.time() + profile <- tempfile("renv-startup-", fileext = ".Rprof") + utils::Rprof(profile) + on.exit({ + utils::Rprof(NULL) + elapsed <- signif(difftime(Sys.time(), start, units = "auto"), digits = 2L) + writeLines(sprintf("- renv took %s to run the autoloader.", format(elapsed))) + writeLines(sprintf("- Profile: %s", profile)) + print(utils::summaryRprof(profile)) + }, add = TRUE) + } + + # figure out whether the autoloader is enabled + enabled <- local({ + + # first, check config option + override <- getOption("renv.config.autoloader.enabled") + if (!is.null(override)) + return(override) + + # if we're being run in a context where R_LIBS is already set, + # don't load -- presumably we're being run as a sub-process and + # the parent process has already set up library paths for us + rcmd <- Sys.getenv("R_CMD", unset = NA) + rlibs <- Sys.getenv("R_LIBS", unset = NA) + if (!is.na(rlibs) && !is.na(rcmd)) + return(FALSE) + + # next, check environment variables + # prefer using the configuration one in the future + envvars <- c( + "RENV_CONFIG_AUTOLOADER_ENABLED", + "RENV_AUTOLOADER_ENABLED", + "RENV_ACTIVATE_PROJECT" + ) + + for (envvar in envvars) { + envval <- Sys.getenv(envvar, unset = NA) + if (!is.na(envval)) + return(tolower(envval) %in% c("true", "t", "1")) + } + + # enable by default + TRUE + + }) + + # bail if we're not enabled + if (!enabled) { + + # if we're not enabled, we might still need to manually load + # the user profile here + profile <- Sys.getenv("R_PROFILE_USER", unset = "~/.Rprofile") + if (file.exists(profile)) { + cfg <- Sys.getenv("RENV_CONFIG_USER_PROFILE", unset = "TRUE") + if (tolower(cfg) %in% c("true", "t", "1")) + sys.source(profile, envir = globalenv()) + } + + return(FALSE) + + } + + # avoid recursion + if (identical(getOption("renv.autoloader.running"), TRUE)) { + warning("ignoring recursive attempt to run renv autoloader") + return(invisible(TRUE)) + } + + # signal that we're loading renv during R startup + options(renv.autoloader.running = TRUE) + on.exit(options(renv.autoloader.running = NULL), add = TRUE) + + # signal that we've consented to use renv + options(renv.consent = TRUE) + + # load the 'utils' package eagerly -- this ensures that renv shims, which + # mask 'utils' packages, will come first on the search path + library(utils, lib.loc = .Library) + + # unload renv if it's already been loaded + if ("renv" %in% loadedNamespaces()) + unloadNamespace("renv") + + # load bootstrap tools + ansify <- function(text) { + if (renv_ansify_enabled()) + renv_ansify_enhanced(text) + else + renv_ansify_default(text) + } + + renv_ansify_enabled <- function() { + + override <- Sys.getenv("RENV_ANSIFY_ENABLED", unset = NA) + if (!is.na(override)) + return(as.logical(override)) + + pane <- Sys.getenv("RSTUDIO_CHILD_PROCESS_PANE", unset = NA) + if (identical(pane, "build")) + return(FALSE) + + testthat <- Sys.getenv("TESTTHAT", unset = "false") + if (tolower(testthat) %in% "true") + return(FALSE) + + iderun <- Sys.getenv("R_CLI_HAS_HYPERLINK_IDE_RUN", unset = "false") + if (tolower(iderun) %in% "false") + return(FALSE) + + TRUE + + } + + renv_ansify_default <- function(text) { + text + } + + renv_ansify_enhanced <- function(text) { + + # R help links + pattern <- "`\\?(renv::(?:[^`])+)`" + replacement <- "`\033]8;;x-r-help:\\1\a?\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # runnable code + pattern <- "`(renv::(?:[^`])+)`" + replacement <- "`\033]8;;x-r-run:\\1\a\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # return ansified text + text + + } + + renv_ansify_init <- function() { + + envir <- renv_envir_self() + if (renv_ansify_enabled()) + assign("ansify", renv_ansify_enhanced, envir = envir) + else + assign("ansify", renv_ansify_default, envir = envir) + + } + + `%||%` <- function(x, y) { + if (is.null(x)) y else x + } + + catf <- function(fmt, ..., appendLF = TRUE) { + + quiet <- getOption("renv.bootstrap.quiet", default = FALSE) + if (quiet) + return(invisible()) + + msg <- sprintf(fmt, ...) + cat(msg, file = stdout(), sep = if (appendLF) "\n" else "") + + invisible(msg) + + } + + header <- function(label, + ..., + prefix = "#", + suffix = "-", + n = min(getOption("width"), 78)) + { + label <- sprintf(label, ...) + n <- max(n - nchar(label) - nchar(prefix) - 2L, 8L) + if (n <= 0) + return(paste(prefix, label)) + + tail <- paste(rep.int(suffix, n), collapse = "") + paste0(prefix, " ", label, " ", tail) + + } + + heredoc <- function(text, leave = 0) { + + # remove leading, trailing whitespace + trimmed <- gsub("^\\s*\\n|\\n\\s*$", "", text) + + # split into lines + lines <- strsplit(trimmed, "\n", fixed = TRUE)[[1L]] + + # compute common indent + indent <- regexpr("[^[:space:]]", lines) + common <- min(setdiff(indent, -1L)) - leave + text <- paste(substring(lines, common), collapse = "\n") + + # substitute in ANSI links for executable renv code + ansify(text) + + } + + bootstrap <- function(version, library) { + + friendly <- renv_bootstrap_version_friendly(version) + section <- header(sprintf("Bootstrapping renv %s", friendly)) + catf(section) + + # attempt to download renv + catf("- Downloading renv ... ", appendLF = FALSE) + withCallingHandlers( + tarball <- renv_bootstrap_download(version), + error = function(err) { + catf("FAILED") + stop("failed to download:\n", conditionMessage(err)) + } + ) + catf("OK") + on.exit(unlink(tarball), add = TRUE) + + # now attempt to install + catf("- Installing renv ... ", appendLF = FALSE) + withCallingHandlers( + status <- renv_bootstrap_install(version, tarball, library), + error = function(err) { + catf("FAILED") + stop("failed to install:\n", conditionMessage(err)) + } + ) + catf("OK") + + # add empty line to break up bootstrapping from normal output + catf("") + + return(invisible()) + } + + renv_bootstrap_tests_running <- function() { + getOption("renv.tests.running", default = FALSE) + } + + renv_bootstrap_repos <- function() { + + # get CRAN repository + cran <- getOption("renv.repos.cran", "https://cloud.r-project.org") + + # check for repos override + repos <- Sys.getenv("RENV_CONFIG_REPOS_OVERRIDE", unset = NA) + if (!is.na(repos)) { + + # check for RSPM; if set, use a fallback repository for renv + rspm <- Sys.getenv("RSPM", unset = NA) + if (identical(rspm, repos)) + repos <- c(RSPM = rspm, CRAN = cran) + + return(repos) + + } + + # check for lockfile repositories + repos <- tryCatch(renv_bootstrap_repos_lockfile(), error = identity) + if (!inherits(repos, "error") && length(repos)) + return(repos) + + # retrieve current repos + repos <- getOption("repos") + + # ensure @CRAN@ entries are resolved + repos[repos == "@CRAN@"] <- cran + + # add in renv.bootstrap.repos if set + default <- c(FALLBACK = "https://cloud.r-project.org") + extra <- getOption("renv.bootstrap.repos", default = default) + repos <- c(repos, extra) + + # remove duplicates that might've snuck in + dupes <- duplicated(repos) | duplicated(names(repos)) + repos[!dupes] + + } + + renv_bootstrap_repos_lockfile <- function() { + + lockpath <- Sys.getenv("RENV_PATHS_LOCKFILE", unset = "renv.lock") + if (!file.exists(lockpath)) + return(NULL) + + lockfile <- tryCatch(renv_json_read(lockpath), error = identity) + if (inherits(lockfile, "error")) { + warning(lockfile) + return(NULL) + } + + repos <- lockfile$R$Repositories + if (length(repos) == 0) + return(NULL) + + keys <- vapply(repos, `[[`, "Name", FUN.VALUE = character(1)) + vals <- vapply(repos, `[[`, "URL", FUN.VALUE = character(1)) + names(vals) <- keys + + return(vals) + + } + + renv_bootstrap_download <- function(version) { + + sha <- attr(version, "sha", exact = TRUE) + + methods <- if (!is.null(sha)) { + + # attempting to bootstrap a development version of renv + c( + function() renv_bootstrap_download_tarball(sha), + function() renv_bootstrap_download_github(sha) + ) + + } else { + + # attempting to bootstrap a release version of renv + c( + function() renv_bootstrap_download_tarball(version), + function() renv_bootstrap_download_cran_latest(version), + function() renv_bootstrap_download_cran_archive(version) + ) + + } + + for (method in methods) { + path <- tryCatch(method(), error = identity) + if (is.character(path) && file.exists(path)) + return(path) + } + + stop("All download methods failed") + + } + + renv_bootstrap_download_impl <- function(url, destfile) { + + mode <- "wb" + + # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17715 + fixup <- + Sys.info()[["sysname"]] == "Windows" && + substring(url, 1L, 5L) == "file:" + + if (fixup) + mode <- "w+b" + + args <- list( + url = url, + destfile = destfile, + mode = mode, + quiet = TRUE + ) + + if ("headers" %in% names(formals(utils::download.file))) { + headers <- renv_bootstrap_download_custom_headers(url) + if (length(headers) && is.character(headers)) + args$headers <- headers + } + + do.call(utils::download.file, args) + + } + + renv_bootstrap_download_custom_headers <- function(url) { + + headers <- getOption("renv.download.headers") + if (is.null(headers)) + return(character()) + + if (!is.function(headers)) + stopf("'renv.download.headers' is not a function") + + headers <- headers(url) + if (length(headers) == 0L) + return(character()) + + if (is.list(headers)) + headers <- unlist(headers, recursive = FALSE, use.names = TRUE) + + ok <- + is.character(headers) && + is.character(names(headers)) && + all(nzchar(names(headers))) + + if (!ok) + stop("invocation of 'renv.download.headers' did not return a named character vector") + + headers + + } + + renv_bootstrap_download_cran_latest <- function(version) { + + spec <- renv_bootstrap_download_cran_latest_find(version) + type <- spec$type + repos <- spec$repos + + baseurl <- utils::contrib.url(repos = repos, type = type) + ext <- if (identical(type, "source")) + ".tar.gz" + else if (Sys.info()[["sysname"]] == "Windows") + ".zip" + else + ".tgz" + name <- sprintf("renv_%s%s", version, ext) + url <- paste(baseurl, name, sep = "/") + + destfile <- file.path(tempdir(), name) + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (inherits(status, "condition")) + return(FALSE) + + # report success and return + destfile + + } + + renv_bootstrap_download_cran_latest_find <- function(version) { + + # check whether binaries are supported on this system + binary <- + getOption("renv.bootstrap.binary", default = TRUE) && + !identical(.Platform$pkgType, "source") && + !identical(getOption("pkgType"), "source") && + Sys.info()[["sysname"]] %in% c("Darwin", "Windows") + + types <- c(if (binary) "binary", "source") + + # iterate over types + repositories + for (type in types) { + for (repos in renv_bootstrap_repos()) { + + # build arguments for utils::available.packages() call + args <- list(type = type, repos = repos) + + # add custom headers if available -- note that + # utils::available.packages() will pass this to download.file() + if ("headers" %in% names(formals(utils::download.file))) { + headers <- renv_bootstrap_download_custom_headers(repos) + if (length(headers) && is.character(headers)) + args$headers <- headers + } + + # retrieve package database + db <- tryCatch( + as.data.frame( + do.call(utils::available.packages, args), + stringsAsFactors = FALSE + ), + error = identity + ) + + if (inherits(db, "error")) + next + + # check for compatible entry + entry <- db[db$Package %in% "renv" & db$Version %in% version, ] + if (nrow(entry) == 0) + next + + # found it; return spec to caller + spec <- list(entry = entry, type = type, repos = repos) + return(spec) + + } + } + + # if we got here, we failed to find renv + fmt <- "renv %s is not available from your declared package repositories" + stop(sprintf(fmt, version)) + + } + + renv_bootstrap_download_cran_archive <- function(version) { + + name <- sprintf("renv_%s.tar.gz", version) + repos <- renv_bootstrap_repos() + urls <- file.path(repos, "src/contrib/Archive/renv", name) + destfile <- file.path(tempdir(), name) + + for (url in urls) { + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (identical(status, 0L)) + return(destfile) + + } + + return(FALSE) + + } + + renv_bootstrap_download_tarball <- function(version) { + + # if the user has provided the path to a tarball via + # an environment variable, then use it + tarball <- Sys.getenv("RENV_BOOTSTRAP_TARBALL", unset = NA) + if (is.na(tarball)) + return() + + # allow directories + if (dir.exists(tarball)) { + name <- sprintf("renv_%s.tar.gz", version) + tarball <- file.path(tarball, name) + } + + # bail if it doesn't exist + if (!file.exists(tarball)) { + + # let the user know we weren't able to honour their request + fmt <- "- RENV_BOOTSTRAP_TARBALL is set (%s) but does not exist." + msg <- sprintf(fmt, tarball) + warning(msg) + + # bail + return() + + } + + catf("- Using local tarball '%s'.", tarball) + tarball + + } + + renv_bootstrap_github_token <- function() { + for (envvar in c("GITHUB_TOKEN", "GITHUB_PAT", "GH_TOKEN")) { + envval <- Sys.getenv(envvar, unset = NA) + if (!is.na(envval)) + return(envval) + } + } + + renv_bootstrap_download_github <- function(version) { + + enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") + if (!identical(enabled, "TRUE")) + return(FALSE) + + # prepare download options + token <- renv_bootstrap_github_token() + if (is.null(token)) + token <- "" + + if (nzchar(Sys.which("curl")) && nzchar(token)) { + fmt <- "--location --fail --header \"Authorization: token %s\"" + extra <- sprintf(fmt, token) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "curl", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } else if (nzchar(Sys.which("wget")) && nzchar(token)) { + fmt <- "--header=\"Authorization: token %s\"" + extra <- sprintf(fmt, token) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "wget", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } + + url <- file.path("https://api.github.com/repos/rstudio/renv/tarball", version) + name <- sprintf("renv_%s.tar.gz", version) + destfile <- file.path(tempdir(), name) + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (!identical(status, 0L)) + return(FALSE) + + renv_bootstrap_download_augment(destfile) + + return(destfile) + + } + + # Add Sha to DESCRIPTION. This is stop gap until #890, after which we + # can use renv::install() to fully capture metadata. + renv_bootstrap_download_augment <- function(destfile) { + sha <- renv_bootstrap_git_extract_sha1_tar(destfile) + if (is.null(sha)) { + return() + } + + # Untar + tempdir <- tempfile("renv-github-") + on.exit(unlink(tempdir, recursive = TRUE), add = TRUE) + untar(destfile, exdir = tempdir) + pkgdir <- dir(tempdir, full.names = TRUE)[[1]] + + # Modify description + desc_path <- file.path(pkgdir, "DESCRIPTION") + desc_lines <- readLines(desc_path) + remotes_fields <- c( + "RemoteType: github", + "RemoteHost: api.github.com", + "RemoteRepo: renv", + "RemoteUsername: rstudio", + "RemotePkgRef: rstudio/renv", + paste("RemoteRef: ", sha), + paste("RemoteSha: ", sha) + ) + writeLines(c(desc_lines[desc_lines != ""], remotes_fields), con = desc_path) + + # Re-tar + local({ + old <- setwd(tempdir) + on.exit(setwd(old), add = TRUE) + + tar(destfile, compression = "gzip") + }) + invisible() + } + + # Extract the commit hash from a git archive. Git archives include the SHA1 + # hash as the comment field of the tarball pax extended header + # (see https://www.kernel.org/pub/software/scm/git/docs/git-archive.html) + # For GitHub archives this should be the first header after the default one + # (512 byte) header. + renv_bootstrap_git_extract_sha1_tar <- function(bundle) { + + # open the bundle for reading + # We use gzcon for everything because (from ?gzcon) + # > Reading from a connection which does not supply a 'gzip' magic + # > header is equivalent to reading from the original connection + conn <- gzcon(file(bundle, open = "rb", raw = TRUE)) + on.exit(close(conn)) + + # The default pax header is 512 bytes long and the first pax extended header + # with the comment should be 51 bytes long + # `52 comment=` (11 chars) + 40 byte SHA1 hash + len <- 0x200 + 0x33 + res <- rawToChar(readBin(conn, "raw", n = len)[0x201:len]) + + if (grepl("^52 comment=", res)) { + sub("52 comment=", "", res) + } else { + NULL + } + } + + renv_bootstrap_install <- function(version, tarball, library) { + + # attempt to install it into project library + dir.create(library, showWarnings = FALSE, recursive = TRUE) + output <- renv_bootstrap_install_impl(library, tarball) + + # check for successful install + status <- attr(output, "status") + if (is.null(status) || identical(status, 0L)) + return(status) + + # an error occurred; report it + header <- "installation of renv failed" + lines <- paste(rep.int("=", nchar(header)), collapse = "") + text <- paste(c(header, lines, output), collapse = "\n") + stop(text) + + } + + renv_bootstrap_install_impl <- function(library, tarball) { + + # invoke using system2 so we can capture and report output + bin <- R.home("bin") + exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R" + R <- file.path(bin, exe) + + args <- c( + "--vanilla", "CMD", "INSTALL", "--no-multiarch", + "-l", shQuote(path.expand(library)), + shQuote(path.expand(tarball)) + ) + + system2(R, args, stdout = TRUE, stderr = TRUE) + + } + + renv_bootstrap_platform_prefix <- function() { + + # construct version prefix + version <- paste(R.version$major, R.version$minor, sep = ".") + prefix <- paste("R", numeric_version(version)[1, 1:2], sep = "-") + + # include SVN revision for development versions of R + # (to avoid sharing platform-specific artefacts with released versions of R) + devel <- + identical(R.version[["status"]], "Under development (unstable)") || + identical(R.version[["nickname"]], "Unsuffered Consequences") + + if (devel) + prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r") + + # build list of path components + components <- c(prefix, R.version$platform) + + # include prefix if provided by user + prefix <- renv_bootstrap_platform_prefix_impl() + if (!is.na(prefix) && nzchar(prefix)) + components <- c(prefix, components) + + # build prefix + paste(components, collapse = "/") + + } + + renv_bootstrap_platform_prefix_impl <- function() { + + # if an explicit prefix has been supplied, use it + prefix <- Sys.getenv("RENV_PATHS_PREFIX", unset = NA) + if (!is.na(prefix)) + return(prefix) + + # if the user has requested an automatic prefix, generate it + auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA) + if (is.na(auto) && getRversion() >= "4.4.0") + auto <- "TRUE" + + if (auto %in% c("TRUE", "True", "true", "1")) + return(renv_bootstrap_platform_prefix_auto()) + + # empty string on failure + "" + + } + + renv_bootstrap_platform_prefix_auto <- function() { + + prefix <- tryCatch(renv_bootstrap_platform_os(), error = identity) + if (inherits(prefix, "error") || prefix %in% "unknown") { + + msg <- paste( + "failed to infer current operating system", + "please file a bug report at https://github.com/rstudio/renv/issues", + sep = "; " + ) + + warning(msg) + + } + + prefix + + } + + renv_bootstrap_platform_os <- function() { + + sysinfo <- Sys.info() + sysname <- sysinfo[["sysname"]] + + # handle Windows + macOS up front + if (sysname == "Windows") + return("windows") + else if (sysname == "Darwin") + return("macos") + + # check for os-release files + for (file in c("/etc/os-release", "/usr/lib/os-release")) + if (file.exists(file)) + return(renv_bootstrap_platform_os_via_os_release(file, sysinfo)) + + # check for redhat-release files + if (file.exists("/etc/redhat-release")) + return(renv_bootstrap_platform_os_via_redhat_release()) + + "unknown" + + } + + renv_bootstrap_platform_os_via_os_release <- function(file, sysinfo) { + + # read /etc/os-release + release <- utils::read.table( + file = file, + sep = "=", + quote = c("\"", "'"), + col.names = c("Key", "Value"), + comment.char = "#", + stringsAsFactors = FALSE + ) + + vars <- as.list(release$Value) + names(vars) <- release$Key + + # get os name + os <- tolower(sysinfo[["sysname"]]) + + # read id + id <- "unknown" + for (field in c("ID", "ID_LIKE")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + id <- vars[[field]] + break + } + } + + # read version + version <- "unknown" + for (field in c("UBUNTU_CODENAME", "VERSION_CODENAME", "VERSION_ID", "BUILD_ID")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + version <- vars[[field]] + break + } + } + + # join together + paste(c(os, id, version), collapse = "-") + + } + + renv_bootstrap_platform_os_via_redhat_release <- function() { + + # read /etc/redhat-release + contents <- readLines("/etc/redhat-release", warn = FALSE) + + # infer id + id <- if (grepl("centos", contents, ignore.case = TRUE)) + "centos" + else if (grepl("redhat", contents, ignore.case = TRUE)) + "redhat" + else + "unknown" + + # try to find a version component (very hacky) + version <- "unknown" + + parts <- strsplit(contents, "[[:space:]]")[[1L]] + for (part in parts) { + + nv <- tryCatch(numeric_version(part), error = identity) + if (inherits(nv, "error")) + next + + version <- nv[1, 1] + break + + } + + paste(c("linux", id, version), collapse = "-") + + } + + renv_bootstrap_library_root_name <- function(project) { + + # use project name as-is if requested + asis <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT_ASIS", unset = "FALSE") + if (asis) + return(basename(project)) + + # otherwise, disambiguate based on project's path + id <- substring(renv_bootstrap_hash_text(project), 1L, 8L) + paste(basename(project), id, sep = "-") + + } + + renv_bootstrap_library_root <- function(project) { + + prefix <- renv_bootstrap_profile_prefix() + + path <- Sys.getenv("RENV_PATHS_LIBRARY", unset = NA) + if (!is.na(path)) + return(paste(c(path, prefix), collapse = "/")) + + path <- renv_bootstrap_library_root_impl(project) + if (!is.null(path)) { + name <- renv_bootstrap_library_root_name(project) + return(paste(c(path, prefix, name), collapse = "/")) + } + + renv_bootstrap_paths_renv("library", project = project) + + } + + renv_bootstrap_library_root_impl <- function(project) { + + root <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT", unset = NA) + if (!is.na(root)) + return(root) + + type <- renv_bootstrap_project_type(project) + if (identical(type, "package")) { + userdir <- renv_bootstrap_user_dir() + return(file.path(userdir, "library")) + } + + } + + renv_bootstrap_validate_version <- function(version, description = NULL) { + + # resolve description file + # + # avoid passing lib.loc to `packageDescription()` below, since R will + # use the loaded version of the package by default anyhow. note that + # this function should only be called after 'renv' is loaded + # https://github.com/rstudio/renv/issues/1625 + description <- description %||% packageDescription("renv") + + # check whether requested version 'version' matches loaded version of renv + sha <- attr(version, "sha", exact = TRUE) + valid <- if (!is.null(sha)) + renv_bootstrap_validate_version_dev(sha, description) + else + renv_bootstrap_validate_version_release(version, description) + + if (valid) + return(TRUE) + + # the loaded version of renv doesn't match the requested version; + # give the user instructions on how to proceed + dev <- identical(description[["RemoteType"]], "github") + remote <- if (dev) + paste("rstudio/renv", description[["RemoteSha"]], sep = "@") + else + paste("renv", description[["Version"]], sep = "@") + + # display both loaded version + sha if available + friendly <- renv_bootstrap_version_friendly( + version = description[["Version"]], + sha = if (dev) description[["RemoteSha"]] + ) + + fmt <- heredoc(" + renv %1$s was loaded from project library, but this project is configured to use renv %2$s. + - Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile. + - Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library. + ") + catf(fmt, friendly, renv_bootstrap_version_friendly(version), remote) + + FALSE + + } + + renv_bootstrap_validate_version_dev <- function(version, description) { + + expected <- description[["RemoteSha"]] + if (!is.character(expected)) + return(FALSE) + + pattern <- sprintf("^\\Q%s\\E", version) + grepl(pattern, expected, perl = TRUE) + + } + + renv_bootstrap_validate_version_release <- function(version, description) { + expected <- description[["Version"]] + is.character(expected) && identical(expected, version) + } + + renv_bootstrap_hash_text <- function(text) { + + hashfile <- tempfile("renv-hash-") + on.exit(unlink(hashfile), add = TRUE) + + writeLines(text, con = hashfile) + tools::md5sum(hashfile) + + } + + renv_bootstrap_load <- function(project, libpath, version) { + + # try to load renv from the project library + if (!requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) + return(FALSE) + + # warn if the version of renv loaded does not match + renv_bootstrap_validate_version(version) + + # execute renv load hooks, if any + hooks <- getHook("renv::autoload") + for (hook in hooks) + if (is.function(hook)) + tryCatch(hook(), error = warnify) + + # load the project + renv::load(project) + + TRUE + + } + + renv_bootstrap_profile_load <- function(project) { + + # if RENV_PROFILE is already set, just use that + profile <- Sys.getenv("RENV_PROFILE", unset = NA) + if (!is.na(profile) && nzchar(profile)) + return(profile) + + # check for a profile file (nothing to do if it doesn't exist) + path <- renv_bootstrap_paths_renv("profile", profile = FALSE, project = project) + if (!file.exists(path)) + return(NULL) + + # read the profile, and set it if it exists + contents <- readLines(path, warn = FALSE) + if (length(contents) == 0L) + return(NULL) + + # set RENV_PROFILE + profile <- contents[[1L]] + if (!profile %in% c("", "default")) + Sys.setenv(RENV_PROFILE = profile) + + profile + + } + + renv_bootstrap_profile_prefix <- function() { + profile <- renv_bootstrap_profile_get() + if (!is.null(profile)) + return(file.path("profiles", profile, "renv")) + } + + renv_bootstrap_profile_get <- function() { + profile <- Sys.getenv("RENV_PROFILE", unset = "") + renv_bootstrap_profile_normalize(profile) + } + + renv_bootstrap_profile_set <- function(profile) { + profile <- renv_bootstrap_profile_normalize(profile) + if (is.null(profile)) + Sys.unsetenv("RENV_PROFILE") + else + Sys.setenv(RENV_PROFILE = profile) + } + + renv_bootstrap_profile_normalize <- function(profile) { + + if (is.null(profile) || profile %in% c("", "default")) + return(NULL) + + profile + + } + + renv_bootstrap_path_absolute <- function(path) { + + substr(path, 1L, 1L) %in% c("~", "/", "\\") || ( + substr(path, 1L, 1L) %in% c(letters, LETTERS) && + substr(path, 2L, 3L) %in% c(":/", ":\\") + ) + + } + + renv_bootstrap_paths_renv <- function(..., profile = TRUE, project = NULL) { + renv <- Sys.getenv("RENV_PATHS_RENV", unset = "renv") + root <- if (renv_bootstrap_path_absolute(renv)) NULL else project + prefix <- if (profile) renv_bootstrap_profile_prefix() + components <- c(root, renv, prefix, ...) + paste(components, collapse = "/") + } + + renv_bootstrap_project_type <- function(path) { + + descpath <- file.path(path, "DESCRIPTION") + if (!file.exists(descpath)) + return("unknown") + + desc <- tryCatch( + read.dcf(descpath, all = TRUE), + error = identity + ) + + if (inherits(desc, "error")) + return("unknown") + + type <- desc$Type + if (!is.null(type)) + return(tolower(type)) + + package <- desc$Package + if (!is.null(package)) + return("package") + + "unknown" + + } + + renv_bootstrap_user_dir <- function() { + dir <- renv_bootstrap_user_dir_impl() + path.expand(chartr("\\", "/", dir)) + } + + renv_bootstrap_user_dir_impl <- function() { + + # use local override if set + override <- getOption("renv.userdir.override") + if (!is.null(override)) + return(override) + + # use R_user_dir if available + tools <- asNamespace("tools") + if (is.function(tools$R_user_dir)) + return(tools$R_user_dir("renv", "cache")) + + # try using our own backfill for older versions of R + envvars <- c("R_USER_CACHE_DIR", "XDG_CACHE_HOME") + for (envvar in envvars) { + root <- Sys.getenv(envvar, unset = NA) + if (!is.na(root)) + return(file.path(root, "R/renv")) + } + + # use platform-specific default fallbacks + if (Sys.info()[["sysname"]] == "Windows") + file.path(Sys.getenv("LOCALAPPDATA"), "R/cache/R/renv") + else if (Sys.info()[["sysname"]] == "Darwin") + "~/Library/Caches/org.R-project.R/R/renv" + else + "~/.cache/R/renv" + + } + + renv_bootstrap_version_friendly <- function(version, shafmt = NULL, sha = NULL) { + sha <- sha %||% attr(version, "sha", exact = TRUE) + parts <- c(version, sprintf(shafmt %||% " [sha: %s]", substring(sha, 1L, 7L))) + paste(parts, collapse = "") + } + + renv_bootstrap_exec <- function(project, libpath, version) { + if (!renv_bootstrap_load(project, libpath, version)) + renv_bootstrap_run(project, libpath, version) + } + + renv_bootstrap_run <- function(project, libpath, version) { + + # perform bootstrap + bootstrap(version, libpath) + + # exit early if we're just testing bootstrap + if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA))) + return(TRUE) + + # try again to load + if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { + return(renv::load(project = project)) + } + + # failed to download or load renv; warn the user + msg <- c( + "Failed to find an renv installation: the project will not be loaded.", + "Use `renv::activate()` to re-initialize the project." + ) + + warning(paste(msg, collapse = "\n"), call. = FALSE) + + } + + renv_json_read <- function(file = NULL, text = NULL) { + + jlerr <- NULL + + # if jsonlite is loaded, use that instead + if ("jsonlite" %in% loadedNamespaces()) { + + json <- tryCatch(renv_json_read_jsonlite(file, text), error = identity) + if (!inherits(json, "error")) + return(json) + + jlerr <- json + + } + + # otherwise, fall back to the default JSON reader + json <- tryCatch(renv_json_read_default(file, text), error = identity) + if (!inherits(json, "error")) + return(json) + + # report an error + if (!is.null(jlerr)) + stop(jlerr) + else + stop(json) + + } + + renv_json_read_jsonlite <- function(file = NULL, text = NULL) { + text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n") + jsonlite::fromJSON(txt = text, simplifyVector = FALSE) + } + + renv_json_read_patterns <- function() { + + list( + + # objects + list("{", "\t\n\tobject(\t\n\t"), + list("}", "\t\n\t)\t\n\t"), + + # arrays + list("[", "\t\n\tarray(\t\n\t"), + list("]", "\n\t\n)\n\t\n"), + + # maps + list(":", "\t\n\t=\t\n\t") + + ) + + } + + renv_json_read_envir <- function() { + + envir <- new.env(parent = emptyenv()) + + envir[["+"]] <- `+` + envir[["-"]] <- `-` + + envir[["object"]] <- function(...) { + result <- list(...) + names(result) <- as.character(names(result)) + result + } + + envir[["array"]] <- list + + envir[["true"]] <- TRUE + envir[["false"]] <- FALSE + envir[["null"]] <- NULL + + envir + + } + + renv_json_read_remap <- function(object, patterns) { + + # repair names if necessary + if (!is.null(names(object))) { + + nms <- names(object) + for (pattern in patterns) + nms <- gsub(pattern[[2L]], pattern[[1L]], nms, fixed = TRUE) + names(object) <- nms + + } + + # repair strings if necessary + if (is.character(object)) { + for (pattern in patterns) + object <- gsub(pattern[[2L]], pattern[[1L]], object, fixed = TRUE) + } + + # recurse for other objects + if (is.recursive(object)) + for (i in seq_along(object)) + object[i] <- list(renv_json_read_remap(object[[i]], patterns)) + + # return remapped object + object + + } + + renv_json_read_default <- function(file = NULL, text = NULL) { + + # read json text + text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n") + + # convert into something the R parser will understand + patterns <- renv_json_read_patterns() + transformed <- text + for (pattern in patterns) + transformed <- gsub(pattern[[1L]], pattern[[2L]], transformed, fixed = TRUE) + + # parse it + rfile <- tempfile("renv-json-", fileext = ".R") + on.exit(unlink(rfile), add = TRUE) + writeLines(transformed, con = rfile) + json <- parse(rfile, keep.source = FALSE, srcfile = NULL)[[1L]] + + # evaluate in safe environment + result <- eval(json, envir = renv_json_read_envir()) + + # fix up strings if necessary + renv_json_read_remap(result, patterns) + + } + + + # load the renv profile, if any + renv_bootstrap_profile_load(project) + + # construct path to library root + root <- renv_bootstrap_library_root(project) + + # construct library prefix for platform + prefix <- renv_bootstrap_platform_prefix() + + # construct full libpath + libpath <- file.path(root, prefix) + + # run bootstrap code + renv_bootstrap_exec(project, libpath, version) + + invisible() + +}) diff --git a/renv/settings.json b/renv/settings.json new file mode 100644 index 0000000..0715690 --- /dev/null +++ b/renv/settings.json @@ -0,0 +1,19 @@ +{ + "bioconductor.version": "3.20", + "external.libraries": [], + "ignored.packages": [], + "package.dependency.fields": [ + "Imports", + "Depends", + "LinkingTo" + ], + "ppm.enabled": null, + "ppm.ignored.urls": [], + "r.version": null, + "snapshot.type": "implicit", + "use.cache": true, + "vcs.ignore.cellar": true, + "vcs.ignore.library": true, + "vcs.ignore.local": true, + "vcs.manage.ignores": true +} From 664bdf091fc142f587cf44eea48aecd8c09927cf Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 09:52:00 -0600 Subject: [PATCH 06/24] updating nextflow file and config. also editing run_pipeline to remove loading conda environment and ensuring download of nextflow --- main.nf | 214 +++++++++++++++++++++++++++++++----------------- nextflow.config | 37 +++++++-- run_pipeline.sh | 46 +++-------- 3 files changed, 184 insertions(+), 113 deletions(-) diff --git a/main.nf b/main.nf index 636cf38..ca9522f 100644 --- a/main.nf +++ b/main.nf @@ -1,87 +1,155 @@ #!/usr/bin/env nextflow /* - * main.nf — run 5 R scripts in strict order + * main.nf — run 6 R scripts in strict order * * You already version-lock packages with renv.lock. - * Each script therefore starts with `renv::load()` (or restore), + * Each script therefore starts with `renv::load()`, * so the only thing we need is the R interpreter on PATH. */ -workflow { +nextflow.enable.dsl = 2 + +/* + * 0) Decompress all input data + */ +process UNZIP { + tag 'unzip_data' + + input: + val dummy_input // Receive the trigger + + output: + val true, emit: unzip_complete // Use emit instead of into + + script: + """ + cd ${params.projectDir} + python ${params.scriptDir}/00_unzip_input_data.py ${params.projectDir} + """ +} - /* - * 0) Decompress all input data. - */ - process UNZIP { - tag 'unzip_data' - script: - """ - cd ${params.projectDir} - python ${params.scriptDir}/0_unzip_input_data.py ${params.projectDir} - """ - } +/* + * 1) Load the renv + */ +process LOAD_RENV { + tag '01_load_renv' + + input: + val dummy_input // Receive the trigger from UNZIP + + output: + val true, emit: renv_loaded + + script: + """ + cd ${params.projectDir} + Rscript --vanilla ${params.scriptDir}/01_load_renv.R + """ +} - /* - * 1) Download / tidy raw data - */ - process GET_DATA { - tag '1_get_data' - script: - """ - cd ${params.projectDir} - Rscript --vanilla ${params.scriptDir}/1_get_data.R - """ - } +/* + * 2) Download / tidy raw data + */ +process GET_DATA { + tag '02_get_data' + + input: + val dummy_input + + output: + val true, emit: data_ready + + script: + """ + cd ${params.projectDir} + Rscript --vanilla ${params.scriptDir}/02_get_data.R + """ +} - /* - * 2) Cluster single-cell data - */ - process GET_CLUSTERING { - tag '2_get_clustering' - script: - """ - cd ${params.projectDir} - Rscript --vanilla ${params.scriptDir}/2_get_clustering.R - """ - } - GET_CLUSTERING.after GET_DATA // enforce order +/* + * 3) Cluster single-cell data + */ +process GET_CLUSTERING { + tag '03_get_clustering' + + input: + val dummy_input + + output: + val true, emit: clustering_done + + script: + """ + cd ${params.projectDir} + Rscript --vanilla ${params.scriptDir}/03_get_clustering.R + """ +} - /* - * 3) Build reference matrices - */ - process PREP_REF_DATA { - tag '3_prepare_reference_data' - script: - """ - cd ${params.projectDir} - Rscript --vanilla ${params.scriptDir}/3_prepare_reference_data.R - """ - } - PREP_REF_DATA.after GET_CLUSTERING +/* + * 4) Build reference matrices + */ +process PREP_REF_DATA { + tag '04_prepare_reference_data' + + input: + val dummy_input + + output: + val true, emit: ref_data_ready + + script: + """ + cd ${params.projectDir} + Rscript --vanilla ${params.scriptDir}/04_prepare_reference_data.R + """ +} - /* - * 4) Deconvolution with InstaPrism - */ - process RUN_INSTAPRISM { - tag '4_run_instaprism' - script: - """ - cd ${params.projectDir} - Rscript --vanilla ${params.scriptDir}/4_run_instaprism.R - """ - } - RUN_INSTAPRISM.after PREP_REF_DATA +/* + * 5) Deconvolution with InstaPrism + */ +process RUN_INSTAPRISM { + tag '05_run_instaprism' + + input: + val dummy_input + + output: + val true, emit: instaprism_complete + + script: + """ + cd ${params.projectDir} + Rscript --vanilla ${params.scriptDir}/05_run_instaprism.R + """ +} - /* - * 5) Visualisation step - */ - process VISUALISE { - tag '5_visualize' - script: - """ - cd ${params.projectDir} - Rscript --vanilla ${params.scriptDir}/5_visualize_instaprism_outputs.R - """ - } - VISUALISE.after RUN_INSTAPRISM +/* + * 6) Visualisation step + */ +process VISUALISE { + tag '06_visualize_instaprism_outputs' + + input: + val dummy_input + + script: + """ + cd ${params.projectDir} + Rscript --vanilla ${params.scriptDir}/06_visualize_instaprism_outputs.R + """ + // No output needed as this is the last step } + +workflow { + // Create initial trigger channel + trigger_channel = Channel.value(true) + + // Run processes in strict order using the output of the previous as input for the next + UNZIP(trigger_channel) + LOAD_RENV(UNZIP.out.unzip_complete) + GET_DATA(LOAD_RENV.out.renv_loaded) + GET_CLUSTERING(GET_DATA.out.data_ready) + PREP_REF_DATA(GET_CLUSTERING.out.clustering_done) + RUN_INSTAPRISM(PREP_REF_DATA.out.ref_data_ready) + VISUALISE(RUN_INSTAPRISM.out.instaprism_complete) +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 41ffb54..aba1311 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,14 +1,39 @@ /* * nextflow.config — cluster defaults + paths */ + +// Define parameters params.projectDir = "${baseDir}" params.scriptDir = "${params.projectDir}/scripts" params.outDir = "${params.projectDir}/output_data" -process { - executor = 'slurm' - queue = 'compute' - cpus = 4 - memory = '16 GB' - time = '10h' +// --- +// Define profiles for different execution environments +// --- +profiles { + + // Profile for running locally on your machine (default if no profile is specified) + local { + process { + executor = 'local' // Use the local executor + cpus = 2 // Adjust based on your local machine's cores + memory = '8 GB' // Adjust based on your local machine's RAM + time = '4h' // Adjust if local runs are typically shorter + } + // If you have Docker or Singularity installed locally and want to use them: + // docker.enabled = true + // singularity.enabled = true + } + + // Profile for running on a SLURM HPC cluster + slurm { + process { + executor = 'slurm' // Use the SLURM executor + queue = 'compute' // Your specific SLURM queue + cpus = 4 // Default CPUs for processes on SLURM + memory = '16 GB' // Default memory for processes on SLURM + time = '10h' // Default time limit for processes on SLURM + } + } } + diff --git a/run_pipeline.sh b/run_pipeline.sh index a60aca5..8b37b05 100644 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -18,44 +18,22 @@ PRJ_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" cd "${PRJ_DIR}" # -------------------------------------------------- -# 1) Load Conda and activate env_hgsoc -# -------------------------------------------------- -source "$(conda info --base)/etc/profile.d/conda.sh" - -ENV_YML="${PRJ_DIR}/environments/env_hgsoc.yml" - -# create the env once; reuse afterwards -if ! conda env list | grep -q '^env_hgsoc '; then - echo "••• Creating Conda environment env_hgsoc" - conda env create -f "${ENV_YML}" +# 1) Ensure that Nextflow is installed +# -------------------------------------------------- +if ! command -v nextflow &> /dev/null +then + echo "Error: Nextflow is not installed or not found in your PATH." + echo "Please install Nextflow and ensure it's accessible in your system's PATH." + echo "You can typically install it with: curl -s https://get.nextflow.io | bash" + echo "Then move the 'nextflow' executable to a directory in your PATH (e.g., ~/bin or /usr/local/bin)." + exit 1 # Exit the script with an error code fi -conda activate env_hgsoc # puts R, Nextflow, compilers on PATH - -# -------------------------------------------------- -# 2) Install R packages that Conda cannot provide -# -------------------------------------------------- -Rscript - <<'RSCRIPT' -pkgs_cran <- c("InstaPrism") # GitHub-only -pkgs_bioc <- c() # add names if Bioc, but absent on bioconda - -if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager", repos = "https://cloud.r-project.org") -if (!requireNamespace("remotes", quietly = TRUE)) - install.packages("remotes", repos = "https://cloud.r-project.org") - -for (p in pkgs_bioc) - if (!requireNamespace(p, quietly = TRUE)) - BiocManager::install(p, ask = FALSE, update = FALSE) - -# GitHub install InstaPrism -if (!requireNamespace("InstaPrism", quietly = TRUE)) - devtools::install_github("humengying0907/InstaPrism") -RSCRIPT # -------------------------------------------------- -# 3) Run the pipeline +# 2) Run the pipeline # -------------------------------------------------- echo "••• Launching Nextflow" -nextflow run main.nf -profile slurm -resume +nextflow run main.nf -profile local -resume +# or nextflow run main.nf -profile slurm -resume, if on HPC echo "••• Pipeline finished 🎉" From eac1f9ab2ef7c890c110375f52120f59c49ac711 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 10:10:12 -0600 Subject: [PATCH 07/24] replace python unzipping file with R to simplify file types and also to message out how many .zip and .gz files and unzipped if any --- scripts/00_unzip_input_data.R | 94 ++++++++++++++++++++++++++++++++++ scripts/00_unzip_input_data.py | 62 ---------------------- 2 files changed, 94 insertions(+), 62 deletions(-) create mode 100644 scripts/00_unzip_input_data.R delete mode 100644 scripts/00_unzip_input_data.py diff --git a/scripts/00_unzip_input_data.R b/scripts/00_unzip_input_data.R new file mode 100644 index 0000000..bc6f8e7 --- /dev/null +++ b/scripts/00_unzip_input_data.R @@ -0,0 +1,94 @@ +########################################################################################## +### 00_unzip_input_data.R +### +### This script unzips any gzip and zip files that may exist in the input_data directory. +########################################################################################## + +# Load here library +library(here) + +# Ensure the R.utils package is installed for gunzip functionality +if (!requireNamespace("R.utils", quietly = TRUE)) { + install.packages("R.utils") +} +library(R.utils) + +# Set base directory for location of .zip and .gz files +# We will also be saving the unzipped files here +input_data <- file.path(here(), "input_data") + +########################################################## +# 1) Function to check for and unzip .zip and .gz files in directory +########################################################## + +unzip_compressed_files <- function(directory_path) { + + # Check if the provided directory exists + if (!dir.exists(directory_path)) { + stop("Error: The specified directory does not exist: ", directory_path) + } + + message(paste("Checking directory:", directory_path, "for compressed files...")) + + # List all files in the directory + all_files <- list.files(directory_path, full.names = TRUE) + + # Initialize counters for unzipped files + zip_unzipped_count <- 0 + gz_unzipped_count <- 0 + + # --- Handle .zip files --- + zip_files <- all_files[grep("\\.zip$", all_files, ignore.case = TRUE)] + + if (length(zip_files) > 0) { + message(paste("Found", length(zip_files), ".zip file(s). Attempting to unzip...")) + for (zip_file in zip_files) { + tryCatch({ + # Unzip the file into the same directory + unzip(zip_file, exdir = directory_path) + message(paste("Successfully unzipped:", basename(zip_file))) + zip_unzipped_count <- zip_unzipped_count + 1 + }, error = function(e) { + warning(paste("Failed to unzip", basename(zip_file), ":", e$message)) + }) + } + } else { + message("No .zip files found.") + } + + # --- Handle .gz files --- + gz_files <- all_files[grep("\\.gz$", all_files, ignore.case = TRUE)] + + if (length(gz_files) > 0) { + message(paste("Found", length(gz_files), ".gz file(s). Attempting to gunzip...")) + for (gz_file in gz_files) { + tryCatch({ + # gunzip the file. The output file name will be the original name without .gz + # e.g., "data.txt.gz" will become "data.txt" + R.utils::gunzip(gz_file, remove = TRUE, overwrite = TRUE) + message(paste("Successfully gunzipped:", basename(gz_file))) + gz_unzipped_count <- gz_unzipped_count + 1 + }, error = function(e) { + warning(paste("Failed to gunzip", basename(gz_file), ":", e$message)) + }) + } + } else { + message("No .gz files found.") + } + + message(paste("\nSummary of unzipping operations:")) + message(paste("- .zip files unzipped:", zip_unzipped_count)) + message(paste("- .gz files unzipped:", gz_unzipped_count)) + + if (zip_unzipped_count == 0 && gz_unzipped_count == 0) { + message("No compressed files were found or unzipped.") + } else { + message("Unzipping process complete.") + } +} + +########################################################## +# 2) Run function +########################################################## + +unzip_compressed_files(input_data) \ No newline at end of file diff --git a/scripts/00_unzip_input_data.py b/scripts/00_unzip_input_data.py deleted file mode 100644 index a7c87f1..0000000 --- a/scripts/00_unzip_input_data.py +++ /dev/null @@ -1,62 +0,0 @@ -''' The following scipt takes in a directory path as input, and - unzips all the gzip and zip files in that directory. - It will not overwrite existing files. - Usage: python unzip_if_needed.py ''' -import os -import sys -import gzip -import zipfile -import pandas as pd - -def ungzip_file(gzip_path, dest_path): - with gzip.open(gzip_path, 'rb') as f_in, open(dest_path, 'wb') as f_out: - f_out.write(f_in.read()) - print(f"Extracted gzip: {gzip_path} -> {dest_path}") - -def unzip_file(zip_path, dest_dir): - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - for member in zip_ref.namelist(): - target_path = os.path.join(dest_dir, member) - if not os.path.exists(target_path): - zip_ref.extract(member, dest_dir) - print(f"Extracted zip: {member} -> {target_path}") - else: - print(f"Skipped existing file: {target_path}") - -def main(path): - if not os.path.isdir(path): - print(f"Invalid path: {path}") - return - - for root, _, files in os.walk(path): - for filename in files: - full_path = os.path.join(root, filename) - - if filename.endswith('.gzip'): - dest_path = full_path[:-5] # remove .gzip - if not os.path.exists(dest_path): - ungzip_file(full_path, dest_path) - else: - print(f"Skipped existing file: {dest_path}") - - elif filename.endswith('.zip'): - unzip_file(full_path, root) - - if filename.endswith('.gz'): - dest_path = full_path[:-3] # remove .gz - if not os.path.exists(dest_path): - ungzip_file(full_path, dest_path) - else: - print(f"Skipped existing file: {dest_path}") - - # finally formatting the data for R: - W = pd.read_csv(f"{path}salmon_raw_counts_for_way_pipeline_whites.tsv", index_col=0, sep='\t') - W.to_csv(f"{path}salmon_raw_counts_for_way_pipeline_whites.tsv", sep='\t') - B = pd.read_csv(f"{path}salmon_raw_counts_for_way_pipeline.tsv", index_col=0, sep='\t') - B.to_csv(f"{path}salmon_raw_counts_for_way_pipeline.tsv", sep='\t') - -if __name__ == "__main__": - if len(sys.argv) != 2: - print("Usage: python unzip_if_needed.py ") - else: - main(sys.argv[1]) \ No newline at end of file From b51c4d08ef112a2fdb90fcaa603577e50203a24d Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 10:15:35 -0600 Subject: [PATCH 08/24] updating git ignore with nextflow and R hidden files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 3e954a1..4ac9c35 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,9 @@ *.user *.userosscache *.sln.docstates +*.nextflow/ +*scripts/.Rhistory + # User-specific files (MonoDevelop/Xamarin Studio) *.userprefs From 11c61cac19f6f57c935fe97e665d6fee0491144c Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 10:38:39 -0600 Subject: [PATCH 09/24] updating readme, removing conda environment, updating main.nf to include new R unzipping script, adding .Rprofile and renv/.gitignore --- .Rprofile | 1 + README.md | 27 +++++++++++---------- environments/env_hgsoc.yml | 49 -------------------------------------- main.nf | 2 +- renv/.gitignore | 7 ++++++ 5 files changed, 24 insertions(+), 62 deletions(-) create mode 100644 .Rprofile delete mode 100644 environments/env_hgsoc.yml create mode 100644 renv/.gitignore diff --git a/.Rprofile b/.Rprofile new file mode 100644 index 0000000..81b960f --- /dev/null +++ b/.Rprofile @@ -0,0 +1 @@ +source("renv/activate.R") diff --git a/README.md b/README.md index 1bed35f..e4370dd 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ 1) Introduction 2) Scripts 3) Input and output data preparation and organization +4) Running the pipeline # 1) Introduction: The purpose of the code in this repository is to use the InstaPrism R package (https://github.com/humengying0907/InstaPrism/tree/master) to run Bayes Prism deconvolution on the following bulk RNA seq and microarray datasets of high grade serous ovarian carcinoma (HGSOC) samples: - “SchildkrautB” – bulk RNA sequencing of HGSOC from Black patients. @@ -13,23 +14,23 @@ The purpose of the code in this repository is to use the InstaPrism R package (h Bayes Prism requires a single cell reference dataset to perform deconvolution. However, it has been previously shown that certain cell types, notably adipocytes, are present in bulk tumor samples but largely absent from single cell RNA sequencing results (https://www.biorxiv.org/content/10.1101/2024.04.25.590992v1). However, adipocytes can be captured using single nucleus RNA sequencing. Here, we incorporate single nucleus RNA sequencing data from adipocytes, in addition to single cell RNA sequencing data of HGSOC, in deconvolution of bulk HGSOC RNA sequencing and microarray data. # 2) Scripts: -Prior to running these scripts, please download the required data as outlined below in “Input and output data preparation and organization.” Please also ensure that the renv folder has been downloaded and is in the same directory as the scripts. These scripts are intended to be run in the following order: -### (Optional) unzip_input_data.py -To uncompress all files in the input data requiered, if not done manually. Exmaple: python unzip_input_data.py path/to/input_data/ -## 1_get_data.R +Prior to running this pipeline, please download the required data as outlined below in “Input and output data preparation and organization.” +## 00_unzip_input_data.R +This script checks for any zipped .gz or .zip files present in input_data, and will unzip anything it finds. +## 01_load_renv.R +This script loads the environment using the renv lockfile. +## 02_get_data.R This script reads the bulk RNA sequencing and microarray datasets and filters them to only include genes present in one common gene mapping list. It transforms the microarray data using 2^(...) to match the scale of the bulk RNA sequencing data values – this is used for InstaPrism deconvolution. It also transforms the bulk RNA sequencing data using log10(...+1) to match the scale of the microarray data – this is used for clustering. All of these matrices are saved in a uniform format containing on sample ID (rows) and genes (columns); file names are appended with either “asImported” or “transformed.” It also saves any metadata information (ex. prior clustering) about the samples in a separate file for reference. -## 2_get_clustering.R +## 03_get_clustering.R This script performs k-means clustering (k=2,3,4), NMF clustering (k=2,3,4), and consensusOV subtyping (https://github.com/bhklab/consensusOV) for each bulk dataset. For k-means clustering, it uses log10(...+1) transformed data (for RNAseq), and raw log2 data (for microarray). For NMF clustering and consensusOV subtyping, it uses raw counts (for RNAseq), and 2^(...) scaled "pseudocounts" data (for microarray). It saves the results in one csv file per dataset, where each row corresponds to one sample. It also saves a csv file containing the results for all datasets. -## 3_prepare_reference_data.R +## 04_prepare_reference_data.R This script creates a single-cell/single-nucleus reference matrix for use as input into InstaPrism. It requires single cell RNA sequencing data of HGSOC (from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE217517) and cell type labels for those cells (from https://github.com/greenelab/deconvolution_pilot/tree/main/data/cell_labels). It also requires single nucleus RNA sequencing data of adipocytes (from http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE176171). Please refer to the “Input and output data preparation and organization” subsection of README for specific files necessary. It reads in the HGSOC single cell and adipocyte single nucleus RNA sequencing data and performs some pre-processing on the adipocyte data (removing duplicate samples, removing non-adipocyte samples, removing samples with many mitochondrial gene reads, and using Seurat to remove low-quality nuclei, empty droplets, and nuclei doublets/multiplets). It then combines the single cell and single nucleus data to generate an expression matrix where each row corresponds to a gene (GeneCards symbols) and each column corresponds to a sample (each assigned a unique numerical ID). It also generates a cell type file which serves as a key, and associates each sample ID to its cell type. -## 4_run_instaprism.R -This script runs InstaPrism, an R package that performs deconvolution with a similar but faster method to BayesPrism. First, it loads in the reference single cell plus single nucleus RNA sequencing reference dataset created in the previous script. Since the reference dataset is so large, it randomly selects 500 of each cell type to use. It then generates and saves two reference objects for input into InstaPrism, one with adipocytes and one without adipocytes. It runs InstaPrism twice on each of the six bulk datasets, both with and without the adipocytes in the reference data. Of note, Instaprism requires non-log-transformed bulk data, so it is performed on the original, non-transformed data for the bulk RNA sequencing datasets and on the 2^(…) transformed data for the microarray datasets. -## 5_visualize_instaprism_outputs.R +## 05_run_instaprism.R +This script runs InstaPrism, an R package that performs deconvolution with a similar but faster method to BayesPrism. First, it loads in the reference single cell plus single nucleus RNA sequencing reference dataset created in the previous script. It also removes a set of previously identified genes to mitigate technical factors unique to single nucleus RNA sequencing. Since the reference dataset is so large, it also randomly selects 500 of each cell type to use. It then generates and saves two reference objects for input into InstaPrism, one with adipocytes and one without adipocytes. It runs InstaPrism twice on each of the six bulk datasets, both with and without the adipocytes in the reference data. Of note, Instaprism requires non-log-transformed bulk data, so it is performed on the original, non-transformed data for the bulk RNA sequencing datasets and on the 2^(…) transformed data for the microarray datasets. +## 06_visualize_instaprism_outputs.R This script creates several figures to visualize the deconvolution results generated in the previous script, and compare the results when run with and without adipocyte single nucleus RNA sequencing data in the reference data. It creates 100% stacked bar charts to visualize the total cell proportions per bulk dataset, 100% stacked bar charts showing cell proportions per sample in each dataset, and bar charts showing the absolute change of cell type proportions in total per dataset. # 3) Input and output data preparation and organization: -Prior to running these scripts, please ensure that the below raw data files have been downloaded and are present in a folder entitled “input_data” inside the same project and directory as the scripts. (The results will be created and stored in another folder inside the same directory entitled “output_data”.) Sample directory contents: - -![Screenshot 2025-02-25 at 2 28 25 PM](https://github.com/user-attachments/assets/b44cfc2a-8242-4f5e-ba2b-f75ff6712de9) +Prior to running these scripts, please ensure that the below raw data files (.gz and .zip zipped files okay) have been downloaded and are present in a folder entitled “input_data” inside the same project and directory as the scripts. (The results will be created and stored in another folder inside the same directory entitled “output_data”.) ## SchildkrautB, SchildkrautW, and reference gene list: ### From https://github.com/greenelab/hgsc_characterization: - /reference_data/ensembl_hgnc_entrez.tsv @@ -129,3 +130,5 @@ Prior to running these scripts, please ensure that the below raw data files have - GSM5820686_Hs_SAT_11-1.dge.tsv.gz #### Metadata - GSE176171_cell_metadata.tsv.gz +# 4) Running the pipeline: +The pipeline can be run by executing run_pipeline.sh, which will in turn run main.nf, a Nextflow pipeline to execute the scripts in order. run_pipeline.sh has configuration profile options to run either locally or on high performance computing (HPC) using Slurm. Be sure to open the file before running and ensure that the appropriate profile is active (both available in comments of script). \ No newline at end of file diff --git a/environments/env_hgsoc.yml b/environments/env_hgsoc.yml deleted file mode 100644 index f9b9f8f..0000000 --- a/environments/env_hgsoc.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: env_hgsoc -channels: - - conda-forge - - bioconda -dependencies: - # ---- core language runtimes ---- - - r-base=4.4.1 - - r-irkernel - - python=3.10.16 - - nextflow # workflow engine - - openjdk # Nextflow runtime - # ---- build tool-chain for renv ---- - - gcc # C compiler - - gxx # C++ compiler - - gfortran # Fortran (some BioC pkgs) - - make - - cmake - - pkg-config - - icu - # ---- CRAN/BioC packages that have Conda builds ---- - - r-aims - - bioconductor-annotationdbi - - r-bh - - bioconductor-biobase - - bioconductor-biocfilecache - - bioconductor-biocgenerics - - r-biocmanager - - bioconductor-biocparallel - - bioconductor-biocsingular - - bioconductor-biocversion - - bioconductor-biostrings - - r-dbi - - bioconductor-delayedarray - - bioconductor-delayedmatrixstats - - r-deriv - - r-fnn - - r-formula - - bioconductor-gseabase - - bioconductor-gsva - - bioconductor-genomeinfodb - - bioconductor-genomeinfodbdata - - bioconductor-genomicranges - - bioconductor-hdf5array - - bioconductor-iranges - # ---- Python packages ---- - - pandas - - numpy - - jupyterlab - - matplotlib diff --git a/main.nf b/main.nf index ca9522f..05390b2 100644 --- a/main.nf +++ b/main.nf @@ -24,7 +24,7 @@ process UNZIP { script: """ cd ${params.projectDir} - python ${params.scriptDir}/00_unzip_input_data.py ${params.projectDir} + Rscript --vanilla ${params.scriptDir}/00_unzip_input_data.R ${params.projectDir} """ } diff --git a/renv/.gitignore b/renv/.gitignore new file mode 100644 index 0000000..0ec0cbb --- /dev/null +++ b/renv/.gitignore @@ -0,0 +1,7 @@ +library/ +local/ +cellar/ +lock/ +python/ +sandbox/ +staging/ From d78e4de91f8320f3ea35a63a079464ec42fdf441 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 10:41:16 -0600 Subject: [PATCH 10/24] updating .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4ac9c35..3efe630 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,8 @@ *.sln.docstates *.nextflow/ *scripts/.Rhistory - +*.Rhistory +*.nextflow.log.* # User-specific files (MonoDevelop/Xamarin Studio) *.userprefs From 768131a04dcba0f2710f023db01bbb31d67b712c Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 11:19:55 -0600 Subject: [PATCH 11/24] replacing R unzipping file with python one --- scripts/00_unzip_input_data.R | 94 ---------------------------------- scripts/00_unzip_input_data.py | 62 ++++++++++++++++++++++ 2 files changed, 62 insertions(+), 94 deletions(-) delete mode 100644 scripts/00_unzip_input_data.R create mode 100644 scripts/00_unzip_input_data.py diff --git a/scripts/00_unzip_input_data.R b/scripts/00_unzip_input_data.R deleted file mode 100644 index bc6f8e7..0000000 --- a/scripts/00_unzip_input_data.R +++ /dev/null @@ -1,94 +0,0 @@ -########################################################################################## -### 00_unzip_input_data.R -### -### This script unzips any gzip and zip files that may exist in the input_data directory. -########################################################################################## - -# Load here library -library(here) - -# Ensure the R.utils package is installed for gunzip functionality -if (!requireNamespace("R.utils", quietly = TRUE)) { - install.packages("R.utils") -} -library(R.utils) - -# Set base directory for location of .zip and .gz files -# We will also be saving the unzipped files here -input_data <- file.path(here(), "input_data") - -########################################################## -# 1) Function to check for and unzip .zip and .gz files in directory -########################################################## - -unzip_compressed_files <- function(directory_path) { - - # Check if the provided directory exists - if (!dir.exists(directory_path)) { - stop("Error: The specified directory does not exist: ", directory_path) - } - - message(paste("Checking directory:", directory_path, "for compressed files...")) - - # List all files in the directory - all_files <- list.files(directory_path, full.names = TRUE) - - # Initialize counters for unzipped files - zip_unzipped_count <- 0 - gz_unzipped_count <- 0 - - # --- Handle .zip files --- - zip_files <- all_files[grep("\\.zip$", all_files, ignore.case = TRUE)] - - if (length(zip_files) > 0) { - message(paste("Found", length(zip_files), ".zip file(s). Attempting to unzip...")) - for (zip_file in zip_files) { - tryCatch({ - # Unzip the file into the same directory - unzip(zip_file, exdir = directory_path) - message(paste("Successfully unzipped:", basename(zip_file))) - zip_unzipped_count <- zip_unzipped_count + 1 - }, error = function(e) { - warning(paste("Failed to unzip", basename(zip_file), ":", e$message)) - }) - } - } else { - message("No .zip files found.") - } - - # --- Handle .gz files --- - gz_files <- all_files[grep("\\.gz$", all_files, ignore.case = TRUE)] - - if (length(gz_files) > 0) { - message(paste("Found", length(gz_files), ".gz file(s). Attempting to gunzip...")) - for (gz_file in gz_files) { - tryCatch({ - # gunzip the file. The output file name will be the original name without .gz - # e.g., "data.txt.gz" will become "data.txt" - R.utils::gunzip(gz_file, remove = TRUE, overwrite = TRUE) - message(paste("Successfully gunzipped:", basename(gz_file))) - gz_unzipped_count <- gz_unzipped_count + 1 - }, error = function(e) { - warning(paste("Failed to gunzip", basename(gz_file), ":", e$message)) - }) - } - } else { - message("No .gz files found.") - } - - message(paste("\nSummary of unzipping operations:")) - message(paste("- .zip files unzipped:", zip_unzipped_count)) - message(paste("- .gz files unzipped:", gz_unzipped_count)) - - if (zip_unzipped_count == 0 && gz_unzipped_count == 0) { - message("No compressed files were found or unzipped.") - } else { - message("Unzipping process complete.") - } -} - -########################################################## -# 2) Run function -########################################################## - -unzip_compressed_files(input_data) \ No newline at end of file diff --git a/scripts/00_unzip_input_data.py b/scripts/00_unzip_input_data.py new file mode 100644 index 0000000..a7c87f1 --- /dev/null +++ b/scripts/00_unzip_input_data.py @@ -0,0 +1,62 @@ +''' The following scipt takes in a directory path as input, and + unzips all the gzip and zip files in that directory. + It will not overwrite existing files. + Usage: python unzip_if_needed.py ''' +import os +import sys +import gzip +import zipfile +import pandas as pd + +def ungzip_file(gzip_path, dest_path): + with gzip.open(gzip_path, 'rb') as f_in, open(dest_path, 'wb') as f_out: + f_out.write(f_in.read()) + print(f"Extracted gzip: {gzip_path} -> {dest_path}") + +def unzip_file(zip_path, dest_dir): + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + for member in zip_ref.namelist(): + target_path = os.path.join(dest_dir, member) + if not os.path.exists(target_path): + zip_ref.extract(member, dest_dir) + print(f"Extracted zip: {member} -> {target_path}") + else: + print(f"Skipped existing file: {target_path}") + +def main(path): + if not os.path.isdir(path): + print(f"Invalid path: {path}") + return + + for root, _, files in os.walk(path): + for filename in files: + full_path = os.path.join(root, filename) + + if filename.endswith('.gzip'): + dest_path = full_path[:-5] # remove .gzip + if not os.path.exists(dest_path): + ungzip_file(full_path, dest_path) + else: + print(f"Skipped existing file: {dest_path}") + + elif filename.endswith('.zip'): + unzip_file(full_path, root) + + if filename.endswith('.gz'): + dest_path = full_path[:-3] # remove .gz + if not os.path.exists(dest_path): + ungzip_file(full_path, dest_path) + else: + print(f"Skipped existing file: {dest_path}") + + # finally formatting the data for R: + W = pd.read_csv(f"{path}salmon_raw_counts_for_way_pipeline_whites.tsv", index_col=0, sep='\t') + W.to_csv(f"{path}salmon_raw_counts_for_way_pipeline_whites.tsv", sep='\t') + B = pd.read_csv(f"{path}salmon_raw_counts_for_way_pipeline.tsv", index_col=0, sep='\t') + B.to_csv(f"{path}salmon_raw_counts_for_way_pipeline.tsv", sep='\t') + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python unzip_if_needed.py ") + else: + main(sys.argv[1]) \ No newline at end of file From 8d8ca52de9375edbc6ec173454eecfb363f92f15 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 11:21:06 -0600 Subject: [PATCH 12/24] adding back in conda environment with python packages only --- env_hgsoc.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 env_hgsoc.yml diff --git a/env_hgsoc.yml b/env_hgsoc.yml new file mode 100644 index 0000000..8ebef87 --- /dev/null +++ b/env_hgsoc.yml @@ -0,0 +1,16 @@ +name: env_hgsoc +channels: + - conda-forge + - bioconda +dependencies: + # ---- core language runtimes ---- + - r-base=4.4.1 + - r-irkernel + - python=3.10.16 + - nextflow # workflow engine + - openjdk # Nextflow runtime + # ---- Python packages ---- + - pandas + - numpy + - jupyterlab + - matplotlib From 42031d1ab4aea9d61d9140565fd2c5bbc81a3e43 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 11:30:33 -0600 Subject: [PATCH 13/24] making it so that the pipeline shell script activates conda and removing the optional unzipping script from the nextflow --- main.nf | 24 ++---------------------- run_pipeline.sh | 20 +++++++++++++++++--- 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/main.nf b/main.nf index 05390b2..8306fa9 100644 --- a/main.nf +++ b/main.nf @@ -9,25 +9,6 @@ nextflow.enable.dsl = 2 -/* - * 0) Decompress all input data - */ -process UNZIP { - tag 'unzip_data' - - input: - val dummy_input // Receive the trigger - - output: - val true, emit: unzip_complete // Use emit instead of into - - script: - """ - cd ${params.projectDir} - Rscript --vanilla ${params.scriptDir}/00_unzip_input_data.R ${params.projectDir} - """ -} - /* * 1) Load the renv */ @@ -35,7 +16,7 @@ process LOAD_RENV { tag '01_load_renv' input: - val dummy_input // Receive the trigger from UNZIP + val dummy_input // Receive the trigger output: val true, emit: renv_loaded @@ -145,8 +126,7 @@ workflow { trigger_channel = Channel.value(true) // Run processes in strict order using the output of the previous as input for the next - UNZIP(trigger_channel) - LOAD_RENV(UNZIP.out.unzip_complete) + LOAD_RENV(trigger_channel) GET_DATA(LOAD_RENV.out.renv_loaded) GET_CLUSTERING(GET_DATA.out.data_ready) PREP_REF_DATA(GET_CLUSTERING.out.clustering_done) diff --git a/run_pipeline.sh b/run_pipeline.sh index 8b37b05..ca39ba1 100644 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -9,7 +9,7 @@ #SBATCH --ntasks-per-node=64 #SBATCH --nodes=1 -set -euo pipefail +set -eo pipefail # -------------------------------------------------- # 0) Resolve project root (directory of this script) @@ -30,10 +30,24 @@ then fi # -------------------------------------------------- -# 2) Run the pipeline +# 2) Load Conda and activate env_hgsoc +# -------------------------------------------------- +source "$(conda info --base)/etc/profile.d/conda.sh" + +ENV_YML="${PRJ_DIR}/env_hgsoc.yml" + +# create the env once; reuse afterwards +if ! conda env list | grep -q '^env_hgsoc '; then + echo "••• Creating Conda environment env_hgsoc" + conda env create -f "${ENV_YML}" +fi +conda activate env_hgsoc + +# -------------------------------------------------- +# 3) Run the pipeline # -------------------------------------------------- echo "••• Launching Nextflow" nextflow run main.nf -profile local -resume # or nextflow run main.nf -profile slurm -resume, if on HPC -echo "••• Pipeline finished 🎉" +echo "••• Pipeline finished 🎉" \ No newline at end of file From 8055b58819ebbabbcdc2dae8174d406f27ba944f Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 11:33:04 -0600 Subject: [PATCH 14/24] updating readme with info about optional unzipping script --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e4370dd..c4f3b32 100644 --- a/README.md +++ b/README.md @@ -15,8 +15,8 @@ The purpose of the code in this repository is to use the InstaPrism R package (h Bayes Prism requires a single cell reference dataset to perform deconvolution. However, it has been previously shown that certain cell types, notably adipocytes, are present in bulk tumor samples but largely absent from single cell RNA sequencing results (https://www.biorxiv.org/content/10.1101/2024.04.25.590992v1). However, adipocytes can be captured using single nucleus RNA sequencing. Here, we incorporate single nucleus RNA sequencing data from adipocytes, in addition to single cell RNA sequencing data of HGSOC, in deconvolution of bulk HGSOC RNA sequencing and microarray data. # 2) Scripts: Prior to running this pipeline, please download the required data as outlined below in “Input and output data preparation and organization.” -## 00_unzip_input_data.R -This script checks for any zipped .gz or .zip files present in input_data, and will unzip anything it finds. +## (OPTIONAL) 00_unzip_input_data.R +This script checks for any zipped .gz or .zip files present in input_data, and will unzip and format anything it finds. It will not run automatically, and is commented out in the shell script. Uncomment to use if needed. ## 01_load_renv.R This script loads the environment using the renv lockfile. ## 02_get_data.R @@ -131,4 +131,4 @@ Prior to running these scripts, please ensure that the below raw data files (.gz #### Metadata - GSE176171_cell_metadata.tsv.gz # 4) Running the pipeline: -The pipeline can be run by executing run_pipeline.sh, which will in turn run main.nf, a Nextflow pipeline to execute the scripts in order. run_pipeline.sh has configuration profile options to run either locally or on high performance computing (HPC) using Slurm. Be sure to open the file before running and ensure that the appropriate profile is active (both available in comments of script). \ No newline at end of file +The pipeline can be run by executing run_pipeline.sh, which will in turn run main.nf, a Nextflow pipeline to execute the scripts in order (except for the optional unzipping script). run_pipeline.sh has configuration profile options to run either locally or on high performance computing (HPC) using Slurm. Be sure to open the file before running and ensure that the appropriate profile is active (both available in comments of script). Please also uncomment running the unzipping script if needed. \ No newline at end of file From b9e3eaf4549181a584c9ff44cd97bb5b7d8ae704 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 11:36:30 -0600 Subject: [PATCH 15/24] adding optional commented-out line to run the unzipping script --- run_pipeline.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/run_pipeline.sh b/run_pipeline.sh index ca39ba1..b42bd83 100644 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -43,6 +43,11 @@ if ! conda env list | grep -q '^env_hgsoc '; then fi conda activate env_hgsoc +# -------------------------------------------------- +# OPTIONAL: unzip .gz and .zip files in input_data +# -------------------------------------------------- +# python scripts/00_unzip_input_data.py + # -------------------------------------------------- # 3) Run the pipeline # -------------------------------------------------- From 25e2499fbfc9c4d511a1541c18a4ede94e318ccd Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 12:39:13 -0600 Subject: [PATCH 16/24] minimally modifying run_pipeline to be suitable for HPC --- run_pipeline.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/run_pipeline.sh b/run_pipeline.sh index b42bd83..03e7041 100644 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -52,7 +52,8 @@ conda activate env_hgsoc # 3) Run the pipeline # -------------------------------------------------- echo "••• Launching Nextflow" -nextflow run main.nf -profile local -resume -# or nextflow run main.nf -profile slurm -resume, if on HPC +nextflow run main.nf -profile slurm -resume +# nextflow run main.nf -profile slurm -resume, if on HPC +# nextflow run main.nf -profile local -resume, if running locally echo "••• Pipeline finished 🎉" \ No newline at end of file From 6462b01feb5184d5e93a883ff026237189d02206 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 13:00:04 -0600 Subject: [PATCH 17/24] specifying CRAN mirror --- scripts/01_load_renv.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/01_load_renv.R b/scripts/01_load_renv.R index 2087e2b..8a4a5af 100644 --- a/scripts/01_load_renv.R +++ b/scripts/01_load_renv.R @@ -9,4 +9,7 @@ if (!requireNamespace("renv", quietly = TRUE)) { install.packages("renv", version = "1.1.0") # Install renv if not already installed } + +options(repos = c(CRAN = "https://cloud.r-project.org")) + renv::restore() \ No newline at end of file From 9f0253f94c305604788397e73d25dc30326a2bae Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 13:08:20 -0600 Subject: [PATCH 18/24] adding amc partition name to nextflow.config --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index aba1311..2757c54 100644 --- a/nextflow.config +++ b/nextflow.config @@ -29,7 +29,7 @@ profiles { slurm { process { executor = 'slurm' // Use the SLURM executor - queue = 'compute' // Your specific SLURM queue + queue = 'amc' // Your specific SLURM queue cpus = 4 // Default CPUs for processes on SLURM memory = '16 GB' // Default memory for processes on SLURM time = '10h' // Default time limit for processes on SLURM From f9af7be3ede84544d68377e846f822cc639b9710 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 13:10:46 -0600 Subject: [PATCH 19/24] adding more info needed to nextflow config --- nextflow.config | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/nextflow.config b/nextflow.config index 2757c54..38175bc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,12 +28,21 @@ profiles { // Profile for running on a SLURM HPC cluster slurm { process { - executor = 'slurm' // Use the SLURM executor - queue = 'amc' // Your specific SLURM queue - cpus = 4 // Default CPUs for processes on SLURM - memory = '16 GB' // Default memory for processes on SLURM - time = '10h' // Default time limit for processes on SLURM + executor = 'slurm' + queue = 'amilan' + clusterOptions = '--account=amc-general --qos=normal' + + // Default resources + cpus = 4 + memory = '16 GB' + time = '10h' } + + executor { + queueSize = 50 // Max concurrent jobs + submitRateLimit = '10 sec' // Rate limiting + } + } } } From 9239777fa85a8dd79d2c129301c904c60138e5f0 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 13:11:34 -0600 Subject: [PATCH 20/24] fixing extra bracket --- nextflow.config | 1 - 1 file changed, 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 38175bc..01af6cb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -43,6 +43,5 @@ profiles { submitRateLimit = '10 sec' // Rate limiting } } - } } From d56aa276448ea9c55ca7fd37da4a2173c9ac0198 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 18 Jun 2025 13:14:51 -0600 Subject: [PATCH 21/24] fixing order of CRAN mirror specification --- scripts/01_load_renv.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/01_load_renv.R b/scripts/01_load_renv.R index 8a4a5af..192e746 100644 --- a/scripts/01_load_renv.R +++ b/scripts/01_load_renv.R @@ -4,12 +4,12 @@ ### This script loads the renv from the lockfile. ########################################################################################## +# Set CRAN mirror option +options(repos = c(CRAN = "https://cloud.r-project.org")) # Load the renv library from the lockfile if (!requireNamespace("renv", quietly = TRUE)) { install.packages("renv", version = "1.1.0") # Install renv if not already installed } -options(repos = c(CRAN = "https://cloud.r-project.org")) - renv::restore() \ No newline at end of file From 4b2e56e09a5f3581568e9c694e20ae0db3c34b45 Mon Sep 17 00:00:00 2001 From: Grace Akatsu Date: Wed, 25 Jun 2025 08:24:36 -0600 Subject: [PATCH 22/24] editing .sh file to add some file paths specific to alpine, per convo with kevin --- run_pipeline.sh | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/run_pipeline.sh b/run_pipeline.sh index 03e7041..31c3627 100644 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -17,8 +17,29 @@ set -eo pipefail PRJ_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" cd "${PRJ_DIR}" +# export paths specific to Alpine +export PATH=/usr/include:$PATH +export CPATH=/usr/include/:$CPATH +export C_INCLUDE_PATH=/usr/include/:$C_INCLUDE_PATH + +# -------------------------------------------------- +# 1) Load miniforge, activate Conda environment env_hgsoc +# -------------------------------------------------- +module load miniforge + +# create the env once; reuse afterwards + if ! conda env list | grep -q '^env_hgsoc '; then + echo "••• Creating Conda environment env_hgsoc" + conda env create -f "${ENV_YML}" + fi + +echo "••• Activating Conda environment env_hgsoc" +conda activate env_hgsoc + +ENV_YML="${PRJ_DIR}/env_hgsoc.yml" + # -------------------------------------------------- -# 1) Ensure that Nextflow is installed +# 2) Ensure that Nextflow is installed # -------------------------------------------------- if ! command -v nextflow &> /dev/null then @@ -29,20 +50,6 @@ then exit 1 # Exit the script with an error code fi -# -------------------------------------------------- -# 2) Load Conda and activate env_hgsoc -# -------------------------------------------------- -source "$(conda info --base)/etc/profile.d/conda.sh" - -ENV_YML="${PRJ_DIR}/env_hgsoc.yml" - -# create the env once; reuse afterwards -if ! conda env list | grep -q '^env_hgsoc '; then - echo "••• Creating Conda environment env_hgsoc" - conda env create -f "${ENV_YML}" -fi -conda activate env_hgsoc - # -------------------------------------------------- # OPTIONAL: unzip .gz and .zip files in input_data # -------------------------------------------------- From cb5781b6e731f52552491cf173b512df18729dee Mon Sep 17 00:00:00 2001 From: Grace Akatsu Date: Wed, 25 Jun 2025 09:21:48 -0600 Subject: [PATCH 23/24] fixing the paths so that it is executable locally and on HPC --- run_pipeline.sh | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/run_pipeline.sh b/run_pipeline.sh index 31c3627..1620181 100644 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -14,19 +14,41 @@ set -eo pipefail # -------------------------------------------------- # 0) Resolve project root (directory of this script) # -------------------------------------------------- -PRJ_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +# Check if SLURM_SUBMIT_DIR is set (indicates Slurm environment) +if [ -n "${SLURM_SUBMIT_DIR}" ]; then + # Running on HPC via Slurm + PRJ_DIR="${SLURM_SUBMIT_DIR}" + RUN_MODE="HPC" + echo "Running on HPC (Slurm). Project directory: ${PRJ_DIR}" +else + # Running locally + # Get the directory where the script itself is located when run locally + PRJ_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + RUN_MODE="LOCAL" + echo "Running locally. Project directory: ${PRJ_DIR}" +fi + +# Change to the project directory cd "${PRJ_DIR}" +echo "Current working directory: $(pwd)" +# -------------------------------------------------- +# 0.5) Specifying paths specific to Alpine +# -------------------------------------------------- # export paths specific to Alpine -export PATH=/usr/include:$PATH -export CPATH=/usr/include/:$CPATH -export C_INCLUDE_PATH=/usr/include/:$C_INCLUDE_PATH +if [ "${RUN_MODE}" == "HPC" ]; then + export PATH=/usr/include:$PATH + export CPATH=/usr/include/:$CPATH + export C_INCLUDE_PATH=/usr/include/:$C_INCLUDE_PATH +fi # -------------------------------------------------- # 1) Load miniforge, activate Conda environment env_hgsoc # -------------------------------------------------- module load miniforge +ENV_YML="${PRJ_DIR}/env_hgsoc.yml" + # create the env once; reuse afterwards if ! conda env list | grep -q '^env_hgsoc '; then echo "••• Creating Conda environment env_hgsoc" @@ -36,8 +58,6 @@ module load miniforge echo "••• Activating Conda environment env_hgsoc" conda activate env_hgsoc -ENV_YML="${PRJ_DIR}/env_hgsoc.yml" - # -------------------------------------------------- # 2) Ensure that Nextflow is installed # -------------------------------------------------- @@ -59,8 +79,15 @@ fi # 3) Run the pipeline # -------------------------------------------------- echo "••• Launching Nextflow" -nextflow run main.nf -profile slurm -resume -# nextflow run main.nf -profile slurm -resume, if on HPC -# nextflow run main.nf -profile local -resume, if running locally + +# Define Nextflow's work directory based on run mode +NEXTFLOW_WORK_DIR="${PRJ_DIR}/nextflow_work" + + +if [ "${RUN_MODE}" == "HPC" ]; then + nextflow run main.nf -profile slurm -resume -w "${NEXTFLOW_WORK_DIR}" +else + nextflow run main.nf -profile local -resume -w "${NEXTFLOW_WORK_DIR}" +fi echo "••• Pipeline finished 🎉" \ No newline at end of file From cb26931e77009c27231dd86f2733784c13fa12a1 Mon Sep 17 00:00:00 2001 From: graceakatsu <140424231+graceakatsu@users.noreply.github.com> Date: Wed, 25 Jun 2025 09:24:16 -0600 Subject: [PATCH 24/24] Update README.md updating readme slightly to reflect new format of run_pipeline.sh --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c4f3b32..fe678bb 100644 --- a/README.md +++ b/README.md @@ -131,4 +131,4 @@ Prior to running these scripts, please ensure that the below raw data files (.gz #### Metadata - GSE176171_cell_metadata.tsv.gz # 4) Running the pipeline: -The pipeline can be run by executing run_pipeline.sh, which will in turn run main.nf, a Nextflow pipeline to execute the scripts in order (except for the optional unzipping script). run_pipeline.sh has configuration profile options to run either locally or on high performance computing (HPC) using Slurm. Be sure to open the file before running and ensure that the appropriate profile is active (both available in comments of script). Please also uncomment running the unzipping script if needed. \ No newline at end of file +The pipeline can be run by executing run_pipeline.sh, which will in turn run main.nf, a Nextflow pipeline to execute the scripts in order (except for the optional unzipping script). run_pipeline.sh has configuration profile options to run either locally or on high performance computing (HPC) using Slurm. Be sure to open the file before running and uncomment running the unzipping script if needed.