diff --git a/DESCRIPTION b/DESCRIPTION index 0e38cd4..3751f7f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,3 +10,4 @@ Suggests: testthat, BSgenome.Scerevisiae.UCSC.sacCer2, RSQLite Imports: dplyr, plyr, BSgenome, RMySQL, GenomicRanges, GenomeInfoDb +RoxygenNote: 6.0.1.9000 diff --git a/NAMESPACE b/NAMESPACE index fadff7a..9986ab2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,4 @@ -# Generated by roxygen2 (4.1.1): do not edit by hand +# Generated by roxygen2: do not edit by hand export(getMRCs) export(getMultihitLengths) diff --git a/R/intSiteRetriever.R b/R/intSiteRetriever.R index 8f0d3c5..96fe23f 100644 --- a/R/intSiteRetriever.R +++ b/R/intSiteRetriever.R @@ -17,8 +17,8 @@ .get_unique_sites <- function(sample_ref, conn) { sample_ref_in_db <- .get_sample_ref_in_db(sample_ref, conn) - sites <- tbl(conn, "sites") - inner_join(sites, sample_ref_in_db) + sites <- dplyr::tbl(conn, "sites") + dplyr::inner_join(sites, sample_ref_in_db) } #' for a given samples get sites positions. @@ -29,16 +29,16 @@ getUniqueSites <- function(sample_ref, conn) { stopifnot(.check_has_sample_ref_cols(sample_ref)) sites <- .get_unique_sites(sample_ref, conn) - collect( select(sites, - siteID, chr, strand, position, sampleName, refGenome), - n = Inf - ) + dplyr::collect( + dplyr::select( + sites, siteID, chr, strand, position, sampleName, refGenome), + n = Inf) } .get_multihitpositions <- function(sample_ref, conn) { sample_ref_in_db <- .get_sample_ref_in_db(sample_ref, conn) - multihitpositions <- tbl(conn, "multihitpositions") - inner_join(multihitpositions, sample_ref_in_db, by="sampleID") + multihitpositions <- dplyr::tbl(conn, "multihitpositions") + dplyr::inner_join(multihitpositions, sample_ref_in_db, by = "sampleID") } #' lengths distributions for multihits @@ -48,16 +48,17 @@ getUniqueSites <- function(sample_ref, conn) { getMultihitLengths <- function(sample_ref, conn) { stopifnot(.check_has_sample_ref_cols(sample_ref)) samples_multihitpositions <- .get_multihitpositions(sample_ref, conn) - multihit_lengths <- tbl(conn, "multihitlengths") - collect(distinct(select(inner_join(samples_multihitpositions, multihit_lengths, by="multihitID"), - sampleName, refGenome, multihitID, length)), + multihit_lengths <- dplyr::tbl(conn, "multihitlengths") + dplyr::collect(dplyr::distinct(dplyr::select(dplyr::inner_join( + samples_multihitpositions, multihit_lengths, by = "multihitID"), + sampleName, refGenome, multihitID, length)), n = Inf) } .get_breakpoints <- function(sample_ref, conn) { sample_ref_sites <- .get_unique_sites(sample_ref, conn) - breakpoints <- tbl(conn, "pcrbreakpoints") - inner_join(sample_ref_sites, breakpoints) + breakpoints <- dplyr::tbl(conn, "pcrbreakpoints") + dplyr::inner_join(sample_ref_sites, breakpoints) } #' breakpoints @@ -65,26 +66,30 @@ getMultihitLengths <- function(sample_ref, conn) { #' @export getUniquePCRbreaks <- function(sample_ref, conn) { breakpoints <- .get_breakpoints(sample_ref, conn) - collect(select(breakpoints, - breakpoint, count, position, siteID, chr, strand, sampleName, refGenome), + dplyr::collect(dplyr::select(breakpoints, + breakpoint, count, position, siteID, chr, + strand, sampleName, refGenome), n = Inf ) # column named kept as in DB ...sites.position AS integration... } .check_has_sample_ref_cols <- function(sample_ref) { - return (all(c("sampleName", "refGenome") %in% names(sample_ref))) + return(all(c("sampleName", "refGenome") %in% names(sample_ref))) } .get_sample_table <- function(conn) { - samples_in_db <- tbl(conn, "samples") - select(samples_in_db, sampleID, sampleName, refGenome) + samples_in_db <- dplyr::tbl(conn, "samples") + dplyr::select(samples_in_db, sampleID, sampleName, refGenome) } .get_sample_ref_in_db <- function(sample_ref, conn) { - samples_in_db <- tbl(conn, "samples") - samples_in_db <- select(samples_in_db, sampleID, sampleName, refGenome, gender) - inner_join(samples_in_db, sample_ref, by=c('sampleName', 'refGenome'), copy=TRUE) + samples_in_db <- dplyr::tbl(conn, "samples") + samples_in_db <- dplyr::select( + samples_in_db, sampleID, sampleName, refGenome, gender) + dplyr::inner_join( + samples_in_db, sample_ref, + by = c('sampleName', 'refGenome'), copy = TRUE) } #' do we have samples in database @@ -94,7 +99,7 @@ getUniquePCRbreaks <- function(sample_ref, conn) { setNameExists <- function(sample_ref, conn) { stopifnot(.check_has_sample_ref_cols(sample_ref)) - sample_ref_in_db <- collect(.get_sample_table(conn), n = Inf) + sample_ref_in_db <- dplyr::collect(.get_sample_table(conn), n = Inf) if (nrow(sample_ref_in_db) == 0) { # nothing is in db return(rep(FALSE, nrow(sample_ref))) } @@ -109,9 +114,11 @@ setNameExists <- function(sample_ref, conn) { getUniqueSiteReadCounts <- function(sample_ref, conn) { stopifnot(.check_has_sample_ref_cols(sample_ref)) sample_ref_sites_breakpoints <- .get_breakpoints(sample_ref, conn) - sample_ref_sites_breakpoints_grouped <- group_by( + sample_ref_sites_breakpoints_grouped <- dplyr::group_by( sample_ref_sites_breakpoints, sampleName, refGenome) - collect(summarize(sample_ref_sites_breakpoints_grouped, readCount=sum(count)), n = Inf) + dplyr::collect(dplyr::summarize( + sample_ref_sites_breakpoints_grouped, readCount = sum(count)), + n = Inf) } #' unique counts for integration sites for a given sample(with fixed genome) @@ -120,8 +127,10 @@ getUniqueSiteReadCounts <- function(sample_ref, conn) { getUniqueSiteCounts <- function(sample_ref, conn) { stopifnot(.check_has_sample_ref_cols(sample_ref)) sample_ref_sites <- .get_unique_sites(sample_ref, conn) - sample_ref_sites_grouped <- group_by(sample_ref_sites, sampleName, refGenome) - collect(summarize(sample_ref_sites_grouped, uniqueSites=n()), n = Inf) + sample_ref_sites_grouped <- dplyr::group_by( + sample_ref_sites, sampleName, refGenome) + dplyr::collect(dplyr::summarize( + sample_ref_sites_grouped, uniqueSites = n()), n = Inf) } @@ -133,16 +142,18 @@ getUniqueSiteCounts <- function(sample_ref, conn) { getMRCs <- function(sample_ref, conn, numberOfMRCs=3) { stopifnot(.check_has_sample_ref_cols(sample_ref)) sites <- .get_unique_sites(sample_ref, conn) - sites.metadata <- collect(select(sites, - siteID, gender, sampleName, refGenome), n = Inf) + sites.metadata <- dplyr::collect(dplyr::select( + sites, siteID, gender, sampleName, refGenome), n = Inf) - sites_meta <- data.frame("siteID"=sites.metadata$siteID, - "gender"=tolower(sites.metadata$gender)) + sites_meta <- data.frame( + "siteID" = sites.metadata$siteID, + "gender" = tolower(sites.metadata$gender)) stopifnot(length(unique(sites.metadata$refGenome)) == 1) ref_genome <- sites.metadata$refGenome[1] # all the same - mrcs <- get_N_MRCs(sites_meta, get_reference_genome(ref_genome), numberOfMRCs) + mrcs <- get_N_MRCs( + sites_meta, get_reference_genome(ref_genome), numberOfMRCs) merge(mrcs, sites.metadata[c("siteID", "sampleName", "refGenome")]) } diff --git a/R/random_site.R b/R/random_site.R index 98962ec..d2837fe 100644 --- a/R/random_site.R +++ b/R/random_site.R @@ -33,13 +33,13 @@ get_reference_genome <- function(reference_genome) { call. = FALSE) } pattern <- paste0("\\.", reference_genome, "$") - match_index <- which(grepl(pattern, installed.genomes())) + match_index <- which(grepl(pattern, unique(BSgenome::installed.genomes()))) if (length(match_index) != 1) { write("Installed genomes are:", stderr()) - write(installed.genomes(), stderr()) + write(BSgenome::installed.genomes(), stderr()) stop(paste("Cannot find unique genome for", reference_genome)) } - BS_genome_full_name <- installed.genomes()[match_index] + BS_genome_full_name <- unique(BSgenome::installed.genomes())[match_index] library(BS_genome_full_name, character.only=T) get(BS_genome_full_name) } @@ -65,7 +65,7 @@ get_random_positions <- function(siteIDs, reference_genome, gender='m', chr_len <- .get_gender_specific_chr(chr_len, gender, male_chr) chr_len <- chr_len[names(chr_len) != "chrM"] #remove mitochondria - cs <- c(0,cumsum(as.numeric(chr_len))) + cs <- c(0, cumsum(as.numeric(chr_len))) genomeLength <- max(cs) seed <- .Random.seed #don't want to screw up global randomness @@ -76,10 +76,12 @@ get_random_positions <- function(siteIDs, reference_genome, gender='m', cuts <- cut(abs(rands), breaks=cs, labels=names(chr_len)) #outputs in format of "siteID", "chr", "strand", "position" - data_frame( + dplyr::data_frame( "siteID" = rep(x,number_of_positions), "chr" = as.character(cuts), - "strand" = as.character(cut(sign(rands), breaks=c(-1,0,1), labels=c("-", "+"), include.lowest=T)), + "strand" = as.character(cut( + sign(rands), breaks=c(-1,0,1), + labels=c("-", "+"), include.lowest=T)), "position" = abs(rands) - cs[match(cuts, names(chr_len))]) }) @@ -97,7 +99,8 @@ get_random_positions <- function(siteIDs, reference_genome, gender='m', #' #' @note siteID are the same as given by sites_meta df #' @export -get_N_MRCs <- function(sites_meta, reference_genome, number_mrcs_per_site=3, male_chr="chrY") { +get_N_MRCs <- function(sites_meta, reference_genome, + number_mrcs_per_site=3, male_chr="chrY") { stopifnot(setequal(names(sites_meta), c("siteID", "gender"))) stopifnot(number_mrcs_per_site > 0) stopifnot(.check_gender(sites_meta$gender)) diff --git a/man/getMRCs.Rd b/man/getMRCs.Rd index c847af2..7c4a074 100644 --- a/man/getMRCs.Rd +++ b/man/getMRCs.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/intSiteRetriever.R \name{getMRCs} \alias{getMRCs} @@ -19,4 +19,3 @@ df with cols: siteID, position, strand, chr, sampleName, refGenome \description{ creates match random controls. } - diff --git a/man/getMultihitLengths.Rd b/man/getMultihitLengths.Rd index 53d3851..ccce403 100644 --- a/man/getMultihitLengths.Rd +++ b/man/getMultihitLengths.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/intSiteRetriever.R \name{getMultihitLengths} \alias{getMultihitLengths} @@ -17,4 +17,3 @@ df with cols: sampleName, refGenome, multihitID, length \description{ lengths distributions for multihits } - diff --git a/man/getUniquePCRbreaks.Rd b/man/getUniquePCRbreaks.Rd index 4ba5a82..a64fa84 100644 --- a/man/getUniquePCRbreaks.Rd +++ b/man/getUniquePCRbreaks.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/intSiteRetriever.R \name{getUniquePCRbreaks} \alias{getUniquePCRbreaks} @@ -14,4 +14,3 @@ getUniquePCRbreaks(sample_ref, conn) \description{ breakpoints } - diff --git a/man/getUniqueSiteCounts.Rd b/man/getUniqueSiteCounts.Rd index d2a4509..3bc9ef4 100644 --- a/man/getUniqueSiteCounts.Rd +++ b/man/getUniqueSiteCounts.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/intSiteRetriever.R \name{getUniqueSiteCounts} \alias{getUniqueSiteCounts} @@ -14,4 +14,3 @@ getUniqueSiteCounts(sample_ref, conn) \description{ unique counts for integration sites for a given sample(with fixed genome) } - diff --git a/man/getUniqueSiteReadCounts.Rd b/man/getUniqueSiteReadCounts.Rd index dd7d1d8..36dc134 100644 --- a/man/getUniqueSiteReadCounts.Rd +++ b/man/getUniqueSiteReadCounts.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/intSiteRetriever.R \name{getUniqueSiteReadCounts} \alias{getUniqueSiteReadCounts} @@ -14,4 +14,3 @@ getUniqueSiteReadCounts(sample_ref, conn) \description{ counts } - diff --git a/man/getUniqueSites.Rd b/man/getUniqueSites.Rd index d757a06..eaadcb4 100644 --- a/man/getUniqueSites.Rd +++ b/man/getUniqueSites.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/intSiteRetriever.R \name{getUniqueSites} \alias{getUniqueSites} @@ -17,4 +17,3 @@ sites dataframe with cols: siteID, chr, strand, position, sampleName, refGenome \description{ for a given samples get sites positions. } - diff --git a/man/get_N_MRCs.Rd b/man/get_N_MRCs.Rd index cb838e0..5f69a2c 100644 --- a/man/get_N_MRCs.Rd +++ b/man/get_N_MRCs.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/random_site.R \name{get_N_MRCs} \alias{get_N_MRCs} @@ -23,4 +23,3 @@ generate random controls for sites \note{ siteID are the same as given by sites_meta df } - diff --git a/man/get_random_positions.Rd b/man/get_random_positions.Rd index fddbf3b..33b5731 100644 --- a/man/get_random_positions.Rd +++ b/man/get_random_positions.Rd @@ -1,16 +1,16 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/random_site.R \name{get_random_positions} \alias{get_random_positions} \title{for a given reference genome and gender generate random positions} \usage{ -get_random_positions(siteIDs, reference_genome, gender, +get_random_positions(siteIDs, reference_genome, gender = "m", number_of_positions = 3, male_chr = c("chrY")) } \arguments{ \item{siteIDs}{vector of unique siteIDs for use as random seed} -\item{reference_genome}{BS object reference genome(} +\item{reference_genome}{BS object reference genome(@seealso get_reference_genome)} \item{gender}{'m' or 'f'} @@ -19,13 +19,9 @@ get_random_positions(siteIDs, reference_genome, gender, \item{male_chr}{list of male-specific chromosomes prefixes(only 1 prefix is allowed at present)} } \value{ -dataframe with columns: +dataframe with columns: siteID(numeric), chr(character), strand(character), position(numeric) } \description{ for a given reference genome and gender generate random positions } -\seealso{ -get_reference_genome) -} - diff --git a/man/get_reference_genome.Rd b/man/get_reference_genome.Rd index dda906e..5af3fbc 100644 --- a/man/get_reference_genome.Rd +++ b/man/get_reference_genome.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/random_site.R \name{get_reference_genome} \alias{get_reference_genome} @@ -16,4 +16,3 @@ stop if cannot find unique genome from installed BSgenome \seealso{ getRefGenome } - diff --git a/man/setNameExists.Rd b/man/setNameExists.Rd index a1a708a..1841c7b 100644 --- a/man/setNameExists.Rd +++ b/man/setNameExists.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/intSiteRetriever.R \name{setNameExists} \alias{setNameExists} @@ -17,4 +17,3 @@ vector of TRUE/FALSE for each row in sample_ref df \description{ do we have samples in database } - diff --git a/tests/testthat/test_database b/tests/testthat/test_database new file mode 100644 index 0000000..9ed71bc Binary files /dev/null and b/tests/testthat/test_database differ diff --git a/tests/testthat/test_get_sample_exists.R b/tests/testthat/test_get_sample_exists.R index babce27..37a3888 100644 --- a/tests/testthat/test_get_sample_exists.R +++ b/tests/testthat/test_get_sample_exists.R @@ -1,9 +1,9 @@ context("check if sites for samples exists") source("database.R") # provide db_name -read_conn <- src_sqlite(db_name) +read_conn <- dplyr::src_sqlite(db_name) -sample_ref <- data_frame( +sample_ref <- dplyr::data_frame( sampleName=c("sample1", "sample2", "NOT_THERE", "sample2", "sample3"), refGenome=c("hg18", "hg18", "UNKNOWN_GENOME", "hgXXX", "hgYYY") ) diff --git a/tests/testthat/test_get_unique_sites.R b/tests/testthat/test_get_unique_sites.R index 1625755..aa2b0e2 100644 --- a/tests/testthat/test_get_unique_sites.R +++ b/tests/testthat/test_get_unique_sites.R @@ -1,9 +1,9 @@ context("integration sites") source("database.R") # provide db_name -read_conn <- src_sqlite(db_name) +read_conn <- dplyr::src_sqlite(db_name) -sample_ref <- data_frame( +sample_ref <- dplyr::data_frame( sampleName=c("sample1", "sample2", "sample2", "sample3"), refGenome=c("hg18", "hg18", "hgXXX", "hgYYY") ) @@ -15,7 +15,7 @@ test_that("sites element has 5 columns", { expect_named(actual, expected, ignore.order=TRUE) }) -sample_ref <- data_frame( +sample_ref <- dplyr::data_frame( sampleName=c("sample1"), refGenome=c("hg18") ) @@ -24,7 +24,7 @@ test_that("can get sites that are present in files", { expect_equal(nrow(getUniqueSites(sample_ref, read_conn)), 3) }) -sample_ref <- data_frame( +sample_ref <- dplyr::data_frame( sampleName=c("sample1", "XXX"), refGenome=c("hg18", "YYY") ) @@ -36,8 +36,8 @@ test_that("if sampleName is not found it is ignored", { context("MRC sites") source("database.R") # provide db_name -db_conn <- src_sqlite(db_name) -sample_ref <- data_frame( +db_conn <- dplyr::src_sqlite(db_name) +sample_ref <- dplyr::data_frame( sampleName=c("sample1", "sample2"), refGenome=c("hg18", "hg18") ) diff --git a/tests/testthat/test_multihit_length.R b/tests/testthat/test_multihit_length.R index ea191df..089cc12 100644 --- a/tests/testthat/test_multihit_length.R +++ b/tests/testthat/test_multihit_length.R @@ -1,11 +1,11 @@ source("database.R") # provide db_name -read_conn <- src_sqlite(db_name) +read_conn <- dplyr::src_sqlite(db_name) context("setNameExists") test_that("can find sample that is in db", { - sample_ref <- data_frame( + sample_ref <- dplyr::data_frame( sampleName=c("sample1"), refGenome=c("hg18") ) @@ -13,7 +13,7 @@ test_that("can find sample that is in db", { }) test_that("can NOT find sample that is NOT in db", { - sample_ref <- data_frame( + sample_ref <- dplyr::data_frame( sampleName=c("sample1_NOT_THERE"), refGenome=c("hg18") ) @@ -21,7 +21,7 @@ test_that("can NOT find sample that is NOT in db", { }) test_that("can present/absent samples", { - sample_ref <- data_frame( + sample_ref <- dplyr::data_frame( sampleName=c("sample3", "sample1_NOT_THERE"), refGenome=c("hgYYY", "hg18") ) @@ -29,7 +29,7 @@ test_that("can present/absent samples", { }) test_that("the same sample but different genomes", { - sample_ref <- data_frame( + sample_ref <- dplyr::data_frame( sampleName=c("sample2", "sample2", "sample2"), refGenome=c("hg18", "hgXXX", "NOT_IN_DB") ) @@ -38,7 +38,7 @@ test_that("the same sample but different genomes", { context("check multihit length") -samples <- data_frame( +samples <- dplyr::data_frame( sampleName=c("sample1", "sample2"), refGenome=c("hg18", "hg18") ) @@ -80,6 +80,6 @@ test_that("has 1 lengths for sample2", { }) test_that("return nothing for non-existing sample", { - expect_equal(nrow(getMultihitLengths(data_frame(sampleName="it does not exist", + expect_equal(nrow(getMultihitLengths(dplyr::data_frame(sampleName="it does not exist", refGenome="hg18"), read_conn)), 0) }) diff --git a/tests/testthat/test_random_site.R b/tests/testthat/test_random_site.R index 578f7e2..8b38292 100644 --- a/tests/testthat/test_random_site.R +++ b/tests/testthat/test_random_site.R @@ -59,3 +59,4 @@ test_that("get_N_MRCs return the same result for the same siteIDs", { mrcs2 <- get_N_MRCs(sites_meta, reference, 9, male_chr="chrI") expect_equal(mrcs1, mrcs2) }) + diff --git a/tests/testthat/test_unique_site_counts.R b/tests/testthat/test_unique_site_counts.R index 7687654..98216e2 100644 --- a/tests/testthat/test_unique_site_counts.R +++ b/tests/testthat/test_unique_site_counts.R @@ -1,16 +1,16 @@ source("database.R") # provide db_name -read_conn <- src_sqlite(db_name) +read_conn <- dplyr::src_sqlite(db_name) context("Unique sites counts") -sample_ref <- data_frame( +sample_ref <- dplyr::data_frame( sampleName=c("sample1", "sample2", "sample2", "sample3"), refGenome=c("hg18", "hg18", "hgXXX", "hgYYY") ) test_that("for samples not in db return nothing", { - sample_ref <- data_frame( + sample_ref <- dplyr::data_frame( sampleName=c("NOT_IN_DB_sample1", "NOT_IN_DB_sample2"), refGenome=c("NOT_IN_DB_hg18", "NOT_IN_DB_hg18") ) diff --git a/tests/testthat/test_unique_site_read_counts.R b/tests/testthat/test_unique_site_read_counts.R index 8fe6fda..1c7e678 100644 --- a/tests/testthat/test_unique_site_read_counts.R +++ b/tests/testthat/test_unique_site_read_counts.R @@ -1,16 +1,16 @@ source("database.R") # provide db_name -read_conn <- src_sqlite(db_name) +read_conn <- dplyr::src_sqlite(db_name) context("Unique sites Read counts") -sample_ref <- data_frame( +sample_ref <- dplyr::data_frame( sampleName=c("sample1", "sample2", "sample2", "sample3"), refGenome=c("hg18", "hg18", "hgXXX", "hgYYY") ) test_that("for samples not in db return nothing", { - sample_ref <- data_frame( + sample_ref <- dplyr::data_frame( sampleName=c("NOT_IN_DB_sample1", "NOT_IN_DB_sample2"), refGenome=c("NOT_IN_DB_hg18", "NOT_IN_DB_hg18") )