Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ Suggests:
testthat, BSgenome.Scerevisiae.UCSC.sacCer2, RSQLite
Imports:
dplyr, plyr, BSgenome, RMySQL, GenomicRanges, GenomeInfoDb
RoxygenNote: 6.0.1.9000
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by roxygen2 (4.1.1): do not edit by hand
# Generated by roxygen2: do not edit by hand

export(getMRCs)
export(getMultihitLengths)
Expand Down
73 changes: 42 additions & 31 deletions R/intSiteRetriever.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

.get_unique_sites <- function(sample_ref, conn) {
sample_ref_in_db <- .get_sample_ref_in_db(sample_ref, conn)
sites <- tbl(conn, "sites")
inner_join(sites, sample_ref_in_db)
sites <- dplyr::tbl(conn, "sites")
dplyr::inner_join(sites, sample_ref_in_db)
}

#' for a given samples get sites positions.
Expand All @@ -29,16 +29,16 @@
getUniqueSites <- function(sample_ref, conn) {
stopifnot(.check_has_sample_ref_cols(sample_ref))
sites <- .get_unique_sites(sample_ref, conn)
collect( select(sites,
siteID, chr, strand, position, sampleName, refGenome),
n = Inf
)
dplyr::collect(
dplyr::select(
sites, siteID, chr, strand, position, sampleName, refGenome),
n = Inf)
}

.get_multihitpositions <- function(sample_ref, conn) {
sample_ref_in_db <- .get_sample_ref_in_db(sample_ref, conn)
multihitpositions <- tbl(conn, "multihitpositions")
inner_join(multihitpositions, sample_ref_in_db, by="sampleID")
multihitpositions <- dplyr::tbl(conn, "multihitpositions")
dplyr::inner_join(multihitpositions, sample_ref_in_db, by = "sampleID")
}

#' lengths distributions for multihits
Expand All @@ -48,43 +48,48 @@ getUniqueSites <- function(sample_ref, conn) {
getMultihitLengths <- function(sample_ref, conn) {
stopifnot(.check_has_sample_ref_cols(sample_ref))
samples_multihitpositions <- .get_multihitpositions(sample_ref, conn)
multihit_lengths <- tbl(conn, "multihitlengths")
collect(distinct(select(inner_join(samples_multihitpositions, multihit_lengths, by="multihitID"),
sampleName, refGenome, multihitID, length)),
multihit_lengths <- dplyr::tbl(conn, "multihitlengths")
dplyr::collect(dplyr::distinct(dplyr::select(dplyr::inner_join(
samples_multihitpositions, multihit_lengths, by = "multihitID"),
sampleName, refGenome, multihitID, length)),
n = Inf)
}

.get_breakpoints <- function(sample_ref, conn) {
sample_ref_sites <- .get_unique_sites(sample_ref, conn)
breakpoints <- tbl(conn, "pcrbreakpoints")
inner_join(sample_ref_sites, breakpoints)
breakpoints <- dplyr::tbl(conn, "pcrbreakpoints")
dplyr::inner_join(sample_ref_sites, breakpoints)
}

#' breakpoints
#' @inheritParams getUniqueSites
#' @export
getUniquePCRbreaks <- function(sample_ref, conn) {
breakpoints <- .get_breakpoints(sample_ref, conn)
collect(select(breakpoints,
breakpoint, count, position, siteID, chr, strand, sampleName, refGenome),
dplyr::collect(dplyr::select(breakpoints,
breakpoint, count, position, siteID, chr,
strand, sampleName, refGenome),
n = Inf
)
# column named kept as in DB ...sites.position AS integration...
}

.check_has_sample_ref_cols <- function(sample_ref) {
return (all(c("sampleName", "refGenome") %in% names(sample_ref)))
return(all(c("sampleName", "refGenome") %in% names(sample_ref)))
}

.get_sample_table <- function(conn) {
samples_in_db <- tbl(conn, "samples")
select(samples_in_db, sampleID, sampleName, refGenome)
samples_in_db <- dplyr::tbl(conn, "samples")
dplyr::select(samples_in_db, sampleID, sampleName, refGenome)
}

.get_sample_ref_in_db <- function(sample_ref, conn) {
samples_in_db <- tbl(conn, "samples")
samples_in_db <- select(samples_in_db, sampleID, sampleName, refGenome, gender)
inner_join(samples_in_db, sample_ref, by=c('sampleName', 'refGenome'), copy=TRUE)
samples_in_db <- dplyr::tbl(conn, "samples")
samples_in_db <- dplyr::select(
samples_in_db, sampleID, sampleName, refGenome, gender)
dplyr::inner_join(
samples_in_db, sample_ref,
by = c('sampleName', 'refGenome'), copy = TRUE)
}

#' do we have samples in database
Expand All @@ -94,7 +99,7 @@ getUniquePCRbreaks <- function(sample_ref, conn) {
setNameExists <- function(sample_ref, conn) {
stopifnot(.check_has_sample_ref_cols(sample_ref))

sample_ref_in_db <- collect(.get_sample_table(conn), n = Inf)
sample_ref_in_db <- dplyr::collect(.get_sample_table(conn), n = Inf)
if (nrow(sample_ref_in_db) == 0) { # nothing is in db
return(rep(FALSE, nrow(sample_ref)))
}
Expand All @@ -109,9 +114,11 @@ setNameExists <- function(sample_ref, conn) {
getUniqueSiteReadCounts <- function(sample_ref, conn) {
stopifnot(.check_has_sample_ref_cols(sample_ref))
sample_ref_sites_breakpoints <- .get_breakpoints(sample_ref, conn)
sample_ref_sites_breakpoints_grouped <- group_by(
sample_ref_sites_breakpoints_grouped <- dplyr::group_by(
sample_ref_sites_breakpoints, sampleName, refGenome)
collect(summarize(sample_ref_sites_breakpoints_grouped, readCount=sum(count)), n = Inf)
dplyr::collect(dplyr::summarize(
sample_ref_sites_breakpoints_grouped, readCount = sum(count)),
n = Inf)
}

#' unique counts for integration sites for a given sample(with fixed genome)
Expand All @@ -120,8 +127,10 @@ getUniqueSiteReadCounts <- function(sample_ref, conn) {
getUniqueSiteCounts <- function(sample_ref, conn) {
stopifnot(.check_has_sample_ref_cols(sample_ref))
sample_ref_sites <- .get_unique_sites(sample_ref, conn)
sample_ref_sites_grouped <- group_by(sample_ref_sites, sampleName, refGenome)
collect(summarize(sample_ref_sites_grouped, uniqueSites=n()), n = Inf)
sample_ref_sites_grouped <- dplyr::group_by(
sample_ref_sites, sampleName, refGenome)
dplyr::collect(dplyr::summarize(
sample_ref_sites_grouped, uniqueSites = n()), n = Inf)
}


Expand All @@ -133,16 +142,18 @@ getUniqueSiteCounts <- function(sample_ref, conn) {
getMRCs <- function(sample_ref, conn, numberOfMRCs=3) {
stopifnot(.check_has_sample_ref_cols(sample_ref))
sites <- .get_unique_sites(sample_ref, conn)
sites.metadata <- collect(select(sites,
siteID, gender, sampleName, refGenome), n = Inf)
sites.metadata <- dplyr::collect(dplyr::select(
sites, siteID, gender, sampleName, refGenome), n = Inf)

sites_meta <- data.frame("siteID"=sites.metadata$siteID,
"gender"=tolower(sites.metadata$gender))
sites_meta <- data.frame(
"siteID" = sites.metadata$siteID,
"gender" = tolower(sites.metadata$gender))

stopifnot(length(unique(sites.metadata$refGenome)) == 1)
ref_genome <- sites.metadata$refGenome[1] # all the same

mrcs <- get_N_MRCs(sites_meta, get_reference_genome(ref_genome), numberOfMRCs)
mrcs <- get_N_MRCs(
sites_meta, get_reference_genome(ref_genome), numberOfMRCs)

merge(mrcs, sites.metadata[c("siteID", "sampleName", "refGenome")])
}
17 changes: 10 additions & 7 deletions R/random_site.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ get_reference_genome <- function(reference_genome) {
call. = FALSE)
}
pattern <- paste0("\\.", reference_genome, "$")
match_index <- which(grepl(pattern, installed.genomes()))
match_index <- which(grepl(pattern, unique(BSgenome::installed.genomes())))
if (length(match_index) != 1) {
write("Installed genomes are:", stderr())
write(installed.genomes(), stderr())
write(BSgenome::installed.genomes(), stderr())
stop(paste("Cannot find unique genome for", reference_genome))
}
BS_genome_full_name <- installed.genomes()[match_index]
BS_genome_full_name <- unique(BSgenome::installed.genomes())[match_index]
library(BS_genome_full_name, character.only=T)
get(BS_genome_full_name)
}
Expand All @@ -65,7 +65,7 @@ get_random_positions <- function(siteIDs, reference_genome, gender='m',
chr_len <- .get_gender_specific_chr(chr_len, gender, male_chr)
chr_len <- chr_len[names(chr_len) != "chrM"] #remove mitochondria

cs <- c(0,cumsum(as.numeric(chr_len)))
cs <- c(0, cumsum(as.numeric(chr_len)))
genomeLength <- max(cs)

seed <- .Random.seed #don't want to screw up global randomness
Expand All @@ -76,10 +76,12 @@ get_random_positions <- function(siteIDs, reference_genome, gender='m',
cuts <- cut(abs(rands), breaks=cs, labels=names(chr_len))

#outputs in format of "siteID", "chr", "strand", "position"
data_frame(
dplyr::data_frame(
"siteID" = rep(x,number_of_positions),
"chr" = as.character(cuts),
"strand" = as.character(cut(sign(rands), breaks=c(-1,0,1), labels=c("-", "+"), include.lowest=T)),
"strand" = as.character(cut(
sign(rands), breaks=c(-1,0,1),
labels=c("-", "+"), include.lowest=T)),
"position" = abs(rands) - cs[match(cuts, names(chr_len))])
})

Expand All @@ -97,7 +99,8 @@ get_random_positions <- function(siteIDs, reference_genome, gender='m',
#'
#' @note siteID are the same as given by sites_meta df
#' @export
get_N_MRCs <- function(sites_meta, reference_genome, number_mrcs_per_site=3, male_chr="chrY") {
get_N_MRCs <- function(sites_meta, reference_genome,
number_mrcs_per_site=3, male_chr="chrY") {
stopifnot(setequal(names(sites_meta), c("siteID", "gender")))
stopifnot(number_mrcs_per_site > 0)
stopifnot(.check_gender(sites_meta$gender))
Expand Down
3 changes: 1 addition & 2 deletions man/getMRCs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/getMultihitLengths.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/getUniquePCRbreaks.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/getUniqueSiteCounts.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/getUniqueSiteReadCounts.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/getUniqueSites.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/get_N_MRCs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 4 additions & 8 deletions man/get_random_positions.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/get_reference_genome.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/setNameExists.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file added tests/testthat/test_database
Binary file not shown.
4 changes: 2 additions & 2 deletions tests/testthat/test_get_sample_exists.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
context("check if sites for samples exists")
source("database.R") # provide db_name

read_conn <- src_sqlite(db_name)
read_conn <- dplyr::src_sqlite(db_name)

sample_ref <- data_frame(
sample_ref <- dplyr::data_frame(
sampleName=c("sample1", "sample2", "NOT_THERE", "sample2", "sample3"),
refGenome=c("hg18", "hg18", "UNKNOWN_GENOME", "hgXXX", "hgYYY")
)
Expand Down
Loading