diff --git a/compute_image_similarities.R b/compute_image_similarities.R new file mode 100644 index 0000000..783b15b --- /dev/null +++ b/compute_image_similarities.R @@ -0,0 +1,112 @@ +library(optparse) +library(rhdf5) +library(proxy) + + +option_list = list( + make_option(c("-i", "--input"), action = "store", default = NA, type='character', + help="Path to the HDF5 file with the WND-CHRM features"), + make_option(c("-o", "--output"), action = "store", default = NA, type='character', + help="Basename to use for the output files (PCs and similarity matrix)"), + make_option(c("-n", "--nPCs"), action = "store", default = NA, type = "integer", + help = "Number of principal components to keep and use") +) + +opt_parser <- OptionParser(option_list=option_list) +opt = parse_args(opt_parser) + +if (is.na(opt$input)){ + print_help(opt_parser) + stop("Argument --input required.", call.=FALSE) +} + + +# Helper function to find elbow in a 2D curve represented by a list of ordered values +# This function finds the point with maximum distance from the line between +# the first and last points. +# Adapted from StackOverflow: +# http://stackoverflow.com/questions/2018178/finding-the-best-trade-off-point-on-a-curve + +find_elbow <- function(values) { + + n <- length(values) + + # Crude check to see if values are ordered + if (values[1]<=values[n]) { + stop("Values must be in decreasing order") + } + + coords <- cbind(seq.int(n), as.matrix(values)) + # Vector between first and last point + line <- coords[n,] - coords[1,] + # Normalize the vector + line <- line/sqrt(sum(line^2)); + # Vector between all points and first point + V1 <- sweep(coords, 2, coords[1,]) # default function in sweep is - (minus) + # To calculate the distance to the line, we split V1 into two + # components, V2 that is parallel to the line and V3 that is perpendicular. + # The distance is the norm of the part that is perpendicular to the line. + # We find V2 by projecting V1 onto the line by taking the scalar product + # of V1 with the unit vector on the line (this gives us the length of the + # projection of V1 onto the line). + # We get V2 by multiplying this scalar product by the unit vector. + # The perpendicular vector is V1 - V2 + V2 <- (V1 %*% line) %*% line + V3 <- V1 - V2 + # Distance to line is the norm of V3 + dist <- sqrt(rowSums(V3^2)) + idx <- which.max(dist) + + return(coords[idx,]) +} + + +# Input file with WND_CHRM features +filename <- opt$input +fields <- h5ls(filename) + +# Read HDF5 file +group_name <- paste(fields$group, fields$name, sep ="/") +h5data <- h5read(filename, group_name[1], compoundAsDataFrame=FALSE) +H5close() + +# Get metadata +measures <- h5data$Measurements +imageID <- measures$ImageID +wellID <- measures$WellID + +# Features are stored in a list of matrices +listOfFeatureMatrices <- measures[11:length(measures)] + +# Make matrix of images x features +featureMatrix <- t(do.call(rbind, listOfFeatureMatrices)) +row.names(featureMatrix) <- imageID + +# There are multiple rows with the same imageID because of tiling +# We average all rows from the same image +featureMatrix <- aggregate(featureMatrix, by = list(row.names(featureMatrix)), mean) +rownames(featureMatrix) <- featureMatrix[,1] + +nImages <- nrow(featureMatrix) # 388372950 images +randomSetSize <- 0.01*nImages +featureMatrix <- featureMatrix[sample(nrow(featureMatrix), randomSetSize), ] + +# Remove constant features +featureMatrix <- featureMatrix[, which(!apply(featureMatrix, 2, FUN=function(x) {sd(x)==0}))] +rm(fields, group_name, h5data, measures, imageID, wellID, listOfFeatureMatrices) + +# PCA +pca <- prcomp(featureMatrix, scale.= TRUE, center = TRUE) + +# Select PCs +k <- NULL +if (!is.na(opt$nPCs)) { + k <- opt$nPCs +} else { + k <- find_elbow(pca$sdev)[1] +} +saveRDS(pca$x[,1:k], paste0(opt$output,"-PCs.rds")) + +# Cosine similarity +S <- as.matrix(simil(pca$x[,1:k], method = "cosine", by_rows = TRUE), diag = 1) +saveRDS(S, paste0(opt$output,"-similarities.rds")) diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..65b8f4a --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,8 @@ +FROM r-base +MAINTAINER ome-devel@lists.openmicroscopy.org.uk + +ADD r-base-docker.R / +RUN Rscript /r-base-docker.R +RUN useradd -m user -d /scratch +WORKDIR /scratch +ENTRYPOINT ["/usr/bin/Rscript"] diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..d135261 --- /dev/null +++ b/docker/README.md @@ -0,0 +1,13 @@ +# Serrano Remining R Docker + +A plain R Docker image for running R scripts in this repository. +Example: + + docker build -t serrano-remining-rbase . + + docker run -it --rm -v $PWD:/src:ro -v $PWD/scratch:/scratch \ + serrano-remining-rbase /src/test_HDF5.R /src/Primary_146_81_features.h5 + + docker run -it --rm -v $PWD:/src:ro -v $PWD/scratch:/scratch \ + serrano-remining-rbase /src/compute_image_similarities.R \ + -i /src/example_features.h5 -o /scratch/out diff --git a/docker/r-base-docker.R b/docker/r-base-docker.R new file mode 100644 index 0000000..7c59146 --- /dev/null +++ b/docker/r-base-docker.R @@ -0,0 +1,15 @@ +install.packages(c( + "ggplot2", + "knitr", + "lsa", + "mclust", + "optparse", + "plotly", + "proxy", + "pryr", + "reshape", + "sysfonts", + "viridis", + "xkcd")) +source("http://bioconductor.org/biocLite.R") +biocLite("rhdf5") diff --git a/test_HDF5.R b/test_HDF5.R new file mode 100644 index 0000000..65946ee --- /dev/null +++ b/test_HDF5.R @@ -0,0 +1,52 @@ +args = commandArgs(trailingOnly=TRUE) +if (length(args)!=1) { + stop("One argument required (h5-file)", call.=FALSE) +} + +#### 1) Install and load *rhdf5* +if (!("rhdf5" %in% rownames(installed.packages()))) +{ + source("http://bioconductor.org/biocLite.R") + biocLite("rhdf5") +} + +library(rhdf5) + +#### 2) Explore the structure +filename <- args[1] +fields <- h5ls(filename) +str(fields) + +#### 3) Read *example.h5* +group_name <- paste0(fields$group, fields$name) +data <- h5read(filename, group_name[1], compoundAsDataFrame=FALSE) +H5close() + +#### 4) Get metadata +measures <- data$Measurements +imageID <- measures$ImageID +wellID <- measures$WellID + +#### 5) Get feature values (imageID: `r imageID`; well: `r wellID`) +# Features are stored in a list of matrices +featureListOfMatrices <- measures[11:length(measures)] + +## 1) Feature values +# Feature Vector has 2919 values +featureVector <- as.vector(do.call(rbind, featureListOfMatrices)) + +## 2) Feature names: Build new ID for each feature +# Length of each feature type +feat_size <- lapply(featureListOfMatrices, length) +feat_size <- data.frame(name=names(feat_size), size=unlist(feat_size)) +rownames(feat_size) <- seq(1:nrow(feat_size)) +# Repeat "length" times the name +a <- rep(feat_size$name, feat_size$size) +# Build sequences of number to create the ids +b <- c() +for(nElem in feat_size$size) +{ + b <- c(b, seq(1:nElem)) +} + +(feature_value <- data.frame(feature=paste(a,b,sep="_"), value=featureVector))