From 950a6e6d7043bfe6008d74c896e01511baf2aaa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Karim=20H=C3=A9rich=C3=A9?= Date: Tue, 28 Mar 2017 15:57:20 +0200 Subject: [PATCH 1/7] script to compute image similarity matrix --- compute_image_similarities.R | 108 +++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 compute_image_similarities.R diff --git a/compute_image_similarities.R b/compute_image_similarities.R new file mode 100644 index 0000000..89535b8 --- /dev/null +++ b/compute_image_similarities.R @@ -0,0 +1,108 @@ +library(optparse) +library(rhdf5) +library(proxy) + + +option_list = list( + make_option(c("-i", "--input"), action = "store", default = NA, type='character', + help="Path to the HDF5 file with the WND-CHRM features"), + make_option(c("-o", "--output"), action = "store", default = NA, type='character', + help="Basename to use for the output files (PCs and similarity matrix)"), + make_option(c("-n", "--nPCs"), action = "store", default = NA, type = "integer", + help = "Number of principal components to keep and use") +) + +opt_parser <- OptionParser(option_list=option_list) +opt = parse_args(opt_parser) + +if (is.na(opt$input)){ + print_help(opt_parser) + stop("Argument --input required.", call.=FALSE) +} + + +# Helper function to find elbow in a 2D curve represented by a list of ordered values +# This function finds the point with maximum distance from the line between +# the first and last points. +# Adapted from StackOverflow: +# http://stackoverflow.com/questions/2018178/finding-the-best-trade-off-point-on-a-curve + +find_elbow <- function(values) { + + n <- length(values) + + # Crude check to see if values are ordered + if (values[1]<=values[n]) { + stop("Values must be in decreasing order") + } + + coords <- cbind(seq.int(n), as.matrix(values)) + # Vector between first and last point + line <- coords[n,] - coords[1,] + # Normalize the vector + line <- line/sqrt(sum(line^2)); + # Vector between all points and first point + V1 <- sweep(coords, 2, coords[1,]) # default function in sweep is - (minus) + # To calculate the distance to the line, we split V1 into two + # components, V2 that is parallel to the line and V3 that is perpendicular. + # The distance is the norm of the part that is perpendicular to the line. + # We find V2 by projecting V1 onto the line by taking the scalar product + # of V1 with the unit vector on the line (this gives us the length of the + # projection of V1 onto the line). + # We get V2 by multiplying this scalar product by the unit vector. + # The perpendicular vector is V1 - V2 + V2 <- (V1 %*% line) %*% line + V3 <- V1 - V2 + # Distance to line is the norm of V3 + dist <- sqrt(rowSums(V3^2)) + idx <- which.max(dist) + + return(coords[idx,]) +} + + +# Input file with WND_CHRM features +filename <- opt$input +fields <- h5ls(filename) + +# Read HDF5 file +group_name <- paste(fields$group, fields$name, sep ="/") +h5data <- h5read(filename, group_name[1], compoundAsDataFrame=FALSE) +H5close() + +# Get metadata +measures <- h5data$Measurements +imageID <- measures$ImageID +wellID <- measures$WellID + +# Features are stored in a list of matrices +listOfFeatureMatrices <- measures[11:length(measures)] + +# Make matrix of images x features +featureMatrix <- t(do.call(rbind, listOfFeatureMatrices)) +row.names(featureMatrix) <- imageID + +# There are multiple rows with the same imageID because of tiling +# We average all rows from the same image +featureMatrix <- aggregate(featureMatrix, by = list(row.names(featureMatrix)), mean) +rownames(featureMatrix) <- featureMatrix[,1] +featureMatrix <- featureMatrix[,-1] + +# Remove constant features +featureMatrix <- featureMatrix[, which(!apply(featureMatrix, 2, FUN=function(x) {sd(x)==0}))] + +# PCA +pca <- prcomp(featureMatrix, scale.= TRUE, center = TRUE) + +# Select PCs +k <- NULL +if (!is.na(opt$nPCs)) { + k <- opt$nPCs +} else { + k <- find_elbow(pca$sdev)[1] +} +saveRDS(pca$x[,1:k], paste0(opt$output,"-PCs.rds")) + +# Cosine similarity +S <- as.matrix(simil(pca$x[,1:k], method = "cosine", by_rows = TRUE), diag = 1) +saveRDS(S, paste0(opt$output,"-similarities.rds")) From f7d9891cdacea673ecab616270d0f89a7ed15087 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Tue, 28 Mar 2017 12:39:25 +0100 Subject: [PATCH 2/7] Convert test_HDF5.Rmd to test_HDF5.R --- test_HDF5.R | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 test_HDF5.R diff --git a/test_HDF5.R b/test_HDF5.R new file mode 100644 index 0000000..0184f88 --- /dev/null +++ b/test_HDF5.R @@ -0,0 +1,47 @@ +#### 1) Install and load *rhdf5* +if (!("rhdf5" %in% rownames(installed.packages()))) +{ + source("http://bioconductor.org/biocLite.R") + biocLite("rhdf5") +} + +library(rhdf5) + +#### 2) Explore the structure +filename <- file.path("..","h5files","plate1_1_013.h5") +fields <- h5ls(filename) +str(fields) + +#### 3) Read *example.h5* +group_name <- paste0(fields$group, fields$name) +data <- h5read(filename, group_name[1], compoundAsDataFrame=FALSE) +H5close() + +#### 4) Get metadata +measures <- data$Measurements +imageID <- measures$ImageID +wellID <- measures$WellID + +#### 5) Get feature values (imageID: `r imageID`; well: `r wellID`) +# Features are stored in a list of matrices +featureListOfMatrices <- measures[11:length(measures)] + +## 1) Feature values +# Feature Vector has 2919 values +featureVector <- as.vector(do.call(rbind, featureListOfMatrices)) + +## 2) Feature names: Build new ID for each feature +# Length of each feature type +feat_size <- lapply(featureListOfMatrices, length) +feat_size <- data.frame(name=names(feat_size), size=unlist(feat_size)) +rownames(feat_size) <- seq(1:nrow(feat_size)) +# Repeat "length" times the name +a <- rep(feat_size$name, feat_size$size) +# Build sequences of number to create the ids +b <- c() +for(nElem in feat_size$size) +{ + b <- c(b, seq(1:nElem)) +} + +(feature_value <- data.frame(feature=paste(a,b,sep="_"), value=featureVector)) From 7c5e361732debb36837b8f94f1ead74457250034 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Tue, 28 Mar 2017 12:56:31 +0100 Subject: [PATCH 3/7] Add Dockerfile for running scripts --- docker/Dockerfile | 8 ++++++++ docker/README.md | 9 +++++++++ docker/r-base-docker.R | 13 +++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 docker/Dockerfile create mode 100644 docker/README.md create mode 100644 docker/r-base-docker.R diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..65b8f4a --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,8 @@ +FROM r-base +MAINTAINER ome-devel@lists.openmicroscopy.org.uk + +ADD r-base-docker.R / +RUN Rscript /r-base-docker.R +RUN useradd -m user -d /scratch +WORKDIR /scratch +ENTRYPOINT ["/usr/bin/Rscript"] diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..a56c914 --- /dev/null +++ b/docker/README.md @@ -0,0 +1,9 @@ +# Serrano Remining R Docker + +A plain R Docker image for running R scripts in this repository. +Example: + + docker build -t serrano-remining-rbase . + + docker run -it --rm -v $PWD:/src:ro -v $PWD/scratch:/scratch \ + serrano-remining-rbase /src/test_HDF5.R /src/Primary_146_81_features.h5 diff --git a/docker/r-base-docker.R b/docker/r-base-docker.R new file mode 100644 index 0000000..19c7ec9 --- /dev/null +++ b/docker/r-base-docker.R @@ -0,0 +1,13 @@ +install.packages(c( + "ggplot2", + "knitr", + "lsa", + "mclust", + "plotly", + "pryr", + "reshape", + "sysfonts", + "viridis", + "xkcd")) +source("http://bioconductor.org/biocLite.R") +biocLite("rhdf5") From 1b4b70e8b42325855031be1474e825d7e493affc Mon Sep 17 00:00:00 2001 From: Simon Li Date: Tue, 28 Mar 2017 12:56:46 +0100 Subject: [PATCH 4/7] test_HDF5.R now takes h5 file as argument --- test_HDF5.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test_HDF5.R b/test_HDF5.R index 0184f88..65946ee 100644 --- a/test_HDF5.R +++ b/test_HDF5.R @@ -1,3 +1,8 @@ +args = commandArgs(trailingOnly=TRUE) +if (length(args)!=1) { + stop("One argument required (h5-file)", call.=FALSE) +} + #### 1) Install and load *rhdf5* if (!("rhdf5" %in% rownames(installed.packages()))) { @@ -8,7 +13,7 @@ if (!("rhdf5" %in% rownames(installed.packages()))) library(rhdf5) #### 2) Explore the structure -filename <- file.path("..","h5files","plate1_1_013.h5") +filename <- args[1] fields <- h5ls(filename) str(fields) From 7d12251c2e88ac2a38801e1c44aeb20a608c2407 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Tue, 28 Mar 2017 17:32:47 +0100 Subject: [PATCH 5/7] Update docker to work with compute_image_similarities.R --- docker/README.md | 4 ++++ docker/r-base-docker.R | 2 ++ 2 files changed, 6 insertions(+) diff --git a/docker/README.md b/docker/README.md index a56c914..d135261 100644 --- a/docker/README.md +++ b/docker/README.md @@ -7,3 +7,7 @@ Example: docker run -it --rm -v $PWD:/src:ro -v $PWD/scratch:/scratch \ serrano-remining-rbase /src/test_HDF5.R /src/Primary_146_81_features.h5 + + docker run -it --rm -v $PWD:/src:ro -v $PWD/scratch:/scratch \ + serrano-remining-rbase /src/compute_image_similarities.R \ + -i /src/example_features.h5 -o /scratch/out diff --git a/docker/r-base-docker.R b/docker/r-base-docker.R index 19c7ec9..7c59146 100644 --- a/docker/r-base-docker.R +++ b/docker/r-base-docker.R @@ -3,7 +3,9 @@ install.packages(c( "knitr", "lsa", "mclust", + "optparse", "plotly", + "proxy", "pryr", "reshape", "sysfonts", From 70e0f1b1667b9f577144afc9d2311c2cfe2f7438 Mon Sep 17 00:00:00 2001 From: jmoore Date: Thu, 22 Jun 2017 06:49:15 +0100 Subject: [PATCH 6/7] Try to free memory --- compute_image_similarities.R | 1 + 1 file changed, 1 insertion(+) diff --git a/compute_image_similarities.R b/compute_image_similarities.R index 89535b8..ed4beb2 100644 --- a/compute_image_similarities.R +++ b/compute_image_similarities.R @@ -90,6 +90,7 @@ featureMatrix <- featureMatrix[,-1] # Remove constant features featureMatrix <- featureMatrix[, which(!apply(featureMatrix, 2, FUN=function(x) {sd(x)==0}))] +rm(fields, group_name, h5data, measures, imageID, wellID, listOfFeatureMatrices) # PCA pca <- prcomp(featureMatrix, scale.= TRUE, center = TRUE) From d281c2245fcfeedd8393619ed9d69cfd9f9a7096 Mon Sep 17 00:00:00 2001 From: jmoore Date: Thu, 22 Jun 2017 06:49:40 +0100 Subject: [PATCH 7/7] User only 1% of features for PCA --- compute_image_similarities.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/compute_image_similarities.R b/compute_image_similarities.R index ed4beb2..783b15b 100644 --- a/compute_image_similarities.R +++ b/compute_image_similarities.R @@ -86,7 +86,10 @@ row.names(featureMatrix) <- imageID # We average all rows from the same image featureMatrix <- aggregate(featureMatrix, by = list(row.names(featureMatrix)), mean) rownames(featureMatrix) <- featureMatrix[,1] -featureMatrix <- featureMatrix[,-1] + +nImages <- nrow(featureMatrix) # 388372950 images +randomSetSize <- 0.01*nImages +featureMatrix <- featureMatrix[sample(nrow(featureMatrix), randomSetSize), ] # Remove constant features featureMatrix <- featureMatrix[, which(!apply(featureMatrix, 2, FUN=function(x) {sd(x)==0}))]