From 950a6e6d7043bfe6008d74c896e01511baf2aaa3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-Karim=20H=C3=A9rich=C3=A9?= <heriche@embl.de>
Date: Tue, 28 Mar 2017 15:57:20 +0200
Subject: [PATCH 1/7] script to compute image similarity matrix

---
 compute_image_similarities.R | 108 +++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 compute_image_similarities.R

diff --git a/compute_image_similarities.R b/compute_image_similarities.R
new file mode 100644
index 0000000..89535b8
--- /dev/null
+++ b/compute_image_similarities.R
@@ -0,0 +1,108 @@
+library(optparse)
+library(rhdf5)
+library(proxy)
+
+
+option_list = list(
+  make_option(c("-i", "--input"), action = "store", default = NA, type='character',
+              help="Path to the HDF5 file with the WND-CHRM features"),
+  make_option(c("-o", "--output"), action = "store", default = NA, type='character',
+              help="Basename to use for the output files (PCs and similarity matrix)"),
+  make_option(c("-n", "--nPCs"), action = "store", default = NA, type = "integer",
+              help = "Number of principal components to keep and use")
+)
+
+opt_parser <- OptionParser(option_list=option_list)
+opt = parse_args(opt_parser)
+
+if (is.na(opt$input)){
+  print_help(opt_parser)
+  stop("Argument --input required.", call.=FALSE)
+}
+
+
+# Helper function to find elbow in a 2D curve represented by a list of ordered values
+# This function finds the point with maximum distance from the line between
+# the first and last points.
+# Adapted from StackOverflow:
+# http://stackoverflow.com/questions/2018178/finding-the-best-trade-off-point-on-a-curve
+
+find_elbow <- function(values) {
+  
+  n <- length(values)
+  
+  # Crude check to see if values are ordered
+  if (values[1]<=values[n]) {
+    stop("Values must be in decreasing order") 
+  }
+  
+  coords <- cbind(seq.int(n), as.matrix(values))
+  # Vector between first and last point
+  line <- coords[n,] - coords[1,]
+  # Normalize the vector
+  line <- line/sqrt(sum(line^2));
+  # Vector between all points and first point
+  V1 <- sweep(coords, 2, coords[1,]) # default function in sweep is - (minus)
+  # To calculate the distance to the line, we split V1 into two
+  # components, V2 that is parallel to the line and V3 that is perpendicular.
+  # The distance is the norm of the part that is perpendicular to the line.
+  # We find V2 by projecting V1 onto the line by taking the scalar product
+  # of V1 with the unit vector on the line (this gives us the length of the
+  # projection of V1 onto the line).
+  # We get V2 by multiplying this scalar product by the unit vector.
+  # The perpendicular vector is V1 - V2
+  V2 <- (V1 %*% line) %*% line
+  V3 <- V1 - V2
+  # Distance to line is the norm of V3
+  dist <- sqrt(rowSums(V3^2))
+  idx <- which.max(dist)
+  
+  return(coords[idx,])
+}
+
+
+# Input file with WND_CHRM features
+filename <- opt$input
+fields <- h5ls(filename)
+
+# Read HDF5 file
+group_name <- paste(fields$group, fields$name, sep ="/")
+h5data <- h5read(filename, group_name[1], compoundAsDataFrame=FALSE)
+H5close()
+
+# Get metadata
+measures <- h5data$Measurements
+imageID <- measures$ImageID
+wellID <- measures$WellID
+
+# Features are stored in a list of matrices
+listOfFeatureMatrices <- measures[11:length(measures)]
+
+# Make matrix of images x features
+featureMatrix <- t(do.call(rbind, listOfFeatureMatrices))
+row.names(featureMatrix) <- imageID
+
+# There are multiple rows with the same imageID because of tiling
+# We average all rows from the same image
+featureMatrix <- aggregate(featureMatrix, by = list(row.names(featureMatrix)), mean)
+rownames(featureMatrix) <- featureMatrix[,1]
+featureMatrix <- featureMatrix[,-1]
+
+# Remove constant features
+featureMatrix <- featureMatrix[, which(!apply(featureMatrix, 2, FUN=function(x) {sd(x)==0}))]
+
+# PCA
+pca <- prcomp(featureMatrix, scale.= TRUE, center = TRUE)
+
+# Select PCs
+k <- NULL
+if (!is.na(opt$nPCs)) {
+  k <- opt$nPCs
+} else {
+  k <- find_elbow(pca$sdev)[1]
+}
+saveRDS(pca$x[,1:k], paste0(opt$output,"-PCs.rds"))
+
+# Cosine similarity
+S <- as.matrix(simil(pca$x[,1:k], method = "cosine", by_rows = TRUE), diag = 1)
+saveRDS(S, paste0(opt$output,"-similarities.rds"))

From f7d9891cdacea673ecab616270d0f89a7ed15087 Mon Sep 17 00:00:00 2001
From: Simon Li <spli@dundee.ac.uk>
Date: Tue, 28 Mar 2017 12:39:25 +0100
Subject: [PATCH 2/7] Convert test_HDF5.Rmd to test_HDF5.R

---
 test_HDF5.R | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 test_HDF5.R

diff --git a/test_HDF5.R b/test_HDF5.R
new file mode 100644
index 0000000..0184f88
--- /dev/null
+++ b/test_HDF5.R
@@ -0,0 +1,47 @@
+#### 1) Install and load *rhdf5*
+if (!("rhdf5" %in% rownames(installed.packages())))
+{
+  source("http://bioconductor.org/biocLite.R")
+  biocLite("rhdf5")
+}
+
+library(rhdf5)
+
+#### 2) Explore the structure
+filename <- file.path("..","h5files","plate1_1_013.h5")
+fields <- h5ls(filename)
+str(fields)
+
+#### 3) Read *example.h5*
+group_name <- paste0(fields$group, fields$name)
+data <- h5read(filename, group_name[1], compoundAsDataFrame=FALSE)
+H5close()
+
+#### 4) Get metadata
+measures <- data$Measurements
+imageID <- measures$ImageID
+wellID <- measures$WellID
+
+#### 5) Get feature values (imageID: `r imageID`; well: `r wellID`)
+# Features are stored in a list of matrices
+featureListOfMatrices <- measures[11:length(measures)]
+
+## 1) Feature values
+# Feature Vector has 2919 values
+featureVector <- as.vector(do.call(rbind, featureListOfMatrices))
+
+## 2) Feature names: Build new ID for each feature
+# Length of each feature type
+feat_size <- lapply(featureListOfMatrices, length)
+feat_size <- data.frame(name=names(feat_size), size=unlist(feat_size))
+rownames(feat_size) <- seq(1:nrow(feat_size))
+# Repeat "length" times the name
+a <- rep(feat_size$name, feat_size$size)
+# Build sequences of number to create the ids
+b <- c()
+for(nElem in feat_size$size)
+{
+  b <- c(b, seq(1:nElem))
+}
+
+(feature_value <- data.frame(feature=paste(a,b,sep="_"), value=featureVector))

From 7c5e361732debb36837b8f94f1ead74457250034 Mon Sep 17 00:00:00 2001
From: Simon Li <spli@dundee.ac.uk>
Date: Tue, 28 Mar 2017 12:56:31 +0100
Subject: [PATCH 3/7] Add Dockerfile for running scripts

---
 docker/Dockerfile      |  8 ++++++++
 docker/README.md       |  9 +++++++++
 docker/r-base-docker.R | 13 +++++++++++++
 3 files changed, 30 insertions(+)
 create mode 100644 docker/Dockerfile
 create mode 100644 docker/README.md
 create mode 100644 docker/r-base-docker.R

diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 0000000..65b8f4a
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,8 @@
+FROM r-base
+MAINTAINER ome-devel@lists.openmicroscopy.org.uk
+
+ADD r-base-docker.R /
+RUN Rscript /r-base-docker.R
+RUN useradd -m user -d /scratch
+WORKDIR /scratch
+ENTRYPOINT ["/usr/bin/Rscript"]
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 0000000..a56c914
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,9 @@
+# Serrano Remining R Docker
+
+A plain R Docker image for running R scripts in this repository.
+Example:
+
+    docker build -t serrano-remining-rbase .
+
+    docker run -it --rm -v $PWD:/src:ro -v $PWD/scratch:/scratch \
+        serrano-remining-rbase /src/test_HDF5.R /src/Primary_146_81_features.h5
diff --git a/docker/r-base-docker.R b/docker/r-base-docker.R
new file mode 100644
index 0000000..19c7ec9
--- /dev/null
+++ b/docker/r-base-docker.R
@@ -0,0 +1,13 @@
+install.packages(c(
+  "ggplot2",
+  "knitr",
+  "lsa",
+  "mclust",
+  "plotly",
+  "pryr",
+  "reshape",
+  "sysfonts",
+  "viridis",
+  "xkcd"))
+source("http://bioconductor.org/biocLite.R")
+biocLite("rhdf5")

From 1b4b70e8b42325855031be1474e825d7e493affc Mon Sep 17 00:00:00 2001
From: Simon Li <spli@dundee.ac.uk>
Date: Tue, 28 Mar 2017 12:56:46 +0100
Subject: [PATCH 4/7] test_HDF5.R now takes h5 file as argument

---
 test_HDF5.R | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test_HDF5.R b/test_HDF5.R
index 0184f88..65946ee 100644
--- a/test_HDF5.R
+++ b/test_HDF5.R
@@ -1,3 +1,8 @@
+args = commandArgs(trailingOnly=TRUE)
+if (length(args)!=1) {
+  stop("One argument required (h5-file)", call.=FALSE)
+}
+
 #### 1) Install and load *rhdf5*
 if (!("rhdf5" %in% rownames(installed.packages())))
 {
@@ -8,7 +13,7 @@ if (!("rhdf5" %in% rownames(installed.packages())))
 library(rhdf5)
 
 #### 2) Explore the structure
-filename <- file.path("..","h5files","plate1_1_013.h5")
+filename <- args[1]
 fields <- h5ls(filename)
 str(fields)
 

From 7d12251c2e88ac2a38801e1c44aeb20a608c2407 Mon Sep 17 00:00:00 2001
From: Simon Li <spli@dundee.ac.uk>
Date: Tue, 28 Mar 2017 17:32:47 +0100
Subject: [PATCH 5/7] Update docker to work with compute_image_similarities.R

---
 docker/README.md       | 4 ++++
 docker/r-base-docker.R | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/docker/README.md b/docker/README.md
index a56c914..d135261 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -7,3 +7,7 @@ Example:
 
     docker run -it --rm -v $PWD:/src:ro -v $PWD/scratch:/scratch \
         serrano-remining-rbase /src/test_HDF5.R /src/Primary_146_81_features.h5
+
+    docker run -it --rm -v $PWD:/src:ro -v $PWD/scratch:/scratch \
+        serrano-remining-rbase /src/compute_image_similarities.R \
+        -i /src/example_features.h5 -o /scratch/out
diff --git a/docker/r-base-docker.R b/docker/r-base-docker.R
index 19c7ec9..7c59146 100644
--- a/docker/r-base-docker.R
+++ b/docker/r-base-docker.R
@@ -3,7 +3,9 @@ install.packages(c(
   "knitr",
   "lsa",
   "mclust",
+  "optparse",
   "plotly",
+  "proxy",
   "pryr",
   "reshape",
   "sysfonts",

From 70e0f1b1667b9f577144afc9d2311c2cfe2f7438 Mon Sep 17 00:00:00 2001
From: jmoore <josh@glencoesoftware.com>
Date: Thu, 22 Jun 2017 06:49:15 +0100
Subject: [PATCH 6/7] Try to free memory

---
 compute_image_similarities.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compute_image_similarities.R b/compute_image_similarities.R
index 89535b8..ed4beb2 100644
--- a/compute_image_similarities.R
+++ b/compute_image_similarities.R
@@ -90,6 +90,7 @@ featureMatrix <- featureMatrix[,-1]
 
 # Remove constant features
 featureMatrix <- featureMatrix[, which(!apply(featureMatrix, 2, FUN=function(x) {sd(x)==0}))]
+rm(fields, group_name, h5data, measures, imageID, wellID, listOfFeatureMatrices)
 
 # PCA
 pca <- prcomp(featureMatrix, scale.= TRUE, center = TRUE)

From d281c2245fcfeedd8393619ed9d69cfd9f9a7096 Mon Sep 17 00:00:00 2001
From: jmoore <josh@glencoesoftware.com>
Date: Thu, 22 Jun 2017 06:49:40 +0100
Subject: [PATCH 7/7] User only 1% of features for PCA

---
 compute_image_similarities.R | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/compute_image_similarities.R b/compute_image_similarities.R
index ed4beb2..783b15b 100644
--- a/compute_image_similarities.R
+++ b/compute_image_similarities.R
@@ -86,7 +86,10 @@ row.names(featureMatrix) <- imageID
 # We average all rows from the same image
 featureMatrix <- aggregate(featureMatrix, by = list(row.names(featureMatrix)), mean)
 rownames(featureMatrix) <- featureMatrix[,1]
-featureMatrix <- featureMatrix[,-1]
+
+nImages <- nrow(featureMatrix) # 388372950 images
+randomSetSize <- 0.01*nImages
+featureMatrix <- featureMatrix[sample(nrow(featureMatrix), randomSetSize), ]
 
 # Remove constant features
 featureMatrix <- featureMatrix[, which(!apply(featureMatrix, 2, FUN=function(x) {sd(x)==0}))]