From 1712c4ef6fd103d503af1af4c1d81de5d8821f73 Mon Sep 17 00:00:00 2001
From: kellenett <kelle-nett@uiowa.edu>
Date: Fri, 15 Mar 2019 10:43:48 -0500
Subject: [PATCH 01/11] Create 1D clustering Yadav.R

---
 1D clustering Yadav.R | 277 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100644 1D clustering Yadav.R

diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R
new file mode 100644
index 0000000..bba2b5b
--- /dev/null
+++ b/1D clustering Yadav.R	
@@ -0,0 +1,277 @@
+# modeling Yadav et al. 2012 cluster analysis; tailored towards clustering of attachment points
+# initially coded by S. Johnson- updates and 3D analysis of spine heads added by K. Nett
+
+
+library(dplyr)          # to install these , enter install.packages("package name") into the console in RStudio. replace package name with each package name in quotes, ie install.packages("dplyr")
+library(ggplot2)
+library(lattice)
+library(latticeExtra)
+library(cluster)
+library(stats)
+library(readr)
+library(bio3d) #install.packages("bio3d", dependencies = TRUE)
+library(DECIPHER)
+#how to install DECIPHER - a is enter in response to prompt for "all"
+#source("https://bioconductor.org/biocLite.R")
+#biocLite("DECIPHER")
+#a
+
+
+fileChosen <- file.choose()     # opens file dialog to open 
+filePath <- dirname(fileChosen) # gets the directory name that that file is in
+setwd(filePath)           # sets the working directory. important for saving files later
+fileList <- list.files(filePath)  # a list of files in the directory to go through
+fileList <- grep(".csv", fileList, value = TRUE) # finds only lists .csv files in case there are other types
+data_all <- data.frame()  # initializes a data.frame that will store all of the data for the experiment
+
+
+
+
+for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
+  fileName <- unlist(strsplit(fileList[1], "[.]"))[1]   # removes ".csv" from file name
+  fileLoc <- file.path(filePath, fileList[1])           # creates a path to each file (differs from FileChosen as the loop runs)
+  df <- read_csv(fileLoc, col_types = cols()) # reads the csv file, suppresses output information about columns
+  colnames(df) <- gsub("-", "_", colnames(df)) # R doesn't like dashes in column names, replaces "-" with "_" for all colnames of df
+  df <- df[complete.cases(df$SOMA_DISTANCE),] # removes cases where there is no soma distance data
+  df <- df[complete.cases(df$RAYBURST_VOLUME),] # removes cases where there is no data
+  df <- df[complete.cases(df$MAX_DTS),] # removes cases where there is no  data
+  df$file <- fileName # adds a column with repeated information about which file that spine came from
+  total_length <- df %>% group_by(SECTION_NUMBER)  %>% summarise(section_length=max(SECTION_LENGTH)) %>%ungroup() %>% summarise(total_length=sum(section_length)) %>% as.double() 
+  # ^^this uses the dplyr package to take the data.frame and group it by section. Then it finds the length of each section before adding them all together
+  
+  total_spines <- as.numeric(nrow(df)) # number of rows = number of spines
+  density_overall <- total_spines/total_length  # calculate density of segment
+  density_mushroom <- sum(as.numeric(df$TYPE=="mushroom"))/total_length  # counts the number of "mushroom" rows and divides by length for mushroom density
+  density_thin <- sum(as.numeric(df$TYPE=="thin"))/total_length  # same as above, with "thin"
+  density_stubby <- sum(as.numeric(df$TYPE=="stubby"))/total_length # same as above, with "stubby"
+  df <- df[order(df$SOMA_DISTANCE),]
+  
+  data_coords <- data.frame(df$X, df$Y, df$Z)
+  
+  agn <- agnes(df$SOMA_DISTANCE,metric = "euclidean", method = "average") # runs the agnes, computes agglomerative hierarchical clustering, "average" = UPGMA
+  dist_ac <- as.matrix(dist(df$SOMA_DISTANCE))
+  df$nn_dist_ac <- apply(dist_ac,2, function(x) sort(x)[2])   # finds nearest neighbor for each spine
+  df$nn2_dist_ac <- apply(dist_ac,2, function(x) sort(x)[3])  # finds second nearest neighbor
+  df$nn3_dist_ac <- apply(dist_ac,2, function(x) sort(x)[4])  # finds third
+  dendrite_ac <- agn$ac # defines agglomeration coefficent of the dendrite
+  
+  ac_test <- list() #initializes list for ac's from next for loop  
+  possible_dist <- seq(1,total_length, by=0.01) # create list of possible soma distances by 0.01 increments to pull from
+  
+   for(k in 1:1000){
+    test_data <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines
+    test_dist <- as.matrix(dist(test_data)) #creates distance matrix for random sample
+    assign("test_cluster", agnes(test_data,metric = "euclidean", method = "average")) #runs UPGMA on sample and labels the agn output "test_cluster"
+    ac_test <- rbind(ac_test,test_cluster$ac) # add row to ac using the 'test_cluster' ac
+  } 
+  
+  cScore_ac_1D <- sum(as.numeric(ac_test<dendrite_ac))/1000 # average (divide by 15000 samples) how many times random ac is smaller than dendrite_ac, the smaller the value, the more "clustering"
+  df$c_score_ac_1D <- cScore_ac_1D #add a row to df with the cScore of the dendrite
+  
+  
+#1D clustering (from spine attachment points)
+  dist_1D <- as.matrix(dist(df$SOMA_DISTANCE)) # creates a distance matrix using distance from soma
+  UPGMA_obsv_1D <- IdClusters(dist_1D, method = "UPGMA", cutoff=0.75, showPlot=TRUE)  #runs cluster analysis with cutoff from Yadav paper
+  UPGMA_obsv_1D$rn <- rownames(UPGMA_obsv_1D) # adds a column with row names to keep spine ID, not sure if this step is necessary
+  cluster_all_1D <- cbind(UPGMA_obsv_1D, df$SOMA_DISTANCE)
+  cluster_all_1D <- cluster_all_1D[order(cluster_all_1D$cluster),]
+  cluster_freq_1D <- table(cluster_all_1D$cluster)
+  cluster_freq_1D <- as.data.frame(cluster_freq_1D)
+  cluster_freq_1D$Freq <- as.numeric(cluster_freq_1D$Freq)
+  cluster_freq_1D$is_clustered <- as.numeric(cluster_freq_1D$Freq > 1) #if more than 1 spine, then it is in a cluster
+  is_clustered_1D <- sum(cluster_freq_1D$is_clustered)
+  num_clusters_1D <- is_clustered_1D
+  num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters
+  spines_in_cluster_1D <- cluster_freq_1D %>% group_by(is_clustered) %>% summarise(num_clus_spines_1D = sum(Freq))
+  spines_clustered_1D <- list()
+  spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[1,2])
+  
+  
+  spines_not_1D <- as.numeric(total_spines - spines_clustered_1D)
+  
+  
+  
+  
+  random_1D_all <- data.frame()
+  prob_density_test_all_1D <- data.frame()
+  
+# 1D random spines for loop
+  for(j in 1:5000){
+    random_spines_1D <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines
+    test_dist_1D <- as.matrix(dist(random_spines_1D)) #creates distance matrix for random sample
+    UPGMA_test_1D <- IdClusters(test_dist_1D, method="UPGMA", cutoff=0.75, showPlot=FALSE)
+    UPGMA_test_1D$rn <- rownames(UPGMA_test_1D)
+    cluster_all_test_1D <- cbind(UPGMA_test_1D, random_spines_1D)
+    cluster_all_test_1D <- cluster_all_test_1D[order(cluster_all_test_1D$cluster),]
+    cluster_freq_test_1D <- table(cluster_all_test_1D$cluster)
+    cluster_freq_test_1D <- as.data.frame(cluster_freq_test_1D)
+    cluster_freq_test_1D$is_clustered <- as.numeric(cluster_freq_test_1D$Freq > 1)
+    is_clustered_test_1D <- sum(cluster_freq_test_1D$is_clustered)
+    num_clusters_test_1D <- is_clustered_test_1D
+    spines_is_clustered_test_1D <- cluster_freq_test_1D %>% group_by(is_clustered) %>% summarise(num_clusters_test_1D = sum(Freq))
+    spines_clustered_test_1D <- spines_is_clustered_test_1D[2,2]
+    spines_not_test_1D <- spines_is_clustered_test_1D[1,2]
+    spines_clustered_test_1D[is.na(spines_clustered_test_1D)] <- 0
+    random_1D <- spines_clustered_test_1D
+    random_1D[is.na(random_1D)] <- 0
+    random_1D_all <- rbind(random_1D_all, random_1D)
+    
+  } # end of random spines 1D for loop
+  
+  random_1D_all <- as.matrix(random_1D_all)
+  random_1D_all <- as.numeric(random_1D_all)
+  
+  std_test_1D <- sd(random_1D_all)
+  mean_test_1D <- mean(random_1D_all)
+  curve_dnorm_1D <- dnorm(random_1D_all, mean_test_1D, std_test_1D)
+  Cscore_1D <- pnorm(curve_dnorm_1D)
+
+  
+ # add all 1D data to all data 
+  cluster_data_1D <- data.frame()  
+  cluster_data_1D <- rbind(cluster_data_1D, num_clusters_1D)
+  cluster_data_1D <- cbind(cluster_data_1D, spines_clustered_1D)
+  cluster_data_1D <- cbind(cluster_data_1D, spines_not_1D)
+  cluster_data_1D <- cbind(cluster_data_1D, Cscore_1D)
+  cluster_data_1D[is.na(cluster_data_1D)] <- 0
+  colnames(cluster_data_1D) <- c("# of clusters - 1D", "spines clustered - 1D", "spines not clustered - 1D", "Cscore - 1D")
+  
+  
+# 3D clustering analysis  
+  dist_3D <- data.frame(dist.xyz(data_coords)) # creates distance matrix of spine head coordinates
+  UPGMA_obsv_3D <- IdClusters(dist_3D, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper
+  #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1)
+  UPGMA_obsv_3D$rn <- rownames(UPGMA_obsv_3D)   #adds a column with rownows to keep spine ID, not sure if this step is neccessary
+  cluster_all_3D <- cbind(UPGMA_obsv_3D, data_coords)   #get the coordinates for each spine ID, if kept in row name/number order, will correctly correspond to each spine
+  cluster_all_3D <- cluster_all_3D[order(cluster_all_3D$cluster),] #sort data frame by cluster number
+  cluster_freq_3D <- table(cluster_all_3D$cluster) #create a table counting how many times each cluster Variable occurs (i.e. how many spines in each cluster)
+  cluster_freq_3D <- as.data.frame(cluster_freq_3D)
+  cluster_freq_3D$Freq <- as.numeric(cluster_freq_3D$Freq)
+  cluster_freq_3D$is_clustered <- as.numeric(cluster_freq_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1
+  is_clustered_3D <- sum(cluster_freq_3D$is_clustered)
+  num_clusters_3D <- is_clustered_3D
+  num_clusters_3D <- sum(cluster_freq_3D$is_clustered) #count how many 1s to determine how many clusters (spines > 1) in the segment
+  spines_in_cluster_3D <- cluster_freq_3D %>% group_by(is_clustered) %>% summarise(num_clus_spines_3D = sum(Freq))
+  spines_clustered_3D <- list()
+  spines_clustered_3D <- rbind(spines_clustered_3D, spines_in_cluster_3D[1,2])
+  spines_not_3D <- as.numeric(total_spines - spines_clustered_3D)
+ 
+  random_3D_all <- data.frame()
+  prob_density_test_all_1D <- data.frame()
+
+  
+# 3D random spines for loop
+  for(j in 1:5000){
+    test_data_X <- data.frame(sample(df$X), df$Y, df$Z) # randomize the X's, Y's, and Z's to make a "biologically plausible" dataframe.
+    colnames(test_data_X) <- c( "x", "Y", "Z")
+    test_data_Y <- data.frame(df$X, sample(df$Y), df$Z)
+    colnames(test_data_Y) <- c( "x", "Y", "Z")
+    test_data_Z <- data.frame(df$X, df$Y, sample(df$Z))
+    colnames(test_data_Z) <- c( "x", "Y", "Z")
+    test_data <- rbind(test_data_X, test_data_Y, test_data_Z)
+    test_data_final <-data.frame(sample_n(test_data, total_spines))
+    test_dist <- as.matrix(dist(test_data_final)) #creates distance matrix for random sample
+
+    UPGMA_test_3D <- IdClusters(test_dist, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper
+    #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1)
+    UPGMA_test_3D$rn <- rownames(UPGMA_test_3D)   #adds a column with rownows to keep spine ID, not sure if this step is neccessary
+    cluster_all_test_3D <- cbind(UPGMA_test_3D, test_data_final)   #get the coordinates for each spine ID, if kept in row name/number order, will correctly correspond to each spine
+    cluster_all_test_3D <- cluster_all_test_3D[order(cluster_all_test_3D$cluster),] #sort data frame by cluster number
+    cluster_freq_test_3D <- table(cluster_all_test_3D$cluster) #create a table counting how many times each cluster Variable occurs (i.e. how many spines in each cluster)
+    cluster_freq_test_3D <- as.data.frame(cluster_freq_test_3D)
+    cluster_freq_test_3D$Freq <- as.numeric(cluster_freq_test_3D$Freq) #convert factors to numbers
+    cluster_freq_test_3D$is_clustered_test <- as.numeric(cluster_freq_test_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1
+    is_clustered_test_3D <- sum(cluster_freq_test_3D$is_clustered_test)
+    num_clusters_test_3D <- is_clustered_test_3D #count how many 1s to determine how many clusters (spines > 1) in the segment
+    num_clusters_test_3D <- sum(cluster_freq_test_3D$is_clustered_test)
+    spines_in_cluster_test_3D <- cluster_freq_test_3D %>% group_by(is_clustered_test) %>% summarise(num_clus_spines_test_3D = sum(Freq))
+    spines_clustered_test_3D <- list()
+    spines_clustered_test_3D <- rbind(spines_clustered_test_3D, spines_in_cluster_test_3D[1,2])
+    spines_not_test_3D <- as.numeric(total_spines - spines_in_cluster_test_3D)
+    spines_clustered_test_3D[is.na(spines_clustered_test_3D)] <- 0
+    random_3D <- spines_clustered_test_3D
+    random_3D[is.na(random_3D)] <- 0
+    random_3D_all <- rbind(random_3D_all, random_1D)
+    
+
+  } # end of random spines 3D for-loop
+  
+  random_3D_all <- as.matrix(random_3D_all)
+  random_3D_all <- as.numeric(random_3D_all)
+  
+  std_test_3D <- sd(random_3D_all)
+  mean_test_3D <- mean(random_3D_all)
+  curve_dnorm_3D <- dnorm(random_3D_all, mean_test_3D, std_test_3D)
+  Cscore_3D <- pnorm(curve_dnorm_3D)
+
+  # add 3D data to data frame
+  cluster_data_3D <- data.frame()
+  cluster_data_3D <- rbind(cluster_data_3D, num_clusters_3D)
+  cluster_data_3D <- cbind(cluster_data_3D, spines_clustered_3D)
+  cluster_data_3D <- cbind(cluster_data_3D, spines_not_3D)
+  cluster_data_3D <- cbind(cluster_data_3D, Cscore_3D)
+  cluster_data_3D[is.na(cluster_data_3D)] <- 0
+  colnames(cluster_data_3D) <- c("# of clusters-3D", "spines clustered-3D", "spines not clustered-3D", "Cscore-3D")
+  
+  
+  
+  df$density_overall <- density_overall # add these data to df
+  df$density_mushroom <- density_mushroom
+  df$density_thin <- density_thin
+  df$density_stubby <- density_stubby
+  data_all <- rbind(data_all, df) # adds df as next row in the data_all file
+  
+  
+  data_all <- cbind(data_all, cluster_data_1D)
+  data_all <- cbind(data_all, cluster_data_3D)
+
+  
+} # end of file for-loop 
+
+data_all$animal_num <- lapply(data_all$file, function(x) unlist(strsplit(x, "-"))[2]) # this pulls out an animal number from the file number
+data_all$retro_label <- substring(data_all$animal_num, nchar(data_all$animal_num), nchar(data_all$animal_num))  #pulls off 'L' or 'N' from ID to indicate whether cell was retro-gradely labeled
+#data_all$PDB <- substring(data_all$animal_num, nchar(data_all$animal_num), nchar(data_all$animal_num))
+data_all$PDB <- lapply(data_all$file, function(x) unlist(strsplit(x, "-"))[3]) # pulls dendrite location out of name and adds column
+data_all$PDB <- replace(data_all$PDB, data_all$PDB=="p", "prox")
+data_all$PDB <- replace(data_all$PDB, data_all$PDB=="d", "dist")
+data_all$PDB <- replace(data_all$PDB, data_all$PDB=="b", "basal")
+data_all$retro_label <- replace(data_all$retro_label, data_all$retro_label=="L", "labeled") 
+data_all$retro_label <- replace(data_all$retro_label, data_all$retro_label=="N", "not labeled")
+data_all$stack <- lapply(data_all$file, function(x) unlist(strsplit(x, "-"))[4]) #gives letter ID of different imaging days
+data_all$stack <- unlist(data_all$stack)
+data_all$animal_num <- lapply(data_all$animal_num, function(x) unlist(strsplit(x, "L"))[1]) #removes letter from behind animal ID name
+data_all$animal_num <- lapply(data_all$animal_num, function(x) unlist(strsplit(x, "N"))[1])
+data_all$RAYBURST_VOLUME <- as.numeric(data_all$RAYBURST_VOLUME)
+data_all$MAX_DTS <- as.numeric(data_all$MAX_DTS)
+
+n <- readline(prompt="Enter group name:") # gives prompt on console to enter animal group
+data_all$group <- n # adds the group to the master data file
+
+data_all$animal_num <- unlist(data_all$animal_num) #changes animal number to a vector rather than a list, which is important for executing the following task
+data_all$PDB <- unlist(data_all$PDB) # same as above
+
+TYPE_ave <- data_all %>% group_by(group, animal_num, retro_label, TYPE) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
+PDB_TYPE_ave <- data_all %>% group_by(group, animal_num, stack, retro_label, PDB, TYPE) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
+
+PDB_ave <- data_all %>% group_by(group, animal_num, retro_label, PDB) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
+# ^first groups by treatment group, then by animal, then by day of image (i.e. -F), then by dendrite location
+# summarise then finds averages for each group for all various pieces of data
+
+animal_ave <- data_all %>% group_by(group, animal_num) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
+# groups by treatment then animal, then finds averages per animal for various data
+
+labeled_ave <- data_all %>% group_by(group, animal_num, retro_label) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
+# groups by treatment then animal, then finds averages per animal for various data
+
+
+
+#dir.create("analysis")  #creates a directory to create a file on the computer
+#savePath <- paste(filePath,"/", "analysis", collapse = "/", sep="") # creates the path where files can be saved
+#setwd(savePath) # sets working directory to the path created above
+#write.csv(data_all, "data_all.csv")
+#write.csv(PDB_ave, "PDB averages.csv")
+#write.csv(animal_ave, "Animal Averages.csv")
+#write.csv(TYPE_ave, "spine_type_averages.csv")
+#write.csv(PDB_TYPE_ave, "PDB_spine_type_averages.csv")
+#write.csv(labeled_ave, "labeled_averages.csv")
+

From e65c4aafff26ede42b4433e534f8fc4d21e23e1d Mon Sep 17 00:00:00 2001
From: kellenett <kelle-nett@uiowa.edu>
Date: Fri, 15 Mar 2019 12:25:51 -0500
Subject: [PATCH 02/11] Update 1D clustering Yadav.R

---
 1D clustering Yadav.R | 164 +++++++++++++++++++++---------------------
 1 file changed, 84 insertions(+), 80 deletions(-)

diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R
index bba2b5b..3e6b770 100644
--- a/1D clustering Yadav.R	
+++ b/1D clustering Yadav.R	
@@ -44,12 +44,14 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   density_mushroom <- sum(as.numeric(df$TYPE=="mushroom"))/total_length  # counts the number of "mushroom" rows and divides by length for mushroom density
   density_thin <- sum(as.numeric(df$TYPE=="thin"))/total_length  # same as above, with "thin"
   density_stubby <- sum(as.numeric(df$TYPE=="stubby"))/total_length # same as above, with "stubby"
-  df <- df[order(df$SOMA_DISTANCE),]
-  
-  data_coords <- data.frame(df$X, df$Y, df$Z)
+  df <- df[order(df$SOMA_DISTANCE),]  #order data by column SOMA_DISTANCE so that coordinates pulled are in spine attachment order
   
+  data_coords <- data.frame(df$X, df$Y, df$Z) # creates data frame with spine head coordinates
+
+# Shane's original clustering analysis using spine attachment distance from soma  
+    
   agn <- agnes(df$SOMA_DISTANCE,metric = "euclidean", method = "average") # runs the agnes, computes agglomerative hierarchical clustering, "average" = UPGMA
-  dist_ac <- as.matrix(dist(df$SOMA_DISTANCE))
+  dist_ac <- as.matrix(dist(df$SOMA_DISTANCE))  #creates a distance matrix of soma distances
   df$nn_dist_ac <- apply(dist_ac,2, function(x) sort(x)[2])   # finds nearest neighbor for each spine
   df$nn2_dist_ac <- apply(dist_ac,2, function(x) sort(x)[3])  # finds second nearest neighbor
   df$nn3_dist_ac <- apply(dist_ac,2, function(x) sort(x)[4])  # finds third
@@ -65,67 +67,68 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
     ac_test <- rbind(ac_test,test_cluster$ac) # add row to ac using the 'test_cluster' ac
   } 
   
-  cScore_ac_1D <- sum(as.numeric(ac_test<dendrite_ac))/1000 # average (divide by 15000 samples) how many times random ac is smaller than dendrite_ac, the smaller the value, the more "clustering"
+  cScore_ac_1D <- sum(as.numeric(ac_test<dendrite_ac))/1000 # average (divide by 1000 samples) how many times random ac is smaller than dendrite_ac, the smaller the value, the more "clustering"
   df$c_score_ac_1D <- cScore_ac_1D #add a row to df with the cScore of the dendrite
+#end Shane's original analysis
   
+
   
 #1D clustering (from spine attachment points)
-  dist_1D <- as.matrix(dist(df$SOMA_DISTANCE)) # creates a distance matrix using distance from soma
-  UPGMA_obsv_1D <- IdClusters(dist_1D, method = "UPGMA", cutoff=0.75, showPlot=TRUE)  #runs cluster analysis with cutoff from Yadav paper
+  dist_1D <- as.matrix(dist(df$SOMA_DISTANCE)) # creates a distance matrix using distance from soma, same as dist_ac, but keeping separate for different analysis for now
+  UPGMA_obsv_1D <- IdClusters(dist_1D, method = "UPGMA", cutoff=0.75, showPlot=TRUE)  #runs cluster analysis with cutoff from Yadav paper on observed spines
   UPGMA_obsv_1D$rn <- rownames(UPGMA_obsv_1D) # adds a column with row names to keep spine ID, not sure if this step is necessary
-  cluster_all_1D <- cbind(UPGMA_obsv_1D, df$SOMA_DISTANCE)
-  cluster_all_1D <- cluster_all_1D[order(cluster_all_1D$cluster),]
-  cluster_freq_1D <- table(cluster_all_1D$cluster)
-  cluster_freq_1D <- as.data.frame(cluster_freq_1D)
-  cluster_freq_1D$Freq <- as.numeric(cluster_freq_1D$Freq)
-  cluster_freq_1D$is_clustered <- as.numeric(cluster_freq_1D$Freq > 1) #if more than 1 spine, then it is in a cluster
-  is_clustered_1D <- sum(cluster_freq_1D$is_clustered)
-  num_clusters_1D <- is_clustered_1D
-  num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters
-  spines_in_cluster_1D <- cluster_freq_1D %>% group_by(is_clustered) %>% summarise(num_clus_spines_1D = sum(Freq))
-  spines_clustered_1D <- list()
-  spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[1,2])
-  
+  cluster_all_1D <- cbind(UPGMA_obsv_1D, df$SOMA_DISTANCE)  # adds cluster number, spine ID(row number) and corresponding distance from soma together
+  cluster_all_1D <- cluster_all_1D[order(cluster_all_1D$cluster),] #orders data by cluster number
+  cluster_freq_1D <- table(cluster_all_1D$cluster) #creates a table with how many spines are in each cluster
+  cluster_freq_1D <- as.data.frame(cluster_freq_1D) #converts above table to a data frame
+  cluster_freq_1D$Freq <- as.numeric(cluster_freq_1D$Freq) # turns the "cluster" column to a number
+  cluster_freq_1D$is_clustered <- as.numeric(cluster_freq_1D$Freq > 1) #returns 1 if there is >1 spine in a cluster and 0 if not-- therefore all 1s reflect a true "cluster" since it has more than 1 spine in ity 
+  num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters are on this segment
+  spines_in_cluster_1D <- cluster_freq_1D %>% group_by(is_clustered) %>% summarise(num_clus_spines_1D = sum(Freq)) # count how many spines are in the true clusters (>1 spines) and how many are not
+  spines_clustered_1D <- list() #initialize list for number of spines that are clustered
+  spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[1,2]) #add number of spines clustered to list
+  spines_clustered_1D <- as.matrix(spines_clustered_1D) #converts # of spines clustered to matrix
+  spines_clustered_1D <- as.numeric(spines_clustered_1D) #converts to numerical form
   
-  spines_not_1D <- as.numeric(total_spines - spines_clustered_1D)
+  spines_not_1D <- as.numeric(total_spines - spines_clustered_1D) #define and calculate number of spines not clustered (total spines minus number of spines clustered)
   
+  test_spines_clustered_all_1D <- data.frame() #initialize dataframe to contain # of spines in a cluster for all random samples
+  test_num_clusters_all_1D <- data.frame()
   
-  
-  
-  random_1D_all <- data.frame()
-  prob_density_test_all_1D <- data.frame()
-  
-# 1D random spines for loop
-  for(j in 1:5000){
+# 1D random spines for-loop
+  for(j in 1:100){
     random_spines_1D <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines
     test_dist_1D <- as.matrix(dist(random_spines_1D)) #creates distance matrix for random sample
-    UPGMA_test_1D <- IdClusters(test_dist_1D, method="UPGMA", cutoff=0.75, showPlot=FALSE)
-    UPGMA_test_1D$rn <- rownames(UPGMA_test_1D)
-    cluster_all_test_1D <- cbind(UPGMA_test_1D, random_spines_1D)
-    cluster_all_test_1D <- cluster_all_test_1D[order(cluster_all_test_1D$cluster),]
-    cluster_freq_test_1D <- table(cluster_all_test_1D$cluster)
-    cluster_freq_test_1D <- as.data.frame(cluster_freq_test_1D)
-    cluster_freq_test_1D$is_clustered <- as.numeric(cluster_freq_test_1D$Freq > 1)
-    is_clustered_test_1D <- sum(cluster_freq_test_1D$is_clustered)
-    num_clusters_test_1D <- is_clustered_test_1D
-    spines_is_clustered_test_1D <- cluster_freq_test_1D %>% group_by(is_clustered) %>% summarise(num_clusters_test_1D = sum(Freq))
-    spines_clustered_test_1D <- spines_is_clustered_test_1D[2,2]
-    spines_not_test_1D <- spines_is_clustered_test_1D[1,2]
-    spines_clustered_test_1D[is.na(spines_clustered_test_1D)] <- 0
-    random_1D <- spines_clustered_test_1D
-    random_1D[is.na(random_1D)] <- 0
-    random_1D_all <- rbind(random_1D_all, random_1D)
+    UPGMA_test_1D <- IdClusters(test_dist_1D, method="UPGMA", cutoff=0.75, showPlot=FALSE) #runs UPGMA with cutoff (same as with obsv but with random sample)
+    UPGMA_test_1D$rn <- rownames(UPGMA_test_1D) #add row name (spine ID) to cluster number  
+    cluster_all_test_1D <- cbind(UPGMA_test_1D, random_spines_1D) #add cluster number/spine ID to randomly generated soma distances
+    cluster_all_test_1D <- cluster_all_test_1D[order(cluster_all_test_1D$cluster),] #order by cluster number
+    cluster_freq_test_1D <- table(cluster_all_test_1D$cluster) #generate table with number of spines in each cluster  
+    cluster_freq_test_1D <- as.data.frame(cluster_freq_test_1D) #change table to data frame 
+    cluster_freq_test_1D$is_clustered <- as.numeric(cluster_freq_test_1D$Freq > 1) #ask whether cluster has >1 spines in it (1 for yes, 0 for no)
+    num_clusters_test_1D <- sum(cluster_freq_test_1D$is_clustered) # add how many 1s (or how many clusters have >1 spines) to get true number of clusters
+    num_clusters_test_1D[is.na(num_clusters_test_1D)] <- 0 #changes possible NA from 0 clusters to 0
+    test_num_clusters_all <- rbind(test_num_clusters_all, num_clusters_test_1D) #adds total number of clusters to running list
+    spines_in_cluster_test_1D <- cluster_freq_test_1D %>% group_by(is_clustered) %>% summarise(num_clusters_test_1D = sum(Freq)) #count how many spines that are in a true cluster or not
+    spines_clustered_test_1D <- spines_in_cluster_test_1D[2,2] # define how many spines are in a cluster
+    spines_not_test_1D <- as.numeric(total_spines - spines_clustered_test_1D) # calculate how many spines are not clustered
+    spines_clustered_test_1D[is.na(spines_clustered_test_1D)] <- 0  #returns 0 instead of Na if no spines are clustered in the random sample
+    test_spines_clustered_all_1D <- rbind(test_spines_clustered_all_1D, spines_clustered_test_1D) #add number of clustered spines to running list 
     
-  } # end of random spines 1D for loop
+  } # end of random spines 1D for-loop
   
-  random_1D_all <- as.matrix(random_1D_all)
-  random_1D_all <- as.numeric(random_1D_all)
+  test_spines_clustered_all_1D <- as.matrix(test_spines_clustered_all_1D) #changes data.frame to matrix (not sure if this is neccesary but it works)
+  test_spines_clustered_all_1D <- as.numeric(test_spines_clustered_all_1D) #changes all vales to numeric (again, not sure if neccessary)
   
-  std_test_1D <- sd(random_1D_all)
-  mean_test_1D <- mean(random_1D_all)
-  curve_dnorm_1D <- dnorm(random_1D_all, mean_test_1D, std_test_1D)
-  Cscore_1D <- pnorm(curve_dnorm_1D)
-
+  
+  std_test_1D <- sd(test_spines_clustered_all_1D) #generates STD of total number of spines clustered in entire random sample
+  mean_test_1D <- mean(test_spines_clustered_all_1D) #generates average number of spines in a cluster in random data
+  curve_dnorm_1D <- dnorm(test_spines_clustered_all_1D, mean_test_1D, std_test_1D) #gives probability density function, or height of probability distribution at each point(height = frequency)
+  std_curve_1D <- sd(curve_dnorm_1D)
+  mean_curve_1D <- mean(curve_dnorm_1D)
+  
+  Cscore_1D <- pnorm(spines_clustered_1D, mean_curve_1D, sd=sqrt(std_curve_1D)) #calculates the probability that given the random sample distribution (curve mean+SD) the total number of spines observed is higher
+  #therefore, the closer to 1, higher probability that a given number has more clustered spines than the random normal distribution and vice versa
   
  # add all 1D data to all data 
   cluster_data_1D <- data.frame()  
@@ -138,7 +141,7 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   
   
 # 3D clustering analysis  
-  dist_3D <- data.frame(dist.xyz(data_coords)) # creates distance matrix of spine head coordinates
+  dist_3D <- as.matrix(dist.xyz(data_coords)) # creates distance matrix of spine head coordinates
   UPGMA_obsv_3D <- IdClusters(dist_3D, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper
   #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1)
   UPGMA_obsv_3D$rn <- rownames(UPGMA_obsv_3D)   #adds a column with rownows to keep spine ID, not sure if this step is neccessary
@@ -148,20 +151,21 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   cluster_freq_3D <- as.data.frame(cluster_freq_3D)
   cluster_freq_3D$Freq <- as.numeric(cluster_freq_3D$Freq)
   cluster_freq_3D$is_clustered <- as.numeric(cluster_freq_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1
-  is_clustered_3D <- sum(cluster_freq_3D$is_clustered)
-  num_clusters_3D <- is_clustered_3D
   num_clusters_3D <- sum(cluster_freq_3D$is_clustered) #count how many 1s to determine how many clusters (spines > 1) in the segment
   spines_in_cluster_3D <- cluster_freq_3D %>% group_by(is_clustered) %>% summarise(num_clus_spines_3D = sum(Freq))
   spines_clustered_3D <- list()
   spines_clustered_3D <- rbind(spines_clustered_3D, spines_in_cluster_3D[1,2])
+  spines_clustered_3D <- as.matrix(spines_clustered_3D) #conerts num of spines clustered to matrix
+  spines_clustered_3D <- as.numeric(spines_clustered_3D) #converts to numerical form
+  
   spines_not_3D <- as.numeric(total_spines - spines_clustered_3D)
- 
-  random_3D_all <- data.frame()
-  prob_density_test_all_1D <- data.frame()
+  
+  test_spines_clustered_all_3D <- data.frame()
+  test_num_clusters_all_3D <- data.frame()
 
   
 # 3D random spines for loop
-  for(j in 1:5000){
+  for(j in 1:100){
     test_data_X <- data.frame(sample(df$X), df$Y, df$Z) # randomize the X's, Y's, and Z's to make a "biologically plausible" dataframe.
     colnames(test_data_X) <- c( "x", "Y", "Z")
     test_data_Y <- data.frame(df$X, sample(df$Y), df$Z)
@@ -170,39 +174,39 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
     colnames(test_data_Z) <- c( "x", "Y", "Z")
     test_data <- rbind(test_data_X, test_data_Y, test_data_Z)
     test_data_final <-data.frame(sample_n(test_data, total_spines))
-    test_dist <- as.matrix(dist(test_data_final)) #creates distance matrix for random sample
+    test_dist_3D <- as.matrix(dist(test_data_final)) #creates distance matrix for random sample
 
-    UPGMA_test_3D <- IdClusters(test_dist, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper
+    UPGMA_test_3D <- IdClusters(test_dist_3D, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper
     #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1)
     UPGMA_test_3D$rn <- rownames(UPGMA_test_3D)   #adds a column with rownows to keep spine ID, not sure if this step is neccessary
     cluster_all_test_3D <- cbind(UPGMA_test_3D, test_data_final)   #get the coordinates for each spine ID, if kept in row name/number order, will correctly correspond to each spine
     cluster_all_test_3D <- cluster_all_test_3D[order(cluster_all_test_3D$cluster),] #sort data frame by cluster number
     cluster_freq_test_3D <- table(cluster_all_test_3D$cluster) #create a table counting how many times each cluster Variable occurs (i.e. how many spines in each cluster)
     cluster_freq_test_3D <- as.data.frame(cluster_freq_test_3D)
-    cluster_freq_test_3D$Freq <- as.numeric(cluster_freq_test_3D$Freq) #convert factors to numbers
-    cluster_freq_test_3D$is_clustered_test <- as.numeric(cluster_freq_test_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1
-    is_clustered_test_3D <- sum(cluster_freq_test_3D$is_clustered_test)
-    num_clusters_test_3D <- is_clustered_test_3D #count how many 1s to determine how many clusters (spines > 1) in the segment
-    num_clusters_test_3D <- sum(cluster_freq_test_3D$is_clustered_test)
-    spines_in_cluster_test_3D <- cluster_freq_test_3D %>% group_by(is_clustered_test) %>% summarise(num_clus_spines_test_3D = sum(Freq))
-    spines_clustered_test_3D <- list()
-    spines_clustered_test_3D <- rbind(spines_clustered_test_3D, spines_in_cluster_test_3D[1,2])
-    spines_not_test_3D <- as.numeric(total_spines - spines_in_cluster_test_3D)
+    cluster_freq_test_3D$is_clustered <- as.numeric(cluster_freq_test_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1
+    num_clusters_test_3D <- sum(cluster_freq_test_3D$is_clustered)
+    num_clusters_test_3D[is.na(num_clusters_test_3D)] <- 0
+    test_num_clusters_all_3D <- rbind(test_num_clusters_all_3D, num_clusters_test_3D)
+    spines_in_cluster_test_3D <- cluster_freq_test_3D %>% group_by(is_clustered) %>% summarise(num_clus_spines_test_3D = sum(Freq))
+    spines_clustered_test_3D <- spines_in_cluster_test_3D[2,2]
+    
+    spines_not_test_3D <- as.numeric(spines_in_cluster_test_3D[1,2])
     spines_clustered_test_3D[is.na(spines_clustered_test_3D)] <- 0
-    random_3D <- spines_clustered_test_3D
-    random_3D[is.na(random_3D)] <- 0
-    random_3D_all <- rbind(random_3D_all, random_1D)
+    test_spines_clustered_all_3D <- rbind(test_spines_clustered_all_3D, spines_clustered_test_3D)
     
-
   } # end of random spines 3D for-loop
   
-  random_3D_all <- as.matrix(random_3D_all)
-  random_3D_all <- as.numeric(random_3D_all)
+  test_spines_clustered_all_3D <- as.matrix(test_spines_clustered_all_3D)
+  test_spines_clustered_all_3D <- as.numeric(test_spines_clustered_all_3D)
   
-  std_test_3D <- sd(random_3D_all)
-  mean_test_3D <- mean(random_3D_all)
-  curve_dnorm_3D <- dnorm(random_3D_all, mean_test_3D, std_test_3D)
-  Cscore_3D <- pnorm(curve_dnorm_3D)
+  std_test_3D <- sd(test_spines_clustered_all_3D)
+  mean_test_3D <- mean(test_spines_clustered_all_3D)
+  curve_dnorm_3D <- dnorm(test_spines_clustered_all_3D, mean_test_3D, std_test_3D)
+  std_curve_3D <- sd(curve_dnorm_3D)
+  mean_curve_3D <- mean(curve_dnorm_3D)
+  
+  Cscore_3D <- pnorm(spines_clustered_3D, mean_curve_3D, sd=sqrt(std_curve_3D))
+
 
   # add 3D data to data frame
   cluster_data_3D <- data.frame()

From a94a6ac34548d3e4e0c2065a47da56b8da585b45 Mon Sep 17 00:00:00 2001
From: kellenett <kelle-nett@uiowa.edu>
Date: Fri, 15 Mar 2019 14:12:39 -0500
Subject: [PATCH 03/11] Update 1D clustering Yadav.R

---
 1D clustering Yadav.R | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R
index 3e6b770..9550389 100644
--- a/1D clustering Yadav.R	
+++ b/1D clustering Yadav.R	
@@ -60,14 +60,14 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   ac_test <- list() #initializes list for ac's from next for loop  
   possible_dist <- seq(1,total_length, by=0.01) # create list of possible soma distances by 0.01 increments to pull from
   
-   for(k in 1:1000){
+   for(k in 1:5000){
     test_data <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines
     test_dist <- as.matrix(dist(test_data)) #creates distance matrix for random sample
     assign("test_cluster", agnes(test_data,metric = "euclidean", method = "average")) #runs UPGMA on sample and labels the agn output "test_cluster"
     ac_test <- rbind(ac_test,test_cluster$ac) # add row to ac using the 'test_cluster' ac
   } 
   
-  cScore_ac_1D <- sum(as.numeric(ac_test<dendrite_ac))/1000 # average (divide by 1000 samples) how many times random ac is smaller than dendrite_ac, the smaller the value, the more "clustering"
+  cScore_ac_1D <- sum(as.numeric(ac_test<dendrite_ac))/5000 # average (divide by 1000 samples) how many times random ac is smaller than dendrite_ac, the smaller the value, the more "clustering"
   df$c_score_ac_1D <- cScore_ac_1D #add a row to df with the cScore of the dendrite
 #end Shane's original analysis
   
@@ -96,7 +96,7 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   test_num_clusters_all_1D <- data.frame()
   
 # 1D random spines for-loop
-  for(j in 1:100){
+  for(j in 1:500){
     random_spines_1D <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines
     test_dist_1D <- as.matrix(dist(random_spines_1D)) #creates distance matrix for random sample
     UPGMA_test_1D <- IdClusters(test_dist_1D, method="UPGMA", cutoff=0.75, showPlot=FALSE) #runs UPGMA with cutoff (same as with obsv but with random sample)
@@ -108,7 +108,7 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
     cluster_freq_test_1D$is_clustered <- as.numeric(cluster_freq_test_1D$Freq > 1) #ask whether cluster has >1 spines in it (1 for yes, 0 for no)
     num_clusters_test_1D <- sum(cluster_freq_test_1D$is_clustered) # add how many 1s (or how many clusters have >1 spines) to get true number of clusters
     num_clusters_test_1D[is.na(num_clusters_test_1D)] <- 0 #changes possible NA from 0 clusters to 0
-    test_num_clusters_all <- rbind(test_num_clusters_all, num_clusters_test_1D) #adds total number of clusters to running list
+    test_num_clusters_all_1D <- rbind(test_num_clusters_all_1D, num_clusters_test_1D) #adds total number of clusters to running list
     spines_in_cluster_test_1D <- cluster_freq_test_1D %>% group_by(is_clustered) %>% summarise(num_clusters_test_1D = sum(Freq)) #count how many spines that are in a true cluster or not
     spines_clustered_test_1D <- spines_in_cluster_test_1D[2,2] # define how many spines are in a cluster
     spines_not_test_1D <- as.numeric(total_spines - spines_clustered_test_1D) # calculate how many spines are not clustered
@@ -196,8 +196,8 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
     
   } # end of random spines 3D for-loop
   
-  test_spines_clustered_all_3D <- as.matrix(test_spines_clustered_all_3D)
-  test_spines_clustered_all_3D <- as.numeric(test_spines_clustered_all_3D)
+  test_spines_clustered_all_3D <- as.numeric(as.matrix(test_spines_clustered_all_3D))
+
   
   std_test_3D <- sd(test_spines_clustered_all_3D)
   mean_test_3D <- mean(test_spines_clustered_all_3D)
@@ -205,7 +205,7 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   std_curve_3D <- sd(curve_dnorm_3D)
   mean_curve_3D <- mean(curve_dnorm_3D)
   
-  Cscore_3D <- pnorm(spines_clustered_3D, mean_curve_3D, sd=sqrt(std_curve_3D))
+  Cscore_3D <- pnorm(spines_clustered_3D, mean_curve_3D, std_curve_3D)
 
 
   # add 3D data to data frame

From 130ce90b03e48e77caaefc808b5124bc2131b962 Mon Sep 17 00:00:00 2001
From: kellenett <kelle-nett@uiowa.edu>
Date: Fri, 15 Mar 2019 15:20:00 -0500
Subject: [PATCH 04/11] Update 1D clustering Yadav.R

---
 1D clustering Yadav.R | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R
index 9550389..14d68ef 100644
--- a/1D clustering Yadav.R	
+++ b/1D clustering Yadav.R	
@@ -86,10 +86,10 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters are on this segment
   spines_in_cluster_1D <- cluster_freq_1D %>% group_by(is_clustered) %>% summarise(num_clus_spines_1D = sum(Freq)) # count how many spines are in the true clusters (>1 spines) and how many are not
   spines_clustered_1D <- list() #initialize list for number of spines that are clustered
-  spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[1,2]) #add number of spines clustered to list
+  spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[2,2]) #add number of spines clustered to list
   spines_clustered_1D <- as.matrix(spines_clustered_1D) #converts # of spines clustered to matrix
   spines_clustered_1D <- as.numeric(spines_clustered_1D) #converts to numerical form
-  
+  spines_clustered_1D[is.na(spines_clustered_1D)] <- 0
   spines_not_1D <- as.numeric(total_spines - spines_clustered_1D) #define and calculate number of spines not clustered (total spines minus number of spines clustered)
   
   test_spines_clustered_all_1D <- data.frame() #initialize dataframe to contain # of spines in a cluster for all random samples
@@ -123,11 +123,12 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   
   std_test_1D <- sd(test_spines_clustered_all_1D) #generates STD of total number of spines clustered in entire random sample
   mean_test_1D <- mean(test_spines_clustered_all_1D) #generates average number of spines in a cluster in random data
+ 
   curve_dnorm_1D <- dnorm(test_spines_clustered_all_1D, mean_test_1D, std_test_1D) #gives probability density function, or height of probability distribution at each point(height = frequency)
   std_curve_1D <- sd(curve_dnorm_1D)
   mean_curve_1D <- mean(curve_dnorm_1D)
   
-  Cscore_1D <- pnorm(spines_clustered_1D, mean_curve_1D, sd=sqrt(std_curve_1D)) #calculates the probability that given the random sample distribution (curve mean+SD) the total number of spines observed is higher
+  Cscore_1D <- pnorm(spines_clustered_1D, mean_test_1D, std_test_1D) #calculates the probability that given the random sample distribution (curve mean+SD) the total number of spines observed is higher
   #therefore, the closer to 1, higher probability that a given number has more clustered spines than the random normal distribution and vice versa
   
  # add all 1D data to all data 
@@ -154,7 +155,7 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   num_clusters_3D <- sum(cluster_freq_3D$is_clustered) #count how many 1s to determine how many clusters (spines > 1) in the segment
   spines_in_cluster_3D <- cluster_freq_3D %>% group_by(is_clustered) %>% summarise(num_clus_spines_3D = sum(Freq))
   spines_clustered_3D <- list()
-  spines_clustered_3D <- rbind(spines_clustered_3D, spines_in_cluster_3D[1,2])
+  spines_clustered_3D <- rbind(spines_clustered_3D, spines_in_cluster_3D[2,2])
   spines_clustered_3D <- as.matrix(spines_clustered_3D) #conerts num of spines clustered to matrix
   spines_clustered_3D <- as.numeric(spines_clustered_3D) #converts to numerical form
   
@@ -205,7 +206,7 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   std_curve_3D <- sd(curve_dnorm_3D)
   mean_curve_3D <- mean(curve_dnorm_3D)
   
-  Cscore_3D <- pnorm(spines_clustered_3D, mean_curve_3D, std_curve_3D)
+  Cscore_3D <- pnorm(spines_clustered_3D, mean_test_3D, std_test_3D)
 
 
   # add 3D data to data frame

From d8dcceed53b807567ea5a2fcf84a0e9295f96401 Mon Sep 17 00:00:00 2001
From: James Kent <james-kent@uiowa.edu>
Date: Mon, 18 Mar 2019 10:21:20 -0500
Subject: [PATCH 05/11] Apply suggestions from code review

Co-Authored-By: kellnett <44405714+kellnett@users.noreply.github.com>
---
 1D clustering Yadav.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R
index 14d68ef..20032d2 100644
--- a/1D clustering Yadav.R	
+++ b/1D clustering Yadav.R	
@@ -46,7 +46,7 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   density_stubby <- sum(as.numeric(df$TYPE=="stubby"))/total_length # same as above, with "stubby"
   df <- df[order(df$SOMA_DISTANCE),]  #order data by column SOMA_DISTANCE so that coordinates pulled are in spine attachment order
   
-  data_coords <- data.frame(df$X, df$Y, df$Z) # creates data frame with spine head coordinates
+data_coords <- df[c("X", "Y", "Z")] # creates data frame with spine head coordinates
 
 # Shane's original clustering analysis using spine attachment distance from soma  
     
@@ -86,7 +86,7 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters are on this segment
   spines_in_cluster_1D <- cluster_freq_1D %>% group_by(is_clustered) %>% summarise(num_clus_spines_1D = sum(Freq)) # count how many spines are in the true clusters (>1 spines) and how many are not
   spines_clustered_1D <- list() #initialize list for number of spines that are clustered
-  spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[2,2]) #add number of spines clustered to list
+  spines_clustered_1D <- spines_in_cluster_1D[[2,2]] #add number of spines clustered to list
   spines_clustered_1D <- as.matrix(spines_clustered_1D) #converts # of spines clustered to matrix
   spines_clustered_1D <- as.numeric(spines_clustered_1D) #converts to numerical form
   spines_clustered_1D[is.na(spines_clustered_1D)] <- 0
@@ -142,7 +142,7 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   
   
 # 3D clustering analysis  
-  dist_3D <- as.matrix(dist.xyz(data_coords)) # creates distance matrix of spine head coordinates
+  dist_3D <- as.matrix(dist(data_coords, diag=TRUE, upper=TRUE)) # creates distance matrix of spine head coordinates
   UPGMA_obsv_3D <- IdClusters(dist_3D, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper
   #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1)
   UPGMA_obsv_3D$rn <- rownames(UPGMA_obsv_3D)   #adds a column with rownows to keep spine ID, not sure if this step is neccessary

From 68e60c2ea0d0f458e25ef15062566d4d79cc905b Mon Sep 17 00:00:00 2001
From: kellenett <kelle-nett@uiowa.edu>
Date: Mon, 25 Mar 2019 13:10:07 -0500
Subject: [PATCH 06/11] Update 1D clustering Yadav.R

---
 1D clustering Yadav.R | 175 ++++++++++++++++++++----------------------
 1 file changed, 84 insertions(+), 91 deletions(-)

diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R
index 14d68ef..780217b 100644
--- a/1D clustering Yadav.R	
+++ b/1D clustering Yadav.R	
@@ -9,6 +9,7 @@ library(latticeExtra)
 library(cluster)
 library(stats)
 library(readr)
+library(tidyr)
 library(bio3d) #install.packages("bio3d", dependencies = TRUE)
 library(DECIPHER)
 #how to install DECIPHER - a is enter in response to prompt for "all"
@@ -28,17 +29,15 @@ data_all <- data.frame()  # initializes a data.frame that will store all of the
 
 
 for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
-  fileName <- unlist(strsplit(fileList[1], "[.]"))[1]   # removes ".csv" from file name
-  fileLoc <- file.path(filePath, fileList[1])           # creates a path to each file (differs from FileChosen as the loop runs)
+  fileName <- unlist(strsplit(fileList[i], "[.]"))[1]   # removes ".csv" from file name
+  fileLoc <- file.path(filePath, fileList[i])           # creates a path to each file (differs from FileChosen as the loop runs)
   df <- read_csv(fileLoc, col_types = cols()) # reads the csv file, suppresses output information about columns
   colnames(df) <- gsub("-", "_", colnames(df)) # R doesn't like dashes in column names, replaces "-" with "_" for all colnames of df
   df <- df[complete.cases(df$SOMA_DISTANCE),] # removes cases where there is no soma distance data
   df <- df[complete.cases(df$RAYBURST_VOLUME),] # removes cases where there is no data
   df <- df[complete.cases(df$MAX_DTS),] # removes cases where there is no  data
   df$file <- fileName # adds a column with repeated information about which file that spine came from
-  total_length <- df %>% group_by(SECTION_NUMBER)  %>% summarise(section_length=max(SECTION_LENGTH)) %>%ungroup() %>% summarise(total_length=sum(section_length)) %>% as.double() 
-  # ^^this uses the dplyr package to take the data.frame and group it by section. Then it finds the length of each section before adding them all together
-  
+  total_length <- df %>% group_by(SECTION_NUMBER)  %>% summarise(section_length=max(SECTION_LENGTH)) %>%ungroup() %>% summarise(total_length=sum(section_length)) %>% as.double() #this uses the dplyr package to take the data.frame and group it by section. Then it finds the length of each section before adding them all together
   total_spines <- as.numeric(nrow(df)) # number of rows = number of spines
   density_overall <- total_spines/total_length  # calculate density of segment
   density_mushroom <- sum(as.numeric(df$TYPE=="mushroom"))/total_length  # counts the number of "mushroom" rows and divides by length for mushroom density
@@ -60,85 +59,85 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   ac_test <- list() #initializes list for ac's from next for loop  
   possible_dist <- seq(1,total_length, by=0.01) # create list of possible soma distances by 0.01 increments to pull from
   
-   for(k in 1:5000){
-    test_data <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines
-    test_dist <- as.matrix(dist(test_data)) #creates distance matrix for random sample
-    assign("test_cluster", agnes(test_data,metric = "euclidean", method = "average")) #runs UPGMA on sample and labels the agn output "test_cluster"
-    ac_test <- rbind(ac_test,test_cluster$ac) # add row to ac using the 'test_cluster' ac
-  } 
+ #  for(k in 1:5000){
+  #  test_data <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines
+  #  test_dist <- as.matrix(dist(test_data)) #creates distance matrix for random sample
+  #  assign("test_cluster", agnes(test_data,metric = "euclidean", method = "average")) #runs UPGMA on sample and labels the agn output "test_cluster"
+   # ac_test <- rbind(ac_test,test_cluster$ac) # add row to ac using the 'test_cluster' ac
+ # } 
   
-  cScore_ac_1D <- sum(as.numeric(ac_test<dendrite_ac))/5000 # average (divide by 1000 samples) how many times random ac is smaller than dendrite_ac, the smaller the value, the more "clustering"
-  df$c_score_ac_1D <- cScore_ac_1D #add a row to df with the cScore of the dendrite
+#  cScore_ac_1D <- sum(as.numeric(ac_test<dendrite_ac))/5000 # average (divide by 1000 samples) how many times random ac is smaller than dendrite_ac, the smaller the value, the more "clustering"
+ # df$c_score_ac_1D <- cScore_ac_1D #add a row to df with the cScore of the dendrite
 #end Shane's original analysis
   
 
   
 #1D clustering (from spine attachment points)
-  dist_1D <- as.matrix(dist(df$SOMA_DISTANCE)) # creates a distance matrix using distance from soma, same as dist_ac, but keeping separate for different analysis for now
-  UPGMA_obsv_1D <- IdClusters(dist_1D, method = "UPGMA", cutoff=0.75, showPlot=TRUE)  #runs cluster analysis with cutoff from Yadav paper on observed spines
-  UPGMA_obsv_1D$rn <- rownames(UPGMA_obsv_1D) # adds a column with row names to keep spine ID, not sure if this step is necessary
-  cluster_all_1D <- cbind(UPGMA_obsv_1D, df$SOMA_DISTANCE)  # adds cluster number, spine ID(row number) and corresponding distance from soma together
-  cluster_all_1D <- cluster_all_1D[order(cluster_all_1D$cluster),] #orders data by cluster number
-  cluster_freq_1D <- table(cluster_all_1D$cluster) #creates a table with how many spines are in each cluster
-  cluster_freq_1D <- as.data.frame(cluster_freq_1D) #converts above table to a data frame
-  cluster_freq_1D$Freq <- as.numeric(cluster_freq_1D$Freq) # turns the "cluster" column to a number
-  cluster_freq_1D$is_clustered <- as.numeric(cluster_freq_1D$Freq > 1) #returns 1 if there is >1 spine in a cluster and 0 if not-- therefore all 1s reflect a true "cluster" since it has more than 1 spine in ity 
-  num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters are on this segment
-  spines_in_cluster_1D <- cluster_freq_1D %>% group_by(is_clustered) %>% summarise(num_clus_spines_1D = sum(Freq)) # count how many spines are in the true clusters (>1 spines) and how many are not
-  spines_clustered_1D <- list() #initialize list for number of spines that are clustered
-  spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[2,2]) #add number of spines clustered to list
-  spines_clustered_1D <- as.matrix(spines_clustered_1D) #converts # of spines clustered to matrix
-  spines_clustered_1D <- as.numeric(spines_clustered_1D) #converts to numerical form
-  spines_clustered_1D[is.na(spines_clustered_1D)] <- 0
-  spines_not_1D <- as.numeric(total_spines - spines_clustered_1D) #define and calculate number of spines not clustered (total spines minus number of spines clustered)
+#  dist_1D <- as.matrix(dist(df$SOMA_DISTANCE)) # creates a distance matrix using distance from soma, same as dist_ac, but keeping separate for different analysis for now
+#  UPGMA_obsv_1D <- IdClusters(dist_1D, method = "UPGMA", cutoff=0.75, showPlot=TRUE)  #runs cluster analysis with cutoff from Yadav paper on observed spines
+#  UPGMA_obsv_1D$rn <- rownames(UPGMA_obsv_1D) # adds a column with row names to keep spine ID, not sure if this step is necessary
+#  cluster_all_1D <- cbind(UPGMA_obsv_1D, df$SOMA_DISTANCE)  # adds cluster number, spine ID(row number) and corresponding distance from soma together
+#  cluster_all_1D <- cluster_all_1D[order(cluster_all_1D$cluster),] #orders data by cluster number
+#  cluster_freq_1D <- table(cluster_all_1D$cluster) #creates a table with how many spines are in each cluster
+#  cluster_freq_1D <- as.data.frame(cluster_freq_1D) #converts above table to a data frame
+#  cluster_freq_1D$Freq <- as.numeric(cluster_freq_1D$Freq) # turns the "cluster" column to a number
+#  cluster_freq_1D$is_clustered <- as.numeric(cluster_freq_1D$Freq > 1) #returns 1 if there is >1 spine in a cluster and 0 if not-- therefore all 1s reflect a true "cluster" since it has more than 1 spine in ity 
+#  num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters are on this segment
   
-  test_spines_clustered_all_1D <- data.frame() #initialize dataframe to contain # of spines in a cluster for all random samples
-  test_num_clusters_all_1D <- data.frame()
+#  spines_clustered_1D <- sum(cluster_freq_1D$Freq>1)
+
+#  spines_clustered_1D <- as.matrix(spines_clustered_1D) #converts # of spines clustered to matrix
+#  spines_clustered_1D <- as.numeric(spines_clustered_1D) #converts to numerical form
+#  spines_clustered_1D[is.na(spines_clustered_1D)] <- 0
+#  spines_not_1D <- as.numeric(total_spines - spines_clustered_1D) #define and calculate number of spines not clustered (total spines minus number of spines clustered)
+#  
+#  test_spines_clustered_all_1D <- data.frame() #initialize dataframe to contain # of spines in a cluster for all random samples
+#  test_num_clusters_all_1D <- data.frame()
   
 # 1D random spines for-loop
-  for(j in 1:500){
-    random_spines_1D <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines
-    test_dist_1D <- as.matrix(dist(random_spines_1D)) #creates distance matrix for random sample
-    UPGMA_test_1D <- IdClusters(test_dist_1D, method="UPGMA", cutoff=0.75, showPlot=FALSE) #runs UPGMA with cutoff (same as with obsv but with random sample)
-    UPGMA_test_1D$rn <- rownames(UPGMA_test_1D) #add row name (spine ID) to cluster number  
-    cluster_all_test_1D <- cbind(UPGMA_test_1D, random_spines_1D) #add cluster number/spine ID to randomly generated soma distances
-    cluster_all_test_1D <- cluster_all_test_1D[order(cluster_all_test_1D$cluster),] #order by cluster number
-    cluster_freq_test_1D <- table(cluster_all_test_1D$cluster) #generate table with number of spines in each cluster  
-    cluster_freq_test_1D <- as.data.frame(cluster_freq_test_1D) #change table to data frame 
-    cluster_freq_test_1D$is_clustered <- as.numeric(cluster_freq_test_1D$Freq > 1) #ask whether cluster has >1 spines in it (1 for yes, 0 for no)
-    num_clusters_test_1D <- sum(cluster_freq_test_1D$is_clustered) # add how many 1s (or how many clusters have >1 spines) to get true number of clusters
-    num_clusters_test_1D[is.na(num_clusters_test_1D)] <- 0 #changes possible NA from 0 clusters to 0
-    test_num_clusters_all_1D <- rbind(test_num_clusters_all_1D, num_clusters_test_1D) #adds total number of clusters to running list
-    spines_in_cluster_test_1D <- cluster_freq_test_1D %>% group_by(is_clustered) %>% summarise(num_clusters_test_1D = sum(Freq)) #count how many spines that are in a true cluster or not
-    spines_clustered_test_1D <- spines_in_cluster_test_1D[2,2] # define how many spines are in a cluster
-    spines_not_test_1D <- as.numeric(total_spines - spines_clustered_test_1D) # calculate how many spines are not clustered
-    spines_clustered_test_1D[is.na(spines_clustered_test_1D)] <- 0  #returns 0 instead of Na if no spines are clustered in the random sample
-    test_spines_clustered_all_1D <- rbind(test_spines_clustered_all_1D, spines_clustered_test_1D) #add number of clustered spines to running list 
-    
-  } # end of random spines 1D for-loop
+#  for(j in 1:10000){
+#    random_spines_1D <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines
+#    test_dist_1D <- as.matrix(dist(random_spines_1D)) #creates distance matrix for random sample
+#    UPGMA_test_1D <- IdClusters(test_dist_1D, method="UPGMA", cutoff=0.75, showPlot=FALSE) #runs UPGMA with cutoff (same as with obsv but with random sample)
+#   UPGMA_test_1D$rn <- rownames(UPGMA_test_1D) #add row name (spine ID) to cluster number  
+#    cluster_all_test_1D <- cbind(UPGMA_test_1D, random_spines_1D) #add cluster number/spine ID to randomly generated soma distances
+#   cluster_all_test_1D <- cluster_all_test_1D[order(cluster_all_test_1D$cluster),] #order by cluster number
+#    cluster_freq_test_1D <- table(cluster_all_test_1D$cluster) #generate table with number of spines in each cluster  
+#    cluster_freq_test_1D <- as.data.frame(cluster_freq_test_1D) #change table to data frame 
+#    cluster_freq_test_1D$is_clustered <- as.numeric(cluster_freq_test_1D$Freq > 1) #ask whether cluster has >1 spines in it (1 for yes, 0 for no)
+#   num_clusters_test_1D <- sum(cluster_freq_test_1D$is_clustered) # add how many 1s (or how many clusters have >1 spines) to get true number of clusters
+#    num_clusters_test_1D[is.na(num_clusters_test_1D)] <- 0 #changes possible NA from 0 clusters to 0
+#    test_num_clusters_all_1D <- rbind(test_num_clusters_all_1D, num_clusters_test_1D) #adds total number of clusters to running list
+#    spines_clustered_test_1D <- sum(cluster_freq_test_1D$Freq>1)
+#
+ #   spines_not_test_1D <- as.numeric(total_spines - spines_clustered_test_1D) # calculate how many spines are not clustered
+#    spines_clustered_test_1D[is.na(spines_clustered_test_1D)] <- 0  #returns 0 instead of Na if no spines are clustered in the random sample
+#    test_spines_clustered_all_1D <- rbind(test_spines_clustered_all_1D, spines_clustered_test_1D) #add number of clustered spines to running list 
+#    
+#  } # end of random spines 1D for-loop
   
-  test_spines_clustered_all_1D <- as.matrix(test_spines_clustered_all_1D) #changes data.frame to matrix (not sure if this is neccesary but it works)
-  test_spines_clustered_all_1D <- as.numeric(test_spines_clustered_all_1D) #changes all vales to numeric (again, not sure if neccessary)
+#  test_spines_clustered_all_1D <- as.matrix(test_spines_clustered_all_1D) #changes data.frame to matrix (not sure if this is neccesary but it works)
+#  test_spines_clustered_all_1D <- as.numeric(test_spines_clustered_all_1D) #changes all vales to numeric (again, not sure if neccessary)
+#  
   
+#  std_test_1D <- sd(test_spines_clustered_all_1D) #generates STD of total number of spines clustered in entire random sample
+#  mean_test_1D <- mean(test_spines_clustered_all_1D) #generates average number of spines in a cluster in random data
+# 
+#  curve_dnorm_1D <- dnorm(test_spines_clustered_all_1D, mean_test_1D, std_test_1D) #gives probability density function, or height of probability distribution at each point(height = frequency)
+#  std_curve_1D <- sd(curve_dnorm_1D)
+#  mean_curve_1D <- mean(curve_dnorm_1D)
   
-  std_test_1D <- sd(test_spines_clustered_all_1D) #generates STD of total number of spines clustered in entire random sample
-  mean_test_1D <- mean(test_spines_clustered_all_1D) #generates average number of spines in a cluster in random data
- 
-  curve_dnorm_1D <- dnorm(test_spines_clustered_all_1D, mean_test_1D, std_test_1D) #gives probability density function, or height of probability distribution at each point(height = frequency)
-  std_curve_1D <- sd(curve_dnorm_1D)
-  mean_curve_1D <- mean(curve_dnorm_1D)
-  
-  Cscore_1D <- pnorm(spines_clustered_1D, mean_test_1D, std_test_1D) #calculates the probability that given the random sample distribution (curve mean+SD) the total number of spines observed is higher
+#  Cscore_1D <- pnorm(spines_clustered_1D, mean_test_1D, std_test_1D) #calculates the probability that given the random sample distribution (curve mean+SD) the total number of spines observed is higher
   #therefore, the closer to 1, higher probability that a given number has more clustered spines than the random normal distribution and vice versa
   
  # add all 1D data to all data 
-  cluster_data_1D <- data.frame()  
-  cluster_data_1D <- rbind(cluster_data_1D, num_clusters_1D)
-  cluster_data_1D <- cbind(cluster_data_1D, spines_clustered_1D)
-  cluster_data_1D <- cbind(cluster_data_1D, spines_not_1D)
-  cluster_data_1D <- cbind(cluster_data_1D, Cscore_1D)
-  cluster_data_1D[is.na(cluster_data_1D)] <- 0
-  colnames(cluster_data_1D) <- c("# of clusters - 1D", "spines clustered - 1D", "spines not clustered - 1D", "Cscore - 1D")
+#  cluster_data_1D <- data.frame()  
+#  cluster_data_1D <- rbind(cluster_data_1D, num_clusters_1D)
+#  cluster_data_1D <- cbind(cluster_data_1D, spines_clustered_1D)
+#  cluster_data_1D <- cbind(cluster_data_1D, spines_not_1D)
+#  cluster_data_1D <- cbind(cluster_data_1D, Cscore_1D)
+#  cluster_data_1D[is.na(cluster_data_1D)] <- 0
+#  colnames(cluster_data_1D) <- c("# of clusters - 1D", "spines clustered - 1D", "spines not clustered - 1D", "Cscore - 1D")
   
   
 # 3D clustering analysis  
@@ -153,12 +152,12 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   cluster_freq_3D$Freq <- as.numeric(cluster_freq_3D$Freq)
   cluster_freq_3D$is_clustered <- as.numeric(cluster_freq_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1
   num_clusters_3D <- sum(cluster_freq_3D$is_clustered) #count how many 1s to determine how many clusters (spines > 1) in the segment
-  spines_in_cluster_3D <- cluster_freq_3D %>% group_by(is_clustered) %>% summarise(num_clus_spines_3D = sum(Freq))
-  spines_clustered_3D <- list()
-  spines_clustered_3D <- rbind(spines_clustered_3D, spines_in_cluster_3D[2,2])
-  spines_clustered_3D <- as.matrix(spines_clustered_3D) #conerts num of spines clustered to matrix
+  spines_clustered_3D <- cluster_freq_3D %>% 
+    filter(is_clustered == 1) %>% 
+    summarise(spines_clustered_3D = sum(Freq))
+  null_length <- nrow(spines_clustered_3D)
+  ifelse(null_length==0, spines_clustered_3D <- 0, NA)
   spines_clustered_3D <- as.numeric(spines_clustered_3D) #converts to numerical form
-  
   spines_not_3D <- as.numeric(total_spines - spines_clustered_3D)
   
   test_spines_clustered_all_3D <- data.frame()
@@ -166,7 +165,7 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
 
   
 # 3D random spines for loop
-  for(j in 1:100){
+  for(j in 1:10){
     test_data_X <- data.frame(sample(df$X), df$Y, df$Z) # randomize the X's, Y's, and Z's to make a "biologically plausible" dataframe.
     colnames(test_data_X) <- c( "x", "Y", "Z")
     test_data_Y <- data.frame(df$X, sample(df$Y), df$Z)
@@ -188,15 +187,16 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
     num_clusters_test_3D <- sum(cluster_freq_test_3D$is_clustered)
     num_clusters_test_3D[is.na(num_clusters_test_3D)] <- 0
     test_num_clusters_all_3D <- rbind(test_num_clusters_all_3D, num_clusters_test_3D)
-    spines_in_cluster_test_3D <- cluster_freq_test_3D %>% group_by(is_clustered) %>% summarise(num_clus_spines_test_3D = sum(Freq))
-    spines_clustered_test_3D <- spines_in_cluster_test_3D[2,2]
+    spines_clustered_test_3D <- cluster_freq_test_3D %>% group_by(is_clustered) %>% filter(is_clustered == 1) %>% summarise(spines_clustered_test_3D = sum(Freq))
+    spines_not_test_3D <- as.numeric(total_spines - spines_clustered_test_3D)
+    
     
-    spines_not_test_3D <- as.numeric(spines_in_cluster_test_3D[1,2])
     spines_clustered_test_3D[is.na(spines_clustered_test_3D)] <- 0
     test_spines_clustered_all_3D <- rbind(test_spines_clustered_all_3D, spines_clustered_test_3D)
     
   } # end of random spines 3D for-loop
   
+  
   test_spines_clustered_all_3D <- as.numeric(as.matrix(test_spines_clustered_all_3D))
 
   
@@ -207,19 +207,12 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   mean_curve_3D <- mean(curve_dnorm_3D)
   
   Cscore_3D <- pnorm(spines_clustered_3D, mean_test_3D, std_test_3D)
-
-
-  # add 3D data to data frame
-  cluster_data_3D <- data.frame()
-  cluster_data_3D <- rbind(cluster_data_3D, num_clusters_3D)
-  cluster_data_3D <- cbind(cluster_data_3D, spines_clustered_3D)
-  cluster_data_3D <- cbind(cluster_data_3D, spines_not_3D)
-  cluster_data_3D <- cbind(cluster_data_3D, Cscore_3D)
-  cluster_data_3D[is.na(cluster_data_3D)] <- 0
-  colnames(cluster_data_3D) <- c("# of clusters-3D", "spines clustered-3D", "spines not clustered-3D", "Cscore-3D")
-  
   
   
+  df$num_clusters_3D <- num_clusters_3D
+  df$spines_clustered_3D <- spines_clustered_3D
+  df$spines_not_3D <- spines_not_3D
+  df$Cscore_3D <- Cscore_3D
   df$density_overall <- density_overall # add these data to df
   df$density_mushroom <- density_mushroom
   df$density_thin <- density_thin
@@ -227,8 +220,8 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   data_all <- rbind(data_all, df) # adds df as next row in the data_all file
   
   
-  data_all <- cbind(data_all, cluster_data_1D)
-  data_all <- cbind(data_all, cluster_data_3D)
+#  data_all <- cbind(data_all, cluster_data_1D)
+#  data_all <- cbind(data_all, cluster_data_3D)
 
   
 } # end of file for-loop 

From 595d51dfcfcc6c1c2ba1e35a190a1f45f177a60d Mon Sep 17 00:00:00 2001
From: kellenett <kelle-nett@uiowa.edu>
Date: Mon, 25 Mar 2019 13:20:20 -0500
Subject: [PATCH 07/11] Create 3D clustering.R

---
 3D clustering.R | 275 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 275 insertions(+)
 create mode 100644 3D clustering.R

diff --git a/3D clustering.R b/3D clustering.R
new file mode 100644
index 0000000..7785ed8
--- /dev/null
+++ b/3D clustering.R	
@@ -0,0 +1,275 @@
+# modeling Yadav et al. 2012 cluster analysis; tailored towards clustering of attachment points
+# initially coded by S. Johnson- updates and 3D analysis of spine heads added by K. Nett
+
+
+library(dplyr)          # to install these , enter install.packages("package name") into the console in RStudio. replace package name with each package name in quotes, ie install.packages("dplyr")
+library(ggplot2)
+library(lattice)
+library(latticeExtra)
+library(cluster)
+library(stats)
+library(readr)
+library(tidyr)
+library(bio3d) #install.packages("bio3d", dependencies = TRUE)
+library(DECIPHER)
+#how to install DECIPHER - a is enter in response to prompt for "all"
+#source("https://bioconductor.org/biocLite.R")
+#biocLite("DECIPHER")
+#a
+
+
+fileChosen <- file.choose()     # opens file dialog to open 
+filePath <- dirname(fileChosen) # gets the directory name that that file is in
+setwd(filePath)           # sets the working directory. important for saving files later
+fileList <- list.files(filePath)  # a list of files in the directory to go through
+fileList <- grep(".csv", fileList, value = TRUE) # finds only lists .csv files in case there are other types
+data_all <- data.frame()  # initializes a data.frame that will store all of the data for the experiment
+
+
+
+
+for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
+  fileName <- unlist(strsplit(fileList[i], "[.]"))[1]   # removes ".csv" from file name
+  fileLoc <- file.path(filePath, fileList[i])           # creates a path to each file (differs from FileChosen as the loop runs)
+  df <- read_csv(fileLoc, col_types = cols()) # reads the csv file, suppresses output information about columns
+  colnames(df) <- gsub("-", "_", colnames(df)) # R doesn't like dashes in column names, replaces "-" with "_" for all colnames of df
+  df <- df[complete.cases(df$SOMA_DISTANCE),] # removes cases where there is no soma distance data
+  df <- df[complete.cases(df$RAYBURST_VOLUME),] # removes cases where there is no data
+  df <- df[complete.cases(df$MAX_DTS),] # removes cases where there is no  data
+  df$file <- fileName # adds a column with repeated information about which file that spine came from
+  total_length <- df %>% group_by(SECTION_NUMBER)  %>% summarise(section_length=max(SECTION_LENGTH)) %>%ungroup() %>% summarise(total_length=sum(section_length)) %>% as.double() #this uses the dplyr package to take the data.frame and group it by section. Then it finds the length of each section before adding them all together
+  total_spines <- as.numeric(nrow(df)) # number of rows = number of spines
+  density_overall <- total_spines/total_length  # calculate density of segment
+  density_mushroom <- sum(as.numeric(df$TYPE=="mushroom"))/total_length  # counts the number of "mushroom" rows and divides by length for mushroom density
+  density_thin <- sum(as.numeric(df$TYPE=="thin"))/total_length  # same as above, with "thin"
+  density_stubby <- sum(as.numeric(df$TYPE=="stubby"))/total_length # same as above, with "stubby"
+  df <- df[order(df$SOMA_DISTANCE),]  #order data by column SOMA_DISTANCE so that coordinates pulled are in spine attachment order
+  
+  data_coords <- data.frame(df$X, df$Y, df$Z) # creates data frame with spine head coordinates
+
+# Shane's original clustering analysis using spine attachment distance from soma  
+    
+  agn <- agnes(df$SOMA_DISTANCE,metric = "euclidean", method = "average") # runs the agnes, computes agglomerative hierarchical clustering, "average" = UPGMA
+  dist_ac <- as.matrix(dist(df$SOMA_DISTANCE))  #creates a distance matrix of soma distances
+  df$nn_dist_ac <- apply(dist_ac,2, function(x) sort(x)[2])   # finds nearest neighbor for each spine
+  df$nn2_dist_ac <- apply(dist_ac,2, function(x) sort(x)[3])  # finds second nearest neighbor
+  df$nn3_dist_ac <- apply(dist_ac,2, function(x) sort(x)[4])  # finds third
+  dendrite_ac <- agn$ac # defines agglomeration coefficent of the dendrite
+  
+  ac_test <- list() #initializes list for ac's from next for loop  
+  possible_dist <- seq(1,total_length, by=0.01) # create list of possible soma distances by 0.01 increments to pull from
+  
+ #  for(k in 1:5000){
+  #  test_data <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines
+  #  test_dist <- as.matrix(dist(test_data)) #creates distance matrix for random sample
+  #  assign("test_cluster", agnes(test_data,metric = "euclidean", method = "average")) #runs UPGMA on sample and labels the agn output "test_cluster"
+   # ac_test <- rbind(ac_test,test_cluster$ac) # add row to ac using the 'test_cluster' ac
+ # } 
+  
+#  cScore_ac_1D <- sum(as.numeric(ac_test<dendrite_ac))/5000 # average (divide by 1000 samples) how many times random ac is smaller than dendrite_ac, the smaller the value, the more "clustering"
+ # df$c_score_ac_1D <- cScore_ac_1D #add a row to df with the cScore of the dendrite
+#end Shane's original analysis
+  
+
+  
+#1D clustering (from spine attachment points)
+#  dist_1D <- as.matrix(dist(df$SOMA_DISTANCE)) # creates a distance matrix using distance from soma, same as dist_ac, but keeping separate for different analysis for now
+#  UPGMA_obsv_1D <- IdClusters(dist_1D, method = "UPGMA", cutoff=0.75, showPlot=TRUE)  #runs cluster analysis with cutoff from Yadav paper on observed spines
+#  UPGMA_obsv_1D$rn <- rownames(UPGMA_obsv_1D) # adds a column with row names to keep spine ID, not sure if this step is necessary
+#  cluster_all_1D <- cbind(UPGMA_obsv_1D, df$SOMA_DISTANCE)  # adds cluster number, spine ID(row number) and corresponding distance from soma together
+#  cluster_all_1D <- cluster_all_1D[order(cluster_all_1D$cluster),] #orders data by cluster number
+#  cluster_freq_1D <- table(cluster_all_1D$cluster) #creates a table with how many spines are in each cluster
+#  cluster_freq_1D <- as.data.frame(cluster_freq_1D) #converts above table to a data frame
+#  cluster_freq_1D$Freq <- as.numeric(cluster_freq_1D$Freq) # turns the "cluster" column to a number
+#  cluster_freq_1D$is_clustered <- as.numeric(cluster_freq_1D$Freq > 1) #returns 1 if there is >1 spine in a cluster and 0 if not-- therefore all 1s reflect a true "cluster" since it has more than 1 spine in ity 
+#  num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters are on this segment
+  
+#  spines_clustered_1D <- sum(cluster_freq_1D$Freq>1)
+
+#  spines_clustered_1D <- as.matrix(spines_clustered_1D) #converts # of spines clustered to matrix
+#  spines_clustered_1D <- as.numeric(spines_clustered_1D) #converts to numerical form
+#  spines_clustered_1D[is.na(spines_clustered_1D)] <- 0
+#  spines_not_1D <- as.numeric(total_spines - spines_clustered_1D) #define and calculate number of spines not clustered (total spines minus number of spines clustered)
+#  
+#  test_spines_clustered_all_1D <- data.frame() #initialize dataframe to contain # of spines in a cluster for all random samples
+#  test_num_clusters_all_1D <- data.frame()
+  
+# 1D random spines for-loop
+#  for(j in 1:10000){
+#    random_spines_1D <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines
+#    test_dist_1D <- as.matrix(dist(random_spines_1D)) #creates distance matrix for random sample
+#    UPGMA_test_1D <- IdClusters(test_dist_1D, method="UPGMA", cutoff=0.75, showPlot=FALSE) #runs UPGMA with cutoff (same as with obsv but with random sample)
+#   UPGMA_test_1D$rn <- rownames(UPGMA_test_1D) #add row name (spine ID) to cluster number  
+#    cluster_all_test_1D <- cbind(UPGMA_test_1D, random_spines_1D) #add cluster number/spine ID to randomly generated soma distances
+#   cluster_all_test_1D <- cluster_all_test_1D[order(cluster_all_test_1D$cluster),] #order by cluster number
+#    cluster_freq_test_1D <- table(cluster_all_test_1D$cluster) #generate table with number of spines in each cluster  
+#    cluster_freq_test_1D <- as.data.frame(cluster_freq_test_1D) #change table to data frame 
+#    cluster_freq_test_1D$is_clustered <- as.numeric(cluster_freq_test_1D$Freq > 1) #ask whether cluster has >1 spines in it (1 for yes, 0 for no)
+#   num_clusters_test_1D <- sum(cluster_freq_test_1D$is_clustered) # add how many 1s (or how many clusters have >1 spines) to get true number of clusters
+#    num_clusters_test_1D[is.na(num_clusters_test_1D)] <- 0 #changes possible NA from 0 clusters to 0
+#    test_num_clusters_all_1D <- rbind(test_num_clusters_all_1D, num_clusters_test_1D) #adds total number of clusters to running list
+#    spines_clustered_test_1D <- sum(cluster_freq_test_1D$Freq>1)
+#
+ #   spines_not_test_1D <- as.numeric(total_spines - spines_clustered_test_1D) # calculate how many spines are not clustered
+#    spines_clustered_test_1D[is.na(spines_clustered_test_1D)] <- 0  #returns 0 instead of Na if no spines are clustered in the random sample
+#    test_spines_clustered_all_1D <- rbind(test_spines_clustered_all_1D, spines_clustered_test_1D) #add number of clustered spines to running list 
+#    
+#  } # end of random spines 1D for-loop
+  
+#  test_spines_clustered_all_1D <- as.matrix(test_spines_clustered_all_1D) #changes data.frame to matrix (not sure if this is neccesary but it works)
+#  test_spines_clustered_all_1D <- as.numeric(test_spines_clustered_all_1D) #changes all vales to numeric (again, not sure if neccessary)
+#  
+  
+#  std_test_1D <- sd(test_spines_clustered_all_1D) #generates STD of total number of spines clustered in entire random sample
+#  mean_test_1D <- mean(test_spines_clustered_all_1D) #generates average number of spines in a cluster in random data
+# 
+#  curve_dnorm_1D <- dnorm(test_spines_clustered_all_1D, mean_test_1D, std_test_1D) #gives probability density function, or height of probability distribution at each point(height = frequency)
+#  std_curve_1D <- sd(curve_dnorm_1D)
+#  mean_curve_1D <- mean(curve_dnorm_1D)
+  
+#  Cscore_1D <- pnorm(spines_clustered_1D, mean_test_1D, std_test_1D) #calculates the probability that given the random sample distribution (curve mean+SD) the total number of spines observed is higher
+  #therefore, the closer to 1, higher probability that a given number has more clustered spines than the random normal distribution and vice versa
+  
+ # add all 1D data to all data 
+#  cluster_data_1D <- data.frame()  
+#  cluster_data_1D <- rbind(cluster_data_1D, num_clusters_1D)
+#  cluster_data_1D <- cbind(cluster_data_1D, spines_clustered_1D)
+#  cluster_data_1D <- cbind(cluster_data_1D, spines_not_1D)
+#  cluster_data_1D <- cbind(cluster_data_1D, Cscore_1D)
+#  cluster_data_1D[is.na(cluster_data_1D)] <- 0
+#  colnames(cluster_data_1D) <- c("# of clusters - 1D", "spines clustered - 1D", "spines not clustered - 1D", "Cscore - 1D")
+  
+  
+# 3D clustering analysis  
+  dist_3D <- as.matrix(dist.xyz(data_coords)) # creates distance matrix of spine head coordinates
+  UPGMA_obsv_3D <- IdClusters(dist_3D, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper
+  #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1)
+  UPGMA_obsv_3D$rn <- rownames(UPGMA_obsv_3D)   #adds a column with rownows to keep spine ID, not sure if this step is neccessary
+  cluster_all_3D <- cbind(UPGMA_obsv_3D, data_coords)   #get the coordinates for each spine ID, if kept in row name/number order, will correctly correspond to each spine
+  cluster_all_3D <- cluster_all_3D[order(cluster_all_3D$cluster),] #sort data frame by cluster number
+  cluster_freq_3D <- table(cluster_all_3D$cluster) #create a table counting how many times each cluster Variable occurs (i.e. how many spines in each cluster)
+  cluster_freq_3D <- as.data.frame(cluster_freq_3D)
+  cluster_freq_3D$Freq <- as.numeric(cluster_freq_3D$Freq)
+  cluster_freq_3D$is_clustered <- as.numeric(cluster_freq_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1
+  num_clusters_3D <- sum(cluster_freq_3D$is_clustered) #count how many 1s to determine how many clusters (spines > 1) in the segment
+  spines_clustered_3D <- cluster_freq_3D %>% 
+    filter(is_clustered == 1) %>% 
+    summarise(spines_clustered_3D = sum(Freq))
+  null_length <- nrow(spines_clustered_3D)
+  ifelse(null_length==0, spines_clustered_3D <- 0, NA)
+  spines_clustered_3D <- as.numeric(spines_clustered_3D) #converts to numerical form
+  spines_not_3D <- as.numeric(total_spines - spines_clustered_3D)
+  
+  test_spines_clustered_all_3D <- data.frame()
+  test_num_clusters_all_3D <- data.frame()
+
+  
+# 3D random spines for loop
+  for(j in 1:10){
+    test_data_X <- data.frame(sample(df$X), df$Y, df$Z) # randomize the X's, Y's, and Z's to make a "biologically plausible" dataframe.
+    colnames(test_data_X) <- c( "x", "Y", "Z")
+    test_data_Y <- data.frame(df$X, sample(df$Y), df$Z)
+    colnames(test_data_Y) <- c( "x", "Y", "Z")
+    test_data_Z <- data.frame(df$X, df$Y, sample(df$Z))
+    colnames(test_data_Z) <- c( "x", "Y", "Z")
+    test_data <- rbind(test_data_X, test_data_Y, test_data_Z)
+    test_data_final <-data.frame(sample_n(test_data, total_spines))
+    test_dist_3D <- as.matrix(dist(test_data_final)) #creates distance matrix for random sample
+
+    UPGMA_test_3D <- IdClusters(test_dist_3D, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper
+    #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1)
+    UPGMA_test_3D$rn <- rownames(UPGMA_test_3D)   #adds a column with rownows to keep spine ID, not sure if this step is neccessary
+    cluster_all_test_3D <- cbind(UPGMA_test_3D, test_data_final)   #get the coordinates for each spine ID, if kept in row name/number order, will correctly correspond to each spine
+    cluster_all_test_3D <- cluster_all_test_3D[order(cluster_all_test_3D$cluster),] #sort data frame by cluster number
+    cluster_freq_test_3D <- table(cluster_all_test_3D$cluster) #create a table counting how many times each cluster Variable occurs (i.e. how many spines in each cluster)
+    cluster_freq_test_3D <- as.data.frame(cluster_freq_test_3D)
+    cluster_freq_test_3D$is_clustered <- as.numeric(cluster_freq_test_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1
+    num_clusters_test_3D <- sum(cluster_freq_test_3D$is_clustered)
+    num_clusters_test_3D[is.na(num_clusters_test_3D)] <- 0
+    test_num_clusters_all_3D <- rbind(test_num_clusters_all_3D, num_clusters_test_3D)
+    spines_clustered_test_3D <- cluster_freq_test_3D %>% group_by(is_clustered) %>% filter(is_clustered == 1) %>% summarise(spines_clustered_test_3D = sum(Freq))
+    spines_not_test_3D <- as.numeric(total_spines - spines_clustered_test_3D)
+    
+    
+    spines_clustered_test_3D[is.na(spines_clustered_test_3D)] <- 0
+    test_spines_clustered_all_3D <- rbind(test_spines_clustered_all_3D, spines_clustered_test_3D)
+    
+  } # end of random spines 3D for-loop
+  
+  
+  test_spines_clustered_all_3D <- as.numeric(as.matrix(test_spines_clustered_all_3D))
+
+  
+  std_test_3D <- sd(test_spines_clustered_all_3D)
+  mean_test_3D <- mean(test_spines_clustered_all_3D)
+  curve_dnorm_3D <- dnorm(test_spines_clustered_all_3D, mean_test_3D, std_test_3D)
+  std_curve_3D <- sd(curve_dnorm_3D)
+  mean_curve_3D <- mean(curve_dnorm_3D)
+  
+  Cscore_3D <- pnorm(spines_clustered_3D, mean_test_3D, std_test_3D)
+  
+  
+  df$num_clusters_3D <- num_clusters_3D
+  df$spines_clustered_3D <- spines_clustered_3D
+  df$spines_not_3D <- spines_not_3D
+  df$Cscore_3D <- Cscore_3D
+  df$density_overall <- density_overall # add these data to df
+  df$density_mushroom <- density_mushroom
+  df$density_thin <- density_thin
+  df$density_stubby <- density_stubby
+  data_all <- rbind(data_all, df) # adds df as next row in the data_all file
+  
+  
+#  data_all <- cbind(data_all, cluster_data_1D)
+#  data_all <- cbind(data_all, cluster_data_3D)
+
+  
+} # end of file for-loop 
+
+data_all$animal_num <- lapply(data_all$file, function(x) unlist(strsplit(x, "-"))[2]) # this pulls out an animal number from the file number
+data_all$retro_label <- substring(data_all$animal_num, nchar(data_all$animal_num), nchar(data_all$animal_num))  #pulls off 'L' or 'N' from ID to indicate whether cell was retro-gradely labeled
+#data_all$PDB <- substring(data_all$animal_num, nchar(data_all$animal_num), nchar(data_all$animal_num))
+data_all$PDB <- lapply(data_all$file, function(x) unlist(strsplit(x, "-"))[3]) # pulls dendrite location out of name and adds column
+data_all$PDB <- replace(data_all$PDB, data_all$PDB=="p", "prox")
+data_all$PDB <- replace(data_all$PDB, data_all$PDB=="d", "dist")
+data_all$PDB <- replace(data_all$PDB, data_all$PDB=="b", "basal")
+data_all$retro_label <- replace(data_all$retro_label, data_all$retro_label=="L", "labeled") 
+data_all$retro_label <- replace(data_all$retro_label, data_all$retro_label=="N", "not labeled")
+data_all$stack <- lapply(data_all$file, function(x) unlist(strsplit(x, "-"))[4]) #gives letter ID of different imaging days
+data_all$stack <- unlist(data_all$stack)
+data_all$animal_num <- lapply(data_all$animal_num, function(x) unlist(strsplit(x, "L"))[1]) #removes letter from behind animal ID name
+data_all$animal_num <- lapply(data_all$animal_num, function(x) unlist(strsplit(x, "N"))[1])
+data_all$RAYBURST_VOLUME <- as.numeric(data_all$RAYBURST_VOLUME)
+data_all$MAX_DTS <- as.numeric(data_all$MAX_DTS)
+
+n <- readline(prompt="Enter group name:") # gives prompt on console to enter animal group
+data_all$group <- n # adds the group to the master data file
+
+data_all$animal_num <- unlist(data_all$animal_num) #changes animal number to a vector rather than a list, which is important for executing the following task
+data_all$PDB <- unlist(data_all$PDB) # same as above
+
+TYPE_ave <- data_all %>% group_by(group, animal_num, retro_label, TYPE) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
+PDB_TYPE_ave <- data_all %>% group_by(group, animal_num, stack, retro_label, PDB, TYPE) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
+
+PDB_ave <- data_all %>% group_by(group, animal_num, retro_label, PDB) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
+# ^first groups by treatment group, then by animal, then by day of image (i.e. -F), then by dendrite location
+# summarise then finds averages for each group for all various pieces of data
+
+animal_ave <- data_all %>% group_by(group, animal_num) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
+# groups by treatment then animal, then finds averages per animal for various data
+
+labeled_ave <- data_all %>% group_by(group, animal_num, retro_label) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
+# groups by treatment then animal, then finds averages per animal for various data
+
+Cscore3D_avg <- data_all %>% group_by(animal_num, retro_label) %>% summarise(Cscore_3D = mean(Cscore_3D))
+
+#dir.create("analysis")  #creates a directory to create a file on the computer
+#savePath <- paste(filePath,"/", "analysis", collapse = "/", sep="") # creates the path where files can be saved
+#setwd(savePath) # sets working directory to the path created above
+#write.csv(data_all, "data_all.csv")
+#write.csv(PDB_ave, "PDB averages.csv")
+#write.csv(animal_ave, "Animal Averages.csv")
+#write.csv(TYPE_ave, "spine_type_averages.csv")
+#write.csv(PDB_TYPE_ave, "PDB_spine_type_averages.csv")
+#write.csv(labeled_ave, "labeled_averages.csv")
+

From 1e8a84e2ed374be5f1fcb330d5e517b394baa2c9 Mon Sep 17 00:00:00 2001
From: kellenett <kelle-nett@uiowa.edu>
Date: Mon, 25 Mar 2019 14:49:16 -0500
Subject: [PATCH 08/11] Update 1D clustering Yadav.R

---
 1D clustering Yadav.R | 117 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 106 insertions(+), 11 deletions(-)

diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R
index 780217b..96fa6b2 100644
--- a/1D clustering Yadav.R	
+++ b/1D clustering Yadav.R	
@@ -37,7 +37,12 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
   df <- df[complete.cases(df$RAYBURST_VOLUME),] # removes cases where there is no data
   df <- df[complete.cases(df$MAX_DTS),] # removes cases where there is no  data
   df$file <- fileName # adds a column with repeated information about which file that spine came from
-  total_length <- df %>% group_by(SECTION_NUMBER)  %>% summarise(section_length=max(SECTION_LENGTH)) %>%ungroup() %>% summarise(total_length=sum(section_length)) %>% as.double() #this uses the dplyr package to take the data.frame and group it by section. Then it finds the length of each section before adding them all together
+  total_length <- df %>% 
+    group_by(SECTION_NUMBER)  %>% 
+    summarise(section_length=max(SECTION_LENGTH)) %>%
+    ungroup() %>% 
+    summarise(total_length=sum(section_length)) %>% 
+    as.double() #this uses the dplyr package to take the data.frame and group it by section. Then it finds the length of each section before adding them all together
   total_spines <- as.numeric(nrow(df)) # number of rows = number of spines
   density_overall <- total_spines/total_length  # calculate density of segment
   density_mushroom <- sum(as.numeric(df$TYPE=="mushroom"))/total_length  # counts the number of "mushroom" rows and divides by length for mushroom density
@@ -49,7 +54,7 @@ for(i in 1:length(fileList)){  ##add back in 'i' when adding for loop back in
 
 # Shane's original clustering analysis using spine attachment distance from soma  
     
-  agn <- agnes(df$SOMA_DISTANCE,metric = "euclidean", method = "average") # runs the agnes, computes agglomerative hierarchical clustering, "average" = UPGMA
+ # agn <- agnes(df$SOMA_DISTANCE,metric = "euclidean", method = "average") # runs the agnes, computes agglomerative hierarchical clustering, "average" = UPGMA
   dist_ac <- as.matrix(dist(df$SOMA_DISTANCE))  #creates a distance matrix of soma distances
   df$nn_dist_ac <- apply(dist_ac,2, function(x) sort(x)[2])   # finds nearest neighbor for each spine
   df$nn2_dist_ac <- apply(dist_ac,2, function(x) sort(x)[3])  # finds second nearest neighbor
@@ -242,24 +247,114 @@ data_all$animal_num <- lapply(data_all$animal_num, function(x) unlist(strsplit(x
 data_all$RAYBURST_VOLUME <- as.numeric(data_all$RAYBURST_VOLUME)
 data_all$MAX_DTS <- as.numeric(data_all$MAX_DTS)
 
+
+
 n <- readline(prompt="Enter group name:") # gives prompt on console to enter animal group
 data_all$group <- n # adds the group to the master data file
 
 data_all$animal_num <- unlist(data_all$animal_num) #changes animal number to a vector rather than a list, which is important for executing the following task
 data_all$PDB <- unlist(data_all$PDB) # same as above
 
-TYPE_ave <- data_all %>% group_by(group, animal_num, retro_label, TYPE) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
-PDB_TYPE_ave <- data_all %>% group_by(group, animal_num, stack, retro_label, PDB, TYPE) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
+data_all_by_file <- data_all %>% 
+  group_by(file) %>% 
+  summarise(num_clusters_3D = mean(num_clusters_3D), 
+            spines_clustered_3D = mean(spines_clustered_3D), 
+            spines_not_3D = mean(spines_not_3D), 
+            Cscore_3D = mean(Cscore_3D), 
+            density_overall = mean(density_overall), 
+            density_mushroom = mean(density_mushroom), 
+            density_thin = mean(density_thin), 
+            density_stubby = mean(density_stubby))
+
+
+data_all_by_file$animal_num <- lapply(data_all_by_file$file, function(x) unlist(strsplit(x, "-"))[2]) # this pulls out an animal number from the file number
+data_all_by_file$retro_label <- substring(data_all_by_file$animal_num, nchar(data_all_by_file$animal_num), nchar(data_all_by_file$animal_num))  #pulls off 'L' or 'N' from ID to indicate whether cell was retro-gradely labeled
+data_all_by_file$PDB <- lapply(data_all_by_file$file, function(x) unlist(strsplit(x, "-"))[3]) # pulls dendrite location out of name and adds column
+data_all_by_file$PDB <- replace(data_all_by_file$PDB, data_all_by_file$PDB=="b", "basal")
+data_all_by_file$PDB <- replace(data_all_by_file$PDB, data_all_by_file$PDB=="p", "prox")
+data_all_by_file$PDB <- replace(data_all_by_file$PDB, data_all_by_file$PDB=="d", "dist")
+data_all_by_file$PDB <- replace(data_all_by_file$PDB, data_all_by_file$PDB=="b", "basal")
+data_all_by_file$PDB <- replace(data_all_by_file$PDB, data_all_by_file$PDB=="p", "prox")
+data_all_by_file$PDB <- replace(data_all_by_file$PDB, data_all_by_file$PDB=="d", "dist")
+data_all_by_file$retro_label <- replace(data_all_by_file$retro_label, data_all_by_file$retro_label=="L", "labeled") 
+data_all_by_file$retro_label <- replace(data_all_by_file$retro_label, data_all_by_file$retro_label=="N", "not labeled")
+data_all_by_file$stack <- lapply(data_all_by_file$file, function(x) unlist(strsplit(x, "-"))[4]) #gives letter ID of different imaging days
+data_all_by_file$stack <- unlist(data_all_by_file$stack)
+data_all_by_file$animal_num <- lapply(data_all_by_file$animal_num, function(x) unlist(strsplit(x, "L"))[1]) #removes letter from behind animal ID name
+data_all_by_file$animal_num <- lapply(data_all_by_file$animal_num, function(x) unlist(strsplit(x, "N"))[1])
+data_all_by_file$group <- n
+data_all_by_file$animal_num <- unlist(data_all_by_file$animal_num) #changes animal number to a vector rather than a list, which is important for executing the following task
+data_all_by_file$PDB <- unlist(data_all_by_file$PDB)
+
+
+#averages from data_all_by_file (when data is repeated by row for an individual file)
+location_avg_file <- data_all_by_file %>%
+  group_by(group, animal_num, retro_label, PDB) %>% 
+  summarise(num_clusters_3D = mean(num_clusters_3D),
+            spines_clustered_3D = mean(spines_clustered_3D),
+            spines_not_3D = mean(spines_not_3D),
+            Cscore_3D = mean(Cscore_3D),
+            density_overall = mean(density_overall),
+            density_mushroom = mean(density_mushroom),
+            density_thin = mean(density_thin),
+            density_stubby = mean(density_stubby))         
+
+animal_labeled_avg_file <- data_all_by_file %>% 
+  group_by(group, animal_num, retro_label) %>% 
+  summarise(num_clusters_3D = mean(num_clusters_3D),
+            spines_clustered_3D = mean(spines_clustered_3D),
+            spines_not_3D = mean(spines_not_3D),
+            Cscore_3D = mean(Cscore_3D),
+            density_overall = mean(density_overall),
+            density_mushroom = mean(density_mushroom),
+            density_thin = mean(density_thin),
+            density_stubby = mean(density_stubby)) 
+
+#averages from data_all (when each spine/row has a different data point)
+spine_type_avg <- data_all %>% 
+  group_by(group, animal_num, retro_label, TYPE) %>% 
+  summarise(nn_1D = mean(nn_dist_1D), 
+            nn2_1D = mean(nn2_dist_1D), 
+            nn3_1D = mean(nn3_dist_1D), 
+            nn_3D = mean(nn_dist_3D), 
+            nn2_3D = mean(nn2_dist_3D), 
+            nn3_3D = mean(nn3_dist_3D),
+            spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), 
+            spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
+
+spine_type_by_location_avg <- data_all %>% 
+  group_by(group, animal_num, stack, retro_label, PDB, TYPE) %>% 
+  summarise(nn_1D = mean(nn_dist_1D), 
+            nn2_1D = mean(nn2_dist_1D), 
+            nn3_1D = mean(nn3_dist_1D), 
+            nn_3D = mean(nn_dist_3D), 
+            nn2_3D = mean(nn2_dist_3D), 
+            nn3_3D = mean(nn3_dist_3D), 
+            spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), 
+            spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
 
-PDB_ave <- data_all %>% group_by(group, animal_num, retro_label, PDB) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
-# ^first groups by treatment group, then by animal, then by day of image (i.e. -F), then by dendrite location
-# summarise then finds averages for each group for all various pieces of data
+location_avg <- data_all %>% 
+  group_by(group, animal_num, retro_label, PDB) %>% 
+  summarise(nn_1D = mean(nn_dist_1D), 
+            nn2_1D = mean(nn2_dist_1D), 
+            nn3_1D = mean(nn3_dist_1D),
+            nn_3D = mean(nn_dist_3D), 
+            nn2_3D = mean(nn2_dist_3D), 
+            nn3_3D = mean(nn3_dist_3D), 
+            spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), 
+            spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
 
-animal_ave <- data_all %>% group_by(group, animal_num) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
-# groups by treatment then animal, then finds averages per animal for various data
+animal_labeled_avg <- data_all %>% 
+  group_by(group, animal_num, retro_label) %>% 
+  summarise(nn_1D = mean(nn_dist_1D), 
+            nn2_1D = mean(nn2_dist_1D), 
+            nn3_1D = mean(nn3_dist_1D),
+            nn_3D = mean(nn_dist_3D),
+            nn2_3D = mean(nn2_dist_3D), 
+            nn3_3D = mean(nn3_dist_3D),
+            spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), 
+            spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
 
-labeled_ave <- data_all %>% group_by(group, animal_num, retro_label) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE))
-# groups by treatment then animal, then finds averages per animal for various data
 
 
 

From 7cc875da3cc7635ba3e691b28dd38e9c85b0e252 Mon Sep 17 00:00:00 2001
From: kellnett <44405714+kellnett@users.noreply.github.com>
Date: Mon, 28 Oct 2019 15:06:47 -0500
Subject: [PATCH 09/11] Add files via upload

---
 folder_loop.R | 683 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 683 insertions(+)
 create mode 100644 folder_loop.R

diff --git a/folder_loop.R b/folder_loop.R
new file mode 100644
index 0000000..c300763
--- /dev/null
+++ b/folder_loop.R
@@ -0,0 +1,683 @@
+#loop attempt with raw_data_cleanup.R
+
+library(dplyr)           
+library(ggplot2)        
+library(lattice)
+library(latticeExtra)
+library(cluster)
+library(stats)
+library(readr)
+library(tidyr)
+library(bio3d)
+library(DECIPHER)
+library(svDialogs)
+library(rowr)
+library(tcltk)
+
+
+parent.folder <- tk_choose.dir(default = "", caption = "select folder")
+sub.folders <- list.dirs(parent.folder, recursive = TRUE)[-1]
+setwd(parent.folder)
+saveFolder <- dirname(parent.folder)
+
+for(i in 1:length(sub.folders)) {
+setwd(parent.folder)
+  fileChosen <- sub.folders[i]
+
+
+
+
+#for(i in 1:length(sub.folders)) {
+#fileChosen <-  sub.folders[i]
+
+
+# Initialize Code: Select file(s) and define group
+
+#fileChosen <- file.choose()                                             # opens a folder dialog box to select first file in folder containing data from 1 treatment group 
+filePath   <- dirname(fileChosen)                                       # gets the directory name that that file is in
+setwd(filePath)                                                         # sets the working directory. important for saving files later
+fileList   <- list.files(fileChosen)                                      # list the files in the directory to go through
+fileList   <- grep(".csv", fileList, value = TRUE)                      # finds only lists .csv files in case there are other types
+data_all   <- data.frame()                                              # initializes data frame to store data throughout the for-loop
+#rat_ID     <- dlgInput("Enter Rat ID #", Sys.info()["user"])$res       # prompt given in console to enter the animal treatment group (i.e. Sal-D0)
+
+
+
+# File list for-loop: Apply code to each file in fileList
+
+for(i in 1:length(fileList))  {                                         # peform code inside {} for each .csv file in the file list
+  fileName     <- unlist(strsplit(fileList[i], "[.]"))[1]               # remove '.csv' from the file name
+  fileLoc      <- file.path(fileChosen, fileList[i])                      # create path to file (will differ from initial FileChosen as loop progresses)
+  df           <- read_csv(fileLoc, col_types = cols())                 # read the csv file, uses column names from NeuronStudio file
+  colnames(df) <- gsub("-", "_", colnames(df))                          # replaces dashes ('-') w/ underscores ('_') for column names (R doesn't like dashes)
+  df           <- df[complete.cases(df$SOMA_DISTANCE) ,]                # removes cases where there is no soma distance data
+  df           <- df[complete.cases(df$RAYBURST_VOLUME), ]              # " " spine volume data
+  df           <- df[complete.cases(df$MAX_DTS), ]                      # " " length data
+  df$file      <- fileName                                              # add a column identifying the file name for each spine (row)
+  ## Current NeuronStudio data should only have 1 section/dendrite; however, it is possible
+  ## to have multiple if they are not linked together. The next 6 lines calculate total
+  ## length when there is more than 1 section
+  
+  total_length <- df %>%                                                # dplyr package: define total length of dendritic segment
+    group_by(SECTION_NUMBER)  %>%                                       # group data frame by section number
+    summarise(section_length = max(SECTION_LENGTH)) %>%                 # find the length of each section
+    ungroup() %>%                                                       # ungroup previous grouping by section number
+    summarise(total_length = sum(section_length)) %>%                   # sum section length of each section
+    as.double()                                                         # allows for 64 bit storage (increase precision with more significant digits)
+  
+
+  df$RAYBURST_VOLUME[df$RAYBURST_VOLUME == 0]      <- NA       #if no data, make NA
+  df$HEAD_DIAMETER[df$HEAD_DIAMETER == 0]          <- NA
+  df$MAX_DTS[df$MAX_DTS == 0]                      <- NA
+  
+  df$HEAD_DIAMETER[df$HEAD_DIAMETER > 1.5]                        <- NA      #mushroom HD cutoff
+  df$MAX_DTS[df$MAX_DTS > 3.00]                                   <- NA      #mushroom/thin length cutoff
+  df$MAX_DTS[df$TYPE == "stubby"       & df$MAX_DTS > 0.80]       <- NA      #stubby length cutoff
+  df$HEAD_DIAMETER[df$TYPE == "stubby" & df$HEAD_DIAMETER > 0.97] <- NA      #stubby HD cutoff
+  df$HEAD_DIAMETER[df$TYPE == "thin"   & df$HEAD_DIAMETER > 1.22] <- NA      #thin HD cutoff
+
+  df$RAYBURST_VOLUME[df$TYPE == "mushroom" & df$RAYBURST_VOLUME > 0.60] <- NA   #cut-offs from 2SD about mean in Harris et al. 1992 paper, pyramidal CA1 neurons
+  df$RAYBURST_VOLUME[df$TYPE == "thin"     & df$RAYBURST_VOLUME > 0.10] <- NA   
+  df$RAYBURST_VOLUME[df$TYPE == "stubby"   & df$RAYBURST_VOLUME > 0.05] <- NA
+
+  total_spines     <- count(df$ID[!is.na(df$ID)])                       # number of rows = number of spines
+  density_overall  <- total_spines/total_length                         # calculate density of dendritic segment
+  density_mushroom <- sum(as.numeric(df$TYPE == "mushroom"),            # find total number of mushroom spines and
+                          na.rm = TRUE)/total_length                    # divide by total length to find mushroom density
+  density_thin     <- sum(as.numeric(df$TYPE == "thin"),                # as above for thin spines
+                          na.rm = TRUE)/total_length                    # ""
+  density_stubby   <- sum(as.numeric(df$TYPE == "stubby"),              # as above for stubby spines
+                          na.rm = TRUE)/total_length                    # ""
+  
+  
+  #add data to running list in data_master file
+  df$density_overall  <- density_overall # add these data to df
+  df$density_mushroom <- density_mushroom
+  df$density_thin     <- density_thin
+  df$density_stubby   <- density_stubby
+  df$animal_num       <- lapply(df$file, function(x) unlist(strsplit(x, "-"))[2])             # pulls animal ID out of file name, with retro label attached
+  df$retro_label      <- substring(df$animal_num, nchar(df$animal_num), nchar(df$animal_num)) # adds column reported retro-label
+  df$retro_label      <- replace(df$retro_label, df$retro_label == "L", 1)                    # adds labeled for L and not labeled for N
+  df$retro_label      <- replace(df$retro_label, df$retro_label == "N", 0)
+  df$retro_label      <- as.numeric(df$retro_label)
+  df$location         <- lapply(df$file, function(x) unlist(strsplit(x, "-"))[3])
+  df$location         <- as.character(df$location)
+  df$animal_num       <- lapply(df$animal_num, function(x) unlist(strsplit(x, "L"))[1])       # removes L or N from animal number
+  df$animal_num       <- lapply(df$animal_num, function(x) unlist(strsplit(x, "N"))[1])       # removes L or N from animal number
+  df$animal_num       <- as.numeric(df$animal_num)
+  
+  
+  
+  df$density_overall[df$density_overall > 4.00]   <- NA
+  df$density_mushroom[df$density_mushroom > 1.00] <- NA
+  df$density_thin[df$density_thin > 4.00]         <- NA
+  df$density_stubby[df$density_stubby > 1.00]     <- NA
+  
+  
+
+  
+  
+  #df[!complete.cases(df),]                         <- NA
+  
+    data_all            <- rbind(data_all, df)                                                  # adds df as next row in the data_master file
+  
+}                                                                                             # end of file for-loop 
+
+
+rat_ID <- data_all$animal_num[1]
+data_all$MAX_DTS<- as.numeric(data_all$MAX_DTS)
+data_all$RAYBURST_VOLUME <- as.numeric(data_all$RAYBURST_VOLUME)
+data_all$HEAD_DIAMETER <- as.numeric(data_all$HEAD_DIAMETER)
+#A_A
+
+data_mast_A_A <- data_all %>%
+  group_by(file) %>%
+  summarise(den_ov_A_A   = mean(density_overall, na.rm = TRUE),
+            den_mush_A_A = mean(density_mushroom, na.rm = TRUE),
+            den_thin_A_A = mean(density_thin, na.rm = TRUE),
+            den_stub_A_A = mean(density_stubby, na.rm = TRUE),
+            vol_ov_A_A   = mean(RAYBURST_VOLUME, na.rm = TRUE),
+            len_ov_A_A   = mean(MAX_DTS, na.rm = TRUE),
+            hd_ov_A_A    = mean(HEAD_DIAMETER, na.rm = TRUE)) %>%
+  ungroup()
+
+data_mast_A_A <- data_mast_A_A %>%
+  summarise(den_ov   = mean(den_ov_A_A, na.rm = TRUE),
+            den_mush = mean(den_mush_A_A, na.rm = TRUE),
+            den_thin = mean(den_thin_A_A, na.rm = TRUE),
+            den_stub = mean(den_stub_A_A, na.rm = TRUE),
+            vol_ov   = mean(vol_ov_A_A, na.rm = TRUE),
+            len_ov   = mean(len_ov_A_A, na.rm = TRUE),
+            hd_ov    = mean(hd_ov_A_A, na.rm = TRUE)) %>%
+  ungroup()
+
+data_TYPE_A_A <- data_all %>%
+  group_by(file, TYPE) %>%
+  summarise(vol_TYPE_A_A = mean(RAYBURST_VOLUME, na.rm = TRUE),
+            len_TYPE_A_A = mean(MAX_DTS, na.rm = TRUE),
+            hd_TYPE_A_A  = mean(HEAD_DIAMETER, na.rm = TRUE)) %>%
+  ungroup()
+
+data_TYPE_A_A <- data_TYPE_A_A %>%
+  group_by(TYPE)               %>%
+  summarise(vol_TYPE = mean(vol_TYPE_A_A, na.rm = TRUE),
+            len_TYPE = mean(len_TYPE_A_A, na.rm = TRUE),
+            hd_TYPE  = mean(hd_TYPE_A_A, na.rm = TRUE))  %>%
+  ungroup()
+
+data_mush_A_A <- subset(data_TYPE_A_A, data_TYPE_A_A$TYPE == 'mushroom', select = 2:4)
+data_thin_A_A <- subset(data_TYPE_A_A, data_TYPE_A_A$TYPE == 'thin',     select = 2:4)
+data_stub_A_A <- subset(data_TYPE_A_A, data_TYPE_A_A$TYPE == 'stubby',   select = 2:4)
+
+label_location <- c('A_A')
+
+data_mast_A_A <- cbind.fill(label_location, 
+                            data_mast_A_A, 
+                            data_mush_A_A, 
+                            data_thin_A_A, 
+                            data_stub_A_A, 
+                            fill = 0)
+
+
+#N_A
+
+data_mast_NL_A <- data_all %>%
+  group_by(file, retro_label)%>%
+  summarise(den_ov_NL_A   = mean(density_overall, na.rm = TRUE),
+            den_mush_NL_A = mean(density_mushroom, na.rm = TRUE),
+            den_thin_NL_A = mean(density_thin, na.rm = TRUE),
+            den_stub_NL_A = mean(density_stubby, na.rm = TRUE),
+            vol_ov_NL_A   = mean(RAYBURST_VOLUME, na.rm = TRUE),
+            len_ov_NL_A   = mean(MAX_DTS, na.rm = TRUE),
+            hd_ov_NL_A    = mean(HEAD_DIAMETER, na.rm = TRUE))   %>%
+  ungroup()
+
+data_mast_NL_A <- data_mast_NL_A %>%
+  group_by(retro_label) %>%
+  summarise(den_ov   = mean(den_ov_NL_A, na.rm = TRUE),
+            den_mush = mean(den_mush_NL_A, na.rm = TRUE),
+            den_thin = mean(den_thin_NL_A, na.rm = TRUE),
+            den_stub = mean(den_stub_NL_A, na.rm = TRUE),
+            vol_ov   = mean(vol_ov_NL_A, na.rm = TRUE),
+            len_ov   = mean(len_ov_NL_A, na.rm = TRUE),
+            hd_ov    = mean(hd_ov_NL_A, na.rm = TRUE))    %>%
+  ungroup()
+
+data_mast_N_A <- subset(data_mast_NL_A, data_mast_NL_A$retro_label == 0, select = 2:8)
+
+data_TYPE_NL_A <- data_all %>%
+  group_by(file, retro_label, TYPE) %>%
+  summarise(vol_TYPE_NL_A = mean(RAYBURST_VOLUME, na.rm = TRUE),
+            len_TYPE_NL_A = mean(MAX_DTS, na.rm = TRUE),
+            hd_TYPE_NL_A  = mean(HEAD_DIAMETER, na.rm = TRUE))   %>%
+  ungroup()
+
+data_TYPE_NL_A <- data_TYPE_NL_A %>%
+  group_by(retro_label, TYPE) %>%
+  summarise(vol_TYPE = mean(vol_TYPE_NL_A, na.rm = TRUE),
+            len_TYPE = mean(len_TYPE_NL_A, na.rm = TRUE),
+            hd_TYPE = mean(hd_TYPE_NL_A, na.rm = TRUE))  %>%
+  ungroup()
+
+
+data_mush_N_A <- subset(data_TYPE_NL_A, data_TYPE_NL_A$retro_label == 0 & data_TYPE_NL_A$TYPE == 'mushroom', select= 3:5)
+data_thin_N_A <- subset(data_TYPE_NL_A, data_TYPE_NL_A$retro_label == 0 & data_TYPE_NL_A$TYPE == 'thin',     select= 3:5)
+data_stub_N_A <- subset(data_TYPE_NL_A, data_TYPE_NL_A$retro_label == 0 & data_TYPE_NL_A$TYPE == 'stubby',   select= 3:5)
+
+label_location <- c('N_A')
+
+data_mast_N_A <- cbind.fill(label_location, 
+                            data_mast_N_A, 
+                            data_mush_N_A, 
+                            data_thin_N_A, 
+                            data_stub_N_A, 
+                            fill = 0)
+
+
+#L_A
+
+data_mast_L_A <- subset(data_mast_NL_A, data_mast_NL_A$retro_label == 1,  select = 2:8)
+data_mush_L_A <- subset(data_TYPE_NL_A, data_TYPE_NL_A$retro_label == 1 & data_TYPE_NL_A$TYPE == 'mushroom', select = 3:5)
+data_thin_L_A <- subset(data_TYPE_NL_A, data_TYPE_NL_A$retro_label == 1 & data_TYPE_NL_A$TYPE == 'thin',     select = 3:5)
+data_stub_L_A <- subset(data_TYPE_NL_A, data_TYPE_NL_A$retro_label == 1 & data_TYPE_NL_A$TYPE == 'stubby',   select = 3:5)
+
+
+label_location <- c('L_A')
+
+data_mast_L_A <- cbind.fill(label_location, 
+                            data_mast_L_A, 
+                            data_mush_L_A, 
+                            data_thin_L_A, 
+                            data_stub_L_A, 
+                            fill = 0)
+
+# for A_b
+
+data_mast_A_bpd <- data_all %>%
+  group_by(file, location) %>%
+  summarise(den_ov_A_b   = mean(density_overall, na.rm = TRUE),
+            den_mush_A_b = mean(density_mushroom, na.rm = TRUE),
+            den_thin_A_b = mean(density_thin, na.rm = TRUE),
+            den_stub_A_b = mean(density_stubby, na.rm = TRUE),
+            vol_ov_A_b   = mean(RAYBURST_VOLUME, na.rm = TRUE),
+            len_ov_A_b   = mean(MAX_DTS, na.rm = TRUE),
+            hd_ov_A_b    = mean(HEAD_DIAMETER, na.rm = TRUE)) %>%
+  ungroup()
+
+data_mast_A_bpd <- data_mast_A_bpd %>%
+  group_by(location) %>%
+  summarise(den_ov   = mean(den_ov_A_b, na.rm = TRUE),
+            den_mush = mean(den_mush_A_b, na.rm = TRUE),
+            den_thin = mean(den_thin_A_b, na.rm = TRUE),
+            den_stub = mean(den_stub_A_b, na.rm = TRUE),
+            vol_ov   = mean(vol_ov_A_b, na.rm = TRUE),
+            len_ov   = mean(len_ov_A_b, na.rm = TRUE),
+            hd_ov    = mean(hd_ov_A_b, na.rm = TRUE)) %>%
+  ungroup()
+
+data_mast_A_b <- subset(data_mast_A_bpd, data_mast_A_bpd$location == 'b', select = 2:8)
+
+
+data_TYPE_A_bpd <- data_all %>%
+  group_by(file, location, TYPE) %>%
+  summarise(vol_TYPE_A_b = mean(RAYBURST_VOLUME, na.rm = TRUE),
+            len_TYPE_A_b = mean(MAX_DTS, na.rm = TRUE),
+            hd_TYPE_A_b  = mean(HEAD_DIAMETER, na.rm = TRUE)) %>%
+  ungroup()
+
+data_TYPE_A_bpd <- data_TYPE_A_bpd %>%
+  group_by(location, TYPE) %>%
+  summarise(vol_TYPE = mean(vol_TYPE_A_b, na.rm = TRUE),
+            len_TYPE = mean(len_TYPE_A_b, na.rm = TRUE),
+            hd_TYPE  = mean(hd_TYPE_A_b, na.rm = TRUE)) %>%
+  ungroup()
+
+data_mush_A_b <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'b' & data_TYPE_A_bpd$TYPE == 'mushroom', select = 3:5)
+data_thin_A_b <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'b' & data_TYPE_A_bpd$TYPE == 'thin',     select = 3:5)
+data_stub_A_b <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'b' & data_TYPE_A_bpd$TYPE == 'stubby',   select = 3:5)
+
+label_location <- c('A_b')
+
+data_mast_A_b <- cbind.fill(label_location, 
+                            data_mast_A_b, 
+                            data_mush_A_b, 
+                            data_thin_A_b, 
+                            data_stub_A_b,
+                            fill = 0)
+
+#A_p
+
+data_mast_A_p <- subset(data_mast_A_bpd, data_mast_A_bpd$location == 'p', select = 2:8)
+data_mush_A_p <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'p' & data_TYPE_A_bpd$TYPE == 'mushroom', select = 3:5)
+data_thin_A_p <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'p' & data_TYPE_A_bpd$TYPE == 'thin',     select = 3:5)
+data_stub_A_p <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'p' & data_TYPE_A_bpd$TYPE == 'stubby',   select = 3:5)
+
+label_location <- c('A_p')
+
+data_mast_A_p <- cbind.fill(label_location, 
+                            data_mast_A_p, 
+                            data_mush_A_p, 
+                            data_thin_A_p, 
+                            data_stub_A_p, 
+                            fill = 0)
+
+#A_d
+
+data_mast_A_d <- subset(data_mast_A_bpd, data_mast_A_bpd$location == 'd',  select = 2:8)
+data_mush_A_d <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'd' & data_TYPE_A_bpd$TYPE == 'mushroom', select = 3:5)
+data_thin_A_d <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'd' & data_TYPE_A_bpd$TYPE == 'thin',     select = 3:5)
+data_stub_A_d <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'd' & data_TYPE_A_bpd$TYPE == 'stubby',   select = 3:5)
+
+label_location <- c('A_d')
+
+data_mast_A_d <- cbind.fill(label_location, 
+                            data_mast_A_d, 
+                            data_mush_A_d, 
+                            data_thin_A_d, 
+                            data_stub_A_d, 
+                            fill = 0)
+
+#for N_b
+
+data_mast_NL_bpd <- data_all %>%
+  group_by(file, retro_label, location) %>%
+  summarise(den_ov_NL_bpd   = mean(density_overall, na.rm = TRUE),
+            den_mush_NL_bpd = mean(density_mushroom, na.rm = TRUE),
+            den_thin_NL_bpd = mean(density_thin, na.rm = TRUE),
+            den_stub_NL_bpd = mean(density_stubby, na.rm = TRUE),
+            vol_ov_NL_bpd   = mean(RAYBURST_VOLUME, na.rm = TRUE),
+            len_ov_NL_bpd   = mean(MAX_DTS, na.rm = TRUE),
+            hd_ov_NL_bpd    = mean(HEAD_DIAMETER, na.rm = TRUE)) %>%
+  ungroup()
+
+data_count_master <- data_mast_NL_bpd[,1:3]
+
+count_N_b <- data.frame(nrow(subset(data_count_master, retro_label == 0 & location =='b')))
+count_N_p <- data.frame(nrow(subset(data_count_master, retro_label == 0 & location =='p')))
+count_N_d <- data.frame(nrow(subset(data_count_master, retro_label == 0 & location =='d')))
+count_N_A <- sum(count_N_b, count_N_p, count_N_d)
+
+count_L_b <- data.frame(nrow(subset(data_count_master, retro_label == 1 & location =='b')))
+count_L_p <- data.frame(nrow(subset(data_count_master, retro_label == 1 & location =='p')))
+count_L_d <- data.frame(nrow(subset(data_count_master, retro_label == 1 & location =='d')))
+count_L_A <- sum(count_L_b, count_L_p, count_L_d)
+
+count_A_A <- sum(count_N_A, count_L_A)
+count_A_b <- sum(count_N_b, count_L_b)
+count_A_p <- sum(count_N_p, count_L_p)
+count_A_d <- sum(count_N_d, count_L_d)
+
+
+
+data_count_master <- cbind.fill(count_A_A,
+                                count_A_b,
+                                count_A_p,
+                                count_A_d,
+                                count_N_A,
+                                count_N_b, 
+                                count_N_p, 
+                                count_N_d,
+                                count_L_A,
+                                count_L_b, 
+                                count_L_p, 
+                                count_L_d, 
+                                fill = 0)
+
+data_count_master            <- t(data_count_master)
+row.names(data_count_master) <- NULL
+
+
+data_mast_NL_bpd <- data_mast_NL_bpd %>%
+  group_by(retro_label, location) %>%
+  summarise(den_ov   = mean(den_ov_NL_bpd, na.rm = TRUE),
+            den_mush = mean(den_mush_NL_bpd, na.rm = TRUE),
+            den_thin = mean(den_thin_NL_bpd, na.rm = TRUE),
+            den_stub = mean(den_stub_NL_bpd, na.rm = TRUE),
+            vol_ov   = mean(vol_ov_NL_bpd, na.rm = TRUE),
+            len_ov   = mean(len_ov_NL_bpd, na.rm = TRUE),
+            hd_ov   = mean(hd_ov_NL_bpd, na.rm = TRUE)) %>%
+  ungroup()
+
+data_mast_N_b <- subset(data_mast_NL_bpd, data_mast_NL_bpd$retro_label == 0 & data_mast_NL_bpd$location == 'b', select = 3:9)
+
+data_TYPE_NL_bpd <- data_all %>%
+  group_by(file, retro_label, location, TYPE) %>%
+  summarise(vol_TYPE_NL_bpd = mean(RAYBURST_VOLUME, na.rm = TRUE),
+            len_TYPE_NL_bpd = mean(MAX_DTS, na.rm = TRUE),
+            hd_TYPE_NL_bpd  = mean(HEAD_DIAMETER, na.rm = TRUE)) %>%
+  ungroup()
+
+data_TYPE_NL_bpd <- data_TYPE_NL_bpd %>%
+  group_by(retro_label, location, TYPE) %>%
+  summarise(vol_TYPE = mean(vol_TYPE_NL_bpd, na.rm = TRUE),
+            len_TYPE = mean(len_TYPE_NL_bpd, na.rm = TRUE),
+            hd_TYPE  = mean(hd_TYPE_NL_bpd, na.rm = TRUE)) %>%
+  ungroup()
+
+data_mush_N_b <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 0 & 
+                          data_TYPE_NL_bpd$location    == 'b' & 
+                          data_TYPE_NL_bpd$TYPE        == 'mushroom', 
+                        select = 4:6)
+
+data_thin_N_b <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 0 & 
+                          data_TYPE_NL_bpd$location    == 'b' & 
+                          data_TYPE_NL_bpd$TYPE        == 'thin', 
+                        select = 4:6)
+
+data_stub_N_b <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 0 & 
+                          data_TYPE_NL_bpd$location    == 'b' & 
+                          data_TYPE_NL_bpd$TYPE        == 'stubby', 
+                        select = 4:6)
+
+
+label_location <- c('N_b')
+
+data_mast_N_b <- cbind.fill(label_location, 
+                            data_mast_N_b, 
+                            data_mush_N_b,
+                            data_thin_N_b,
+                            data_stub_N_b, 
+                            fill = 0)
+
+#N_p
+
+data_mast_N_p <- subset(data_mast_NL_bpd, data_mast_NL_bpd$retro_label == 0 & data_mast_NL_bpd$location == 'p', select = 3:9)
+
+
+data_mush_N_p <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 0 & 
+                          data_TYPE_NL_bpd$location    == 'p' & 
+                          data_TYPE_NL_bpd$TYPE        == 'mushroom', 
+                        select = 4:6)
+
+
+data_thin_N_p <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 0 & 
+                          data_TYPE_NL_bpd$location    == 'p' & 
+                          data_TYPE_NL_bpd$TYPE        == 'thin', 
+                        select = 4:6)
+
+
+data_stub_N_p <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 0 & 
+                          data_TYPE_NL_bpd$location    == 'p' & 
+                          data_TYPE_NL_bpd$TYPE        == 'stubby', 
+                        select = 4:6)
+
+
+label_location <- c('N_p')
+
+data_mast_N_p <- cbind.fill(label_location, 
+                            data_mast_N_p, 
+                            data_mush_N_p, 
+                            data_thin_N_p, 
+                            data_stub_N_p, 
+                            fill = 0)
+
+#N_d
+
+data_mast_N_d <- subset(data_mast_NL_bpd, 
+                        data_mast_NL_bpd$retro_label == 0 & 
+                          data_mast_NL_bpd$location    == 'd', 
+                        select = 3:9)
+
+data_mush_N_d <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 0 & 
+                          data_TYPE_NL_bpd$location    == 'd' & 
+                          data_TYPE_NL_bpd$TYPE        == 'mushroom', 
+                        select = 4:6)
+
+data_thin_N_d <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 0 & 
+                          data_TYPE_NL_bpd$location    == 'd' & 
+                          data_TYPE_NL_bpd$TYPE        == 'thin', 
+                        select = 4:6)
+
+data_stub_N_d <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 0 & 
+                          data_TYPE_NL_bpd$location    == 'd' & 
+                          data_TYPE_NL_bpd$TYPE        == 'stubby', 
+                        select = 4:6)
+
+label_location <- c('N_d')
+
+data_mast_N_d <- cbind.fill(label_location, 
+                            data_mast_N_d, 
+                            data_mush_N_d, 
+                            data_thin_N_d, 
+                            data_stub_N_d, 
+                            fill = 0)
+
+#L_b
+
+data_mast_L_b <- subset(data_mast_NL_bpd, 
+                        data_mast_NL_bpd$retro_label == 1 & 
+                          data_mast_NL_bpd$location    == 'b', 
+                        select = 3:9)
+
+
+data_mush_L_b <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 1 & 
+                          data_TYPE_NL_bpd$location    == 'b' & 
+                          data_TYPE_NL_bpd$TYPE        == 'mushroom', 
+                        select = 4:6)
+
+
+data_thin_L_b <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 1 & 
+                          data_TYPE_NL_bpd$location    == 'b' & 
+                          data_TYPE_NL_bpd$TYPE        == 'thin', 
+                        select = 4:6)
+
+
+data_stub_L_b <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 1 & 
+                          data_TYPE_NL_bpd$location    == 'b' & 
+                          data_TYPE_NL_bpd$TYPE        == 'stubby', 
+                        select = 4:6)
+
+
+label_location <- c('L_b')
+data_mast_L_b <- cbind.fill(label_location, 
+                            data_mast_L_b, 
+                            data_mush_L_b, 
+                            data_thin_L_b, 
+                            data_stub_L_b, 
+                            fill = 0)
+
+#L_p
+
+data_mast_L_p <- subset(data_mast_NL_bpd, 
+                        data_mast_NL_bpd$retro_label == 1 & 
+                          data_mast_NL_bpd$location    == 'p', 
+                        select = 3:9)
+
+
+data_mush_L_p <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 1 & 
+                          data_TYPE_NL_bpd$location    == 'p' & 
+                          data_TYPE_NL_bpd$TYPE        == 'mushroom', 
+                        select = 4:6)
+
+
+data_thin_L_p <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 1 & 
+                          data_TYPE_NL_bpd$location    == 'p' & 
+                          data_TYPE_NL_bpd$TYPE        == 'thin', 
+                        select = 4:6)
+
+
+data_stub_L_p <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 1 & 
+                          data_TYPE_NL_bpd$location    == 'p' & 
+                          data_TYPE_NL_bpd$TYPE        == 'stubby', 
+                        select = 4:6)
+
+
+
+label_location <- c('L_p')
+
+data_mast_L_p <- cbind.fill(label_location, 
+                            data_mast_L_p, 
+                            data_mush_L_p, 
+                            data_thin_L_p, 
+                            data_stub_L_p, 
+                            fill = 0)
+
+#L_d
+
+data_mast_L_d <- subset(data_mast_NL_bpd, 
+                        data_mast_NL_bpd$retro_label == 1 & 
+                          data_mast_NL_bpd$location    == 'd', 
+                        select = 3:9)
+
+
+data_mush_L_d <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 1 & 
+                          data_TYPE_NL_bpd$location    == 'd' & 
+                          data_TYPE_NL_bpd$TYPE        == 'mushroom', 
+                        select = 4:6)
+
+
+data_thin_L_d <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 1 & 
+                          data_TYPE_NL_bpd$location    == 'd' & 
+                          data_TYPE_NL_bpd$TYPE        == 'thin', 
+                        select = 4:6)
+
+
+data_stub_L_d <- subset(data_TYPE_NL_bpd, 
+                        data_TYPE_NL_bpd$retro_label == 1 & 
+                          data_TYPE_NL_bpd$location    == 'd' & 
+                          data_TYPE_NL_bpd$TYPE        == 'stubby', 
+                        select = 4:6)
+
+
+label_location <- c('L_d')
+
+data_mast_L_d <- cbind.fill(label_location, 
+                            data_mast_L_d, 
+                            data_mush_L_d, 
+                            data_thin_L_d, 
+                            data_stub_L_d, 
+                            fill = 0)
+
+
+
+
+data_master <- rbind(data_mast_A_A, 
+                     data_mast_A_b, 
+                     data_mast_A_p, 
+                     data_mast_A_d, 
+                     data_mast_N_A, 
+                     data_mast_N_b, 
+                     data_mast_N_p, 
+                     data_mast_N_d, 
+                     data_mast_L_A, 
+                     data_mast_L_b, 
+                     data_mast_L_p, 
+                     data_mast_L_d)
+
+data_master <- cbind.fill (rat_ID, 
+                           data_count_master, 
+                           data_master)
+
+colnames(data_master) <- c('rat_ID', 
+                           'count',
+                           'group',
+                           'den_ov', 
+                           'den_mush', 
+                           'den_thin', 
+                           'den_stub', 
+                           'vol_ov', 
+                           'len_ov', 
+                           'hd_ov', 
+                           'vol_mush', 
+                           'len_mush', 
+                           'hd_mush', 
+                           'vol_thin', 
+                           'len_thin', 
+                           'hd_thin',
+                           'vol_stub', 
+                           'len_stub', 
+                           'hd_stub')
+
+
+
+savePath_all <- paste0(saveFolder, "/data_all") # creates the path where files can be saved
+setwd(savePath_all)
+write.csv(data_all, paste0(rat_ID, "_data_all.csv"))
+#setwd(fileLoc) # sets working directory to the path created above
+#dir.create(savePath, "data_all") #create a directory to create a file inside
+setwd(saveFolder)
+savePath_master <- paste0(saveFolder, "/data_master")
+setwd(savePath_master)
+write.csv(data_master, paste0(rat_ID, "_data_master.csv"))
+
+}
+
+
+winDialog("ok", "Code complete")
\ No newline at end of file

From dfb8fe67773358b3a15949ecb62a473da4d29d5c Mon Sep 17 00:00:00 2001
From: kellnett <44405714+kellnett@users.noreply.github.com>
Date: Tue, 29 Oct 2019 10:32:32 -0500
Subject: [PATCH 10/11] Create 3D-update

---
 3D-update | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 3D-update

diff --git a/3D-update b/3D-update
new file mode 100644
index 0000000..038d718
--- /dev/null
+++ b/3D-update
@@ -0,0 +1 @@
+testing

From 839b277a1446acd1d0b9643dcd5f77d9253339bc Mon Sep 17 00:00:00 2001
From: kellnett <44405714+kellnett@users.noreply.github.com>
Date: Tue, 29 Oct 2019 10:34:42 -0500
Subject: [PATCH 11/11] updated_loop

current code I'm using
I worked on it separately outside of GitHub so a lot is different
---
 folder_loop.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/folder_loop.R b/folder_loop.R
index c300763..77f8ac4 100644
--- a/folder_loop.R
+++ b/folder_loop.R
@@ -1,4 +1,4 @@
-#loop attempt with raw_data_cleanup.R
+#loop attempt with raw_data_cleanup.R again
 
 library(dplyr)           
 library(ggplot2)        
@@ -680,4 +680,4 @@ write.csv(data_master, paste0(rat_ID, "_data_master.csv"))
 }
 
 
-winDialog("ok", "Code complete")
\ No newline at end of file
+winDialog("ok", "Code complete")