From 1712c4ef6fd103d503af1af4c1d81de5d8821f73 Mon Sep 17 00:00:00 2001 From: kellenett Date: Fri, 15 Mar 2019 10:43:48 -0500 Subject: [PATCH 01/11] Create 1D clustering Yadav.R --- 1D clustering Yadav.R | 277 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 1D clustering Yadav.R diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R new file mode 100644 index 0000000..bba2b5b --- /dev/null +++ b/1D clustering Yadav.R @@ -0,0 +1,277 @@ +# modeling Yadav et al. 2012 cluster analysis; tailored towards clustering of attachment points +# initially coded by S. Johnson- updates and 3D analysis of spine heads added by K. Nett + + +library(dplyr) # to install these , enter install.packages("package name") into the console in RStudio. replace package name with each package name in quotes, ie install.packages("dplyr") +library(ggplot2) +library(lattice) +library(latticeExtra) +library(cluster) +library(stats) +library(readr) +library(bio3d) #install.packages("bio3d", dependencies = TRUE) +library(DECIPHER) +#how to install DECIPHER - a is enter in response to prompt for "all" +#source("https://bioconductor.org/biocLite.R") +#biocLite("DECIPHER") +#a + + +fileChosen <- file.choose() # opens file dialog to open +filePath <- dirname(fileChosen) # gets the directory name that that file is in +setwd(filePath) # sets the working directory. important for saving files later +fileList <- list.files(filePath) # a list of files in the directory to go through +fileList <- grep(".csv", fileList, value = TRUE) # finds only lists .csv files in case there are other types +data_all <- data.frame() # initializes a data.frame that will store all of the data for the experiment + + + + +for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in + fileName <- unlist(strsplit(fileList[1], "[.]"))[1] # removes ".csv" from file name + fileLoc <- file.path(filePath, fileList[1]) # creates a path to each file (differs from FileChosen as the loop runs) + df <- read_csv(fileLoc, col_types = cols()) # reads the csv file, suppresses output information about columns + colnames(df) <- gsub("-", "_", colnames(df)) # R doesn't like dashes in column names, replaces "-" with "_" for all colnames of df + df <- df[complete.cases(df$SOMA_DISTANCE),] # removes cases where there is no soma distance data + df <- df[complete.cases(df$RAYBURST_VOLUME),] # removes cases where there is no data + df <- df[complete.cases(df$MAX_DTS),] # removes cases where there is no data + df$file <- fileName # adds a column with repeated information about which file that spine came from + total_length <- df %>% group_by(SECTION_NUMBER) %>% summarise(section_length=max(SECTION_LENGTH)) %>%ungroup() %>% summarise(total_length=sum(section_length)) %>% as.double() + # ^^this uses the dplyr package to take the data.frame and group it by section. Then it finds the length of each section before adding them all together + + total_spines <- as.numeric(nrow(df)) # number of rows = number of spines + density_overall <- total_spines/total_length # calculate density of segment + density_mushroom <- sum(as.numeric(df$TYPE=="mushroom"))/total_length # counts the number of "mushroom" rows and divides by length for mushroom density + density_thin <- sum(as.numeric(df$TYPE=="thin"))/total_length # same as above, with "thin" + density_stubby <- sum(as.numeric(df$TYPE=="stubby"))/total_length # same as above, with "stubby" + df <- df[order(df$SOMA_DISTANCE),] + + data_coords <- data.frame(df$X, df$Y, df$Z) + + agn <- agnes(df$SOMA_DISTANCE,metric = "euclidean", method = "average") # runs the agnes, computes agglomerative hierarchical clustering, "average" = UPGMA + dist_ac <- as.matrix(dist(df$SOMA_DISTANCE)) + df$nn_dist_ac <- apply(dist_ac,2, function(x) sort(x)[2]) # finds nearest neighbor for each spine + df$nn2_dist_ac <- apply(dist_ac,2, function(x) sort(x)[3]) # finds second nearest neighbor + df$nn3_dist_ac <- apply(dist_ac,2, function(x) sort(x)[4]) # finds third + dendrite_ac <- agn$ac # defines agglomeration coefficent of the dendrite + + ac_test <- list() #initializes list for ac's from next for loop + possible_dist <- seq(1,total_length, by=0.01) # create list of possible soma distances by 0.01 increments to pull from + + for(k in 1:1000){ + test_data <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines + test_dist <- as.matrix(dist(test_data)) #creates distance matrix for random sample + assign("test_cluster", agnes(test_data,metric = "euclidean", method = "average")) #runs UPGMA on sample and labels the agn output "test_cluster" + ac_test <- rbind(ac_test,test_cluster$ac) # add row to ac using the 'test_cluster' ac + } + + cScore_ac_1D <- sum(as.numeric(ac_test 1) #if more than 1 spine, then it is in a cluster + is_clustered_1D <- sum(cluster_freq_1D$is_clustered) + num_clusters_1D <- is_clustered_1D + num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters + spines_in_cluster_1D <- cluster_freq_1D %>% group_by(is_clustered) %>% summarise(num_clus_spines_1D = sum(Freq)) + spines_clustered_1D <- list() + spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[1,2]) + + + spines_not_1D <- as.numeric(total_spines - spines_clustered_1D) + + + + + random_1D_all <- data.frame() + prob_density_test_all_1D <- data.frame() + +# 1D random spines for loop + for(j in 1:5000){ + random_spines_1D <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines + test_dist_1D <- as.matrix(dist(random_spines_1D)) #creates distance matrix for random sample + UPGMA_test_1D <- IdClusters(test_dist_1D, method="UPGMA", cutoff=0.75, showPlot=FALSE) + UPGMA_test_1D$rn <- rownames(UPGMA_test_1D) + cluster_all_test_1D <- cbind(UPGMA_test_1D, random_spines_1D) + cluster_all_test_1D <- cluster_all_test_1D[order(cluster_all_test_1D$cluster),] + cluster_freq_test_1D <- table(cluster_all_test_1D$cluster) + cluster_freq_test_1D <- as.data.frame(cluster_freq_test_1D) + cluster_freq_test_1D$is_clustered <- as.numeric(cluster_freq_test_1D$Freq > 1) + is_clustered_test_1D <- sum(cluster_freq_test_1D$is_clustered) + num_clusters_test_1D <- is_clustered_test_1D + spines_is_clustered_test_1D <- cluster_freq_test_1D %>% group_by(is_clustered) %>% summarise(num_clusters_test_1D = sum(Freq)) + spines_clustered_test_1D <- spines_is_clustered_test_1D[2,2] + spines_not_test_1D <- spines_is_clustered_test_1D[1,2] + spines_clustered_test_1D[is.na(spines_clustered_test_1D)] <- 0 + random_1D <- spines_clustered_test_1D + random_1D[is.na(random_1D)] <- 0 + random_1D_all <- rbind(random_1D_all, random_1D) + + } # end of random spines 1D for loop + + random_1D_all <- as.matrix(random_1D_all) + random_1D_all <- as.numeric(random_1D_all) + + std_test_1D <- sd(random_1D_all) + mean_test_1D <- mean(random_1D_all) + curve_dnorm_1D <- dnorm(random_1D_all, mean_test_1D, std_test_1D) + Cscore_1D <- pnorm(curve_dnorm_1D) + + + # add all 1D data to all data + cluster_data_1D <- data.frame() + cluster_data_1D <- rbind(cluster_data_1D, num_clusters_1D) + cluster_data_1D <- cbind(cluster_data_1D, spines_clustered_1D) + cluster_data_1D <- cbind(cluster_data_1D, spines_not_1D) + cluster_data_1D <- cbind(cluster_data_1D, Cscore_1D) + cluster_data_1D[is.na(cluster_data_1D)] <- 0 + colnames(cluster_data_1D) <- c("# of clusters - 1D", "spines clustered - 1D", "spines not clustered - 1D", "Cscore - 1D") + + +# 3D clustering analysis + dist_3D <- data.frame(dist.xyz(data_coords)) # creates distance matrix of spine head coordinates + UPGMA_obsv_3D <- IdClusters(dist_3D, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper + #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1) + UPGMA_obsv_3D$rn <- rownames(UPGMA_obsv_3D) #adds a column with rownows to keep spine ID, not sure if this step is neccessary + cluster_all_3D <- cbind(UPGMA_obsv_3D, data_coords) #get the coordinates for each spine ID, if kept in row name/number order, will correctly correspond to each spine + cluster_all_3D <- cluster_all_3D[order(cluster_all_3D$cluster),] #sort data frame by cluster number + cluster_freq_3D <- table(cluster_all_3D$cluster) #create a table counting how many times each cluster Variable occurs (i.e. how many spines in each cluster) + cluster_freq_3D <- as.data.frame(cluster_freq_3D) + cluster_freq_3D$Freq <- as.numeric(cluster_freq_3D$Freq) + cluster_freq_3D$is_clustered <- as.numeric(cluster_freq_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1 + is_clustered_3D <- sum(cluster_freq_3D$is_clustered) + num_clusters_3D <- is_clustered_3D + num_clusters_3D <- sum(cluster_freq_3D$is_clustered) #count how many 1s to determine how many clusters (spines > 1) in the segment + spines_in_cluster_3D <- cluster_freq_3D %>% group_by(is_clustered) %>% summarise(num_clus_spines_3D = sum(Freq)) + spines_clustered_3D <- list() + spines_clustered_3D <- rbind(spines_clustered_3D, spines_in_cluster_3D[1,2]) + spines_not_3D <- as.numeric(total_spines - spines_clustered_3D) + + random_3D_all <- data.frame() + prob_density_test_all_1D <- data.frame() + + +# 3D random spines for loop + for(j in 1:5000){ + test_data_X <- data.frame(sample(df$X), df$Y, df$Z) # randomize the X's, Y's, and Z's to make a "biologically plausible" dataframe. + colnames(test_data_X) <- c( "x", "Y", "Z") + test_data_Y <- data.frame(df$X, sample(df$Y), df$Z) + colnames(test_data_Y) <- c( "x", "Y", "Z") + test_data_Z <- data.frame(df$X, df$Y, sample(df$Z)) + colnames(test_data_Z) <- c( "x", "Y", "Z") + test_data <- rbind(test_data_X, test_data_Y, test_data_Z) + test_data_final <-data.frame(sample_n(test_data, total_spines)) + test_dist <- as.matrix(dist(test_data_final)) #creates distance matrix for random sample + + UPGMA_test_3D <- IdClusters(test_dist, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper + #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1) + UPGMA_test_3D$rn <- rownames(UPGMA_test_3D) #adds a column with rownows to keep spine ID, not sure if this step is neccessary + cluster_all_test_3D <- cbind(UPGMA_test_3D, test_data_final) #get the coordinates for each spine ID, if kept in row name/number order, will correctly correspond to each spine + cluster_all_test_3D <- cluster_all_test_3D[order(cluster_all_test_3D$cluster),] #sort data frame by cluster number + cluster_freq_test_3D <- table(cluster_all_test_3D$cluster) #create a table counting how many times each cluster Variable occurs (i.e. how many spines in each cluster) + cluster_freq_test_3D <- as.data.frame(cluster_freq_test_3D) + cluster_freq_test_3D$Freq <- as.numeric(cluster_freq_test_3D$Freq) #convert factors to numbers + cluster_freq_test_3D$is_clustered_test <- as.numeric(cluster_freq_test_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1 + is_clustered_test_3D <- sum(cluster_freq_test_3D$is_clustered_test) + num_clusters_test_3D <- is_clustered_test_3D #count how many 1s to determine how many clusters (spines > 1) in the segment + num_clusters_test_3D <- sum(cluster_freq_test_3D$is_clustered_test) + spines_in_cluster_test_3D <- cluster_freq_test_3D %>% group_by(is_clustered_test) %>% summarise(num_clus_spines_test_3D = sum(Freq)) + spines_clustered_test_3D <- list() + spines_clustered_test_3D <- rbind(spines_clustered_test_3D, spines_in_cluster_test_3D[1,2]) + spines_not_test_3D <- as.numeric(total_spines - spines_in_cluster_test_3D) + spines_clustered_test_3D[is.na(spines_clustered_test_3D)] <- 0 + random_3D <- spines_clustered_test_3D + random_3D[is.na(random_3D)] <- 0 + random_3D_all <- rbind(random_3D_all, random_1D) + + + } # end of random spines 3D for-loop + + random_3D_all <- as.matrix(random_3D_all) + random_3D_all <- as.numeric(random_3D_all) + + std_test_3D <- sd(random_3D_all) + mean_test_3D <- mean(random_3D_all) + curve_dnorm_3D <- dnorm(random_3D_all, mean_test_3D, std_test_3D) + Cscore_3D <- pnorm(curve_dnorm_3D) + + # add 3D data to data frame + cluster_data_3D <- data.frame() + cluster_data_3D <- rbind(cluster_data_3D, num_clusters_3D) + cluster_data_3D <- cbind(cluster_data_3D, spines_clustered_3D) + cluster_data_3D <- cbind(cluster_data_3D, spines_not_3D) + cluster_data_3D <- cbind(cluster_data_3D, Cscore_3D) + cluster_data_3D[is.na(cluster_data_3D)] <- 0 + colnames(cluster_data_3D) <- c("# of clusters-3D", "spines clustered-3D", "spines not clustered-3D", "Cscore-3D") + + + + df$density_overall <- density_overall # add these data to df + df$density_mushroom <- density_mushroom + df$density_thin <- density_thin + df$density_stubby <- density_stubby + data_all <- rbind(data_all, df) # adds df as next row in the data_all file + + + data_all <- cbind(data_all, cluster_data_1D) + data_all <- cbind(data_all, cluster_data_3D) + + +} # end of file for-loop + +data_all$animal_num <- lapply(data_all$file, function(x) unlist(strsplit(x, "-"))[2]) # this pulls out an animal number from the file number +data_all$retro_label <- substring(data_all$animal_num, nchar(data_all$animal_num), nchar(data_all$animal_num)) #pulls off 'L' or 'N' from ID to indicate whether cell was retro-gradely labeled +#data_all$PDB <- substring(data_all$animal_num, nchar(data_all$animal_num), nchar(data_all$animal_num)) +data_all$PDB <- lapply(data_all$file, function(x) unlist(strsplit(x, "-"))[3]) # pulls dendrite location out of name and adds column +data_all$PDB <- replace(data_all$PDB, data_all$PDB=="p", "prox") +data_all$PDB <- replace(data_all$PDB, data_all$PDB=="d", "dist") +data_all$PDB <- replace(data_all$PDB, data_all$PDB=="b", "basal") +data_all$retro_label <- replace(data_all$retro_label, data_all$retro_label=="L", "labeled") +data_all$retro_label <- replace(data_all$retro_label, data_all$retro_label=="N", "not labeled") +data_all$stack <- lapply(data_all$file, function(x) unlist(strsplit(x, "-"))[4]) #gives letter ID of different imaging days +data_all$stack <- unlist(data_all$stack) +data_all$animal_num <- lapply(data_all$animal_num, function(x) unlist(strsplit(x, "L"))[1]) #removes letter from behind animal ID name +data_all$animal_num <- lapply(data_all$animal_num, function(x) unlist(strsplit(x, "N"))[1]) +data_all$RAYBURST_VOLUME <- as.numeric(data_all$RAYBURST_VOLUME) +data_all$MAX_DTS <- as.numeric(data_all$MAX_DTS) + +n <- readline(prompt="Enter group name:") # gives prompt on console to enter animal group +data_all$group <- n # adds the group to the master data file + +data_all$animal_num <- unlist(data_all$animal_num) #changes animal number to a vector rather than a list, which is important for executing the following task +data_all$PDB <- unlist(data_all$PDB) # same as above + +TYPE_ave <- data_all %>% group_by(group, animal_num, retro_label, TYPE) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) +PDB_TYPE_ave <- data_all %>% group_by(group, animal_num, stack, retro_label, PDB, TYPE) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) + +PDB_ave <- data_all %>% group_by(group, animal_num, retro_label, PDB) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) +# ^first groups by treatment group, then by animal, then by day of image (i.e. -F), then by dendrite location +# summarise then finds averages for each group for all various pieces of data + +animal_ave <- data_all %>% group_by(group, animal_num) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) +# groups by treatment then animal, then finds averages per animal for various data + +labeled_ave <- data_all %>% group_by(group, animal_num, retro_label) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) +# groups by treatment then animal, then finds averages per animal for various data + + + +#dir.create("analysis") #creates a directory to create a file on the computer +#savePath <- paste(filePath,"/", "analysis", collapse = "/", sep="") # creates the path where files can be saved +#setwd(savePath) # sets working directory to the path created above +#write.csv(data_all, "data_all.csv") +#write.csv(PDB_ave, "PDB averages.csv") +#write.csv(animal_ave, "Animal Averages.csv") +#write.csv(TYPE_ave, "spine_type_averages.csv") +#write.csv(PDB_TYPE_ave, "PDB_spine_type_averages.csv") +#write.csv(labeled_ave, "labeled_averages.csv") + From e65c4aafff26ede42b4433e534f8fc4d21e23e1d Mon Sep 17 00:00:00 2001 From: kellenett Date: Fri, 15 Mar 2019 12:25:51 -0500 Subject: [PATCH 02/11] Update 1D clustering Yadav.R --- 1D clustering Yadav.R | 164 +++++++++++++++++++++--------------------- 1 file changed, 84 insertions(+), 80 deletions(-) diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R index bba2b5b..3e6b770 100644 --- a/1D clustering Yadav.R +++ b/1D clustering Yadav.R @@ -44,12 +44,14 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in density_mushroom <- sum(as.numeric(df$TYPE=="mushroom"))/total_length # counts the number of "mushroom" rows and divides by length for mushroom density density_thin <- sum(as.numeric(df$TYPE=="thin"))/total_length # same as above, with "thin" density_stubby <- sum(as.numeric(df$TYPE=="stubby"))/total_length # same as above, with "stubby" - df <- df[order(df$SOMA_DISTANCE),] - - data_coords <- data.frame(df$X, df$Y, df$Z) + df <- df[order(df$SOMA_DISTANCE),] #order data by column SOMA_DISTANCE so that coordinates pulled are in spine attachment order + data_coords <- data.frame(df$X, df$Y, df$Z) # creates data frame with spine head coordinates + +# Shane's original clustering analysis using spine attachment distance from soma + agn <- agnes(df$SOMA_DISTANCE,metric = "euclidean", method = "average") # runs the agnes, computes agglomerative hierarchical clustering, "average" = UPGMA - dist_ac <- as.matrix(dist(df$SOMA_DISTANCE)) + dist_ac <- as.matrix(dist(df$SOMA_DISTANCE)) #creates a distance matrix of soma distances df$nn_dist_ac <- apply(dist_ac,2, function(x) sort(x)[2]) # finds nearest neighbor for each spine df$nn2_dist_ac <- apply(dist_ac,2, function(x) sort(x)[3]) # finds second nearest neighbor df$nn3_dist_ac <- apply(dist_ac,2, function(x) sort(x)[4]) # finds third @@ -65,67 +67,68 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in ac_test <- rbind(ac_test,test_cluster$ac) # add row to ac using the 'test_cluster' ac } - cScore_ac_1D <- sum(as.numeric(ac_test 1) #if more than 1 spine, then it is in a cluster - is_clustered_1D <- sum(cluster_freq_1D$is_clustered) - num_clusters_1D <- is_clustered_1D - num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters - spines_in_cluster_1D <- cluster_freq_1D %>% group_by(is_clustered) %>% summarise(num_clus_spines_1D = sum(Freq)) - spines_clustered_1D <- list() - spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[1,2]) - + cluster_all_1D <- cbind(UPGMA_obsv_1D, df$SOMA_DISTANCE) # adds cluster number, spine ID(row number) and corresponding distance from soma together + cluster_all_1D <- cluster_all_1D[order(cluster_all_1D$cluster),] #orders data by cluster number + cluster_freq_1D <- table(cluster_all_1D$cluster) #creates a table with how many spines are in each cluster + cluster_freq_1D <- as.data.frame(cluster_freq_1D) #converts above table to a data frame + cluster_freq_1D$Freq <- as.numeric(cluster_freq_1D$Freq) # turns the "cluster" column to a number + cluster_freq_1D$is_clustered <- as.numeric(cluster_freq_1D$Freq > 1) #returns 1 if there is >1 spine in a cluster and 0 if not-- therefore all 1s reflect a true "cluster" since it has more than 1 spine in ity + num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters are on this segment + spines_in_cluster_1D <- cluster_freq_1D %>% group_by(is_clustered) %>% summarise(num_clus_spines_1D = sum(Freq)) # count how many spines are in the true clusters (>1 spines) and how many are not + spines_clustered_1D <- list() #initialize list for number of spines that are clustered + spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[1,2]) #add number of spines clustered to list + spines_clustered_1D <- as.matrix(spines_clustered_1D) #converts # of spines clustered to matrix + spines_clustered_1D <- as.numeric(spines_clustered_1D) #converts to numerical form - spines_not_1D <- as.numeric(total_spines - spines_clustered_1D) + spines_not_1D <- as.numeric(total_spines - spines_clustered_1D) #define and calculate number of spines not clustered (total spines minus number of spines clustered) + test_spines_clustered_all_1D <- data.frame() #initialize dataframe to contain # of spines in a cluster for all random samples + test_num_clusters_all_1D <- data.frame() - - - random_1D_all <- data.frame() - prob_density_test_all_1D <- data.frame() - -# 1D random spines for loop - for(j in 1:5000){ +# 1D random spines for-loop + for(j in 1:100){ random_spines_1D <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines test_dist_1D <- as.matrix(dist(random_spines_1D)) #creates distance matrix for random sample - UPGMA_test_1D <- IdClusters(test_dist_1D, method="UPGMA", cutoff=0.75, showPlot=FALSE) - UPGMA_test_1D$rn <- rownames(UPGMA_test_1D) - cluster_all_test_1D <- cbind(UPGMA_test_1D, random_spines_1D) - cluster_all_test_1D <- cluster_all_test_1D[order(cluster_all_test_1D$cluster),] - cluster_freq_test_1D <- table(cluster_all_test_1D$cluster) - cluster_freq_test_1D <- as.data.frame(cluster_freq_test_1D) - cluster_freq_test_1D$is_clustered <- as.numeric(cluster_freq_test_1D$Freq > 1) - is_clustered_test_1D <- sum(cluster_freq_test_1D$is_clustered) - num_clusters_test_1D <- is_clustered_test_1D - spines_is_clustered_test_1D <- cluster_freq_test_1D %>% group_by(is_clustered) %>% summarise(num_clusters_test_1D = sum(Freq)) - spines_clustered_test_1D <- spines_is_clustered_test_1D[2,2] - spines_not_test_1D <- spines_is_clustered_test_1D[1,2] - spines_clustered_test_1D[is.na(spines_clustered_test_1D)] <- 0 - random_1D <- spines_clustered_test_1D - random_1D[is.na(random_1D)] <- 0 - random_1D_all <- rbind(random_1D_all, random_1D) + UPGMA_test_1D <- IdClusters(test_dist_1D, method="UPGMA", cutoff=0.75, showPlot=FALSE) #runs UPGMA with cutoff (same as with obsv but with random sample) + UPGMA_test_1D$rn <- rownames(UPGMA_test_1D) #add row name (spine ID) to cluster number + cluster_all_test_1D <- cbind(UPGMA_test_1D, random_spines_1D) #add cluster number/spine ID to randomly generated soma distances + cluster_all_test_1D <- cluster_all_test_1D[order(cluster_all_test_1D$cluster),] #order by cluster number + cluster_freq_test_1D <- table(cluster_all_test_1D$cluster) #generate table with number of spines in each cluster + cluster_freq_test_1D <- as.data.frame(cluster_freq_test_1D) #change table to data frame + cluster_freq_test_1D$is_clustered <- as.numeric(cluster_freq_test_1D$Freq > 1) #ask whether cluster has >1 spines in it (1 for yes, 0 for no) + num_clusters_test_1D <- sum(cluster_freq_test_1D$is_clustered) # add how many 1s (or how many clusters have >1 spines) to get true number of clusters + num_clusters_test_1D[is.na(num_clusters_test_1D)] <- 0 #changes possible NA from 0 clusters to 0 + test_num_clusters_all <- rbind(test_num_clusters_all, num_clusters_test_1D) #adds total number of clusters to running list + spines_in_cluster_test_1D <- cluster_freq_test_1D %>% group_by(is_clustered) %>% summarise(num_clusters_test_1D = sum(Freq)) #count how many spines that are in a true cluster or not + spines_clustered_test_1D <- spines_in_cluster_test_1D[2,2] # define how many spines are in a cluster + spines_not_test_1D <- as.numeric(total_spines - spines_clustered_test_1D) # calculate how many spines are not clustered + spines_clustered_test_1D[is.na(spines_clustered_test_1D)] <- 0 #returns 0 instead of Na if no spines are clustered in the random sample + test_spines_clustered_all_1D <- rbind(test_spines_clustered_all_1D, spines_clustered_test_1D) #add number of clustered spines to running list - } # end of random spines 1D for loop + } # end of random spines 1D for-loop - random_1D_all <- as.matrix(random_1D_all) - random_1D_all <- as.numeric(random_1D_all) + test_spines_clustered_all_1D <- as.matrix(test_spines_clustered_all_1D) #changes data.frame to matrix (not sure if this is neccesary but it works) + test_spines_clustered_all_1D <- as.numeric(test_spines_clustered_all_1D) #changes all vales to numeric (again, not sure if neccessary) - std_test_1D <- sd(random_1D_all) - mean_test_1D <- mean(random_1D_all) - curve_dnorm_1D <- dnorm(random_1D_all, mean_test_1D, std_test_1D) - Cscore_1D <- pnorm(curve_dnorm_1D) - + + std_test_1D <- sd(test_spines_clustered_all_1D) #generates STD of total number of spines clustered in entire random sample + mean_test_1D <- mean(test_spines_clustered_all_1D) #generates average number of spines in a cluster in random data + curve_dnorm_1D <- dnorm(test_spines_clustered_all_1D, mean_test_1D, std_test_1D) #gives probability density function, or height of probability distribution at each point(height = frequency) + std_curve_1D <- sd(curve_dnorm_1D) + mean_curve_1D <- mean(curve_dnorm_1D) + + Cscore_1D <- pnorm(spines_clustered_1D, mean_curve_1D, sd=sqrt(std_curve_1D)) #calculates the probability that given the random sample distribution (curve mean+SD) the total number of spines observed is higher + #therefore, the closer to 1, higher probability that a given number has more clustered spines than the random normal distribution and vice versa # add all 1D data to all data cluster_data_1D <- data.frame() @@ -138,7 +141,7 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in # 3D clustering analysis - dist_3D <- data.frame(dist.xyz(data_coords)) # creates distance matrix of spine head coordinates + dist_3D <- as.matrix(dist.xyz(data_coords)) # creates distance matrix of spine head coordinates UPGMA_obsv_3D <- IdClusters(dist_3D, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1) UPGMA_obsv_3D$rn <- rownames(UPGMA_obsv_3D) #adds a column with rownows to keep spine ID, not sure if this step is neccessary @@ -148,20 +151,21 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in cluster_freq_3D <- as.data.frame(cluster_freq_3D) cluster_freq_3D$Freq <- as.numeric(cluster_freq_3D$Freq) cluster_freq_3D$is_clustered <- as.numeric(cluster_freq_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1 - is_clustered_3D <- sum(cluster_freq_3D$is_clustered) - num_clusters_3D <- is_clustered_3D num_clusters_3D <- sum(cluster_freq_3D$is_clustered) #count how many 1s to determine how many clusters (spines > 1) in the segment spines_in_cluster_3D <- cluster_freq_3D %>% group_by(is_clustered) %>% summarise(num_clus_spines_3D = sum(Freq)) spines_clustered_3D <- list() spines_clustered_3D <- rbind(spines_clustered_3D, spines_in_cluster_3D[1,2]) + spines_clustered_3D <- as.matrix(spines_clustered_3D) #conerts num of spines clustered to matrix + spines_clustered_3D <- as.numeric(spines_clustered_3D) #converts to numerical form + spines_not_3D <- as.numeric(total_spines - spines_clustered_3D) - - random_3D_all <- data.frame() - prob_density_test_all_1D <- data.frame() + + test_spines_clustered_all_3D <- data.frame() + test_num_clusters_all_3D <- data.frame() # 3D random spines for loop - for(j in 1:5000){ + for(j in 1:100){ test_data_X <- data.frame(sample(df$X), df$Y, df$Z) # randomize the X's, Y's, and Z's to make a "biologically plausible" dataframe. colnames(test_data_X) <- c( "x", "Y", "Z") test_data_Y <- data.frame(df$X, sample(df$Y), df$Z) @@ -170,39 +174,39 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in colnames(test_data_Z) <- c( "x", "Y", "Z") test_data <- rbind(test_data_X, test_data_Y, test_data_Z) test_data_final <-data.frame(sample_n(test_data, total_spines)) - test_dist <- as.matrix(dist(test_data_final)) #creates distance matrix for random sample + test_dist_3D <- as.matrix(dist(test_data_final)) #creates distance matrix for random sample - UPGMA_test_3D <- IdClusters(test_dist, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper + UPGMA_test_3D <- IdClusters(test_dist_3D, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1) UPGMA_test_3D$rn <- rownames(UPGMA_test_3D) #adds a column with rownows to keep spine ID, not sure if this step is neccessary cluster_all_test_3D <- cbind(UPGMA_test_3D, test_data_final) #get the coordinates for each spine ID, if kept in row name/number order, will correctly correspond to each spine cluster_all_test_3D <- cluster_all_test_3D[order(cluster_all_test_3D$cluster),] #sort data frame by cluster number cluster_freq_test_3D <- table(cluster_all_test_3D$cluster) #create a table counting how many times each cluster Variable occurs (i.e. how many spines in each cluster) cluster_freq_test_3D <- as.data.frame(cluster_freq_test_3D) - cluster_freq_test_3D$Freq <- as.numeric(cluster_freq_test_3D$Freq) #convert factors to numbers - cluster_freq_test_3D$is_clustered_test <- as.numeric(cluster_freq_test_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1 - is_clustered_test_3D <- sum(cluster_freq_test_3D$is_clustered_test) - num_clusters_test_3D <- is_clustered_test_3D #count how many 1s to determine how many clusters (spines > 1) in the segment - num_clusters_test_3D <- sum(cluster_freq_test_3D$is_clustered_test) - spines_in_cluster_test_3D <- cluster_freq_test_3D %>% group_by(is_clustered_test) %>% summarise(num_clus_spines_test_3D = sum(Freq)) - spines_clustered_test_3D <- list() - spines_clustered_test_3D <- rbind(spines_clustered_test_3D, spines_in_cluster_test_3D[1,2]) - spines_not_test_3D <- as.numeric(total_spines - spines_in_cluster_test_3D) + cluster_freq_test_3D$is_clustered <- as.numeric(cluster_freq_test_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1 + num_clusters_test_3D <- sum(cluster_freq_test_3D$is_clustered) + num_clusters_test_3D[is.na(num_clusters_test_3D)] <- 0 + test_num_clusters_all_3D <- rbind(test_num_clusters_all_3D, num_clusters_test_3D) + spines_in_cluster_test_3D <- cluster_freq_test_3D %>% group_by(is_clustered) %>% summarise(num_clus_spines_test_3D = sum(Freq)) + spines_clustered_test_3D <- spines_in_cluster_test_3D[2,2] + + spines_not_test_3D <- as.numeric(spines_in_cluster_test_3D[1,2]) spines_clustered_test_3D[is.na(spines_clustered_test_3D)] <- 0 - random_3D <- spines_clustered_test_3D - random_3D[is.na(random_3D)] <- 0 - random_3D_all <- rbind(random_3D_all, random_1D) + test_spines_clustered_all_3D <- rbind(test_spines_clustered_all_3D, spines_clustered_test_3D) - } # end of random spines 3D for-loop - random_3D_all <- as.matrix(random_3D_all) - random_3D_all <- as.numeric(random_3D_all) + test_spines_clustered_all_3D <- as.matrix(test_spines_clustered_all_3D) + test_spines_clustered_all_3D <- as.numeric(test_spines_clustered_all_3D) - std_test_3D <- sd(random_3D_all) - mean_test_3D <- mean(random_3D_all) - curve_dnorm_3D <- dnorm(random_3D_all, mean_test_3D, std_test_3D) - Cscore_3D <- pnorm(curve_dnorm_3D) + std_test_3D <- sd(test_spines_clustered_all_3D) + mean_test_3D <- mean(test_spines_clustered_all_3D) + curve_dnorm_3D <- dnorm(test_spines_clustered_all_3D, mean_test_3D, std_test_3D) + std_curve_3D <- sd(curve_dnorm_3D) + mean_curve_3D <- mean(curve_dnorm_3D) + + Cscore_3D <- pnorm(spines_clustered_3D, mean_curve_3D, sd=sqrt(std_curve_3D)) + # add 3D data to data frame cluster_data_3D <- data.frame() From a94a6ac34548d3e4e0c2065a47da56b8da585b45 Mon Sep 17 00:00:00 2001 From: kellenett Date: Fri, 15 Mar 2019 14:12:39 -0500 Subject: [PATCH 03/11] Update 1D clustering Yadav.R --- 1D clustering Yadav.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R index 3e6b770..9550389 100644 --- a/1D clustering Yadav.R +++ b/1D clustering Yadav.R @@ -60,14 +60,14 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in ac_test <- list() #initializes list for ac's from next for loop possible_dist <- seq(1,total_length, by=0.01) # create list of possible soma distances by 0.01 increments to pull from - for(k in 1:1000){ + for(k in 1:5000){ test_data <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines test_dist <- as.matrix(dist(test_data)) #creates distance matrix for random sample assign("test_cluster", agnes(test_data,metric = "euclidean", method = "average")) #runs UPGMA on sample and labels the agn output "test_cluster" ac_test <- rbind(ac_test,test_cluster$ac) # add row to ac using the 'test_cluster' ac } - cScore_ac_1D <- sum(as.numeric(ac_test 1) #ask whether cluster has >1 spines in it (1 for yes, 0 for no) num_clusters_test_1D <- sum(cluster_freq_test_1D$is_clustered) # add how many 1s (or how many clusters have >1 spines) to get true number of clusters num_clusters_test_1D[is.na(num_clusters_test_1D)] <- 0 #changes possible NA from 0 clusters to 0 - test_num_clusters_all <- rbind(test_num_clusters_all, num_clusters_test_1D) #adds total number of clusters to running list + test_num_clusters_all_1D <- rbind(test_num_clusters_all_1D, num_clusters_test_1D) #adds total number of clusters to running list spines_in_cluster_test_1D <- cluster_freq_test_1D %>% group_by(is_clustered) %>% summarise(num_clusters_test_1D = sum(Freq)) #count how many spines that are in a true cluster or not spines_clustered_test_1D <- spines_in_cluster_test_1D[2,2] # define how many spines are in a cluster spines_not_test_1D <- as.numeric(total_spines - spines_clustered_test_1D) # calculate how many spines are not clustered @@ -196,8 +196,8 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in } # end of random spines 3D for-loop - test_spines_clustered_all_3D <- as.matrix(test_spines_clustered_all_3D) - test_spines_clustered_all_3D <- as.numeric(test_spines_clustered_all_3D) + test_spines_clustered_all_3D <- as.numeric(as.matrix(test_spines_clustered_all_3D)) + std_test_3D <- sd(test_spines_clustered_all_3D) mean_test_3D <- mean(test_spines_clustered_all_3D) @@ -205,7 +205,7 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in std_curve_3D <- sd(curve_dnorm_3D) mean_curve_3D <- mean(curve_dnorm_3D) - Cscore_3D <- pnorm(spines_clustered_3D, mean_curve_3D, sd=sqrt(std_curve_3D)) + Cscore_3D <- pnorm(spines_clustered_3D, mean_curve_3D, std_curve_3D) # add 3D data to data frame From 130ce90b03e48e77caaefc808b5124bc2131b962 Mon Sep 17 00:00:00 2001 From: kellenett Date: Fri, 15 Mar 2019 15:20:00 -0500 Subject: [PATCH 04/11] Update 1D clustering Yadav.R --- 1D clustering Yadav.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R index 9550389..14d68ef 100644 --- a/1D clustering Yadav.R +++ b/1D clustering Yadav.R @@ -86,10 +86,10 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters are on this segment spines_in_cluster_1D <- cluster_freq_1D %>% group_by(is_clustered) %>% summarise(num_clus_spines_1D = sum(Freq)) # count how many spines are in the true clusters (>1 spines) and how many are not spines_clustered_1D <- list() #initialize list for number of spines that are clustered - spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[1,2]) #add number of spines clustered to list + spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[2,2]) #add number of spines clustered to list spines_clustered_1D <- as.matrix(spines_clustered_1D) #converts # of spines clustered to matrix spines_clustered_1D <- as.numeric(spines_clustered_1D) #converts to numerical form - + spines_clustered_1D[is.na(spines_clustered_1D)] <- 0 spines_not_1D <- as.numeric(total_spines - spines_clustered_1D) #define and calculate number of spines not clustered (total spines minus number of spines clustered) test_spines_clustered_all_1D <- data.frame() #initialize dataframe to contain # of spines in a cluster for all random samples @@ -123,11 +123,12 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in std_test_1D <- sd(test_spines_clustered_all_1D) #generates STD of total number of spines clustered in entire random sample mean_test_1D <- mean(test_spines_clustered_all_1D) #generates average number of spines in a cluster in random data + curve_dnorm_1D <- dnorm(test_spines_clustered_all_1D, mean_test_1D, std_test_1D) #gives probability density function, or height of probability distribution at each point(height = frequency) std_curve_1D <- sd(curve_dnorm_1D) mean_curve_1D <- mean(curve_dnorm_1D) - Cscore_1D <- pnorm(spines_clustered_1D, mean_curve_1D, sd=sqrt(std_curve_1D)) #calculates the probability that given the random sample distribution (curve mean+SD) the total number of spines observed is higher + Cscore_1D <- pnorm(spines_clustered_1D, mean_test_1D, std_test_1D) #calculates the probability that given the random sample distribution (curve mean+SD) the total number of spines observed is higher #therefore, the closer to 1, higher probability that a given number has more clustered spines than the random normal distribution and vice versa # add all 1D data to all data @@ -154,7 +155,7 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in num_clusters_3D <- sum(cluster_freq_3D$is_clustered) #count how many 1s to determine how many clusters (spines > 1) in the segment spines_in_cluster_3D <- cluster_freq_3D %>% group_by(is_clustered) %>% summarise(num_clus_spines_3D = sum(Freq)) spines_clustered_3D <- list() - spines_clustered_3D <- rbind(spines_clustered_3D, spines_in_cluster_3D[1,2]) + spines_clustered_3D <- rbind(spines_clustered_3D, spines_in_cluster_3D[2,2]) spines_clustered_3D <- as.matrix(spines_clustered_3D) #conerts num of spines clustered to matrix spines_clustered_3D <- as.numeric(spines_clustered_3D) #converts to numerical form @@ -205,7 +206,7 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in std_curve_3D <- sd(curve_dnorm_3D) mean_curve_3D <- mean(curve_dnorm_3D) - Cscore_3D <- pnorm(spines_clustered_3D, mean_curve_3D, std_curve_3D) + Cscore_3D <- pnorm(spines_clustered_3D, mean_test_3D, std_test_3D) # add 3D data to data frame From d8dcceed53b807567ea5a2fcf84a0e9295f96401 Mon Sep 17 00:00:00 2001 From: James Kent Date: Mon, 18 Mar 2019 10:21:20 -0500 Subject: [PATCH 05/11] Apply suggestions from code review Co-Authored-By: kellnett <44405714+kellnett@users.noreply.github.com> --- 1D clustering Yadav.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R index 14d68ef..20032d2 100644 --- a/1D clustering Yadav.R +++ b/1D clustering Yadav.R @@ -46,7 +46,7 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in density_stubby <- sum(as.numeric(df$TYPE=="stubby"))/total_length # same as above, with "stubby" df <- df[order(df$SOMA_DISTANCE),] #order data by column SOMA_DISTANCE so that coordinates pulled are in spine attachment order - data_coords <- data.frame(df$X, df$Y, df$Z) # creates data frame with spine head coordinates +data_coords <- df[c("X", "Y", "Z")] # creates data frame with spine head coordinates # Shane's original clustering analysis using spine attachment distance from soma @@ -86,7 +86,7 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters are on this segment spines_in_cluster_1D <- cluster_freq_1D %>% group_by(is_clustered) %>% summarise(num_clus_spines_1D = sum(Freq)) # count how many spines are in the true clusters (>1 spines) and how many are not spines_clustered_1D <- list() #initialize list for number of spines that are clustered - spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[2,2]) #add number of spines clustered to list + spines_clustered_1D <- spines_in_cluster_1D[[2,2]] #add number of spines clustered to list spines_clustered_1D <- as.matrix(spines_clustered_1D) #converts # of spines clustered to matrix spines_clustered_1D <- as.numeric(spines_clustered_1D) #converts to numerical form spines_clustered_1D[is.na(spines_clustered_1D)] <- 0 @@ -142,7 +142,7 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in # 3D clustering analysis - dist_3D <- as.matrix(dist.xyz(data_coords)) # creates distance matrix of spine head coordinates + dist_3D <- as.matrix(dist(data_coords, diag=TRUE, upper=TRUE)) # creates distance matrix of spine head coordinates UPGMA_obsv_3D <- IdClusters(dist_3D, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1) UPGMA_obsv_3D$rn <- rownames(UPGMA_obsv_3D) #adds a column with rownows to keep spine ID, not sure if this step is neccessary From 68e60c2ea0d0f458e25ef15062566d4d79cc905b Mon Sep 17 00:00:00 2001 From: kellenett Date: Mon, 25 Mar 2019 13:10:07 -0500 Subject: [PATCH 06/11] Update 1D clustering Yadav.R --- 1D clustering Yadav.R | 175 ++++++++++++++++++++---------------------- 1 file changed, 84 insertions(+), 91 deletions(-) diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R index 14d68ef..780217b 100644 --- a/1D clustering Yadav.R +++ b/1D clustering Yadav.R @@ -9,6 +9,7 @@ library(latticeExtra) library(cluster) library(stats) library(readr) +library(tidyr) library(bio3d) #install.packages("bio3d", dependencies = TRUE) library(DECIPHER) #how to install DECIPHER - a is enter in response to prompt for "all" @@ -28,17 +29,15 @@ data_all <- data.frame() # initializes a data.frame that will store all of the for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in - fileName <- unlist(strsplit(fileList[1], "[.]"))[1] # removes ".csv" from file name - fileLoc <- file.path(filePath, fileList[1]) # creates a path to each file (differs from FileChosen as the loop runs) + fileName <- unlist(strsplit(fileList[i], "[.]"))[1] # removes ".csv" from file name + fileLoc <- file.path(filePath, fileList[i]) # creates a path to each file (differs from FileChosen as the loop runs) df <- read_csv(fileLoc, col_types = cols()) # reads the csv file, suppresses output information about columns colnames(df) <- gsub("-", "_", colnames(df)) # R doesn't like dashes in column names, replaces "-" with "_" for all colnames of df df <- df[complete.cases(df$SOMA_DISTANCE),] # removes cases where there is no soma distance data df <- df[complete.cases(df$RAYBURST_VOLUME),] # removes cases where there is no data df <- df[complete.cases(df$MAX_DTS),] # removes cases where there is no data df$file <- fileName # adds a column with repeated information about which file that spine came from - total_length <- df %>% group_by(SECTION_NUMBER) %>% summarise(section_length=max(SECTION_LENGTH)) %>%ungroup() %>% summarise(total_length=sum(section_length)) %>% as.double() - # ^^this uses the dplyr package to take the data.frame and group it by section. Then it finds the length of each section before adding them all together - + total_length <- df %>% group_by(SECTION_NUMBER) %>% summarise(section_length=max(SECTION_LENGTH)) %>%ungroup() %>% summarise(total_length=sum(section_length)) %>% as.double() #this uses the dplyr package to take the data.frame and group it by section. Then it finds the length of each section before adding them all together total_spines <- as.numeric(nrow(df)) # number of rows = number of spines density_overall <- total_spines/total_length # calculate density of segment density_mushroom <- sum(as.numeric(df$TYPE=="mushroom"))/total_length # counts the number of "mushroom" rows and divides by length for mushroom density @@ -60,85 +59,85 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in ac_test <- list() #initializes list for ac's from next for loop possible_dist <- seq(1,total_length, by=0.01) # create list of possible soma distances by 0.01 increments to pull from - for(k in 1:5000){ - test_data <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines - test_dist <- as.matrix(dist(test_data)) #creates distance matrix for random sample - assign("test_cluster", agnes(test_data,metric = "euclidean", method = "average")) #runs UPGMA on sample and labels the agn output "test_cluster" - ac_test <- rbind(ac_test,test_cluster$ac) # add row to ac using the 'test_cluster' ac - } + # for(k in 1:5000){ + # test_data <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines + # test_dist <- as.matrix(dist(test_data)) #creates distance matrix for random sample + # assign("test_cluster", agnes(test_data,metric = "euclidean", method = "average")) #runs UPGMA on sample and labels the agn output "test_cluster" + # ac_test <- rbind(ac_test,test_cluster$ac) # add row to ac using the 'test_cluster' ac + # } - cScore_ac_1D <- sum(as.numeric(ac_test 1) #returns 1 if there is >1 spine in a cluster and 0 if not-- therefore all 1s reflect a true "cluster" since it has more than 1 spine in ity - num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters are on this segment - spines_in_cluster_1D <- cluster_freq_1D %>% group_by(is_clustered) %>% summarise(num_clus_spines_1D = sum(Freq)) # count how many spines are in the true clusters (>1 spines) and how many are not - spines_clustered_1D <- list() #initialize list for number of spines that are clustered - spines_clustered_1D <- rbind(spines_clustered_1D, spines_in_cluster_1D[2,2]) #add number of spines clustered to list - spines_clustered_1D <- as.matrix(spines_clustered_1D) #converts # of spines clustered to matrix - spines_clustered_1D <- as.numeric(spines_clustered_1D) #converts to numerical form - spines_clustered_1D[is.na(spines_clustered_1D)] <- 0 - spines_not_1D <- as.numeric(total_spines - spines_clustered_1D) #define and calculate number of spines not clustered (total spines minus number of spines clustered) +# dist_1D <- as.matrix(dist(df$SOMA_DISTANCE)) # creates a distance matrix using distance from soma, same as dist_ac, but keeping separate for different analysis for now +# UPGMA_obsv_1D <- IdClusters(dist_1D, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #runs cluster analysis with cutoff from Yadav paper on observed spines +# UPGMA_obsv_1D$rn <- rownames(UPGMA_obsv_1D) # adds a column with row names to keep spine ID, not sure if this step is necessary +# cluster_all_1D <- cbind(UPGMA_obsv_1D, df$SOMA_DISTANCE) # adds cluster number, spine ID(row number) and corresponding distance from soma together +# cluster_all_1D <- cluster_all_1D[order(cluster_all_1D$cluster),] #orders data by cluster number +# cluster_freq_1D <- table(cluster_all_1D$cluster) #creates a table with how many spines are in each cluster +# cluster_freq_1D <- as.data.frame(cluster_freq_1D) #converts above table to a data frame +# cluster_freq_1D$Freq <- as.numeric(cluster_freq_1D$Freq) # turns the "cluster" column to a number +# cluster_freq_1D$is_clustered <- as.numeric(cluster_freq_1D$Freq > 1) #returns 1 if there is >1 spine in a cluster and 0 if not-- therefore all 1s reflect a true "cluster" since it has more than 1 spine in ity +# num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters are on this segment - test_spines_clustered_all_1D <- data.frame() #initialize dataframe to contain # of spines in a cluster for all random samples - test_num_clusters_all_1D <- data.frame() +# spines_clustered_1D <- sum(cluster_freq_1D$Freq>1) + +# spines_clustered_1D <- as.matrix(spines_clustered_1D) #converts # of spines clustered to matrix +# spines_clustered_1D <- as.numeric(spines_clustered_1D) #converts to numerical form +# spines_clustered_1D[is.na(spines_clustered_1D)] <- 0 +# spines_not_1D <- as.numeric(total_spines - spines_clustered_1D) #define and calculate number of spines not clustered (total spines minus number of spines clustered) +# +# test_spines_clustered_all_1D <- data.frame() #initialize dataframe to contain # of spines in a cluster for all random samples +# test_num_clusters_all_1D <- data.frame() # 1D random spines for-loop - for(j in 1:500){ - random_spines_1D <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines - test_dist_1D <- as.matrix(dist(random_spines_1D)) #creates distance matrix for random sample - UPGMA_test_1D <- IdClusters(test_dist_1D, method="UPGMA", cutoff=0.75, showPlot=FALSE) #runs UPGMA with cutoff (same as with obsv but with random sample) - UPGMA_test_1D$rn <- rownames(UPGMA_test_1D) #add row name (spine ID) to cluster number - cluster_all_test_1D <- cbind(UPGMA_test_1D, random_spines_1D) #add cluster number/spine ID to randomly generated soma distances - cluster_all_test_1D <- cluster_all_test_1D[order(cluster_all_test_1D$cluster),] #order by cluster number - cluster_freq_test_1D <- table(cluster_all_test_1D$cluster) #generate table with number of spines in each cluster - cluster_freq_test_1D <- as.data.frame(cluster_freq_test_1D) #change table to data frame - cluster_freq_test_1D$is_clustered <- as.numeric(cluster_freq_test_1D$Freq > 1) #ask whether cluster has >1 spines in it (1 for yes, 0 for no) - num_clusters_test_1D <- sum(cluster_freq_test_1D$is_clustered) # add how many 1s (or how many clusters have >1 spines) to get true number of clusters - num_clusters_test_1D[is.na(num_clusters_test_1D)] <- 0 #changes possible NA from 0 clusters to 0 - test_num_clusters_all_1D <- rbind(test_num_clusters_all_1D, num_clusters_test_1D) #adds total number of clusters to running list - spines_in_cluster_test_1D <- cluster_freq_test_1D %>% group_by(is_clustered) %>% summarise(num_clusters_test_1D = sum(Freq)) #count how many spines that are in a true cluster or not - spines_clustered_test_1D <- spines_in_cluster_test_1D[2,2] # define how many spines are in a cluster - spines_not_test_1D <- as.numeric(total_spines - spines_clustered_test_1D) # calculate how many spines are not clustered - spines_clustered_test_1D[is.na(spines_clustered_test_1D)] <- 0 #returns 0 instead of Na if no spines are clustered in the random sample - test_spines_clustered_all_1D <- rbind(test_spines_clustered_all_1D, spines_clustered_test_1D) #add number of clustered spines to running list - - } # end of random spines 1D for-loop +# for(j in 1:10000){ +# random_spines_1D <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines +# test_dist_1D <- as.matrix(dist(random_spines_1D)) #creates distance matrix for random sample +# UPGMA_test_1D <- IdClusters(test_dist_1D, method="UPGMA", cutoff=0.75, showPlot=FALSE) #runs UPGMA with cutoff (same as with obsv but with random sample) +# UPGMA_test_1D$rn <- rownames(UPGMA_test_1D) #add row name (spine ID) to cluster number +# cluster_all_test_1D <- cbind(UPGMA_test_1D, random_spines_1D) #add cluster number/spine ID to randomly generated soma distances +# cluster_all_test_1D <- cluster_all_test_1D[order(cluster_all_test_1D$cluster),] #order by cluster number +# cluster_freq_test_1D <- table(cluster_all_test_1D$cluster) #generate table with number of spines in each cluster +# cluster_freq_test_1D <- as.data.frame(cluster_freq_test_1D) #change table to data frame +# cluster_freq_test_1D$is_clustered <- as.numeric(cluster_freq_test_1D$Freq > 1) #ask whether cluster has >1 spines in it (1 for yes, 0 for no) +# num_clusters_test_1D <- sum(cluster_freq_test_1D$is_clustered) # add how many 1s (or how many clusters have >1 spines) to get true number of clusters +# num_clusters_test_1D[is.na(num_clusters_test_1D)] <- 0 #changes possible NA from 0 clusters to 0 +# test_num_clusters_all_1D <- rbind(test_num_clusters_all_1D, num_clusters_test_1D) #adds total number of clusters to running list +# spines_clustered_test_1D <- sum(cluster_freq_test_1D$Freq>1) +# + # spines_not_test_1D <- as.numeric(total_spines - spines_clustered_test_1D) # calculate how many spines are not clustered +# spines_clustered_test_1D[is.na(spines_clustered_test_1D)] <- 0 #returns 0 instead of Na if no spines are clustered in the random sample +# test_spines_clustered_all_1D <- rbind(test_spines_clustered_all_1D, spines_clustered_test_1D) #add number of clustered spines to running list +# +# } # end of random spines 1D for-loop - test_spines_clustered_all_1D <- as.matrix(test_spines_clustered_all_1D) #changes data.frame to matrix (not sure if this is neccesary but it works) - test_spines_clustered_all_1D <- as.numeric(test_spines_clustered_all_1D) #changes all vales to numeric (again, not sure if neccessary) +# test_spines_clustered_all_1D <- as.matrix(test_spines_clustered_all_1D) #changes data.frame to matrix (not sure if this is neccesary but it works) +# test_spines_clustered_all_1D <- as.numeric(test_spines_clustered_all_1D) #changes all vales to numeric (again, not sure if neccessary) +# +# std_test_1D <- sd(test_spines_clustered_all_1D) #generates STD of total number of spines clustered in entire random sample +# mean_test_1D <- mean(test_spines_clustered_all_1D) #generates average number of spines in a cluster in random data +# +# curve_dnorm_1D <- dnorm(test_spines_clustered_all_1D, mean_test_1D, std_test_1D) #gives probability density function, or height of probability distribution at each point(height = frequency) +# std_curve_1D <- sd(curve_dnorm_1D) +# mean_curve_1D <- mean(curve_dnorm_1D) - std_test_1D <- sd(test_spines_clustered_all_1D) #generates STD of total number of spines clustered in entire random sample - mean_test_1D <- mean(test_spines_clustered_all_1D) #generates average number of spines in a cluster in random data - - curve_dnorm_1D <- dnorm(test_spines_clustered_all_1D, mean_test_1D, std_test_1D) #gives probability density function, or height of probability distribution at each point(height = frequency) - std_curve_1D <- sd(curve_dnorm_1D) - mean_curve_1D <- mean(curve_dnorm_1D) - - Cscore_1D <- pnorm(spines_clustered_1D, mean_test_1D, std_test_1D) #calculates the probability that given the random sample distribution (curve mean+SD) the total number of spines observed is higher +# Cscore_1D <- pnorm(spines_clustered_1D, mean_test_1D, std_test_1D) #calculates the probability that given the random sample distribution (curve mean+SD) the total number of spines observed is higher #therefore, the closer to 1, higher probability that a given number has more clustered spines than the random normal distribution and vice versa # add all 1D data to all data - cluster_data_1D <- data.frame() - cluster_data_1D <- rbind(cluster_data_1D, num_clusters_1D) - cluster_data_1D <- cbind(cluster_data_1D, spines_clustered_1D) - cluster_data_1D <- cbind(cluster_data_1D, spines_not_1D) - cluster_data_1D <- cbind(cluster_data_1D, Cscore_1D) - cluster_data_1D[is.na(cluster_data_1D)] <- 0 - colnames(cluster_data_1D) <- c("# of clusters - 1D", "spines clustered - 1D", "spines not clustered - 1D", "Cscore - 1D") +# cluster_data_1D <- data.frame() +# cluster_data_1D <- rbind(cluster_data_1D, num_clusters_1D) +# cluster_data_1D <- cbind(cluster_data_1D, spines_clustered_1D) +# cluster_data_1D <- cbind(cluster_data_1D, spines_not_1D) +# cluster_data_1D <- cbind(cluster_data_1D, Cscore_1D) +# cluster_data_1D[is.na(cluster_data_1D)] <- 0 +# colnames(cluster_data_1D) <- c("# of clusters - 1D", "spines clustered - 1D", "spines not clustered - 1D", "Cscore - 1D") # 3D clustering analysis @@ -153,12 +152,12 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in cluster_freq_3D$Freq <- as.numeric(cluster_freq_3D$Freq) cluster_freq_3D$is_clustered <- as.numeric(cluster_freq_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1 num_clusters_3D <- sum(cluster_freq_3D$is_clustered) #count how many 1s to determine how many clusters (spines > 1) in the segment - spines_in_cluster_3D <- cluster_freq_3D %>% group_by(is_clustered) %>% summarise(num_clus_spines_3D = sum(Freq)) - spines_clustered_3D <- list() - spines_clustered_3D <- rbind(spines_clustered_3D, spines_in_cluster_3D[2,2]) - spines_clustered_3D <- as.matrix(spines_clustered_3D) #conerts num of spines clustered to matrix + spines_clustered_3D <- cluster_freq_3D %>% + filter(is_clustered == 1) %>% + summarise(spines_clustered_3D = sum(Freq)) + null_length <- nrow(spines_clustered_3D) + ifelse(null_length==0, spines_clustered_3D <- 0, NA) spines_clustered_3D <- as.numeric(spines_clustered_3D) #converts to numerical form - spines_not_3D <- as.numeric(total_spines - spines_clustered_3D) test_spines_clustered_all_3D <- data.frame() @@ -166,7 +165,7 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in # 3D random spines for loop - for(j in 1:100){ + for(j in 1:10){ test_data_X <- data.frame(sample(df$X), df$Y, df$Z) # randomize the X's, Y's, and Z's to make a "biologically plausible" dataframe. colnames(test_data_X) <- c( "x", "Y", "Z") test_data_Y <- data.frame(df$X, sample(df$Y), df$Z) @@ -188,15 +187,16 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in num_clusters_test_3D <- sum(cluster_freq_test_3D$is_clustered) num_clusters_test_3D[is.na(num_clusters_test_3D)] <- 0 test_num_clusters_all_3D <- rbind(test_num_clusters_all_3D, num_clusters_test_3D) - spines_in_cluster_test_3D <- cluster_freq_test_3D %>% group_by(is_clustered) %>% summarise(num_clus_spines_test_3D = sum(Freq)) - spines_clustered_test_3D <- spines_in_cluster_test_3D[2,2] + spines_clustered_test_3D <- cluster_freq_test_3D %>% group_by(is_clustered) %>% filter(is_clustered == 1) %>% summarise(spines_clustered_test_3D = sum(Freq)) + spines_not_test_3D <- as.numeric(total_spines - spines_clustered_test_3D) + - spines_not_test_3D <- as.numeric(spines_in_cluster_test_3D[1,2]) spines_clustered_test_3D[is.na(spines_clustered_test_3D)] <- 0 test_spines_clustered_all_3D <- rbind(test_spines_clustered_all_3D, spines_clustered_test_3D) } # end of random spines 3D for-loop + test_spines_clustered_all_3D <- as.numeric(as.matrix(test_spines_clustered_all_3D)) @@ -207,19 +207,12 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in mean_curve_3D <- mean(curve_dnorm_3D) Cscore_3D <- pnorm(spines_clustered_3D, mean_test_3D, std_test_3D) - - - # add 3D data to data frame - cluster_data_3D <- data.frame() - cluster_data_3D <- rbind(cluster_data_3D, num_clusters_3D) - cluster_data_3D <- cbind(cluster_data_3D, spines_clustered_3D) - cluster_data_3D <- cbind(cluster_data_3D, spines_not_3D) - cluster_data_3D <- cbind(cluster_data_3D, Cscore_3D) - cluster_data_3D[is.na(cluster_data_3D)] <- 0 - colnames(cluster_data_3D) <- c("# of clusters-3D", "spines clustered-3D", "spines not clustered-3D", "Cscore-3D") - + df$num_clusters_3D <- num_clusters_3D + df$spines_clustered_3D <- spines_clustered_3D + df$spines_not_3D <- spines_not_3D + df$Cscore_3D <- Cscore_3D df$density_overall <- density_overall # add these data to df df$density_mushroom <- density_mushroom df$density_thin <- density_thin @@ -227,8 +220,8 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in data_all <- rbind(data_all, df) # adds df as next row in the data_all file - data_all <- cbind(data_all, cluster_data_1D) - data_all <- cbind(data_all, cluster_data_3D) +# data_all <- cbind(data_all, cluster_data_1D) +# data_all <- cbind(data_all, cluster_data_3D) } # end of file for-loop From 595d51dfcfcc6c1c2ba1e35a190a1f45f177a60d Mon Sep 17 00:00:00 2001 From: kellenett Date: Mon, 25 Mar 2019 13:20:20 -0500 Subject: [PATCH 07/11] Create 3D clustering.R --- 3D clustering.R | 275 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 3D clustering.R diff --git a/3D clustering.R b/3D clustering.R new file mode 100644 index 0000000..7785ed8 --- /dev/null +++ b/3D clustering.R @@ -0,0 +1,275 @@ +# modeling Yadav et al. 2012 cluster analysis; tailored towards clustering of attachment points +# initially coded by S. Johnson- updates and 3D analysis of spine heads added by K. Nett + + +library(dplyr) # to install these , enter install.packages("package name") into the console in RStudio. replace package name with each package name in quotes, ie install.packages("dplyr") +library(ggplot2) +library(lattice) +library(latticeExtra) +library(cluster) +library(stats) +library(readr) +library(tidyr) +library(bio3d) #install.packages("bio3d", dependencies = TRUE) +library(DECIPHER) +#how to install DECIPHER - a is enter in response to prompt for "all" +#source("https://bioconductor.org/biocLite.R") +#biocLite("DECIPHER") +#a + + +fileChosen <- file.choose() # opens file dialog to open +filePath <- dirname(fileChosen) # gets the directory name that that file is in +setwd(filePath) # sets the working directory. important for saving files later +fileList <- list.files(filePath) # a list of files in the directory to go through +fileList <- grep(".csv", fileList, value = TRUE) # finds only lists .csv files in case there are other types +data_all <- data.frame() # initializes a data.frame that will store all of the data for the experiment + + + + +for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in + fileName <- unlist(strsplit(fileList[i], "[.]"))[1] # removes ".csv" from file name + fileLoc <- file.path(filePath, fileList[i]) # creates a path to each file (differs from FileChosen as the loop runs) + df <- read_csv(fileLoc, col_types = cols()) # reads the csv file, suppresses output information about columns + colnames(df) <- gsub("-", "_", colnames(df)) # R doesn't like dashes in column names, replaces "-" with "_" for all colnames of df + df <- df[complete.cases(df$SOMA_DISTANCE),] # removes cases where there is no soma distance data + df <- df[complete.cases(df$RAYBURST_VOLUME),] # removes cases where there is no data + df <- df[complete.cases(df$MAX_DTS),] # removes cases where there is no data + df$file <- fileName # adds a column with repeated information about which file that spine came from + total_length <- df %>% group_by(SECTION_NUMBER) %>% summarise(section_length=max(SECTION_LENGTH)) %>%ungroup() %>% summarise(total_length=sum(section_length)) %>% as.double() #this uses the dplyr package to take the data.frame and group it by section. Then it finds the length of each section before adding them all together + total_spines <- as.numeric(nrow(df)) # number of rows = number of spines + density_overall <- total_spines/total_length # calculate density of segment + density_mushroom <- sum(as.numeric(df$TYPE=="mushroom"))/total_length # counts the number of "mushroom" rows and divides by length for mushroom density + density_thin <- sum(as.numeric(df$TYPE=="thin"))/total_length # same as above, with "thin" + density_stubby <- sum(as.numeric(df$TYPE=="stubby"))/total_length # same as above, with "stubby" + df <- df[order(df$SOMA_DISTANCE),] #order data by column SOMA_DISTANCE so that coordinates pulled are in spine attachment order + + data_coords <- data.frame(df$X, df$Y, df$Z) # creates data frame with spine head coordinates + +# Shane's original clustering analysis using spine attachment distance from soma + + agn <- agnes(df$SOMA_DISTANCE,metric = "euclidean", method = "average") # runs the agnes, computes agglomerative hierarchical clustering, "average" = UPGMA + dist_ac <- as.matrix(dist(df$SOMA_DISTANCE)) #creates a distance matrix of soma distances + df$nn_dist_ac <- apply(dist_ac,2, function(x) sort(x)[2]) # finds nearest neighbor for each spine + df$nn2_dist_ac <- apply(dist_ac,2, function(x) sort(x)[3]) # finds second nearest neighbor + df$nn3_dist_ac <- apply(dist_ac,2, function(x) sort(x)[4]) # finds third + dendrite_ac <- agn$ac # defines agglomeration coefficent of the dendrite + + ac_test <- list() #initializes list for ac's from next for loop + possible_dist <- seq(1,total_length, by=0.01) # create list of possible soma distances by 0.01 increments to pull from + + # for(k in 1:5000){ + # test_data <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines + # test_dist <- as.matrix(dist(test_data)) #creates distance matrix for random sample + # assign("test_cluster", agnes(test_data,metric = "euclidean", method = "average")) #runs UPGMA on sample and labels the agn output "test_cluster" + # ac_test <- rbind(ac_test,test_cluster$ac) # add row to ac using the 'test_cluster' ac + # } + +# cScore_ac_1D <- sum(as.numeric(ac_test 1) #returns 1 if there is >1 spine in a cluster and 0 if not-- therefore all 1s reflect a true "cluster" since it has more than 1 spine in ity +# num_clusters_1D <- sum(cluster_freq_1D$is_clustered) #count how many clusters are on this segment + +# spines_clustered_1D <- sum(cluster_freq_1D$Freq>1) + +# spines_clustered_1D <- as.matrix(spines_clustered_1D) #converts # of spines clustered to matrix +# spines_clustered_1D <- as.numeric(spines_clustered_1D) #converts to numerical form +# spines_clustered_1D[is.na(spines_clustered_1D)] <- 0 +# spines_not_1D <- as.numeric(total_spines - spines_clustered_1D) #define and calculate number of spines not clustered (total spines minus number of spines clustered) +# +# test_spines_clustered_all_1D <- data.frame() #initialize dataframe to contain # of spines in a cluster for all random samples +# test_num_clusters_all_1D <- data.frame() + +# 1D random spines for-loop +# for(j in 1:10000){ +# random_spines_1D <- sample(possible_dist, total_spines) # take a sample from all possible locations on dendrite to match total number of spines +# test_dist_1D <- as.matrix(dist(random_spines_1D)) #creates distance matrix for random sample +# UPGMA_test_1D <- IdClusters(test_dist_1D, method="UPGMA", cutoff=0.75, showPlot=FALSE) #runs UPGMA with cutoff (same as with obsv but with random sample) +# UPGMA_test_1D$rn <- rownames(UPGMA_test_1D) #add row name (spine ID) to cluster number +# cluster_all_test_1D <- cbind(UPGMA_test_1D, random_spines_1D) #add cluster number/spine ID to randomly generated soma distances +# cluster_all_test_1D <- cluster_all_test_1D[order(cluster_all_test_1D$cluster),] #order by cluster number +# cluster_freq_test_1D <- table(cluster_all_test_1D$cluster) #generate table with number of spines in each cluster +# cluster_freq_test_1D <- as.data.frame(cluster_freq_test_1D) #change table to data frame +# cluster_freq_test_1D$is_clustered <- as.numeric(cluster_freq_test_1D$Freq > 1) #ask whether cluster has >1 spines in it (1 for yes, 0 for no) +# num_clusters_test_1D <- sum(cluster_freq_test_1D$is_clustered) # add how many 1s (or how many clusters have >1 spines) to get true number of clusters +# num_clusters_test_1D[is.na(num_clusters_test_1D)] <- 0 #changes possible NA from 0 clusters to 0 +# test_num_clusters_all_1D <- rbind(test_num_clusters_all_1D, num_clusters_test_1D) #adds total number of clusters to running list +# spines_clustered_test_1D <- sum(cluster_freq_test_1D$Freq>1) +# + # spines_not_test_1D <- as.numeric(total_spines - spines_clustered_test_1D) # calculate how many spines are not clustered +# spines_clustered_test_1D[is.na(spines_clustered_test_1D)] <- 0 #returns 0 instead of Na if no spines are clustered in the random sample +# test_spines_clustered_all_1D <- rbind(test_spines_clustered_all_1D, spines_clustered_test_1D) #add number of clustered spines to running list +# +# } # end of random spines 1D for-loop + +# test_spines_clustered_all_1D <- as.matrix(test_spines_clustered_all_1D) #changes data.frame to matrix (not sure if this is neccesary but it works) +# test_spines_clustered_all_1D <- as.numeric(test_spines_clustered_all_1D) #changes all vales to numeric (again, not sure if neccessary) +# + +# std_test_1D <- sd(test_spines_clustered_all_1D) #generates STD of total number of spines clustered in entire random sample +# mean_test_1D <- mean(test_spines_clustered_all_1D) #generates average number of spines in a cluster in random data +# +# curve_dnorm_1D <- dnorm(test_spines_clustered_all_1D, mean_test_1D, std_test_1D) #gives probability density function, or height of probability distribution at each point(height = frequency) +# std_curve_1D <- sd(curve_dnorm_1D) +# mean_curve_1D <- mean(curve_dnorm_1D) + +# Cscore_1D <- pnorm(spines_clustered_1D, mean_test_1D, std_test_1D) #calculates the probability that given the random sample distribution (curve mean+SD) the total number of spines observed is higher + #therefore, the closer to 1, higher probability that a given number has more clustered spines than the random normal distribution and vice versa + + # add all 1D data to all data +# cluster_data_1D <- data.frame() +# cluster_data_1D <- rbind(cluster_data_1D, num_clusters_1D) +# cluster_data_1D <- cbind(cluster_data_1D, spines_clustered_1D) +# cluster_data_1D <- cbind(cluster_data_1D, spines_not_1D) +# cluster_data_1D <- cbind(cluster_data_1D, Cscore_1D) +# cluster_data_1D[is.na(cluster_data_1D)] <- 0 +# colnames(cluster_data_1D) <- c("# of clusters - 1D", "spines clustered - 1D", "spines not clustered - 1D", "Cscore - 1D") + + +# 3D clustering analysis + dist_3D <- as.matrix(dist.xyz(data_coords)) # creates distance matrix of spine head coordinates + UPGMA_obsv_3D <- IdClusters(dist_3D, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper + #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1) + UPGMA_obsv_3D$rn <- rownames(UPGMA_obsv_3D) #adds a column with rownows to keep spine ID, not sure if this step is neccessary + cluster_all_3D <- cbind(UPGMA_obsv_3D, data_coords) #get the coordinates for each spine ID, if kept in row name/number order, will correctly correspond to each spine + cluster_all_3D <- cluster_all_3D[order(cluster_all_3D$cluster),] #sort data frame by cluster number + cluster_freq_3D <- table(cluster_all_3D$cluster) #create a table counting how many times each cluster Variable occurs (i.e. how many spines in each cluster) + cluster_freq_3D <- as.data.frame(cluster_freq_3D) + cluster_freq_3D$Freq <- as.numeric(cluster_freq_3D$Freq) + cluster_freq_3D$is_clustered <- as.numeric(cluster_freq_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1 + num_clusters_3D <- sum(cluster_freq_3D$is_clustered) #count how many 1s to determine how many clusters (spines > 1) in the segment + spines_clustered_3D <- cluster_freq_3D %>% + filter(is_clustered == 1) %>% + summarise(spines_clustered_3D = sum(Freq)) + null_length <- nrow(spines_clustered_3D) + ifelse(null_length==0, spines_clustered_3D <- 0, NA) + spines_clustered_3D <- as.numeric(spines_clustered_3D) #converts to numerical form + spines_not_3D <- as.numeric(total_spines - spines_clustered_3D) + + test_spines_clustered_all_3D <- data.frame() + test_num_clusters_all_3D <- data.frame() + + +# 3D random spines for loop + for(j in 1:10){ + test_data_X <- data.frame(sample(df$X), df$Y, df$Z) # randomize the X's, Y's, and Z's to make a "biologically plausible" dataframe. + colnames(test_data_X) <- c( "x", "Y", "Z") + test_data_Y <- data.frame(df$X, sample(df$Y), df$Z) + colnames(test_data_Y) <- c( "x", "Y", "Z") + test_data_Z <- data.frame(df$X, df$Y, sample(df$Z)) + colnames(test_data_Z) <- c( "x", "Y", "Z") + test_data <- rbind(test_data_X, test_data_Y, test_data_Z) + test_data_final <-data.frame(sample_n(test_data, total_spines)) + test_dist_3D <- as.matrix(dist(test_data_final)) #creates distance matrix for random sample + + UPGMA_test_3D <- IdClusters(test_dist_3D, method = "UPGMA", cutoff=0.75, showPlot=TRUE) #run cluster analysis with cutoff used in Yadav paper + #gives cluster number associated with which spine (i.e. spines 25 and 26 are in cluster 1) + UPGMA_test_3D$rn <- rownames(UPGMA_test_3D) #adds a column with rownows to keep spine ID, not sure if this step is neccessary + cluster_all_test_3D <- cbind(UPGMA_test_3D, test_data_final) #get the coordinates for each spine ID, if kept in row name/number order, will correctly correspond to each spine + cluster_all_test_3D <- cluster_all_test_3D[order(cluster_all_test_3D$cluster),] #sort data frame by cluster number + cluster_freq_test_3D <- table(cluster_all_test_3D$cluster) #create a table counting how many times each cluster Variable occurs (i.e. how many spines in each cluster) + cluster_freq_test_3D <- as.data.frame(cluster_freq_test_3D) + cluster_freq_test_3D$is_clustered <- as.numeric(cluster_freq_test_3D$Freq >1) # create column where 1 means there is more than one spine in a cluster or 0 if just 1 + num_clusters_test_3D <- sum(cluster_freq_test_3D$is_clustered) + num_clusters_test_3D[is.na(num_clusters_test_3D)] <- 0 + test_num_clusters_all_3D <- rbind(test_num_clusters_all_3D, num_clusters_test_3D) + spines_clustered_test_3D <- cluster_freq_test_3D %>% group_by(is_clustered) %>% filter(is_clustered == 1) %>% summarise(spines_clustered_test_3D = sum(Freq)) + spines_not_test_3D <- as.numeric(total_spines - spines_clustered_test_3D) + + + spines_clustered_test_3D[is.na(spines_clustered_test_3D)] <- 0 + test_spines_clustered_all_3D <- rbind(test_spines_clustered_all_3D, spines_clustered_test_3D) + + } # end of random spines 3D for-loop + + + test_spines_clustered_all_3D <- as.numeric(as.matrix(test_spines_clustered_all_3D)) + + + std_test_3D <- sd(test_spines_clustered_all_3D) + mean_test_3D <- mean(test_spines_clustered_all_3D) + curve_dnorm_3D <- dnorm(test_spines_clustered_all_3D, mean_test_3D, std_test_3D) + std_curve_3D <- sd(curve_dnorm_3D) + mean_curve_3D <- mean(curve_dnorm_3D) + + Cscore_3D <- pnorm(spines_clustered_3D, mean_test_3D, std_test_3D) + + + df$num_clusters_3D <- num_clusters_3D + df$spines_clustered_3D <- spines_clustered_3D + df$spines_not_3D <- spines_not_3D + df$Cscore_3D <- Cscore_3D + df$density_overall <- density_overall # add these data to df + df$density_mushroom <- density_mushroom + df$density_thin <- density_thin + df$density_stubby <- density_stubby + data_all <- rbind(data_all, df) # adds df as next row in the data_all file + + +# data_all <- cbind(data_all, cluster_data_1D) +# data_all <- cbind(data_all, cluster_data_3D) + + +} # end of file for-loop + +data_all$animal_num <- lapply(data_all$file, function(x) unlist(strsplit(x, "-"))[2]) # this pulls out an animal number from the file number +data_all$retro_label <- substring(data_all$animal_num, nchar(data_all$animal_num), nchar(data_all$animal_num)) #pulls off 'L' or 'N' from ID to indicate whether cell was retro-gradely labeled +#data_all$PDB <- substring(data_all$animal_num, nchar(data_all$animal_num), nchar(data_all$animal_num)) +data_all$PDB <- lapply(data_all$file, function(x) unlist(strsplit(x, "-"))[3]) # pulls dendrite location out of name and adds column +data_all$PDB <- replace(data_all$PDB, data_all$PDB=="p", "prox") +data_all$PDB <- replace(data_all$PDB, data_all$PDB=="d", "dist") +data_all$PDB <- replace(data_all$PDB, data_all$PDB=="b", "basal") +data_all$retro_label <- replace(data_all$retro_label, data_all$retro_label=="L", "labeled") +data_all$retro_label <- replace(data_all$retro_label, data_all$retro_label=="N", "not labeled") +data_all$stack <- lapply(data_all$file, function(x) unlist(strsplit(x, "-"))[4]) #gives letter ID of different imaging days +data_all$stack <- unlist(data_all$stack) +data_all$animal_num <- lapply(data_all$animal_num, function(x) unlist(strsplit(x, "L"))[1]) #removes letter from behind animal ID name +data_all$animal_num <- lapply(data_all$animal_num, function(x) unlist(strsplit(x, "N"))[1]) +data_all$RAYBURST_VOLUME <- as.numeric(data_all$RAYBURST_VOLUME) +data_all$MAX_DTS <- as.numeric(data_all$MAX_DTS) + +n <- readline(prompt="Enter group name:") # gives prompt on console to enter animal group +data_all$group <- n # adds the group to the master data file + +data_all$animal_num <- unlist(data_all$animal_num) #changes animal number to a vector rather than a list, which is important for executing the following task +data_all$PDB <- unlist(data_all$PDB) # same as above + +TYPE_ave <- data_all %>% group_by(group, animal_num, retro_label, TYPE) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) +PDB_TYPE_ave <- data_all %>% group_by(group, animal_num, stack, retro_label, PDB, TYPE) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) + +PDB_ave <- data_all %>% group_by(group, animal_num, retro_label, PDB) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) +# ^first groups by treatment group, then by animal, then by day of image (i.e. -F), then by dendrite location +# summarise then finds averages for each group for all various pieces of data + +animal_ave <- data_all %>% group_by(group, animal_num) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) +# groups by treatment then animal, then finds averages per animal for various data + +labeled_ave <- data_all %>% group_by(group, animal_num, retro_label) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) +# groups by treatment then animal, then finds averages per animal for various data + +Cscore3D_avg <- data_all %>% group_by(animal_num, retro_label) %>% summarise(Cscore_3D = mean(Cscore_3D)) + +#dir.create("analysis") #creates a directory to create a file on the computer +#savePath <- paste(filePath,"/", "analysis", collapse = "/", sep="") # creates the path where files can be saved +#setwd(savePath) # sets working directory to the path created above +#write.csv(data_all, "data_all.csv") +#write.csv(PDB_ave, "PDB averages.csv") +#write.csv(animal_ave, "Animal Averages.csv") +#write.csv(TYPE_ave, "spine_type_averages.csv") +#write.csv(PDB_TYPE_ave, "PDB_spine_type_averages.csv") +#write.csv(labeled_ave, "labeled_averages.csv") + From 1e8a84e2ed374be5f1fcb330d5e517b394baa2c9 Mon Sep 17 00:00:00 2001 From: kellenett Date: Mon, 25 Mar 2019 14:49:16 -0500 Subject: [PATCH 08/11] Update 1D clustering Yadav.R --- 1D clustering Yadav.R | 117 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 106 insertions(+), 11 deletions(-) diff --git a/1D clustering Yadav.R b/1D clustering Yadav.R index 780217b..96fa6b2 100644 --- a/1D clustering Yadav.R +++ b/1D clustering Yadav.R @@ -37,7 +37,12 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in df <- df[complete.cases(df$RAYBURST_VOLUME),] # removes cases where there is no data df <- df[complete.cases(df$MAX_DTS),] # removes cases where there is no data df$file <- fileName # adds a column with repeated information about which file that spine came from - total_length <- df %>% group_by(SECTION_NUMBER) %>% summarise(section_length=max(SECTION_LENGTH)) %>%ungroup() %>% summarise(total_length=sum(section_length)) %>% as.double() #this uses the dplyr package to take the data.frame and group it by section. Then it finds the length of each section before adding them all together + total_length <- df %>% + group_by(SECTION_NUMBER) %>% + summarise(section_length=max(SECTION_LENGTH)) %>% + ungroup() %>% + summarise(total_length=sum(section_length)) %>% + as.double() #this uses the dplyr package to take the data.frame and group it by section. Then it finds the length of each section before adding them all together total_spines <- as.numeric(nrow(df)) # number of rows = number of spines density_overall <- total_spines/total_length # calculate density of segment density_mushroom <- sum(as.numeric(df$TYPE=="mushroom"))/total_length # counts the number of "mushroom" rows and divides by length for mushroom density @@ -49,7 +54,7 @@ for(i in 1:length(fileList)){ ##add back in 'i' when adding for loop back in # Shane's original clustering analysis using spine attachment distance from soma - agn <- agnes(df$SOMA_DISTANCE,metric = "euclidean", method = "average") # runs the agnes, computes agglomerative hierarchical clustering, "average" = UPGMA + # agn <- agnes(df$SOMA_DISTANCE,metric = "euclidean", method = "average") # runs the agnes, computes agglomerative hierarchical clustering, "average" = UPGMA dist_ac <- as.matrix(dist(df$SOMA_DISTANCE)) #creates a distance matrix of soma distances df$nn_dist_ac <- apply(dist_ac,2, function(x) sort(x)[2]) # finds nearest neighbor for each spine df$nn2_dist_ac <- apply(dist_ac,2, function(x) sort(x)[3]) # finds second nearest neighbor @@ -242,24 +247,114 @@ data_all$animal_num <- lapply(data_all$animal_num, function(x) unlist(strsplit(x data_all$RAYBURST_VOLUME <- as.numeric(data_all$RAYBURST_VOLUME) data_all$MAX_DTS <- as.numeric(data_all$MAX_DTS) + + n <- readline(prompt="Enter group name:") # gives prompt on console to enter animal group data_all$group <- n # adds the group to the master data file data_all$animal_num <- unlist(data_all$animal_num) #changes animal number to a vector rather than a list, which is important for executing the following task data_all$PDB <- unlist(data_all$PDB) # same as above -TYPE_ave <- data_all %>% group_by(group, animal_num, retro_label, TYPE) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) -PDB_TYPE_ave <- data_all %>% group_by(group, animal_num, stack, retro_label, PDB, TYPE) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) +data_all_by_file <- data_all %>% + group_by(file) %>% + summarise(num_clusters_3D = mean(num_clusters_3D), + spines_clustered_3D = mean(spines_clustered_3D), + spines_not_3D = mean(spines_not_3D), + Cscore_3D = mean(Cscore_3D), + density_overall = mean(density_overall), + density_mushroom = mean(density_mushroom), + density_thin = mean(density_thin), + density_stubby = mean(density_stubby)) + + +data_all_by_file$animal_num <- lapply(data_all_by_file$file, function(x) unlist(strsplit(x, "-"))[2]) # this pulls out an animal number from the file number +data_all_by_file$retro_label <- substring(data_all_by_file$animal_num, nchar(data_all_by_file$animal_num), nchar(data_all_by_file$animal_num)) #pulls off 'L' or 'N' from ID to indicate whether cell was retro-gradely labeled +data_all_by_file$PDB <- lapply(data_all_by_file$file, function(x) unlist(strsplit(x, "-"))[3]) # pulls dendrite location out of name and adds column +data_all_by_file$PDB <- replace(data_all_by_file$PDB, data_all_by_file$PDB=="b", "basal") +data_all_by_file$PDB <- replace(data_all_by_file$PDB, data_all_by_file$PDB=="p", "prox") +data_all_by_file$PDB <- replace(data_all_by_file$PDB, data_all_by_file$PDB=="d", "dist") +data_all_by_file$PDB <- replace(data_all_by_file$PDB, data_all_by_file$PDB=="b", "basal") +data_all_by_file$PDB <- replace(data_all_by_file$PDB, data_all_by_file$PDB=="p", "prox") +data_all_by_file$PDB <- replace(data_all_by_file$PDB, data_all_by_file$PDB=="d", "dist") +data_all_by_file$retro_label <- replace(data_all_by_file$retro_label, data_all_by_file$retro_label=="L", "labeled") +data_all_by_file$retro_label <- replace(data_all_by_file$retro_label, data_all_by_file$retro_label=="N", "not labeled") +data_all_by_file$stack <- lapply(data_all_by_file$file, function(x) unlist(strsplit(x, "-"))[4]) #gives letter ID of different imaging days +data_all_by_file$stack <- unlist(data_all_by_file$stack) +data_all_by_file$animal_num <- lapply(data_all_by_file$animal_num, function(x) unlist(strsplit(x, "L"))[1]) #removes letter from behind animal ID name +data_all_by_file$animal_num <- lapply(data_all_by_file$animal_num, function(x) unlist(strsplit(x, "N"))[1]) +data_all_by_file$group <- n +data_all_by_file$animal_num <- unlist(data_all_by_file$animal_num) #changes animal number to a vector rather than a list, which is important for executing the following task +data_all_by_file$PDB <- unlist(data_all_by_file$PDB) + + +#averages from data_all_by_file (when data is repeated by row for an individual file) +location_avg_file <- data_all_by_file %>% + group_by(group, animal_num, retro_label, PDB) %>% + summarise(num_clusters_3D = mean(num_clusters_3D), + spines_clustered_3D = mean(spines_clustered_3D), + spines_not_3D = mean(spines_not_3D), + Cscore_3D = mean(Cscore_3D), + density_overall = mean(density_overall), + density_mushroom = mean(density_mushroom), + density_thin = mean(density_thin), + density_stubby = mean(density_stubby)) + +animal_labeled_avg_file <- data_all_by_file %>% + group_by(group, animal_num, retro_label) %>% + summarise(num_clusters_3D = mean(num_clusters_3D), + spines_clustered_3D = mean(spines_clustered_3D), + spines_not_3D = mean(spines_not_3D), + Cscore_3D = mean(Cscore_3D), + density_overall = mean(density_overall), + density_mushroom = mean(density_mushroom), + density_thin = mean(density_thin), + density_stubby = mean(density_stubby)) + +#averages from data_all (when each spine/row has a different data point) +spine_type_avg <- data_all %>% + group_by(group, animal_num, retro_label, TYPE) %>% + summarise(nn_1D = mean(nn_dist_1D), + nn2_1D = mean(nn2_dist_1D), + nn3_1D = mean(nn3_dist_1D), + nn_3D = mean(nn_dist_3D), + nn2_3D = mean(nn2_dist_3D), + nn3_3D = mean(nn3_dist_3D), + spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), + spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) + +spine_type_by_location_avg <- data_all %>% + group_by(group, animal_num, stack, retro_label, PDB, TYPE) %>% + summarise(nn_1D = mean(nn_dist_1D), + nn2_1D = mean(nn2_dist_1D), + nn3_1D = mean(nn3_dist_1D), + nn_3D = mean(nn_dist_3D), + nn2_3D = mean(nn2_dist_3D), + nn3_3D = mean(nn3_dist_3D), + spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), + spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) -PDB_ave <- data_all %>% group_by(group, animal_num, retro_label, PDB) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) -# ^first groups by treatment group, then by animal, then by day of image (i.e. -F), then by dendrite location -# summarise then finds averages for each group for all various pieces of data +location_avg <- data_all %>% + group_by(group, animal_num, retro_label, PDB) %>% + summarise(nn_1D = mean(nn_dist_1D), + nn2_1D = mean(nn2_dist_1D), + nn3_1D = mean(nn3_dist_1D), + nn_3D = mean(nn_dist_3D), + nn2_3D = mean(nn2_dist_3D), + nn3_3D = mean(nn3_dist_3D), + spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), + spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) -animal_ave <- data_all %>% group_by(group, animal_num) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) -# groups by treatment then animal, then finds averages per animal for various data +animal_labeled_avg <- data_all %>% + group_by(group, animal_num, retro_label) %>% + summarise(nn_1D = mean(nn_dist_1D), + nn2_1D = mean(nn2_dist_1D), + nn3_1D = mean(nn3_dist_1D), + nn_3D = mean(nn_dist_3D), + nn2_3D = mean(nn2_dist_3D), + nn3_3D = mean(nn3_dist_3D), + spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), + spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) -labeled_ave <- data_all %>% group_by(group, animal_num, retro_label) %>% summarise(Cscore_1D = mean(c_score_1D), nn_1D = mean(nn_dist_1D), nn2_1D = mean(nn2_dist_1D), nn3_1D = mean(nn3_dist_1D), Cscore_3D = mean(c_score_3D), nn_3D = mean(nn_dist_3D), nn2_3D = mean(nn2_dist_3D), nn3_3D = mean(nn3_dist_3D), density_overall = mean(density_overall), density_mushroom = mean(density_mushroom), density_thin = mean(density_thin), density_stubby = mean(density_stubby), spine_vol_overall = mean(RAYBURST_VOLUME, na.rm = TRUE), spine_length_overall = mean(MAX_DTS, na.rm = TRUE)) -# groups by treatment then animal, then finds averages per animal for various data From 7cc875da3cc7635ba3e691b28dd38e9c85b0e252 Mon Sep 17 00:00:00 2001 From: kellnett <44405714+kellnett@users.noreply.github.com> Date: Mon, 28 Oct 2019 15:06:47 -0500 Subject: [PATCH 09/11] Add files via upload --- folder_loop.R | 683 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 683 insertions(+) create mode 100644 folder_loop.R diff --git a/folder_loop.R b/folder_loop.R new file mode 100644 index 0000000..c300763 --- /dev/null +++ b/folder_loop.R @@ -0,0 +1,683 @@ +#loop attempt with raw_data_cleanup.R + +library(dplyr) +library(ggplot2) +library(lattice) +library(latticeExtra) +library(cluster) +library(stats) +library(readr) +library(tidyr) +library(bio3d) +library(DECIPHER) +library(svDialogs) +library(rowr) +library(tcltk) + + +parent.folder <- tk_choose.dir(default = "", caption = "select folder") +sub.folders <- list.dirs(parent.folder, recursive = TRUE)[-1] +setwd(parent.folder) +saveFolder <- dirname(parent.folder) + +for(i in 1:length(sub.folders)) { +setwd(parent.folder) + fileChosen <- sub.folders[i] + + + + +#for(i in 1:length(sub.folders)) { +#fileChosen <- sub.folders[i] + + +# Initialize Code: Select file(s) and define group + +#fileChosen <- file.choose() # opens a folder dialog box to select first file in folder containing data from 1 treatment group +filePath <- dirname(fileChosen) # gets the directory name that that file is in +setwd(filePath) # sets the working directory. important for saving files later +fileList <- list.files(fileChosen) # list the files in the directory to go through +fileList <- grep(".csv", fileList, value = TRUE) # finds only lists .csv files in case there are other types +data_all <- data.frame() # initializes data frame to store data throughout the for-loop +#rat_ID <- dlgInput("Enter Rat ID #", Sys.info()["user"])$res # prompt given in console to enter the animal treatment group (i.e. Sal-D0) + + + +# File list for-loop: Apply code to each file in fileList + +for(i in 1:length(fileList)) { # peform code inside {} for each .csv file in the file list + fileName <- unlist(strsplit(fileList[i], "[.]"))[1] # remove '.csv' from the file name + fileLoc <- file.path(fileChosen, fileList[i]) # create path to file (will differ from initial FileChosen as loop progresses) + df <- read_csv(fileLoc, col_types = cols()) # read the csv file, uses column names from NeuronStudio file + colnames(df) <- gsub("-", "_", colnames(df)) # replaces dashes ('-') w/ underscores ('_') for column names (R doesn't like dashes) + df <- df[complete.cases(df$SOMA_DISTANCE) ,] # removes cases where there is no soma distance data + df <- df[complete.cases(df$RAYBURST_VOLUME), ] # " " spine volume data + df <- df[complete.cases(df$MAX_DTS), ] # " " length data + df$file <- fileName # add a column identifying the file name for each spine (row) + ## Current NeuronStudio data should only have 1 section/dendrite; however, it is possible + ## to have multiple if they are not linked together. The next 6 lines calculate total + ## length when there is more than 1 section + + total_length <- df %>% # dplyr package: define total length of dendritic segment + group_by(SECTION_NUMBER) %>% # group data frame by section number + summarise(section_length = max(SECTION_LENGTH)) %>% # find the length of each section + ungroup() %>% # ungroup previous grouping by section number + summarise(total_length = sum(section_length)) %>% # sum section length of each section + as.double() # allows for 64 bit storage (increase precision with more significant digits) + + + df$RAYBURST_VOLUME[df$RAYBURST_VOLUME == 0] <- NA #if no data, make NA + df$HEAD_DIAMETER[df$HEAD_DIAMETER == 0] <- NA + df$MAX_DTS[df$MAX_DTS == 0] <- NA + + df$HEAD_DIAMETER[df$HEAD_DIAMETER > 1.5] <- NA #mushroom HD cutoff + df$MAX_DTS[df$MAX_DTS > 3.00] <- NA #mushroom/thin length cutoff + df$MAX_DTS[df$TYPE == "stubby" & df$MAX_DTS > 0.80] <- NA #stubby length cutoff + df$HEAD_DIAMETER[df$TYPE == "stubby" & df$HEAD_DIAMETER > 0.97] <- NA #stubby HD cutoff + df$HEAD_DIAMETER[df$TYPE == "thin" & df$HEAD_DIAMETER > 1.22] <- NA #thin HD cutoff + + df$RAYBURST_VOLUME[df$TYPE == "mushroom" & df$RAYBURST_VOLUME > 0.60] <- NA #cut-offs from 2SD about mean in Harris et al. 1992 paper, pyramidal CA1 neurons + df$RAYBURST_VOLUME[df$TYPE == "thin" & df$RAYBURST_VOLUME > 0.10] <- NA + df$RAYBURST_VOLUME[df$TYPE == "stubby" & df$RAYBURST_VOLUME > 0.05] <- NA + + total_spines <- count(df$ID[!is.na(df$ID)]) # number of rows = number of spines + density_overall <- total_spines/total_length # calculate density of dendritic segment + density_mushroom <- sum(as.numeric(df$TYPE == "mushroom"), # find total number of mushroom spines and + na.rm = TRUE)/total_length # divide by total length to find mushroom density + density_thin <- sum(as.numeric(df$TYPE == "thin"), # as above for thin spines + na.rm = TRUE)/total_length # "" + density_stubby <- sum(as.numeric(df$TYPE == "stubby"), # as above for stubby spines + na.rm = TRUE)/total_length # "" + + + #add data to running list in data_master file + df$density_overall <- density_overall # add these data to df + df$density_mushroom <- density_mushroom + df$density_thin <- density_thin + df$density_stubby <- density_stubby + df$animal_num <- lapply(df$file, function(x) unlist(strsplit(x, "-"))[2]) # pulls animal ID out of file name, with retro label attached + df$retro_label <- substring(df$animal_num, nchar(df$animal_num), nchar(df$animal_num)) # adds column reported retro-label + df$retro_label <- replace(df$retro_label, df$retro_label == "L", 1) # adds labeled for L and not labeled for N + df$retro_label <- replace(df$retro_label, df$retro_label == "N", 0) + df$retro_label <- as.numeric(df$retro_label) + df$location <- lapply(df$file, function(x) unlist(strsplit(x, "-"))[3]) + df$location <- as.character(df$location) + df$animal_num <- lapply(df$animal_num, function(x) unlist(strsplit(x, "L"))[1]) # removes L or N from animal number + df$animal_num <- lapply(df$animal_num, function(x) unlist(strsplit(x, "N"))[1]) # removes L or N from animal number + df$animal_num <- as.numeric(df$animal_num) + + + + df$density_overall[df$density_overall > 4.00] <- NA + df$density_mushroom[df$density_mushroom > 1.00] <- NA + df$density_thin[df$density_thin > 4.00] <- NA + df$density_stubby[df$density_stubby > 1.00] <- NA + + + + + + #df[!complete.cases(df),] <- NA + + data_all <- rbind(data_all, df) # adds df as next row in the data_master file + +} # end of file for-loop + + +rat_ID <- data_all$animal_num[1] +data_all$MAX_DTS<- as.numeric(data_all$MAX_DTS) +data_all$RAYBURST_VOLUME <- as.numeric(data_all$RAYBURST_VOLUME) +data_all$HEAD_DIAMETER <- as.numeric(data_all$HEAD_DIAMETER) +#A_A + +data_mast_A_A <- data_all %>% + group_by(file) %>% + summarise(den_ov_A_A = mean(density_overall, na.rm = TRUE), + den_mush_A_A = mean(density_mushroom, na.rm = TRUE), + den_thin_A_A = mean(density_thin, na.rm = TRUE), + den_stub_A_A = mean(density_stubby, na.rm = TRUE), + vol_ov_A_A = mean(RAYBURST_VOLUME, na.rm = TRUE), + len_ov_A_A = mean(MAX_DTS, na.rm = TRUE), + hd_ov_A_A = mean(HEAD_DIAMETER, na.rm = TRUE)) %>% + ungroup() + +data_mast_A_A <- data_mast_A_A %>% + summarise(den_ov = mean(den_ov_A_A, na.rm = TRUE), + den_mush = mean(den_mush_A_A, na.rm = TRUE), + den_thin = mean(den_thin_A_A, na.rm = TRUE), + den_stub = mean(den_stub_A_A, na.rm = TRUE), + vol_ov = mean(vol_ov_A_A, na.rm = TRUE), + len_ov = mean(len_ov_A_A, na.rm = TRUE), + hd_ov = mean(hd_ov_A_A, na.rm = TRUE)) %>% + ungroup() + +data_TYPE_A_A <- data_all %>% + group_by(file, TYPE) %>% + summarise(vol_TYPE_A_A = mean(RAYBURST_VOLUME, na.rm = TRUE), + len_TYPE_A_A = mean(MAX_DTS, na.rm = TRUE), + hd_TYPE_A_A = mean(HEAD_DIAMETER, na.rm = TRUE)) %>% + ungroup() + +data_TYPE_A_A <- data_TYPE_A_A %>% + group_by(TYPE) %>% + summarise(vol_TYPE = mean(vol_TYPE_A_A, na.rm = TRUE), + len_TYPE = mean(len_TYPE_A_A, na.rm = TRUE), + hd_TYPE = mean(hd_TYPE_A_A, na.rm = TRUE)) %>% + ungroup() + +data_mush_A_A <- subset(data_TYPE_A_A, data_TYPE_A_A$TYPE == 'mushroom', select = 2:4) +data_thin_A_A <- subset(data_TYPE_A_A, data_TYPE_A_A$TYPE == 'thin', select = 2:4) +data_stub_A_A <- subset(data_TYPE_A_A, data_TYPE_A_A$TYPE == 'stubby', select = 2:4) + +label_location <- c('A_A') + +data_mast_A_A <- cbind.fill(label_location, + data_mast_A_A, + data_mush_A_A, + data_thin_A_A, + data_stub_A_A, + fill = 0) + + +#N_A + +data_mast_NL_A <- data_all %>% + group_by(file, retro_label)%>% + summarise(den_ov_NL_A = mean(density_overall, na.rm = TRUE), + den_mush_NL_A = mean(density_mushroom, na.rm = TRUE), + den_thin_NL_A = mean(density_thin, na.rm = TRUE), + den_stub_NL_A = mean(density_stubby, na.rm = TRUE), + vol_ov_NL_A = mean(RAYBURST_VOLUME, na.rm = TRUE), + len_ov_NL_A = mean(MAX_DTS, na.rm = TRUE), + hd_ov_NL_A = mean(HEAD_DIAMETER, na.rm = TRUE)) %>% + ungroup() + +data_mast_NL_A <- data_mast_NL_A %>% + group_by(retro_label) %>% + summarise(den_ov = mean(den_ov_NL_A, na.rm = TRUE), + den_mush = mean(den_mush_NL_A, na.rm = TRUE), + den_thin = mean(den_thin_NL_A, na.rm = TRUE), + den_stub = mean(den_stub_NL_A, na.rm = TRUE), + vol_ov = mean(vol_ov_NL_A, na.rm = TRUE), + len_ov = mean(len_ov_NL_A, na.rm = TRUE), + hd_ov = mean(hd_ov_NL_A, na.rm = TRUE)) %>% + ungroup() + +data_mast_N_A <- subset(data_mast_NL_A, data_mast_NL_A$retro_label == 0, select = 2:8) + +data_TYPE_NL_A <- data_all %>% + group_by(file, retro_label, TYPE) %>% + summarise(vol_TYPE_NL_A = mean(RAYBURST_VOLUME, na.rm = TRUE), + len_TYPE_NL_A = mean(MAX_DTS, na.rm = TRUE), + hd_TYPE_NL_A = mean(HEAD_DIAMETER, na.rm = TRUE)) %>% + ungroup() + +data_TYPE_NL_A <- data_TYPE_NL_A %>% + group_by(retro_label, TYPE) %>% + summarise(vol_TYPE = mean(vol_TYPE_NL_A, na.rm = TRUE), + len_TYPE = mean(len_TYPE_NL_A, na.rm = TRUE), + hd_TYPE = mean(hd_TYPE_NL_A, na.rm = TRUE)) %>% + ungroup() + + +data_mush_N_A <- subset(data_TYPE_NL_A, data_TYPE_NL_A$retro_label == 0 & data_TYPE_NL_A$TYPE == 'mushroom', select= 3:5) +data_thin_N_A <- subset(data_TYPE_NL_A, data_TYPE_NL_A$retro_label == 0 & data_TYPE_NL_A$TYPE == 'thin', select= 3:5) +data_stub_N_A <- subset(data_TYPE_NL_A, data_TYPE_NL_A$retro_label == 0 & data_TYPE_NL_A$TYPE == 'stubby', select= 3:5) + +label_location <- c('N_A') + +data_mast_N_A <- cbind.fill(label_location, + data_mast_N_A, + data_mush_N_A, + data_thin_N_A, + data_stub_N_A, + fill = 0) + + +#L_A + +data_mast_L_A <- subset(data_mast_NL_A, data_mast_NL_A$retro_label == 1, select = 2:8) +data_mush_L_A <- subset(data_TYPE_NL_A, data_TYPE_NL_A$retro_label == 1 & data_TYPE_NL_A$TYPE == 'mushroom', select = 3:5) +data_thin_L_A <- subset(data_TYPE_NL_A, data_TYPE_NL_A$retro_label == 1 & data_TYPE_NL_A$TYPE == 'thin', select = 3:5) +data_stub_L_A <- subset(data_TYPE_NL_A, data_TYPE_NL_A$retro_label == 1 & data_TYPE_NL_A$TYPE == 'stubby', select = 3:5) + + +label_location <- c('L_A') + +data_mast_L_A <- cbind.fill(label_location, + data_mast_L_A, + data_mush_L_A, + data_thin_L_A, + data_stub_L_A, + fill = 0) + +# for A_b + +data_mast_A_bpd <- data_all %>% + group_by(file, location) %>% + summarise(den_ov_A_b = mean(density_overall, na.rm = TRUE), + den_mush_A_b = mean(density_mushroom, na.rm = TRUE), + den_thin_A_b = mean(density_thin, na.rm = TRUE), + den_stub_A_b = mean(density_stubby, na.rm = TRUE), + vol_ov_A_b = mean(RAYBURST_VOLUME, na.rm = TRUE), + len_ov_A_b = mean(MAX_DTS, na.rm = TRUE), + hd_ov_A_b = mean(HEAD_DIAMETER, na.rm = TRUE)) %>% + ungroup() + +data_mast_A_bpd <- data_mast_A_bpd %>% + group_by(location) %>% + summarise(den_ov = mean(den_ov_A_b, na.rm = TRUE), + den_mush = mean(den_mush_A_b, na.rm = TRUE), + den_thin = mean(den_thin_A_b, na.rm = TRUE), + den_stub = mean(den_stub_A_b, na.rm = TRUE), + vol_ov = mean(vol_ov_A_b, na.rm = TRUE), + len_ov = mean(len_ov_A_b, na.rm = TRUE), + hd_ov = mean(hd_ov_A_b, na.rm = TRUE)) %>% + ungroup() + +data_mast_A_b <- subset(data_mast_A_bpd, data_mast_A_bpd$location == 'b', select = 2:8) + + +data_TYPE_A_bpd <- data_all %>% + group_by(file, location, TYPE) %>% + summarise(vol_TYPE_A_b = mean(RAYBURST_VOLUME, na.rm = TRUE), + len_TYPE_A_b = mean(MAX_DTS, na.rm = TRUE), + hd_TYPE_A_b = mean(HEAD_DIAMETER, na.rm = TRUE)) %>% + ungroup() + +data_TYPE_A_bpd <- data_TYPE_A_bpd %>% + group_by(location, TYPE) %>% + summarise(vol_TYPE = mean(vol_TYPE_A_b, na.rm = TRUE), + len_TYPE = mean(len_TYPE_A_b, na.rm = TRUE), + hd_TYPE = mean(hd_TYPE_A_b, na.rm = TRUE)) %>% + ungroup() + +data_mush_A_b <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'b' & data_TYPE_A_bpd$TYPE == 'mushroom', select = 3:5) +data_thin_A_b <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'b' & data_TYPE_A_bpd$TYPE == 'thin', select = 3:5) +data_stub_A_b <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'b' & data_TYPE_A_bpd$TYPE == 'stubby', select = 3:5) + +label_location <- c('A_b') + +data_mast_A_b <- cbind.fill(label_location, + data_mast_A_b, + data_mush_A_b, + data_thin_A_b, + data_stub_A_b, + fill = 0) + +#A_p + +data_mast_A_p <- subset(data_mast_A_bpd, data_mast_A_bpd$location == 'p', select = 2:8) +data_mush_A_p <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'p' & data_TYPE_A_bpd$TYPE == 'mushroom', select = 3:5) +data_thin_A_p <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'p' & data_TYPE_A_bpd$TYPE == 'thin', select = 3:5) +data_stub_A_p <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'p' & data_TYPE_A_bpd$TYPE == 'stubby', select = 3:5) + +label_location <- c('A_p') + +data_mast_A_p <- cbind.fill(label_location, + data_mast_A_p, + data_mush_A_p, + data_thin_A_p, + data_stub_A_p, + fill = 0) + +#A_d + +data_mast_A_d <- subset(data_mast_A_bpd, data_mast_A_bpd$location == 'd', select = 2:8) +data_mush_A_d <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'd' & data_TYPE_A_bpd$TYPE == 'mushroom', select = 3:5) +data_thin_A_d <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'd' & data_TYPE_A_bpd$TYPE == 'thin', select = 3:5) +data_stub_A_d <- subset(data_TYPE_A_bpd, data_TYPE_A_bpd$location == 'd' & data_TYPE_A_bpd$TYPE == 'stubby', select = 3:5) + +label_location <- c('A_d') + +data_mast_A_d <- cbind.fill(label_location, + data_mast_A_d, + data_mush_A_d, + data_thin_A_d, + data_stub_A_d, + fill = 0) + +#for N_b + +data_mast_NL_bpd <- data_all %>% + group_by(file, retro_label, location) %>% + summarise(den_ov_NL_bpd = mean(density_overall, na.rm = TRUE), + den_mush_NL_bpd = mean(density_mushroom, na.rm = TRUE), + den_thin_NL_bpd = mean(density_thin, na.rm = TRUE), + den_stub_NL_bpd = mean(density_stubby, na.rm = TRUE), + vol_ov_NL_bpd = mean(RAYBURST_VOLUME, na.rm = TRUE), + len_ov_NL_bpd = mean(MAX_DTS, na.rm = TRUE), + hd_ov_NL_bpd = mean(HEAD_DIAMETER, na.rm = TRUE)) %>% + ungroup() + +data_count_master <- data_mast_NL_bpd[,1:3] + +count_N_b <- data.frame(nrow(subset(data_count_master, retro_label == 0 & location =='b'))) +count_N_p <- data.frame(nrow(subset(data_count_master, retro_label == 0 & location =='p'))) +count_N_d <- data.frame(nrow(subset(data_count_master, retro_label == 0 & location =='d'))) +count_N_A <- sum(count_N_b, count_N_p, count_N_d) + +count_L_b <- data.frame(nrow(subset(data_count_master, retro_label == 1 & location =='b'))) +count_L_p <- data.frame(nrow(subset(data_count_master, retro_label == 1 & location =='p'))) +count_L_d <- data.frame(nrow(subset(data_count_master, retro_label == 1 & location =='d'))) +count_L_A <- sum(count_L_b, count_L_p, count_L_d) + +count_A_A <- sum(count_N_A, count_L_A) +count_A_b <- sum(count_N_b, count_L_b) +count_A_p <- sum(count_N_p, count_L_p) +count_A_d <- sum(count_N_d, count_L_d) + + + +data_count_master <- cbind.fill(count_A_A, + count_A_b, + count_A_p, + count_A_d, + count_N_A, + count_N_b, + count_N_p, + count_N_d, + count_L_A, + count_L_b, + count_L_p, + count_L_d, + fill = 0) + +data_count_master <- t(data_count_master) +row.names(data_count_master) <- NULL + + +data_mast_NL_bpd <- data_mast_NL_bpd %>% + group_by(retro_label, location) %>% + summarise(den_ov = mean(den_ov_NL_bpd, na.rm = TRUE), + den_mush = mean(den_mush_NL_bpd, na.rm = TRUE), + den_thin = mean(den_thin_NL_bpd, na.rm = TRUE), + den_stub = mean(den_stub_NL_bpd, na.rm = TRUE), + vol_ov = mean(vol_ov_NL_bpd, na.rm = TRUE), + len_ov = mean(len_ov_NL_bpd, na.rm = TRUE), + hd_ov = mean(hd_ov_NL_bpd, na.rm = TRUE)) %>% + ungroup() + +data_mast_N_b <- subset(data_mast_NL_bpd, data_mast_NL_bpd$retro_label == 0 & data_mast_NL_bpd$location == 'b', select = 3:9) + +data_TYPE_NL_bpd <- data_all %>% + group_by(file, retro_label, location, TYPE) %>% + summarise(vol_TYPE_NL_bpd = mean(RAYBURST_VOLUME, na.rm = TRUE), + len_TYPE_NL_bpd = mean(MAX_DTS, na.rm = TRUE), + hd_TYPE_NL_bpd = mean(HEAD_DIAMETER, na.rm = TRUE)) %>% + ungroup() + +data_TYPE_NL_bpd <- data_TYPE_NL_bpd %>% + group_by(retro_label, location, TYPE) %>% + summarise(vol_TYPE = mean(vol_TYPE_NL_bpd, na.rm = TRUE), + len_TYPE = mean(len_TYPE_NL_bpd, na.rm = TRUE), + hd_TYPE = mean(hd_TYPE_NL_bpd, na.rm = TRUE)) %>% + ungroup() + +data_mush_N_b <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 0 & + data_TYPE_NL_bpd$location == 'b' & + data_TYPE_NL_bpd$TYPE == 'mushroom', + select = 4:6) + +data_thin_N_b <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 0 & + data_TYPE_NL_bpd$location == 'b' & + data_TYPE_NL_bpd$TYPE == 'thin', + select = 4:6) + +data_stub_N_b <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 0 & + data_TYPE_NL_bpd$location == 'b' & + data_TYPE_NL_bpd$TYPE == 'stubby', + select = 4:6) + + +label_location <- c('N_b') + +data_mast_N_b <- cbind.fill(label_location, + data_mast_N_b, + data_mush_N_b, + data_thin_N_b, + data_stub_N_b, + fill = 0) + +#N_p + +data_mast_N_p <- subset(data_mast_NL_bpd, data_mast_NL_bpd$retro_label == 0 & data_mast_NL_bpd$location == 'p', select = 3:9) + + +data_mush_N_p <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 0 & + data_TYPE_NL_bpd$location == 'p' & + data_TYPE_NL_bpd$TYPE == 'mushroom', + select = 4:6) + + +data_thin_N_p <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 0 & + data_TYPE_NL_bpd$location == 'p' & + data_TYPE_NL_bpd$TYPE == 'thin', + select = 4:6) + + +data_stub_N_p <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 0 & + data_TYPE_NL_bpd$location == 'p' & + data_TYPE_NL_bpd$TYPE == 'stubby', + select = 4:6) + + +label_location <- c('N_p') + +data_mast_N_p <- cbind.fill(label_location, + data_mast_N_p, + data_mush_N_p, + data_thin_N_p, + data_stub_N_p, + fill = 0) + +#N_d + +data_mast_N_d <- subset(data_mast_NL_bpd, + data_mast_NL_bpd$retro_label == 0 & + data_mast_NL_bpd$location == 'd', + select = 3:9) + +data_mush_N_d <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 0 & + data_TYPE_NL_bpd$location == 'd' & + data_TYPE_NL_bpd$TYPE == 'mushroom', + select = 4:6) + +data_thin_N_d <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 0 & + data_TYPE_NL_bpd$location == 'd' & + data_TYPE_NL_bpd$TYPE == 'thin', + select = 4:6) + +data_stub_N_d <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 0 & + data_TYPE_NL_bpd$location == 'd' & + data_TYPE_NL_bpd$TYPE == 'stubby', + select = 4:6) + +label_location <- c('N_d') + +data_mast_N_d <- cbind.fill(label_location, + data_mast_N_d, + data_mush_N_d, + data_thin_N_d, + data_stub_N_d, + fill = 0) + +#L_b + +data_mast_L_b <- subset(data_mast_NL_bpd, + data_mast_NL_bpd$retro_label == 1 & + data_mast_NL_bpd$location == 'b', + select = 3:9) + + +data_mush_L_b <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 1 & + data_TYPE_NL_bpd$location == 'b' & + data_TYPE_NL_bpd$TYPE == 'mushroom', + select = 4:6) + + +data_thin_L_b <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 1 & + data_TYPE_NL_bpd$location == 'b' & + data_TYPE_NL_bpd$TYPE == 'thin', + select = 4:6) + + +data_stub_L_b <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 1 & + data_TYPE_NL_bpd$location == 'b' & + data_TYPE_NL_bpd$TYPE == 'stubby', + select = 4:6) + + +label_location <- c('L_b') +data_mast_L_b <- cbind.fill(label_location, + data_mast_L_b, + data_mush_L_b, + data_thin_L_b, + data_stub_L_b, + fill = 0) + +#L_p + +data_mast_L_p <- subset(data_mast_NL_bpd, + data_mast_NL_bpd$retro_label == 1 & + data_mast_NL_bpd$location == 'p', + select = 3:9) + + +data_mush_L_p <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 1 & + data_TYPE_NL_bpd$location == 'p' & + data_TYPE_NL_bpd$TYPE == 'mushroom', + select = 4:6) + + +data_thin_L_p <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 1 & + data_TYPE_NL_bpd$location == 'p' & + data_TYPE_NL_bpd$TYPE == 'thin', + select = 4:6) + + +data_stub_L_p <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 1 & + data_TYPE_NL_bpd$location == 'p' & + data_TYPE_NL_bpd$TYPE == 'stubby', + select = 4:6) + + + +label_location <- c('L_p') + +data_mast_L_p <- cbind.fill(label_location, + data_mast_L_p, + data_mush_L_p, + data_thin_L_p, + data_stub_L_p, + fill = 0) + +#L_d + +data_mast_L_d <- subset(data_mast_NL_bpd, + data_mast_NL_bpd$retro_label == 1 & + data_mast_NL_bpd$location == 'd', + select = 3:9) + + +data_mush_L_d <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 1 & + data_TYPE_NL_bpd$location == 'd' & + data_TYPE_NL_bpd$TYPE == 'mushroom', + select = 4:6) + + +data_thin_L_d <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 1 & + data_TYPE_NL_bpd$location == 'd' & + data_TYPE_NL_bpd$TYPE == 'thin', + select = 4:6) + + +data_stub_L_d <- subset(data_TYPE_NL_bpd, + data_TYPE_NL_bpd$retro_label == 1 & + data_TYPE_NL_bpd$location == 'd' & + data_TYPE_NL_bpd$TYPE == 'stubby', + select = 4:6) + + +label_location <- c('L_d') + +data_mast_L_d <- cbind.fill(label_location, + data_mast_L_d, + data_mush_L_d, + data_thin_L_d, + data_stub_L_d, + fill = 0) + + + + +data_master <- rbind(data_mast_A_A, + data_mast_A_b, + data_mast_A_p, + data_mast_A_d, + data_mast_N_A, + data_mast_N_b, + data_mast_N_p, + data_mast_N_d, + data_mast_L_A, + data_mast_L_b, + data_mast_L_p, + data_mast_L_d) + +data_master <- cbind.fill (rat_ID, + data_count_master, + data_master) + +colnames(data_master) <- c('rat_ID', + 'count', + 'group', + 'den_ov', + 'den_mush', + 'den_thin', + 'den_stub', + 'vol_ov', + 'len_ov', + 'hd_ov', + 'vol_mush', + 'len_mush', + 'hd_mush', + 'vol_thin', + 'len_thin', + 'hd_thin', + 'vol_stub', + 'len_stub', + 'hd_stub') + + + +savePath_all <- paste0(saveFolder, "/data_all") # creates the path where files can be saved +setwd(savePath_all) +write.csv(data_all, paste0(rat_ID, "_data_all.csv")) +#setwd(fileLoc) # sets working directory to the path created above +#dir.create(savePath, "data_all") #create a directory to create a file inside +setwd(saveFolder) +savePath_master <- paste0(saveFolder, "/data_master") +setwd(savePath_master) +write.csv(data_master, paste0(rat_ID, "_data_master.csv")) + +} + + +winDialog("ok", "Code complete") \ No newline at end of file From dfb8fe67773358b3a15949ecb62a473da4d29d5c Mon Sep 17 00:00:00 2001 From: kellnett <44405714+kellnett@users.noreply.github.com> Date: Tue, 29 Oct 2019 10:32:32 -0500 Subject: [PATCH 10/11] Create 3D-update --- 3D-update | 1 + 1 file changed, 1 insertion(+) create mode 100644 3D-update diff --git a/3D-update b/3D-update new file mode 100644 index 0000000..038d718 --- /dev/null +++ b/3D-update @@ -0,0 +1 @@ +testing From 839b277a1446acd1d0b9643dcd5f77d9253339bc Mon Sep 17 00:00:00 2001 From: kellnett <44405714+kellnett@users.noreply.github.com> Date: Tue, 29 Oct 2019 10:34:42 -0500 Subject: [PATCH 11/11] updated_loop current code I'm using I worked on it separately outside of GitHub so a lot is different --- folder_loop.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/folder_loop.R b/folder_loop.R index c300763..77f8ac4 100644 --- a/folder_loop.R +++ b/folder_loop.R @@ -1,4 +1,4 @@ -#loop attempt with raw_data_cleanup.R +#loop attempt with raw_data_cleanup.R again library(dplyr) library(ggplot2) @@ -680,4 +680,4 @@ write.csv(data_master, paste0(rat_ID, "_data_master.csv")) } -winDialog("ok", "Code complete") \ No newline at end of file +winDialog("ok", "Code complete")