diff --git a/DESCRIPTION b/DESCRIPTION index 9f1486f..8d8fcf7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,4 +1,4 @@ -Package: SCI +Package: SCI2 Title: Spatial Cauasal Inference Version: 0.0.0.1 Authors@R: person("Dan", "Runfola", , "drunfola@aiddata.org", role = c("aut", "cre")) @@ -6,4 +6,4 @@ Description: An alpha release of a package designed to make impact evaluation th Depends: R (>= 3.0.2), sp, maptools, reshape, ggplot2, grid, gridExtra, spdep,proj4, FNN, psych License: Creative Commons with Attribution LazyData: true -Imports:sp, maptools, reshape, ggplot2, grid, gridExtra, spdep,proj4, FNN, psych \ No newline at end of file +Imports:sp, maptools, reshape, ggplot2, grid, gridExtra, spdep,proj4, FNN, psych diff --git a/R/BuildTimeSeries.R b/R/BuildTimeSeries.R index a470ad8..f0c26bc 100644 --- a/R/BuildTimeSeries.R +++ b/R/BuildTimeSeries.R @@ -1,147 +1,235 @@ -BuildTimeSeries <- function(dta,idField,varList_pre,startYear,endYear,colYears=NULL,interpYears=NULL) -{ - years <- startYear:endYear - #If there is a "colYears" variable, convert to binaries. - #Eventually could be extended to more than one column. - if(!is.null(colYears)) - { - #For each variable, for each year, create a binary representing the treatment status. - for(k in 1:length(years)) - { - for(j in 1:length(colYears)) - { - varN <- paste("TrtMnt_",colYears[j],years[k],sep="") - exec <- paste("dta$",varN,"=0",sep="") - eval(parse(text=exec)) - - dta@data[varN][dta@data[colYears[j]] <= as.numeric(years[k])] <- 1 - } - } - } - for (j in 1:length(colYears)) - { - trt_id = paste("TrtMnt_",colYears[j],sep="") - interpYears <- c(interpYears,trt_id) - } - print(interpYears) - #If there is an "interpVars" variable, linearly interpolate values based on at least 2 known points in time. - if(!is.null(interpYears)) - { - for(AncInt in 1:length(interpYears)) - { - cur_ancVi <- interpYears[AncInt] - interpFrame <- dta@data[idField] - interpFrame[idField] <- dta@data[idField] - cnt = 2 - for(k in 1:length(years)) - { - #First, build a model describing the relationship between years and any data in the interp field. - varI <- paste(cur_ancVi,years[[k]],sep="") - #Check if data exists for the year - if not, ignore. If so, include in the new modeling frame. - if(varI %in% colnames(dta@data)) - { - add_data <- paste("interpFrame[cnt] <- dta@data$",varI) - eval(parse(text=add_data)) - colnames(interpFrame)[cnt] <- years[[k]] - cnt = cnt + 1 - } else - { - #Exception for a single-point interpolation - varC <- paste(cur_ancVi,sep="") - if(varC %in% colnames(dta@data)) - { - add_data <- paste("interpFrame[cnt] <- dta@data$",varC) - eval(parse(text=add_data)) - cnt = 3 +BuildTimeSeries <- function (dta, idField, varList_pre, startYear, endYear, colYears=NULL, interpYears=NULL) { + + # generate year range + years <- startYear:endYear + + + + print("bts1") + + # If there is a "colYears" variable, convert to binaries. + # Eventually could be extended to more than one column. + if (!is.null(colYears)) { + # For each variable, for each year, create a binary representing the treatment status. + for (j in 1:length(colYears)) { + + for (k in 1:length(years)) { + + varN = paste("TrtMnt_",colYears[j],"_",years[k], sep="") + print(varN) + + dta@data[varN] <- 0 + dta@data[varN][dta@data[colYears[j]] <= years[k]] <- 1 + dta@data[varN][is.na(dta@data[colYears[j]])] <- 0 + } - } - + + # add the "TrtMnt_" + colYears[j] prefix to interpYears + trt_id = paste("TrtMnt_",colYears[j],"_....", sep="") + interpYears <- c(interpYears, trt_id) } - #Only one time point, so no interpolation is done - value is simply copied to all other columns. - if(cnt == 3) - { - for(k in 1:length(years)) - { - varI <- paste("dta@data$",cur_ancVi,years[[k]]," <- interpFrame[2]",sep="") - eval(parse(text=varI)) - } - } else { - tDframe <- dta@data[idField] - #Here, we model out everything. - #Melt the dataframe for modeling - melt_Model_dta <- melt(data.frame(interpFrame),id=idField) - melt_Model_dta["variable"] <- as.numeric(gsub("X","",melt_Model_dta$variable)) - - #Fit the model for interpolation - execstr <- paste("mdl <- lm(value ~ variable + factor(",idField,"),data=melt_Model_dta)",sep="") - eval(parse(text=execstr)) - #Apply the model to interpolate - for(u in 1:length(years)) - { - varI <- paste(cur_ancVi,years[[u]],sep="") - if(!(varI %in% colnames(dta@data))) - { - #Variable doesn't exist, so we need to interpolate. - tDframe[idField] <- dta@data[idField] - tDframe["variable"] <- years[[u]] - dta@data[varI] <- predict(mdl,newdata=tDframe) + } + + # View(dta) + # print(interpYears) + + + + + print("bts2") + timer <- proc.time() + + # If there is an "interpVars" variable, linearly interpolate values based on at least 2 known points in time. + if (!is.null(interpYears)) { + + for (AncInt in 1:length(interpYears)) { + + print(interpYears[AncInt]) + + + cur_ancVi <- interpYears[AncInt] + + # create interpolation data frame and add id field + interpFrame <- dta@data[idField] + interpFrame[idField] <- dta@data[idField] + + cnt = 2 + + + print("bts2.0.1") + + # add data to interpolation data frame + if (cur_ancVi %in% colnames(dta@data)) { + # Exception for a single-point interpolation + interpFrame[cnt] <- dta@data[,cur_ancVi] + cnt = 3 + + } else { + for (k in 1:length(years)) { + # First, build a model describing the relationship between years and any data in the interp field. + + # Check if data exists for the year - if not, ignore. If so, include in the new modeling frame. + varI <- gsub('....', years[[k]], cur_ancVi, fixed=TRUE) + if (varI %in% colnames(dta@data)) { + + interpFrame[cnt] <- dta@data[,varI] + colnames(interpFrame)[cnt] <- years[[k]] + cnt = cnt + 1 + + } + } + } + + print(cnt) + + + print("bts2.0.2") + + if (cnt == 3) { + # only one time point, so no interpolation is done - value is simply copied to all other columns. + + print("bts2.0.2a") + for (k in 1:length(years)) { + # add _year to end of non temporal data + dta@data[,paste(cur_ancVi,years[[k]],sep="_")] <- interpFrame[2] + } + + } else if (cnt < length(years) + 2) { + # run model if data exists for at least two years but not for all years + print("bts2.0.2b0") + + # Melt the dataframe for modeling + melt_Model_dta <- melt(data.frame(interpFrame), id=idField) + melt_Model_dta["variable"] <- as.numeric(gsub("X", "", melt_Model_dta[,"variable"])) + + + print("bts2.0.2b1") + + # Fit the model for interpolation + # this is a slow part + execstr <- paste("mdl <- lm(value ~ variable + factor(",idField,"),data=melt_Model_dta)", sep="") + eval(parse(text=execstr)) + # mdl <- lm(value ~ variable + factor(idField), data=melt_Model_dta) + + + print("bts2.0.2b2") + + # Apply the model to interpolate + tDframe <- dta@data[idField] + + for (u in 1:length(years)) { + + varI <- gsub('....', years[[u]], cur_ancVi, fixed=TRUE) + if (!(varI %in% colnames(dta@data))) { + # Variable doesn't exist, so we need to interpolate. + tDframe[idField] <- dta@data[idField] + tDframe["variable"] <- years[[u]] + dta@data[varI] <- predict(mdl, newdata=tDframe) + } + } } - } + } - - } - - #Append interpolated fields to our melting lists - for(v in 1:length(interpYears)) - { - varList_pre[[length(varList_pre)+1]] <- interpYears[v] - } + + print("bts2.1") + # Append interpolated fields to our melting lists + varList_pre <- c(varList_pre, interpYears) - } - - - #Run the melts - - meltList <- list() - for (i in 1:length(varList_pre)) - { - #grep_str = paste(idField,"|",varList_pre[i],"[0-9][0-9][0-9][0-9]",sep="") - #Limit to only relevant years - grepStrYrs = idField - for(j in 1:length(years)) - { - tempGrep <- grepStrYrs - grepStrYrs <- paste(tempGrep,"|",varList_pre[[i]],years[[j]],sep="") } - tDF <- dta@data[grepl(grepStrYrs,names(dta@data))] - meltList[[i]] <- melt(tDF,id=idField) - - #Keep only years in the year column, rename columns - colnames(meltList[[i]])[2] <- "Year" - colnames(meltList[[i]])[3] <- varList_pre[[i]] - - #Clean up year column - gsub_command <- paste("^",varList_pre[[i]],sep="") - meltList[[i]][2] <- gsub(gsub_command, "", as.matrix(meltList[[i]][2])) - - - #Remove ID and year if this is at least the second variable to avoid duplications. - if(i > 1) - { - meltList[[i]] <- meltList[[i]][3] + + print(varList_pre) + + timer <- proc.time() - timer + print(paste("section completed in", timer[3], "seconds.")) + + + + + print("bts3") + timer <- proc.time() + + # Run the melts + meltList <- list() + for (i in 1:length(varList_pre)) { + + print("bts3.0") + print(varList_pre[[i]]) + + # Limit to only relevant years + grepStrYrs = idField + + for (j in 1:length(years)) { + tempGrep <- grepStrYrs + + if (regexpr("....", varList_pre[[i]], fixed=TRUE)[1] == -1) { + grepStrYrs <- paste(tempGrep,"|",paste(varList_pre[[i]],years[[j]], sep="_"), sep="") + } else { + grepStrYrs <- paste(tempGrep,"|",gsub('....', years[[j]], varList_pre[[i]], fixed=TRUE), sep="") + } + } + + print(grepStrYrs) + + print("bts3.1") + # print(names(dta@data)) + + tDF <- dta@data[grepl(grepStrYrs, names(dta@data))] + meltList[[i]] <- melt(tDF, id=idField) + + print("bts3.2") + # print(colnames(meltList[[i]])) + # Keep only years in the year column, rename columns + colnames(meltList[[i]])[2] <- "Year" + + print("bts3.3") + + colnames(meltList[[i]])[3] <- varList_pre[[i]] + + + print("bts3.4") + + if (i == 1) { + # set field to use for regex when formatting year field later + year_regex_field <- varList_pre[[i]] + + } else { + # remove id and year after first pass to avoid duplications + meltList[[i]] <- meltList[[i]][3] + + } + } - } + timer <- proc.time() - timer + print(paste("section completed in", timer[3], "seconds.")) - #Finish up with a cherry on top - meltListRet <- data.frame(meltList) - - return(meltListRet) + + + + print("bts4") + + # convert meltList to data frame + meltListRet <- data.frame(meltList) + + # format year + regex_test <- regexpr("....", year_regex_field, fixed=TRUE)[1] + if (regex_test > -1) { + meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { + return(as.integer(substr(z, regex_test, regex_test+nchar("....")-1))) + }) + + } else { + meltListRet['Year'] <- lapply(meltListRet['Year'], function (z) { + return(as.integer(substr(z, nchar(z)-nchar("....")+1, nchar(z)))) + }) + } + + return(meltListRet) } # dm1 <- melt(d[,c("Type","I.alt","idx06","idx07","idx08")], id=c("Type","I.alt")) # dm2 <- melt(d[,c("Type","I.alt","farve1","farve2")], id=c("Type","I.alt")) # colnames(dm2) <- c("Type", "I.alt", "variable2", "value2") -# dm <- merge(dm1, dm2) \ No newline at end of file +# dm <- merge(dm1, dm2) diff --git a/R/PSM_correlogram.R b/R/PSM_correlogram.R index 2e44f33..f4f9107 100644 --- a/R/PSM_correlogram.R +++ b/R/PSM_correlogram.R @@ -1,66 +1,77 @@ -#forked from SPDEP (sp.correlogram) +# forked from SPDEP (sp.correlogram) +PSM_correlogram <- function (dta, var, order = 1, style = "W", randomisation = TRUE, zero.policy = NULL, spChk = NULL, start, end) { -PSM_correlogram <- function (dta, var, order = 1, style = "W", - randomisation = TRUE, zero.policy = NULL, spChk = NULL, start,end) -{ - stopifnot(is.vector(var)) - if (any(is.na(var))) - stop("no NAs permitted in variable") - if (is.null(zero.policy)) - zero.policy <- get("zeroPolicy", envir = .spdepOptions) - stopifnot(is.logical(zero.policy)) - if (is.null(spChk)) - spChk <- get.spChkOption() - if (order < 1) - stop("order less than 1") - - #nblags <- nblag(neighbours, maxlag = order) - #r.nb <- dnearneigh(as.matrix(coordinates(dta_prj)),d1=c1,d2=c2) - nblags <- vector(mode = "list", length = order) - end = end * 1000 - start = start * 1000 - rng_increment = (end-start) / order - cur_step = start + rng_increment - cur_start = start - nblags[[1]] <- dnearneigh(dta,d1=cur_start,d2=cur_step) - binname <- list() - binname <- rbind(binname,(cur_start/1000)) - for(L in 2:order) - { - cur_start = cur_step - cur_step = cur_step + rng_increment - nblags[[L]] <- dnearneigh(dta,d1=cur_start,d2=cur_step) - binname <- rbind(binname,(cur_start/1000)) - } - cardnos <- vector(mode = "list", length = order) - for (i in 1:order) cardnos[[i]] <- table(card(nblags[[i]])) - nobs <- sapply(cardnos, function(x) sum(x[names(x) > "0"])) - #if (any(nobs < 3)) - # stop("sp.correlogram: too few included observations in higher lags:\n\treduce order.") - res <- matrix(NA, nrow = 0, ncol = 3) - cnt = 0 + stopifnot(is.vector(var)) + + if (any(is.na(var))) + stop("no NAs permitted in variable") + + if (is.null(zero.policy)) + zero.policy <- get("zeroPolicy", envir = .spdepOptions) + + stopifnot(is.logical(zero.policy)) + + if (is.null(spChk)) + spChk <- get.spChkOption() + + if (order < 1) + stop("order less than 1") + + #nblags <- nblag(neighbours, maxlag = order) + #r.nb <- dnearneigh(as.matrix(coordinates(dta_prj)),d1=c1,d2=c2) + + end = end * 1000 + start = start * 1000 + rng_increment = (end-start) / order + nblags <- vector(mode = "list", length = order) + binname <- list() + + cur_step = start + rng_increment + cur_start = start + nblags[[1]] <- dnearneigh(dta, d1=cur_start, d2=cur_step) + binname <- rbind(binname, (cur_start/1000)) + + for (L in 2:order) { + cur_start = cur_step + cur_step = cur_step + rng_increment + nblags[[L]] <- dnearneigh(dta, d1=cur_start, d2=cur_step) + binname <- rbind(binname, (cur_start/1000)) + } + + cardnos <- vector(mode = "list", length = order) + for (i in 1:order) { + cardnos[[i]] <- table(card(nblags[[i]])) + } + + nobs <- sapply(cardnos, function(x) sum(x[names(x) > "0"])) + + #if (any(nobs < 3)) + # stop("sp.correlogram: too few included observations in higher lags:\n\treduce order.") + res <- matrix(NA, nrow = 0, ncol = 3) + cnt = 0 + for (i in 1:order) { + if (nobs[[i]] == 0) { + cardnos <- cardnos[-i] + cnt = cnt + 1 + warn_str = paste("Bin h=",i,"was empty, and is excluded from the plot.") + warning(warn_str) + } else { + listw <- nb2listw(nblags[[i]], style = style, zero.policy = zero.policy) + res <- rbind(res, moran.test(var, listw, randomisation = randomisation, zero.policy = zero.policy)$estimate) + cur_rw = i - cnt + rownames(res)[cur_rw] <- binname[[i]] + } + } + order = order - cnt + #rownames(res) <- 1:order + + cardnos <- vector(mode = "list", length = order) for (i in 1:order) { - if(nobs[[i]] == 0) - { - cardnos <- cardnos[-i] - cnt = cnt + 1 - warn_str = paste("Bin h=",i,"was empty, and is excluded from the plot.") - warning(warn_str) - } else { - listw <- nb2listw(nblags[[i]], style = style, zero.policy = zero.policy) - res <- rbind(res,moran.test(var, listw, randomisation = randomisation, - zero.policy = zero.policy)$estimate) - cur_rw = i - cnt - rownames(res)[cur_rw] <- binname[[i]] - } + cardnos[[i]] <- table(card(nblags[[i]])) } - order = order - cnt - #rownames(res) <- 1:order - cardnos <- vector(mode = "list", length = order) - for (i in 1:order) cardnos[[i]] <- table(card(nblags[[i]])) - obj <- list(res = res, method = "I", cardnos = cardnos, - var = deparse(substitute(var))) - class(obj) <- "spcor" - print(obj) - obj + + obj <- list(res = res, method = "I", cardnos = cardnos, var = deparse(substitute(var))) + class(obj) <- "spcor" + print(obj) + obj } \ No newline at end of file diff --git a/R/PSMdistDecay.R b/R/PSMdistDecay.R index 20ae298..77ce202 100644 --- a/R/PSMdistDecay.R +++ b/R/PSMdistDecay.R @@ -1,22 +1,22 @@ -#PSM distance decay examination - should we enforce a threshold for matches or not? -#This is a wrapper for a heavily modified sp.correlogram from SPDEP -#This new function (PSM_correlogram) allows for the specification of distance bands -#Neighbors within each band are then tested for Moran's I correlation. -PSMdistDecay = function(dta,psm_col,start,end,h) -{ - #Produce a corellogram using Moran's I at varying resolutions - #First, convert to an equal-distance projection - - #Need to update this so it handles projections correctly every time. - #Currently hacked together. - - dta_prj_coords <- project(as.matrix(coordinates(dta)),"+proj=laea") - dta_prj <- as(dta,"data.frame") - coordinates(dta_prj) <- dta_prj_coords - - exec <- paste("PSM_correlogram(as.matrix(coordinates(dta_prj)),dta_prj$",psm_col,",order=",h,",zero.policy=TRUE,start=",start,",end=",end,")",sep="") - sp.cor <- eval(parse(text=exec)) - plot(sp.cor) - return(sp.cor) +# PSM distance decay examination - should we enforce a threshold for matches or not? +# This is a wrapper for a heavily modified sp.correlogram from SPDEP +# This new function (PSM_correlogram) allows for the specification of distance bands +# Neighbors within each band are then tested for Moran's I correlation. +PSMdistDecay = function(dta, psm_col, start, end, h) { + # Produce a corellogram using Moran's I at varying resolutions + + # First, convert to an equal-distance projection + # Need to update this so it handles projections correctly every time. + # Currently hacked together. + dta_prj_coords <- project(as.matrix(coordinates(dta)),"+proj=laea") + dta_prj <- as(dta,"data.frame") + coordinates(dta_prj) <- dta_prj_coords + + # run PSM_correlogram and return results + exec <- paste("PSM_correlogram(as.matrix(coordinates(dta_prj)),dta_prj$",psm_col,",order=",h,",zero.policy=TRUE,start=",start,",end=",end,")", sep="") + sp.cor <- eval(parse(text=exec)) + plot(sp.cor) + + return(sp.cor) } diff --git a/R/SAT.R b/R/SAT.R index f630f35..a3e25a6 100644 --- a/R/SAT.R +++ b/R/SAT.R @@ -1,227 +1,517 @@ -SAT <- function(dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinColName) -{ - #Initialization - pltObjs <- list() - init_dta <- dta - - drop_unmatched = drop_opts["drop_unmatched"] - drop_method = drop_opts["drop_method"] - drop_thresh = as.numeric(drop_opts["drop_thresh"]) - - if(!is.null(constraints)) - { - for(cst in 1:length(names(constraints))) - { - if(names(constraints)[cst] == "groups") - { - exec_stmnt = paste("dta$ConstraintGroupSet_Opt <- dta$",constraints["groups"],sep="") - eval(parse(text=exec_stmnt)) - } else - { - dta$ConstraintGroupSet_Opt <- 1 - } - if(names(constraints)[cst] == "distance") - { - dist_PSM = as.numeric(constraints["distance"][[1]]) - } else - { - dist_PSM=NULL - } - } - } else { - dta$ConstraintGroupSet_Opt <- 1 - #max the distance threshold by taking the diagonal of the bounding box. - dist_PSM = NULL - } - - #Caclulate the number of groups to constrain by, if any. - group_constraints <- unique(dta$ConstraintGroupSet_Opt) - - #Make sure there are both treatment and control groups of an adequate size (>= 1 of each) - t_dta <- list() - u_dta <-list() - grp_list <- list() - cnt = 0 - for (grp in 1:length(group_constraints)) - { - cur_grp <- as.matrix(group_constraints)[grp] - grp_index = length(grp_list)+1 - t_index = length(t_dta)+1 - grp_list[[grp_index]] <- as.matrix(group_constraints)[grp] - t_dta[[t_index]] <- dta[dta$TrtBin == 1,] - u_dta[[t_index]] <- dta[dta$TrtBin == 0,] - treatment_count <- cur_grp %in% t_dta[[t_index]]$ConstraintGroupSet_Opt - untreated_count <- cur_grp %in% u_dta[[t_index]]$ConstraintGroupSet_Opt - if((untreated_count == FALSE) || (treatment_count == FALSE)) - { - dta <- dta[!dta$ConstraintGroupSet_Opt == cur_grp,] - t_dta[[t_index]] <- NULL - u_dta[[t_index]] <- NULL - grp_list[[t_index]] <- NULL - war_statement = paste("Dropped group due to a lack of both treatment and control observation: '",cur_grp,"'",sep="") - warning(war_statement) +# SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinColName) { + +# dta <- as.data.table(dta@data) + +# #Initialization +# pltObjs <- list() +# init_dta <- dta + +# drop_unmatched = drop_opts["drop_unmatched"] +# drop_method = drop_opts["drop_method"] +# drop_thresh = as.numeric(drop_opts["drop_thresh"]) + + + +# print("sat1") + +# if (!is.null(constraints) && constraints != c()) { +# for (cst in 1:length(names(constraints))) { +# if (names(constraints)[cst] == "groups") { +# dta[,"ConstraintGroupSet_Opt"] <- dta[,get(constraints["groups"])] + +# } else { +# dta[,"ConstraintGroupSet_Opt"] <- 1 +# } + +# if (names(constraints)[cst] == "distance") { +# dist_PSM = as.numeric(constraints["distance"][[1]]) +# } else { +# dist_PSM=NULL +# } +# } + +# } else { +# dta[,"ConstraintGroupSet_Opt"] <- 1 +# #max the distance threshold by taking the diagonal of the bounding box. +# dist_PSM = NULL +# } + + + +# print("sat2") + +# #Caclulate the number of groups to constrain by, if any. +# group_constraints <- unique(dta[,'ConstraintGroupSet_Opt'] + +# #Make sure there are both treatment and control groups of an adequate size (>= 1 of each) +# t_dta <- list() +# u_dta <-list() +# grp_list <- list() +# cnt = 0 + +# for (grp in 1:length(group_constraints)) { +# cur_grp <- as.matrix(group_constraints)[grp] +# grp_index = length(grp_list)+1 +# t_index = length(t_dta)+1 +# grp_list[[grp_index]] <- as.matrix(group_constraints)[grp] + +# t_dta[[t_index]] <- dta[TrtBin == 1] +# u_dta[[t_index]] <- dta[TrtBin == 0] + +# has_treated <- cur_grp %in% t_dta[[t_index]][,'ConstraintGroupSet_Opt'] +# has_untreated <- cur_grp %in% u_dta[[t_index]][,'ConstraintGroupSet_Opt'] + +# if ((has_untreated == FALSE) || (has_treated == FALSE)) { +# dta <- dta['ConstraintGroupSet_Opt' != cur_grp] +# t_dta[[t_index]] <- NULL +# u_dta[[t_index]] <- NULL +# grp_list[[t_index]] <- NULL +# war_statement = paste("Dropped group due to a lack of both treatment and control observation: '",cur_grp,"'",sep="") +# warning(war_statement) + +# } else { +# t_dta[[t_index]] <- t_dta[[t_index]][ConstraintGroupSet_Opt == (cur_grp)] +# u_dta[[t_index]] <- u_dta[[t_index]][ConstraintGroupSet_Opt == (cur_grp)] + +# cnt = cnt + 1 +# } +# } + + +# if (cnt == 0) { +# return('drop') +# } + + +# print("sat3") + +# temp_dta <- list() + +# for (i in 1:cnt) { +# cur_grp <- grp_list[[i]] + +# print("sat3.1") +# it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) + +# print("sat3.2") +# if (mtd == "fastNN") { +# # *** +# # this is the slow part of functions +# temp_dta[[i]] <- fastNN_binary_func(it_dta, TrtBinColName, ids, cur_grp, dist_PSM) +# } + +# if (mtd == "NN_WithReplacement") { +# print("NN with replacement is currently not available, please choose fastNN") +# # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) +# } +# } + +# print("sat4") + +# #Build the final datasets from subsets +# if (cnt > 1) { +# dta <- temp_dta[[1]] +# for(k in 2:cnt) { +# dta <- maptools::spRbind(dta, temp_dta[[k]]) +# } +# } else { +# dta <- temp_dta[[1]] +# } + +# print("sat5") + +# if (drop_unmatched == TRUE) { +# dta <- dta["PSM_match_ID" != -999] +# } + +# anc_v_int <- strsplit(psm_eq, "~")[[1]][2] +# anc_vars <- strsplit(gsub(" ","",anc_v_int), "+", fixed=TRUE) +# anc_vars <- c(anc_vars[[1]], "PSM_trtProb") + +# print("sat6") + +# #Drop observations according to the selected method +# if (drop_method == "SD") { +# #Method to drop pairs that are greater than a set threshold apart in terms of PSM Standard Deviations. +# psm_sd_thresh = sd(dta[,"PSM_trtProb"]) * drop_thresh +# if (visual == "TRUE") { +# print(psm_sd_thresh) +# } +# dta <- dta["PSM_distance" < psm_sd_thresh] +# } + + + +# #Plot the pre and post-dropping balance for PSM model... +# #Balance metrics are based on "Misunderstandings between experimentalists and +# #observationalists about causal inference", Imal, King, and Stuart. +# #Simplest suggestion of comparing means and checking if .25 SD apart used. +# cnt = 0 + +# print("sat7") + +# for (i in 1:length(anc_vars)) { + +# print("sat7.0") + +# #gsub to remove any factors() +# ed_v = sub("factor\\(","",anc_vars[i]) +# ed_v = sub(")","",ed_v) + +# c_type = class(init_dta[[ed_v]]) + +# print("sat7.1") +# if (c_type == "matrix") { + +# dta[,ed_v] <- as.numeric(dta[,ed_v]) + +# init_dta[,ed_v] <- as.numeric(init_dta[,ed_v]) + +# c_type = "numeric" +# } + +# print("sat7.2") +# if ((c_type == "numeric") & (visual == "TRUE")) { +# cnt = cnt + 1 +# pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(init_dta, anc_vars[i],"Pre-Balancing: ",simple_out = FALSE) +# pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(dta, anc_vars[i],"Post-Balancing: ",simple_out = FALSE) + + +# treat_mean_pre <- round(describeBy(init_dta[[ed_v]], group=init_dta[[TrtBinColName]])[[2]][[3]], 5) +# treat_SD_pre <- round(describeBy(init_dta[[ed_v]], group=init_dta[[TrtBinColName]])[[2]][[4]], 5) +# control_mean_pre <- round(describeBy(init_dta[[ed_v]], group=init_dta[[TrtBinColName]])[[1]][[3]], 5) +# control_SD_pre <- round(describeBy(init_dta[[ed_v]], group=init_dta[[TrtBinColName]])[[1]][[4]], 5) + +# treat_mean_post <- round(describeBy(dta[[ed_v]], group=dta[[TrtBinColName]])[[2]][[3]], 5) +# treat_SD_post <- round(describeBy(dta[[ed_v]], group=dta[[TrtBinColName]])[[2]][[4]], 5) +# control_mean_post <- round(describeBy(dta[[ed_v]], group=dta[[TrtBinColName]])[[1]][[3]], 5) +# control_SD_post <- round(describeBy(dta[[ed_v]], group=dta[[TrtBinColName]])[[1]][[4]], 5) + + +# it_diff_Mean_pre <- round(abs( treat_mean_pre-control_mean_pre ),5) +# it_diff_Mean_post <- round(abs(treat_mean_post-control_mean_post),5) + +# if (!exists("bRes")) { + +# bRes <- data.frame(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, +# treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, +# it_diff_Mean_pre,it_diff_Mean_post) + +# colnames(bRes)[1] <- "Pre-Balance Treated Mean" +# colnames(bRes)[2] <- "Pre-Balance Treated SD" +# colnames(bRes)[3] <- "Pre-Balance Control Mean" +# colnames(bRes)[4] <- "Pre-Balance Control SD" + +# colnames(bRes)[5] <- "Post-Balance Treated Mean" +# colnames(bRes)[6] <- "Post-Balance Treated SD" +# colnames(bRes)[7] <- "Post-Balance Control Mean" +# colnames(bRes)[8] <- "Post-Balance Control SD" + +# colnames(bRes)[9] <- "Mean Difference Pre-Balance" +# colnames(bRes)[10] <- "Mean Difference Post-Balance" + +# } else { +# bRes <- rbind(bRes, c(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, +# treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, +# it_diff_Mean_pre,it_diff_Mean_post)) +# } + +# rownames(bRes)[i-(i-cnt)] <- gsub("[^a-zA-Z0-9]", "", ed_v) +# } +# } + +# print("sat8") + +# if (visual=="TRUE") { +# #Output graphics +# #Remove the factor rows +# nrow_c <- length(pltObjs) +# counter <- 1 +# while (counter <= nrow_c) { +# d = counter + 3 +# if (d > nrow_c) { +# d = nrow_c +# } +# do.call(grid.arrange,c(pltObjs[counter:d],nrow=2,ncol=2)) +# counter = counter + 4 +# } +# #bTab <- stargazer(bRes,summary=FALSE,type="html") +# #print.htmlTable(bTab) +# } + + +# return (as.data.frame(dta)) +# } + + + + + +SAT <- function (dta, mtd, constraints, psm_eq, ids, drop_opts, visual, TrtBinColName) { + #Initialization + pltObjs <- list() + init_dta <- dta + + drop_unmatched = drop_opts["drop_unmatched"] + drop_method = drop_opts["drop_method"] + drop_thresh = as.numeric(drop_opts["drop_thresh"]) + + + + print("sat1") + + # if (!is.null(constraints) && constraints != c()) { + if (!is.null(constraints) ) { + + print("sat1a.1") + + for (cst in 1:length(names(constraints))) { + if (names(constraints)[cst] == "groups") { + dta@data[,"ConstraintGroupSet_Opt"] <- dta@data[,constraints["groups"]] + + } else { + dta$ConstraintGroupSet_Opt <- 1 + } + + if (names(constraints)[cst] == "distance") { + dist_PSM = as.numeric(constraints["distance"][[1]]) + } else { + dist_PSM=NULL + } + } + + + print("sat1a.2") + + # Caclulate the number of groups to constrain by, if any. + group_constraints <- unique(dta$ConstraintGroupSet_Opt) + + # Make sure there are both treatment and control groups of an adequate size (>= 1 of each) + t_dta <- list() + u_dta <-list() + grp_list <- list() + cnt <- 0 + + for (grp in 1:length(group_constraints)) { + cur_grp <- as.matrix(group_constraints)[grp] + grp_index = length(grp_list)+1 + t_index = length(t_dta)+1 + grp_list[[grp_index]] <- as.matrix(group_constraints)[grp] + + t_dta[[t_index]] <- dta[dta$TrtBin == 1,] + u_dta[[t_index]] <- dta[dta$TrtBin == 0,] + + treatment_count <- cur_grp %in% t_dta[[t_index]]$ConstraintGroupSet_Opt + untreated_count <- cur_grp %in% u_dta[[t_index]]$ConstraintGroupSet_Opt + + if ((untreated_count == FALSE) || (treatment_count == FALSE)) { + dta <- dta[!dta$ConstraintGroupSet_Opt == cur_grp,] + t_dta[[t_index]] <- NULL + u_dta[[t_index]] <- NULL + grp_list[[t_index]] <- NULL + war_statement = paste("Dropped group due to a lack of both treatment and control observation: '",cur_grp,"'",sep="") + warning(war_statement) + + } else { + t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] + u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] + + cnt <- cnt + 1 + } + } + + + if (cnt == 0) { + return('drop') + } + + + print("sat1a.3") + + + for (i in 1:cnt) { + cur_grp <- grp_list[[i]] + + print("sat1a.3.1") + it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) + + print("sat1a.3.2") + if (mtd == "fastNN") { + # *** + # this is the slow part of functions + temp_dta[[i]] <- fastNN_binary_func(it_dta, TrtBinColName, ids, cur_grp, dist_PSM) + } + + # if (mtd == "NN_WithReplacement") { + # print("NN with replacement is currently not available, please choose fastNN") + # # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + # } + } + + print("sat1a.4") + + #Build the final datasets from subsets + if (cnt > 1) { + dta <- temp_dta[[1]] + for(k in 2:cnt) { + dta <- maptools::spRbind(dta, temp_dta[[k]]) + } + } else { + dta <- temp_dta[[1]] + } + + } else { - - t_dta[[t_index]] <- t_dta[[t_index]][t_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] - u_dta[[t_index]] <- u_dta[[t_index]][u_dta[[t_index]]$ConstraintGroupSet_Opt == cur_grp,] - - cnt = cnt + 1 - } - } - temp_dta <- list() -for(i in 1:cnt) - { - cur_grp <- grp_list[[i]] - it_dta <- maptools::spRbind(t_dta[[i]],u_dta[[i]]) - - - if (mtd == "fastNN") - { - temp_dta[[i]] <- fastNN_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) - } - - if (mtd == "NN_WithReplacement") - { - print("NN with replacement is currently not available, please choose fastNN") - # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + + print("sat1b.1") + + cnt <- 1 + temp_dta <- list() + + if (mtd == "fastNN") { + # *** + # this is the slow part of functions + dta <- fastNN_binary_func(dta, TrtBinColName, ids, NULL, NULL) + + if (class(dta) == class('drop')) { + return('drop') + } + } + + + # if (mtd == "NN_WithReplacement") { + # print("NN with replacement is currently not available, please choose fastNN") + # # temp_dta[[i]] <- NN_WithReplacement_binary_func(it_dta,TrtBinColName,ids,cur_grp,dist_PSM) + # } + + } - } - -#Build the final datasets from subsets -if(cnt > 1) -{ - dta <- temp_dta[[1]] - for(k in 2:cnt) - { - dta <- maptools::spRbind(dta, temp_dta[[k]]) - } -} else { - dta <- temp_dta[[1]] -} + + vvv <<- dta - if (drop_unmatched == TRUE) - { - dta <- dta[dta@data$PSM_match_ID != -999,] - } - - anc_v_int <- strsplit(psm_eq, "~")[[1]][2] - anc_vars <- strsplit(gsub(" ","",anc_v_int), "+", fixed=TRUE) - anc_vars <- c(anc_vars[[1]], "PSM_trtProb") - - #Drop observations according to the selected method - if(drop_method == "SD") - { - #Method to drop pairs that are greater than a set threshold apart in terms of PSM Standard Deviations. - psm_sd_thresh = sd(dta$PSM_trtProb) * drop_thresh - if(visual == "TRUE") - { - print(psm_sd_thresh) + print("sat5") + + if (drop_unmatched == TRUE) { + dta <- dta[dta@data[,"PSM_match_ID"] != -999,] } - dta <- dta[dta@data$PSM_distance < psm_sd_thresh,] - } - - #Plot the pre and post-dropping balance for PSM model... - #Balance metrics are based on "Misunderstandings between experimentalists and - #observationalists about causal inference", Imal, King, and Stuart. - #Simplest suggestion of comparing means and checking if .25 SD apart used. - cnt = 0 - for (i in 1:length(anc_vars)) - { - #gsub to remove any factors() - ed_v = sub("factor\\(","",anc_vars[i]) - ed_v = sub(")","",ed_v) - treat_mean_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[2]][[3]],5)") - treat_SD_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[2]][[4]],5)") - - control_mean_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[1]][[3]],5)") - control_SD_pre = paste("round(describeBy(init_dta@data$",ed_v,", group=init_dta@data$",TrtBinColName,")[[1]][[4]],5)") - - treat_mean_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[2]][[3]],5)") - treat_SD_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[2]][[4]],5)") - - control_mean_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[1]][[3]],5)") - control_SD_post = paste("round(describeBy(dta@data$",ed_v,", group=dta@data$",TrtBinColName,")[[1]][[4]],5)") - - c_type = eval(parse(text=paste("class(init_dta@data$",ed_v,")"))) - - if(c_type == "matrix") - { - exec_str = paste("dta@data$",ed_v,"<- as.numeric(dta@data$",ed_v,")",sep="") - eval(parse(text=exec_str)) - - exec_str = paste("init_dta@data$",ed_v,"<- as.numeric(init_dta@data$",ed_v,")",sep="") - eval(parse(text=exec_str)) - c_type = "numeric" + + anc_v_int <- strsplit(psm_eq, "~")[[1]][2] + anc_vars <- strsplit(gsub(" ","",anc_v_int), "+", fixed=TRUE) + anc_vars <- c(anc_vars[[1]], "PSM_trtProb") + + print("sat6") + + #Drop observations according to the selected method + if (drop_method == "SD") { + #Method to drop pairs that are greater than a set threshold apart in terms of PSM Standard Deviations. + psm_sd_thresh = sd(dta@data[,"PSM_trtProb"]) * drop_thresh + if (visual == "TRUE") { + print(psm_sd_thresh) + } + dta <- dta[dta@data[,"PSM_distance"] < psm_sd_thresh,] } - if((c_type == "numeric") & (visual == "TRUE")) - { - cnt = cnt + 1 - pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(init_dta, anc_vars[i],"Pre-Balancing: ",simple_out = FALSE) - pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(dta, anc_vars[i],"Post-Balancing: ",simple_out = FALSE) - - treat_mean_pre <- eval(parse(text=treat_mean_pre)) - treat_SD_pre <- eval(parse(text=treat_SD_pre)) - control_mean_pre <- eval(parse(text=control_mean_pre)) - control_SD_pre <- eval(parse(text=control_SD_pre)) - - treat_mean_post <- eval(parse(text=treat_mean_post)) - treat_SD_post <- eval(parse(text=treat_SD_post)) - control_mean_post <- eval(parse(text=control_mean_post)) - control_SD_post <- eval(parse(text=control_SD_post)) - - it_diff_Mean_pre <- round(abs( treat_mean_pre-control_mean_pre ),5) - it_diff_Mean_post <- round(abs(treat_mean_post-control_mean_post),5) - - if(!exists("bRes")) - { - bRes <- data.frame(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, + + + + #Plot the pre and post-dropping balance for PSM model... + #Balance metrics are based on "Misunderstandings between experimentalists and + #observationalists about causal inference", Imal, King, and Stuart. + #Simplest suggestion of comparing means and checking if .25 SD apart used. + cnt = 0 + + print("sat7") + + for (i in 1:length(anc_vars)) { + + print("sat7.0") + + #gsub to remove any factors() + ed_v = sub("factor\\(","",anc_vars[i]) + ed_v = sub(")","",ed_v) + + c_type = class(init_dta@data[[ed_v]]) + + print("sat7.1") + if (c_type == "matrix") { + + dta@data[,ed_v] <- as.numeric(dta@data[,ed_v]) + + init_dta@data[,ed_v] <- as.numeric(init_dta@data[,ed_v]) + + c_type = "numeric" + } + + print("sat7.2") + if ((c_type == "numeric") & (visual == "TRUE")) { + cnt = cnt + 1 + pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(init_dta, anc_vars[i],"Pre-Balancing: ",simple_out = FALSE) + pltObjs[[length(pltObjs) + 1]] <- GroupCompHist(dta, anc_vars[i],"Post-Balancing: ",simple_out = FALSE) + + + treat_mean_pre <- round(describeBy(init_dta@data[[ed_v]], group=init_dta@data[[TrtBinColName]])[[2]][[3]], 5) + treat_SD_pre <- round(describeBy(init_dta@data[[ed_v]], group=init_dta@data[[TrtBinColName]])[[2]][[4]], 5) + control_mean_pre <- round(describeBy(init_dta@data[[ed_v]], group=init_dta@data[[TrtBinColName]])[[1]][[3]], 5) + control_SD_pre <- round(describeBy(init_dta@data[[ed_v]], group=init_dta@data[[TrtBinColName]])[[1]][[4]], 5) + + treat_mean_post <- round(describeBy(dta@data[[ed_v]], group=dta@data[[TrtBinColName]])[[2]][[3]], 5) + treat_SD_post <- round(describeBy(dta@data[[ed_v]], group=dta@data[[TrtBinColName]])[[2]][[4]], 5) + control_mean_post <- round(describeBy(dta@data[[ed_v]], group=dta@data[[TrtBinColName]])[[1]][[3]], 5) + control_SD_post <- round(describeBy(dta@data[[ed_v]], group=dta@data[[TrtBinColName]])[[1]][[4]], 5) + + + it_diff_Mean_pre <- round(abs( treat_mean_pre-control_mean_pre ),5) + it_diff_Mean_post <- round(abs(treat_mean_post-control_mean_post),5) + + if (!exists("bRes")) { + + bRes <- data.frame(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, it_diff_Mean_pre,it_diff_Mean_post) - colnames(bRes)[1] <- "Pre-Balance Treated Mean" - colnames(bRes)[2] <- "Pre-Balance Treated SD" - colnames(bRes)[3] <- "Pre-Balance Control Mean" - colnames(bRes)[4] <- "Pre-Balance Control SD" - - colnames(bRes)[5] <- "Post-Balance Treated Mean" - colnames(bRes)[6] <- "Post-Balance Treated SD" - colnames(bRes)[7] <- "Post-Balance Control Mean" - colnames(bRes)[8] <- "Post-Balance Control SD" - - colnames(bRes)[9] <- "Mean Difference Pre-Balance" - colnames(bRes)[10] <- "Mean Difference Post-Balance" - }else{ - bRes <- rbind(bRes, c(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, + + colnames(bRes)[1] <- "Pre-Balance Treated Mean" + colnames(bRes)[2] <- "Pre-Balance Treated SD" + colnames(bRes)[3] <- "Pre-Balance Control Mean" + colnames(bRes)[4] <- "Pre-Balance Control SD" + + colnames(bRes)[5] <- "Post-Balance Treated Mean" + colnames(bRes)[6] <- "Post-Balance Treated SD" + colnames(bRes)[7] <- "Post-Balance Control Mean" + colnames(bRes)[8] <- "Post-Balance Control SD" + + colnames(bRes)[9] <- "Mean Difference Pre-Balance" + colnames(bRes)[10] <- "Mean Difference Post-Balance" + + } else { + bRes <- rbind(bRes, c(treat_mean_pre,treat_SD_pre,control_mean_pre,control_SD_pre, treat_mean_post,treat_SD_post,control_mean_post,control_SD_post, it_diff_Mean_pre,it_diff_Mean_post)) - } - - - rownames(bRes)[i-(i-cnt)] <- gsub("[^a-zA-Z0-9]","",ed_v) + } + + rownames(bRes)[i-(i-cnt)] <- gsub("[^a-zA-Z0-9]", "", ed_v) + } } - } - - if(visual=="TRUE") - { - #Output graphics - #Remove the factor rows - nrow_c <- length(pltObjs) - counter <- 1 - while(counter <= nrow_c) - { - d = counter + 3 - if(d > nrow_c) - { - d = nrow_c - } - do.call(grid.arrange,c(pltObjs[counter:d],nrow=2,ncol=2)) - counter = counter + 4 + + print("sat8") + + if (visual=="TRUE") { + #Output graphics + #Remove the factor rows + nrow_c <- length(pltObjs) + counter <- 1 + while (counter <= nrow_c) { + d = counter + 3 + if (d > nrow_c) { + d = nrow_c + } + do.call(grid.arrange,c(pltObjs[counter:d],nrow=2,ncol=2)) + counter = counter + 4 + } + #bTab <- stargazer(bRes,summary=FALSE,type="html") + #print.htmlTable(bTab) } - #bTab <- stargazer(bRes,summary=FALSE,type="html") - #print.htmlTable(bTab) - } - - return (dta) + + return (dta) } diff --git a/R/SpatialCausalPSM.R b/R/SpatialCausalPSM.R index 6af506a..44b6a41 100644 --- a/R/SpatialCausalPSM.R +++ b/R/SpatialCausalPSM.R @@ -1,56 +1,59 @@ -SpatialCausalPSM <- function(dta, mtd,mdl,drop, visual) -{ - #Initialization - pltObjs <- list() - - #Method - if(mtd == "logit") - { - PSMfit <- glm(mdl, dta@data, family="binomial") - retData <- dta - retData$PSM_trtProb <- predict(PSMfit, dta@data, type="response") - } - if(mtd=="lm") - { - PSMfit <- lm(mdl, dta@data) +SpatialCausalPSM <- function(dta, mtd, mdl, drop, visual) { + + # Initialization + pltObjs <- list() + + # generate model based on method + if (mtd == "logit") { + # generalized linear model + PSMfit <- glm(mdl, dta@data, family="binomial", na.action=na.omit) + } else if (mtd == "lm") { + # linear model + PSMfit <- lm(mdl, dta@data) + } + + # copy data retData <- dta - retData$PSM_trtProb <- predict(PSMfit, dta@data, type="response") - } - - if(visual == "TRUE") - { - #Show user distributions. - pltObjs[[1]] <- GroupCompHist(retData, "PSM_trtProb","Initial PSM Balance",simple_out=FALSE) - print(summary(PSMfit)) - } - - - #Second, if a drop parameter - if set to "support", remove observations - #that don't overlap in the PSM distribution. - if(drop == "support") - { - - #Drop - treated <- retData@data[retData@data$TrtBin == 1,] - untreated <- retData@data[retData@data$TrtBin == 0,] - min_cut <- max(min(treated$PSM_trtProb), min(untreated$PSM_trtProb)) - max_cut <- min(max(treated$PSM_trtProb), max(untreated$PSM_trtProb)) - - retData <- retData[retData@data$PSM_trtProb >= min_cut,] - retData <- retData[retData@data$PSM_trtProb <= max_cut,] - - } - if(visual == "TRUE") - { - #Post drop histograms - pltObjs[[2]] <- GroupCompHist(retData, "PSM_trtProb","Post-Extrapolation Drops (if enabled)",simple_out=FALSE) - - #Output graphics - grid.arrange(pltObjs[[1]], pltObjs[[2]],ncol=2,main="PSM Matching Stage 1 (Dropping Observations Requiring Extrapolation)") - - } - retEle <- 0 - retEle$data <- retData - retEle$model <- PSMfit - return (retEle) + + # predict values based on model + retData@data[,"PSM_trtProb"] <- predict(PSMfit, dta@data, type="response") + + + if (visual == "TRUE") { + # Show user distributions. + pltObjs[[1]] <- GroupCompHist(retData, "PSM_trtProb", "Initial PSM Balance", simple_out=FALSE) + print(summary(PSMfit)) + } + + # Second, if a drop parameter - if set to "support", remove observations + # that don't overlap in the PSM distribution. + if (drop == "support") { + + # Drop + treated <- retData@data[retData@data[,"TrtBin"] == 1,] + untreated <- retData@data[retData@data[,"TrtBin"] == 0,] + min_cut <- max(min(treated[,"PSM_trtProb"], na.rm = TRUE), min(untreated[,"PSM_trtProb"], na.rm = TRUE)) + max_cut <- min(max(treated[,"PSM_trtProb"], na.rm = TRUE), max(untreated[,"PSM_trtProb"], na.rm = TRUE)) + + retData <- retData[!is.na(retData@data[,"PSM_trtProb"]),] + retData <- retData[retData@data[,"PSM_trtProb"] >= min_cut,] + retData <- retData[retData@data[,"PSM_trtProb"] <= max_cut,] + + } + + if (visual == "TRUE") { + # Post drop histograms + pltObjs[[2]] <- GroupCompHist(retData, "PSM_trtProb", "Post-Extrapolation Drops (if enabled)", simple_out=FALSE) + + # Output graphics + grid.arrange(pltObjs[[1]], pltObjs[[2]], ncol=2, main="PSM Matching Stage 1 (Dropping Observations Requiring Extrapolation)") + + } + + # return original and predicted data along with model + retEle <- c() + retEle$data <- retData + retEle$model <- PSMfit + + return(retEle) } diff --git a/R/Stage2PSM.R b/R/Stage2PSM.R index a0054b4..8936173 100644 --- a/R/Stage2PSM.R +++ b/R/Stage2PSM.R @@ -3,82 +3,111 @@ #These functions are to make common modeling strategies easier to specify for users #That do not write their own models. -Stage2PSM <- function(model, dta, type, table_out = NULL, opts = NULL) -{ +Stage2PSM <- function (model, dta, type, table_out = NULL, opts = NULL, force_posdef = TRUE) { - ret_var <- list() + ret_var <- list() - if(type == "lm") - { - m_fit <- lm(model,dta) - print("==========================") - print("UNSTANDARDIZED MODEL RESULTS") - print("==========================") - #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") - print(summary(m_fit)) - ret_var$unstandardized <- lm(model,dta) - texreg::plotreg(m_fit,omit.coef="(match)|(Intercept)",custom.model.names="Unstandardized Model",custom.note=model) - - if(!is.null(table_out)) - { - dta_tmp <- dta - - if(class(dta) == "data.frame") - { - d_index <- sapply(dta_tmp, is.numeric) - dta_tmp[d_index] <- lapply(dta_tmp[d_index],scale) - } else { - d_index <- sapply(dta_tmp@data, is.numeric) - dta_tmp@data[d_index] <- lapply(dta_tmp@data[d_index],scale) - } - dta_fit_std <- lm(model,dta_tmp) - ret_var$standardized <- lm(model,dta_tmp) - print("==========================") - print("STANDARDIZED MODEL RESULTS") - print("==========================") - print(summary(dta_fit_std)) - texreg::plotreg(dta_fit_std,omit.coef="(match)|(Intercept)",custom.model.names="Standardized Model", custom.note=model) + if (type == "lm") { + m_fit <- lm(model, dta) + print("==========================") + print("UNSTANDARDIZED MODEL RESULTS") + print("==========================") + #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") + print(summary(m_fit)) + ret_var$unstandardized <- m_fit + texreg::plotreg(m_fit, omit.coef="(match)|(Intercept)", custom.model.names="Unstandardized Model", custom.note=model) + + if (!is.null(table_out)) { + dta_tmp <- dta - } + if (class(dta) == "data.frame") { + d_index <- sapply(dta_tmp, is.numeric) + dta_tmp[d_index] <- lapply(dta_tmp[d_index],scale) + } else { + d_index <- sapply(dta_tmp@data, is.numeric) + dta_tmp@data[d_index] <- lapply(dta_tmp@data[d_index],scale) + } + + dta_fit_std <- lm(model,dta_tmp) + ret_var$standardized <- dta_fit_std + print("==========================") + print("STANDARDIZED MODEL RESULTS") + print("==========================") + print(summary(dta_fit_std)) + texreg::plotreg(dta_fit_std,omit.coef="(match)|(Intercept)",custom.model.names="Standardized Model", custom.note=model) + + } - } + } - if(type == "cmreg") - { - m_fit <- lm(model,dta) - ret_var$unstandardized <- m_fit - #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") - #print.htmlTable(mTab) - print(summary(m_fit)) - texreg::plotreg(m_fit,omit.coef="(match)|(Intercept)|(factor)",custom.model.names="Unstandardized Model",custom.note=model) + if (type == "cmreg") { + + # make sure cluster options provided are valid + print(opts == NULL) + print(length(opts) == 0) + + if (is.null(opts) || length(opts) == 0) { + print("Must have at least 1 clustering option") + return("Invalid opts given.") + + } else if (length(opts) > 2) { + print("Cannot have more than 2 clustering options") + return("Invalid opts given.") + } + + + print("c1a") + m_fit <- lm(model, dta) + + print("c1b") + ret_var$unstandardized <- m_fit + + print("c1c") + #mTab <- stargazer(m_fit,type="html",title="Unstandardized Model Results") + #print.htmlTable(mTab) + print(summary(m_fit)) + texreg::plotreg(m_fit,omit.coef="(match)|(Intercept)|(factor)",custom.model.names="Unstandardized Model",custom.note=model) + + print("c2") + + if (!is.null(table_out)) { + + dta_tmp <- dta + + if( class(dta) == "data.frame") { + d_index <- sapply(dta_tmp, is.numeric) + dta_tmp[d_index] <- lapply(dta_tmp[d_index],scale) + } else { + d_index <- sapply(dta_tmp@data, is.numeric) + dta_tmp@data[d_index] <- lapply(dta_tmp@data[d_index],scale) + } + dta_fit_std <- lm(model,dta_tmp) + ret_var$standardized <- dta_fit_std + print(summary(dta_fit_std)) + texreg::plotreg(dta_fit_std,omit.coef="(match)|(Intercept)|(factor)",custom.model.names="Standardized Model", custom.note=model) + + } + + print("c3") - if(!is.null(table_out)) - { - dta_tmp <- dta - - if(class(dta) == "data.frame") - { - d_index <- sapply(dta_tmp, is.numeric) - dta_tmp[d_index] <- lapply(dta_tmp[d_index],scale) - } else { - d_index <- sapply(dta_tmp@data, is.numeric) - dta_tmp@data[d_index] <- lapply(dta_tmp@data[d_index],scale) - } - dta_fit_std <- lm(model,dta_tmp) - ret_var$standardized <- dta_fit_std - print(summary(dta_fit_std)) - texreg::plotreg(dta_fit_std,omit.coef="(match)|(Intercept)|(factor)",custom.model.names="Standardized Model", custom.note=model) - + print(opts) + + # exec = paste("cluster.vcov(m_fit,cbind(dta$",opts[1],",dta$",opts[2],"))",sep="") + # m_fit[["var"]] <- eval(parse(text=exec)) + + if (length(opts) == 1) { + m_fit$var <- cluster.vcov(m_fit,dta[opts[1]], force_posdef=force_posdef) + + } else if (length(opts) == 2) { + m_fit$var <- cluster.vcov(m_fit,cbind(dta[opts[1]], dta[opts[2]]), force_posdef=force_posdef) + + } + + CMREG <- coeftest(m_fit,m_fit$var) + print("cmReg:") + print(CMREG) + ret_var$cmreg <- CMREG + } - - print(opts) - exec = paste("cluster.vcov(m_fit,cbind(dta$",opts[1],",dta$",opts[2],"))",sep="") - m_fit$var <- eval(parse(text=exec)) - CMREG <- coeftest(m_fit,m_fit$var) - print("cmReg:") - print(CMREG) - ret_var$cmreg <- CMREG - - } - return(ret_var) -} \ No newline at end of file + return(ret_var) +} diff --git a/R/fastNN_binary_func.R b/R/fastNN_binary_func.R index a9e1419..b4917d2 100644 --- a/R/fastNN_binary_func.R +++ b/R/fastNN_binary_func.R @@ -1,3 +1,160 @@ +# #FastNN +# #Algorithm to find a hopefully near-optimal match of pairs +# #In a treatment and control group +# #Works by first ordering by the propensity score matching value, +# #and then working through this list in order from highest to lowest. +# #Matches are removed each step. + +# fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { + +# print("nn1.0") + +# #Fast nearest neighbors search - will not arrive at optimum, +# #but this may not be an issue for many analysis. +# #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + + +# sorted_dta <- dta@data[order(dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + +# #Conduct the matching +# treated <- as.data.table(sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")]) +# untreated <- as.data.table(sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")]) + + + +# it_cnt = min(nrow(treated), nrow(untreated)) +# # dta@data[["match"]] <- -999 +# # dta@data[["PSM_distance"]] <- -999 +# dta@data[["PSM_match_ID"]] <- -999 + +# print("nn2") + +# # Calculate a distance decay function +# # to perturb pairs based on their distances. +# for (j in 1:it_cnt) { + +# print("nn2.1") + +# k <- get.knnx(treated[, 'PSM_trtProb'], untreated[, 'PSM_trtProb'], 1) + +# # print("nn2.2") + +# # # Perturb the values based on the distance decay function, if selected. +# # if (!is.null(dist_PSM)) { +# # for (mC in 1:length(k[[1]])) { + +# # print("nn2.2.0") + +# # # Calculate the Euclidean Distance between pairs +# # Control_ID = toString(untreated[mC, get(ids), with=FALSE]) + +# # mT = k[["nn.index"]][mC] + +# # Treatment_ID = toString(treated[mT, get(ids), with=FALSE]) + +# # #Find the control x,y location +# # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + + +# # #Find the treatment x,y location +# # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) + +# # y_dist = abs(cCoord[1] - cCoord[2]) +# # x_dist = abs(tCoord[1] - tCoord[2]) +# # euc_dist = sqrt(y_dist^2 + x_dist^2) + +# # print("nn2.2.1") + +# # PSM_score = k[["nn.dist"]][mC] +# # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + +# # print("nn2.2.2") + + +# # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + +# # } + +# # } + +# print("nn2.3") + +# # Add the matched treatment and control values to the recording data frame +# # best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. +# best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) + +# # This will give us the matched index in the "treated" dataset. +# best_m_treated = k[["nn.index"]][best_m_control] + + +# # Control and Treatment PSM ID +# # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") +# # Control_ID = toString(eval(parse(text=cid_txt))) + +# # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") +# # Treatment_ID = toString(eval(parse(text=tid_txt))) + +# print(class(untreated)) +# print(colnames(untreated)) + +# # Control PSM ID and Treatment PSM ID +# Control_ID = toString(untreated[best_m_control, (ids), with=FALSE]) +# Treatment_ID = toString(treated[best_m_control, (ids), with=FALSE]) + + +# # Create a unique pair ID for each group (will simply append a "1" if only 1 group) +# if (is.null(curgrp)) { +# pair_id <- paste('pair',j, sep="") +# } else { +# pair_id <- paste(curgrp,j, sep="") + +# } + + +# print("nn2.4x") + +# #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row +# # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID +# # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + + +# # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] +# dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id + + + +# # Drop the paired match out of the iteration matrix +# # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] +# # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] + +# # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + +# qt = quote(ids != Treatment_ID) +# qu = quote(ids != Treatment_ID) + +# treated <- treated[eval(qt)] +# untreated <- untreated[eval(qu)] + + + +# } + +# return(dta) + +# } + + + + + + + + + + + + + #FastNN #Algorithm to find a hopefully near-optimal match of pairs #In a treatment and control group @@ -5,120 +162,197 @@ #and then working through this list in order from highest to lowest. #Matches are removed each step. -fastNN_binary_func <- function(dta,trtMntVar,ids,curgrp,dist_PSM) -{ - #Fast nearest neighbors search - will not arrive at optimum, - #but this may not be an issue for many analysis. - #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. - sorted_dta <- dta@data[order(dta@data$PSM_trtProb),] - #Conduct the matching - - str_trted <- paste("treated <- sorted_dta[sorted_dta$",trtMntVar, "== 1,]",sep="") - str_untrted <- paste("untreated <- sorted_dta[sorted_dta$",trtMntVar,"==0,]",sep="") - eval(parse(text=str_trted)) - eval(parse(text=str_untrted)) - - it_cnt = min(length(treated[[1]]), length(untreated[[1]])) - dta@data$match <- -999 - dta@data$PSM_distance <- -999 - dta@data$PSM_match_ID <- -999 - - #Calculate a distance decay function - #to perturb pairs based on their distances. - for (j in 1:it_cnt) - { - str_trted <- paste("treated <- sorted_dta[sorted_dta$",trtMntVar, "== 1,]",sep="") - str_untrted <- paste("untreated <- sorted_dta[sorted_dta$",trtMntVar,"==0,]",sep="") - eval(parse(text=str_trted)) - eval(parse(text=str_untrted)) - - - - #Run the KNN for all neighbors. - k <- get.knnx(treated$PSM_trtProb, untreated$PSM_trtProb, 1) - - #Perturb the values based on the distance decay function, if selected. - if(!is.null(dist_PSM)) - { - for(mC in 1:length(k[[1]])) - { - #Calculate the Euclidean Distance between pairs - cid_txt = paste("untreated$",ids,"[",mC,"]",sep="") - Control_ID = toString(eval(parse(text=cid_txt))) - - mT = k$nn.index[mC] +fastNN_binary_func <- function(dta, trtMntVar, ids, curgrp, dist_PSM) { + + + print('nn') + + # make sure there are at least x rows in both treated and control + row_min <- 30 + trt_rows <- dta@data$TrtBin == 1 + if ( sum(trt_rows) < row_min | sum(! trt_rows) < row_min ) { + return('drop') + } + + + + match_data <- dta@data[,c('PSM_trtProb', ids, 'TrtBin')] + + rownames(match_data) <- match_data[[ids]] + + zzz <<- match_data + + m <- matchit(TrtBin ~ PSM_trtProb, data=match_data, method="nearest", ratio=1) + + dta@data[["PSM_match_ID"]] <- -999 + + for ( i in rownames(m$match.matrix) ) { + trt_id <- i + cnt_id <- m$match.matrix[i,] - tid_txt = paste("treated$",ids,"[",mT,"]",sep="") - Treatment_ID = toString(eval(parse(text=tid_txt))) + if ( !is.na(cnt_id) ) { + pair_id <- paste(trt_id,cnt_id, sep='__') + dta@data$PSM_match_ID[which(dta@data[[ids]] == cnt_id | dta@data[[ids]] == trt_id)] <- pair_id + } + } + + + # =========================================================================== + + + # print("nn1.0") + + + # #Fast nearest neighbors search - will not arrive at optimum, + # #but this may not be an issue for many analysis. + # #Effectively loops through all observations in the treatment group, ordered by PSM score - higher scores go first. + + # sorted_dta <- dta@data[order(-dta@data[["PSM_trtProb"]]), c(ids, trtMntVar, "PSM_trtProb")] + + + # #Conduct the matching + # treated <- sorted_dta[sorted_dta[[trtMntVar]] == 1, c(ids, "PSM_trtProb")] + # untreated <- sorted_dta[sorted_dta[[trtMntVar]] == 0, c(ids, "PSM_trtProb")] + + # it_cnt = min(length(treated[[1]]), length(untreated[[1]])) + + # if (it_cnt < 30) { + # return('drop') + # } + + # # dta@data[["match"]] <- -999 + # # dta@data[["PSM_distance"]] <- -999 + # dta@data[["PSM_match_ID"]] <- -999 + + # # print("nn2") + + # #Calculate a distance decay function + # #to perturb pairs based on their distances. + # for (j in 1:it_cnt) { + # # time_list <- c() + + # print(paste("nn cnt:",j)) + # # timer <- proc.time() + + # # treated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 1 & sorted_dta[['nn_matched']] == 0),] + # # untreated <- sorted_dta[which(sorted_dta[[trtMntVar]] == 0 & sorted_dta[['nn_matched']] == 0),] - #Find the control x,y location - cCoord_e = paste("coordinates(dta[which(dta@data$",ids," == Control_ID),])", sep="") - cCoord = eval(parse(text=cCoord_e)) + # # print(nrow(sorted_dta[which(sorted_dta[['nn_matched']] == 0),])) + # # time_list[1] <- round((proc.time() - timer)[3],5) + # # print("nn2.1") + # # timer <- proc.time() + + # #Run the KNN for all neighbors. + # # print(length(treated[[1]])) + # # summary(treated[["PSM_trtProb"]]) + # # print(length(untreated[[1]])) + # # summary(untreated[["PSM_trtProb"]]) + + # k <- get.knnx(treated[["PSM_trtProb"]], untreated[["PSM_trtProb"]], 1) - #Find the treatment x,y location - tCoord_e = paste("coordinates(dta[which(dta@data$",ids," == Treatment_ID),])", sep="") - tCoord = eval(parse(text=tCoord_e)) + # # time_list[2] <- round((proc.time() - timer)[3],5) + # # print("nn2.2") + # # timer <- proc.time() + + # # #Perturb the values based on the distance decay function, if selected. + # # if (!is.null(dist_PSM)) { + # # for (mC in 1:length(k[[1]])) { + + # # print("nn2.2.0") + + # # #Calculate the Euclidean Distance between pairs + # # Control_ID = toString(untreated[[ids]][[mC]]) + + # # mT = k[["nn.index"]][mC] + + # # Treatment_ID = toString(treated[[ids]][[mT]]) + + # # #Find the control x,y location + # # cCoord = coordinates(dta[which(dta@data[[ids]] == Control_ID),]) + + + # # #Find the treatment x,y location + # # tCoord = coordinates(dta[which(dta@data[[ids]] == Treatment_ID),]) - y_dist = abs(cCoord[1] - cCoord[2]) - x_dist = abs(tCoord[1] - tCoord[2]) - euc_dist = sqrt(y_dist^2 + x_dist^2) + # # y_dist = abs(cCoord[1] - cCoord[2]) + # # x_dist = abs(tCoord[1] - tCoord[2]) + # # euc_dist = sqrt(y_dist^2 + x_dist^2) + + # # print("nn2.2.1") + + # # PSM_score = k[["nn.dist"]][mC] + # # geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + + # # print("nn2.2.2") + + + # # k[["nn.dist"]][mC] <- ((1-geog_Weight) * PSM_score) + + # # } + + # # } + + # # time_list[3] <- round((proc.time() - timer)[3],5) + # print("nn2.3") + # # timer <- proc.time() + + # #Add the matched treatment and control values to the recording data frame + # #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. + # best_m_control = which(k[["nn.dist"]] %in% sort(k[["nn.dist"]])[1]) - PSM_score = k$nn.dist[mC] - geog_Weight = pairDistWeight(dist=euc_dist,threshold=dist_PSM,type="Spherical") + # #This will give us the matched index in the "treated" dataset. + # best_m_treated = k[["nn.index"]][best_m_control] + + + # # #Control PSM ID + # cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") + # Control_ID = toString(eval(parse(text=cid_txt))) + + # # #Treatment PSM ID + # tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") + # Treatment_ID = toString(eval(parse(text=tid_txt))) + + + + # #Create a unique pair ID for each group (will simply append a "1" if only 1 group) + # # pair_id = paste(curgrp,j, sep="") + # pair_id <- paste(Treatment_ID,Control_ID, sep='__') + # print(pair_id) + + # # time_list[4] <- round((proc.time() - timer)[3],5) + # print("nn2.4x") + # # timer <- proc.time() + + # #Add the Treatment ID to the Control Row and Add the Control ID to the Treatment Row + # # dta@data$match[which(dta@data[[ids]] == Control_ID)] <- Treatment_ID + # # dta@data$match[which(dta@data[[ids]] == Treatment_ID)] <- Control_ID + + + # # dta@data$PSM_distance[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- k[["nn.dist"]][,1][best_m_control] + # dta@data$PSM_match_ID[which(dta@data[[ids]] == Control_ID | dta@data[[ids]] == Treatment_ID)] <- pair_id - k$nn.dist[mC] <- ((1-geog_Weight) * PSM_score) + # # time_list[5] <- round((proc.time() - timer)[3],5) + # # timer <- proc.time() - } - - } + # #Drop the paired match out of the iteration matrix + # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Treatment_ID ,] + # # sorted_dta <- sorted_dta[sorted_dta[[ids]] != Control_ID ,] - #Add the matched treatment and control values to the recording data frame - #best_m_control is the row in the "distance" matrix with the lowest value. This is the same row as in the index. - best_m_control = which(k$nn.dist %in% sort(k$nn.dist)[1]) - - #This will give us the matched index in the "treated" dataset. - best_m_treated = k$nn.index[best_m_control] - - #Control PSM ID - cid_txt = paste("untreated$",ids,"[",best_m_control,"]",sep="") - Control_ID = toString(eval(parse(text=cid_txt))) + # # sorted_dta[which(sorted_dta[[ids]] == Control_ID | sorted_dta[[ids]] == Treatment_ID),][['nn_matched']] <- 1 + + # treated <- treated[which(treated[[ids]] != Treatment_ID),] + # untreated <- untreated[which(untreated[[ids]] != Control_ID),] - #Treatment PSM ID - tid_txt = paste("treated$",ids,"[",best_m_treated,"]",sep="") - Treatment_ID = toString(eval(parse(text=tid_txt))) - - #Create a unique pair ID for each group (will simply append a "1" if only 1 group) - pair_id = paste(curgrp,j,sep="") - - #Add the Treatment ID to the Control Row - tid_a_1 = paste("dta@data$match[which(dta@data$",ids," == Control_ID)] = Treatment_ID", sep="") - tid_a_2 = paste("dta@data$PSM_distance[which(dta@data$",ids," == Control_ID)] = k$nn.dist[,1][best_m_control]",sep="") - tid_a_3 = paste("dta@data$PSM_match_ID[which(dta@data$",ids," == Control_ID)] = pair_id", sep="") - eval(parse(text=tid_a_1)) - eval(parse(text=tid_a_2)) - eval(parse(text=tid_a_3)) - - - - #Add the Control ID to the Treatment Row - cid_a_1 = paste("dta@data$match[which(dta@data$",ids," == Treatment_ID)] = Control_ID", sep="") - cid_a_2 = paste("dta@data$PSM_distance[which(dta@data$",ids," == Treatment_ID)] = k$nn.dist[,1][best_m_control]", sep="") - cid_a_3 = paste("dta@data$PSM_match_ID[which(dta@data$",ids," == Treatment_ID)] = pair_id", sep="") - eval(parse(text=cid_a_1)) - eval(parse(text=cid_a_2)) - eval(parse(text=cid_a_3)) - - - #Drop the paired match out of the iteration matrix - did_a_1 = paste("sorted_dta <- sorted_dta[sorted_dta$",ids,"!= Treatment_ID ,]",sep="") - did_a_2 = paste("sorted_dta <- sorted_dta[sorted_dta$",ids,"!= Control_ID ,]",sep="") - eval(parse(text=did_a_1)) - eval(parse(text=did_a_2)) - - - } - return(dta) -} \ No newline at end of file + # # time_list[6] <- round((proc.time() - timer)[3],5) + # # print(paste(time_list)) + + # } + + + return(dta) + +} + diff --git a/R/loadLibs.R b/R/loadLibs.R index 8cf97ca..6facdfc 100644 --- a/R/loadLibs.R +++ b/R/loadLibs.R @@ -1,6 +1,5 @@ #Library loading script in case dependencies are not loading correctly. -loadLibs <- function (x=1) - { +loadLibs <- function (x=1) { require(sp) #require(GISTools) # sudo apt-get install libgeos-dev require(maptools) @@ -20,4 +19,8 @@ loadLibs <- function (x=1) library(stargazer) library(lmtest) library(multiwayvcov) - } \ No newline at end of file + + # library(data.table) + library(MatchIt) + # library(optmatch) +} diff --git a/R/timeRangeAvg.R b/R/timeRangeAvg.R index 1af3cfd..c378173 100644 --- a/R/timeRangeAvg.R +++ b/R/timeRangeAvg.R @@ -1,9 +1,13 @@ -timeRangeAvg <- function(dta,prefix,startyr,endyr) -{ - searchS = paste("^",prefix,startyr,sep="") - searchE = paste("^",prefix,endyr,sep="") - strt_id <- grep(searchS,colnames(dta)) - end_id <- grep(searchE,colnames(dta)) - rmean <- rowMeans(dta[strt_id:end_id]) - return(rmean) -} \ No newline at end of file +timeRangeAvg <- function(dta, prefix, startyr, endyr) { + + searchS = paste("^",prefix,startyr, sep="") + searchE = paste("^",prefix,endyr, sep="") + + start_id <- grep(searchS, colnames(dta)) + end_id <- grep(searchE, colnames(dta)) + + rmean <- rowMeans(dta[start_id:end_id]) + + return(rmean) + +} diff --git a/R/timeRangeTrend.R b/R/timeRangeTrend.R index 33c6e87..cb196ce 100644 --- a/R/timeRangeTrend.R +++ b/R/timeRangeTrend.R @@ -1,23 +1,226 @@ -timeRangeTrend <- function(dta,prefix,startyr,endyr,IDfield) -{ - grep_str = paste(IDfield,prefix,sep="|") - tDF <- dta@data[grepl(grep_str,names(dta@data))] - analysisDF <- melt(tDF,id=c(IDfield)) - - #cleaned GREP - new_pre <- gsub("[0-9]","",prefix,fixed=TRUE) - analysisDF["Year"] <- lapply(analysisDF["variable"],FUN=function(x) as.numeric(gsub(new_pre,"",x))) - analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] - analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] - dta@data["newfieldID"] <- 0 - for (i in 1:length(dta)) - { - ID <- as.character(dta@data[IDfield][i,]) - #Fit trend model - ID_dat <- analysisDF[analysisDF[IDfield] == ID,] - trend_mod <- lm(value ~ Year,data=ID_dat) - dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] - } - return(dta$newfieldID) - -} \ No newline at end of file + + + +# timeRangeType <- function (columns, prefix, startyr, endyr, field) { + +# if (!is.na(as.numeric(startyr) && is.na(as.integer(endyr) && !is.na(field) && field %in% column_names) { +# type = "pre" +# startyr = as.integer(startyr) + +# } else if (is.na(as.numeric(startyr) && !is.na(as.integer(endyr) && !is.na(field) && field %in% column_names) { +# type = "post" +# endyr = as.integer(endyr) + +# } else if (!is.na(as.numeric(startyr) && !is.na(as.integer(endyr) && as.integer(endyr) > as.integer(startyr)) { +# type = "range" +# startyr = as.integer(startyr) +# endyr = as.integer(endyr) + +# } else { +# type = "invalid" +# } + +# return(c(type, startyr, endyr)) + +# } + + + +# # run linear model on data within year range as specified +# # by field prefix and return coefficients +# timeRangeTrend <- function (dta, prefix, startyr, endyr, field=NA, IDfield, thresh=0.5) { + +# check <- timeRangeType(colnames(dta), prefix, startyr, endyr, field) + +# type = check[1] +# startyr = check[2] +# endyr = check[3] + + +# if (type == "range") { + +# output <- timeRangeTrend_calc(dta, prefix, startyr, tmp_endyr, IDfield, thresh=0.5) + +# } else if (type == "pre") { + +# output <- apply(dta, 1, function (row) { + +# tmp_endyr <- as.integer(row['start_actual_isodate']) + +# if (is.na(tmp_endyr) || start_yr >= tmp_endyr) { +# return(as.integer("NA")) + +# } else { +# return(timeRangeTrend_calc(row, prefix, startyr, tmp_endyr, IDfield, thresh=0.5, field="start_actual_isodate")) + +# } + +# }) + + +# } else if (type == "post") { + +# output <- lapply(dta, function (row) { + +# tmp_startyr <- as.integer(row['start_actual_isodate']) + +# if (is.na(tmp_startyr) || tmp_startyr >= endyr) { +# return(as.integer("NA")) + +# } else { +# return(timeRangeTrend_calc(row, prefix, tmp_startyr, endyr, IDfield, thresh=0.5, field="start_actual_isodate")) + +# } + +# }) + + +# } else if (type == "invalid") { +# output <- 1 + +# } else { +# output <- 2 + +# } + +# return(output) + +# } + + + +# timeRangeTrend_calc <- function (dta, prefix, startyr, endyr, IDfield, thresh=0.5) { + + +# # create new dataframe from all columns in dta dataframe that +# # are either the ID or a year which is indicated by the prefix +# grep_str = paste(IDfield, prefix, sep="|") +# tDF <- dta@data[grepl(grep_str, names(dta@data))] + +# # melt all years columns in new dataframe +# analysisDF <- melt(tDF, id=c(IDfield)) + +# # cleaned GREP - remove year digit placeholders +# # new_pre <- gsub("[0-9]", "", prefix, fixed=TRUE) + +# # get location of year in prefix +# yIndex <- regexpr("[0-9]", prefix, fixed=TRUE) + +# # generate new year field by removing prefix from variable (original column names) +# # analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) as.numeric(gsub(new_pre, "", x))) +# analysisDF["Year"] <- lapply(analysisDF["variable"], FUN=function(x) { +# as.numeric(substr(x, yIndex[1], yIndex[1]+3)) +# }) + +# # keep years in range specified +# analysisDF <- analysisDF[analysisDF["Year"] >= startyr ,] +# analysisDF <- analysisDF[analysisDF["Year"] <= endyr ,] + +# # create empty field +# dta@data["newfieldID"] <- 0 + +# # iterate over original dataframe +# for (i in 1:length(dta)) { +# # get id for row (in original data) +# ID <- as.character(dta@data[IDfield][i,]) + +# # get all data corresponding to id from analysis dataframe +# ID_dat <- analysisDF[analysisDF[IDfield] == ID,] + +# dat_length <-length(ID_dat) +# count_na <-sum(is.na(ID_dat[['value']])) +# count_non_na <- dat_length - count_na +# percent_na <- count_na / dat_length + +# # if number of NAs is over threshold or if less than 2 points of data are not NA, return NA +# if (percent_na > thresh || count_non_na < 2) { + +# dta@data["newfieldID"][i,] <- NA + +# } else { +# # fit trend model +# trend_mod <- lm(value ~ Year, data=ID_dat, na.action = na.omit) + +# # add trend coefficients to new field +# dta@data["newfieldID"][i,] <- summary(trend_mod)$coefficients[2] +# } + +# } + +# # return new field with trend coefficients +# return(dta[["newfieldID"]]) + +# } + + + +# timeRangeAvg <- function (dta, prefix, startyr, endyr, field=NA) { + +# check <- timeRangeType(colnames(dta), prefix, startyr, endyr, field) + +# type = check[1] +# startyr = check[2] +# endyr = check[3] + + +# if (type == "range") { + +# output <- timeRangeAvg_calc(dta, prefix, startyr, endyr) + +# } else if (type == "pre") { + +# output <- apply(dta, 1, function (row) { + +# tmp_endyr <- as.integer(row['start_actual_isodate']) + +# if (is.na(tmp_endyr) || start_yr >= tmp_endyr) { +# return(as.integer("NA")) + +# } else { +# return(timeRangeAvg_calc(row, prefix, startyr, tmp_endyr)) + +# } + +# }) + + +# } else if (type == "post") { + +# output <- lapply(dta, function (row) { + +# tmp_startyr <- as.integer(row['start_actual_isodate']) + +# if (is.na(tmp_startyr) || tmp_startyr >= endyr) { +# return(as.integer("NA")) + +# } else { +# return(timeRangeAvg_calc(row, prefix, tmp_startyr, endyr)) + +# } + +# }) + +# } else if (type == "invalid") { +# output <- 1 + +# } else { +# output <- 2 + +# } + +# return(output) + +# } + + + +# timeRangeAvg_calc <- function (dta, prefix, startyr, endyr) { + +# range <- c(startyr:endyr) +# search <- paste("^",prefix,"(",paste(range, collapse="|"),")", sep="") +# matches <- grepl(search, colnames(dta)) +# rmean <- rowMeans(dta[matches], na.rm=FALSE) + +# return(rmean) + +# } diff --git a/SCI_PanelFix.Rproj b/SCI_PanelFix.Rproj new file mode 100644 index 0000000..a4dce49 --- /dev/null +++ b/SCI_PanelFix.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 4 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source