haghish · haghish · Mar 26, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,22 +5,17 @@ Version: 0.1
 Authors@R: 
     person("E. F. Haghish",
            role = c("aut", "cre", "cph"),
-           email = "[email protected]")
+           email = "[email protected]"
+           )
 Depends: 
     R (>= 3.5.0)
-Description: This R package introduces Holistic Multimodel Domain Analysis (HMDA),
-    an new paradigm for exploring domains related to an outcome. HMDA trains a 
-    grid of supervised learning models, and instead of selecting the best-performing 
-    model, it takes the inconsistencies between the good models into account to 
-    explore domains of items and indicators related to an outcome. For each of the
-    models, Weighted Mean SHAP (WMSHAP) is computed, taking the performance of the 
-    model into account. 
+Description: Holistic Multimodel Domain Analysis (HMDA) is a robust and transparent framework designed for exploratory machine learning research, aiming to enhance the process of feature assessment and selection. HMDA addresses key limitations of traditional machine learning methods by evaluating the consistency across multiple high-performing models within a fine-tuned modeling grid, thereby improving the interpretability and reliability of feature importance assessments. Specifically, it computes Weighted Mean SHapley Additive exPlanations (WMSHAP), which aggregate feature contributions from multiple models based on weighted performance metrics. HMDA also provides confidence intervals to demonstrate the stability of these feature importance estimates. This framework is particularly beneficial for analyzing complex, multidimensional datasets common in health research, supporting reliable exploration of mental health outcomes such as suicidal ideation, suicide attempts, and other psychological conditions. Additionally, HMDA includes automated procedures for feature selection based on WMSHAP ratios and performs dimension reduction analyses to identify underlying structures among features. For more details see Haghish (2025) <doi:10.13140/RG.2.2.32473.63846>.
 Imports:
   curl (>= 4.3.0),
   h2o (>= 3.34.0.0),
-  shapley (>= 0.4),
+  shapley (>= 0.5),
   autoEnsemble (>= 0.3),
-  h2otools (>= 0.5), 
+  h2otools (>= 0.4), 
   splitTools (>= 1.0.1),
   psych (>= 2.4.6),
   dplyr (>= 1.1.4),
@@ -29,5 +24,5 @@ License: MIT + file LICENSE
 Encoding: UTF-8
 RoxygenNote: 7.3.2
 LazyData: true
-URL: https://github.com/haghish/HMDA, https://www.sv.uio.no/psi/english/people/academic/haghish/
+URL: http://dx.doi.org/10.13140/RG.2.2.32473.63846, https://github.com/haghish/HMDA, https://www.sv.uio.no/psi/english/people/academic/haghish/
 BugReports: https://github.com/haghish/HMDA/issues
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
-export(capture)
 export(check_efa)
 export(dictionary)
 export(hmda.adjust.params)
@@ -64,6 +63,7 @@ importFrom(h2o,h2o.shap_summary_plot)
 importFrom(h2o,h2o.shutdown)
 importFrom(h2o,h2o.stackedEnsemble)
 importFrom(h2otools,automlModelParam)
+importFrom(h2otools,capture)
 importFrom(psych,fa.diagram)
 importFrom(psych,fa.sort)
 importFrom(psych,factor.scores)

diff --git a/R/best_of_family.R b/R/best_of_family.R
@@ -28,15 +28,16 @@
 #'         the \code{model_id} from that row. The final result is a unique
 #'         set of model IDs that represent the best models across all metrics.
 #'
-#' @examples
-#' \dontrun{
-#'   # Example: Given a data frame 'merged' with performance metrics,
-#'   # select the best models per metric.
-#'   best_ids <- best_of_family(merged)
-#'   print(best_ids)
-#' }
+# @examples
+# \dontrun{
+#   # Example: Given a data frame 'merged' with performance metrics,
+#   # select the best models per metric.
+#   best_ids <- best_of_family(merged)
+#   print(best_ids)
+# }
 #'
 #' @author E. F. Haghish
+
 best_of_family <- function(df) {
   # Exclude the "model_id" column and keep numeric columns only.
   metric_cols <- setdiff(names(df), "model_id")

diff --git a/R/capture.R b/R/capture.R
diff --git a/R/check_efa.R b/R/check_efa.R
@@ -11,7 +11,7 @@
 #'   values required for a feature. Default is 5.
 #' @param min_intercorrelation A numeric threshold for the minimum acceptable
 #'   intercorrelation among features. (Note: this parameter is not used explicitly in the current implementation.) Default is 0.3.
-#' @param verbatim Logical; if \code{TRUE}, a confirmation message is printed when all
+#' @param verbose Logical; if \code{TRUE}, a confirmation message is printed when all
 #'   features appear suitable. Default is \code{FALSE}.
 #'
 #' @return \code{TRUE} if all features are deemed suitable for EFA, and \code{FALSE}
@@ -38,7 +38,6 @@
 #' @importFrom stats na.omit cor
 #'
 #' @examples
-#' \dontrun{
 #'   # Example: assess feature suitability for EFA using the USJudgeRatings dataset.
 #'   # this dataset contains ratings on several aspects of U.S. federal judges' performance.
 #'   # Here, we check whether these rating variables are suitable for EFA.
@@ -48,19 +47,19 @@
 #'     df = USJudgeRatings,
 #'     features = features_to_check,
 #'     min_unique = 3,
-#'     verbatim = TRUE
+#'     verbose = TRUE
 #'   )
 #'
 #'   # TRUE indicates the features are suitable.
 #'   print(result)
-#' }
 #'
 #' @export
+
 check_efa <- function(df,
                       features,
                       min_unique = 5,
                       min_intercorrelation = .3,
-                      verbatim = FALSE) {
+                      verbose = FALSE) {
 
   # Vector to store messages for unsuitable features
   unsuitable_messages <- c()
@@ -155,7 +154,7 @@ check_efa <- function(df,
     }
     return(FALSE)
   } else {
-    if (verbatim) message("All features appear suitable for exploratory factor analysis with the minrank algorithm.")
+    if (verbose) cat("All features appear suitable for exploratory factor analysis with the minrank algorithm.\n")
     return(TRUE)
   }
 }
diff --git a/R/dictionary.R b/R/dictionary.R
@@ -14,7 +14,6 @@
 #' @details The function iterates over each column in the input data frame \code{df} and retrieves the specified attribute using \code{attr()}. If the attribute is not found for a column, \code{NA} is returned as its description. The resulting data frame acts as a dictionary for the variables, which is particularly useful for documenting datasets during exploratory data analysis.
 #'
 #' @examples
-#' \dontrun{
 #'   # Example: Generate a dictionary of variable labels using the USJudgeRatings dataset.
 #'   # This dataset contains ratings on various performance measures for U.S. federal judges.
 #'   data("USJudgeRatings")
@@ -29,7 +28,6 @@
 #'   # Generate the dictionary of labels
 #'   dict <- dictionary(USJudgeRatings, "label")
 #'   print(dict)
-#' }
 #'
 #' @export
 #' @author E. F. Haghish

diff --git a/R/hmda.adjust.params.R b/R/hmda.adjust.params.R
@@ -28,14 +28,13 @@
 #'   loop.
 #'
 #' @examples
-#' \dontrun{
 #'   # Example 1: Adjust a hyperparameter grid for 100 models.
 #'   params <- list(
 #'     alpha = c(0.1, 0.2, 0.3, 0.4),
 #'     beta = c(1, 2, 3, 4, 5),
 #'     gamma = c(10, 20, 30)
 #'   )
-#'   new_params <- HMDA:::adjust.params(params, n_models = 1000)
+#'   new_params <- hmda.adjust.params(params, n_models = 100)
 #'   print(new_params)
 #'
 #'   # Example 2: The generated hyperparameters range between min and max of each
@@ -47,7 +46,6 @@
 #'   )
 #'   new_params <- hmda.adjust.params(params, n_models = 1000)
 #'   print(new_params)
-#' }
 #'
 #' @export
 #' @author E. F. Haghish

diff --git a/R/hmda.autoEnsemble.R b/R/hmda.autoEnsemble.R
@@ -4,8 +4,8 @@
 #'   leverages the \pkg{autoEnsemble} package to stack a set of trained models
 #'   (e.g., from HMDA grid) into a stronger meta-learner. For more
 #'   details on autoEnsemble, please see the GitHub repository at
-#'   \url{https://github.com/haghish/autoEnsemble} and the CRAN package page at
-#'   \url{https://CRAN.R-project.org/package=autoEnsemble}.
+#'   \url{https://github.com/haghish/autoEnsemble} and the CRAN package of
+#'   autoEnsemble R package.
 #'
 #' @importFrom utils setTxtProgressBar txtProgressBar
 #' @importFrom h2o h2o.stackedEnsemble h2o.getModel h2o.auc h2o.aucpr h2o.mcc
@@ -78,67 +78,47 @@
 #'
 #' @examples
 #' \dontrun{
-#' # load the required libraries for building the base-learners and the ensemble models
-#' library(h2o)
-# library(h2otools)
-#' library(autoEnsemble)
-#'
-#' # initiate the h2o server
-#' h2o.init(ignore_config = TRUE, nthreads = 2, bind_to_localhost = FALSE, insecure = TRUE)
-#'
-#' # upload data to h2o cloud
-#' prostate_path <- system.file("extdata", "prostate.csv", package = "h2o")
-#' prostate <- h2o.importFile(path = prostate_path, header = TRUE)
-#'
-#' ### H2O provides 2 types of grid search for tuning the models, which are
-#' ### AutoML and Grid. Below, I tune 2 set of model grids and use them both
-#' ### for building the ensemble, just to set an example ...
-#'
-#' #######################################################
-#' ### PREPARE AutoML Grid (takes a couple of minutes)
-#' #######################################################
-#' # run AutoML to tune various models (GLM, GBM, XGBoost, DRF, DeepLearning) for 120 seconds
-#' y <- "CAPSULE"
-#' prostate[,y] <- as.factor(prostate[,y])  #convert to factor for classification
-#' aml <- h2o.automl(y = y, training_frame = prostate, max_runtime_secs = 120,
-#'                  include_algos=c("DRF","GLM", "XGBoost", "GBM", "DeepLearning"),
-#'
-#'                  # this setting ensures the models are comparable for building a meta learner
-#'                  seed = 2023, nfolds = 10,
-#'                  keep_cross_validation_predictions = TRUE)
-#'
-#' #######################################################
-#' ### PREPARE H2O Grid (takes a couple of minutes)
-#' #######################################################
-#' # make sure equal number of "nfolds" is specified for different grids
-#' grid <- h2o.grid(algorithm = "gbm", y = y, training_frame = prostate,
-#'                  hyper_params = list(ntrees = seq(1,50,1)),
-#'                  grid_id = "ensemble_grid",
-#'
-#'                  # this setting ensures the models are comparable for building a meta learner
-#'                  seed = 2023, fold_assignment = "Modulo", nfolds = 10,
-#'                  keep_cross_validation_predictions = TRUE)
-#'
-#' #######################################################
-#' ### PREPARE ENSEMBLE MODEL
-#' #######################################################
-#'
-#' ### get the models' IDs from the AutoML and grid searches.
-#' ### this is all that is needed before building the ensemble,
-#' ### i.e., to specify the model IDs that should be evaluated.
-#'
-#' ids    <- c(h2o.get_ids(aml), h2o.get_ids(grid))
-#' top    <- ensemble(models = ids, training_frame = prostate, strategy = "top")
-#' search <- ensemble(models = ids, training_frame = prostate, strategy = "search")
-#'
-#' #######################################################
-#' ### EVALUATE THE MODELS
-#' #######################################################
-#' h2o.auc(aml@leader)                          # best model identified by h2o.automl
-#' h2o.auc(h2o.getModel(grid@model_ids[[1]]))   # best model identified by grid search
-#' h2o.auc(top$model).                          # ensemble model with 'top' search strategy
-#' h2o.auc(search$model).                       # ensemble model with 'search' search strategy
-#'
+#'   library(HMDA)
+#'   library(h2o)
+#'   hmda.init()
+#'
+#'   # Import a sample binary outcome dataset into H2O
+#'   train <- h2o.importFile(
+#'   "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
+#'   test <- h2o.importFile(
+#'   "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
+#'
+#'   # Identify predictors and response
+#'   y <- "response"
+#'   x <- setdiff(names(train), y)
+#'
+#'   # For binary classification, response should be a factor
+#'   train[, y] <- as.factor(train[, y])
+#'   test[, y] <- as.factor(test[, y])
+#'
+#'   params <- list(learn_rate = c(0.01, 0.1),
+#'                  max_depth = c(3, 5, 9),
+#'                  sample_rate = c(0.8, 1.0)
+#'   )
+#'
+#'   # Train and validate a cartesian grid of GBMs
+#'   hmda_grid1 <- hmda.grid(algorithm = "gbm", x = x, y = y,
+#'                           grid_id = "hmda_grid1",
+#'                           training_frame = train,
+#'                           nfolds = 10,
+#'                           ntrees = 100,
+#'                           seed = 1,
+#'                           hyper_params = gbm_params1)
+#'
+#'   # Assess the performances of the models
+#'   grid_performance <- hmda.grid.analysis(hmda_grid1)
+#'
+#'   # Return the best 2 models according to each metric
+#'   hmda.best.models(grid_performance, n_models = 2)
+#'
+#'   # build an autoEnsemble model & test it with the testing dataset
+#'   meta <- hmda.autoEnsemble(models = hmda_grid1, training_frame = train)
+#'   print(h2o.performance(model = meta$model, newdata = test))
 #' }
 #' @export
 #' @author E. F. Haghish