h2oai · manh4wk · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 6, 2025
diff --git a/h2o-algos/src/main/java/hex/schemas/GLMModelV3.java b/h2o-algos/src/main/java/hex/schemas/GLMModelV3.java
@@ -35,6 +35,9 @@ public static final class GLMModelOutputV3 extends ModelOutputSchemaV3<GLMOutput
     @API(help = "Variable Importances", direction = API.Direction.OUTPUT, level = API.Level.secondary)
     TwoDimTableV3 variable_importances;
 
+    @API(help = "Variance-Covariance Matrix")
+    TwoDimTableV3 vcov_table;
+
     @API(help="Lambda minimizing the objective value, only applicable with lambda search or when arrays of alpha and " +
             "lambdas are provided")
     double lambda_best;
@@ -286,6 +289,36 @@ public GLMModelOutputV3 fillFromImpl(GLMModel.GLMOutput impl) {
         }
       }
       coefficients_table.fillFromImpl(tdt);
+
+      if(impl.hasPValues()) { //vcov is only populated if p values are calculated
+        double[][] vcov;
+        vcov = impl.vcov();
+        vcov_table = new TwoDimTableV3();
+        colTypes = new String[ns.length];
+        colFormats = new String[ns.length];
+        colnames = ns.clone();
+        for(int i = 0; i < ns.length; ++i) {
+          colTypes[i] = "double";
+          colFormats[i] = "%5f";
+        }
+        tdt = new TwoDimTable("Variance-Covariance Matrix","variance-covariance matrix", ns, colnames, colTypes, colFormats, "names");
+
+        // move intercept from last row and column to first row and column to match the reordering done with the coefficients
+        tdt.set(0, 0, vcov[ns.length - 1][ns.length - 1]);
+        for(int i = 1; i < ns.length; ++i) {
+          tdt.set(0, i, vcov[ns.length-1][i-1]);
+        }
+        for(int i = 1; i < ns.length; ++i) {
+          tdt.set(i, 0, vcov[i-1][ns.length-1]);
+        }
+        for(int i = 1; i < ns.length; ++i) {
+          for(int j = 1; j < ns.length; ++j) {
+            tdt.set(i, j, vcov[i - 1][j - 1]);
+          }
+        }
+        vcov_table.fillFromImpl(tdt);
+      }
+
       if(impl.beta() != null) { // get varImp
         calculateVarimpBase(magnitudes, indices, impl.getNormBeta());
 

diff --git a/h2o-py/h2o/model/model_base.py b/h2o-py/h2o/model/model_base.py
@@ -887,7 +887,16 @@ def coef_with_p_values(self):
                                  "compute_p_values=True.")
         else:
             raise ValueError("p-values, z-values and std_error are only found in GLM.")
-
+
+    def vcov(self):
+        if self.algo == 'glm':
+            if self.parms["compute_p_values"]["actual_value"]:
+                return self._model_json["output"]["vcov_table"]
+            else:
+                raise ValueError("The variance-covariance matrix is only calculated when compute_p_values=True.")
+        else:
+            raise ValueError("The variance-covariance matrix is only found in GLM.")
+
     def _fillMultinomialDict(self, standardize=False):
         if self.algo == 'gam':
             tbl = self._model_json["output"]["coefficients_table"]

diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_16583_vcov.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_16583_vcov.py
@@ -0,0 +1,97 @@
+import sys
+
+sys.path.insert(1, "../../../")
+import h2o
+from tests import pyunit_utils
+from h2o.estimators.gbm import H2OGradientBoostingEstimator
+from h2o.estimators.glm import H2OGeneralizedLinearEstimator
+
+# make sure error is thrown when non glm model try to call vcov
+def test_glm_vcov():
+    cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
+    y = "economy_20mpg"
+    predictors = ["displacement", "power", "weight", "acceleration", "year"]
+    cars[y] = cars[y].asfactor()
+
+    gbm_model = H2OGradientBoostingEstimator(distribution="bernoulli", seed=1234)
+    gbm_model.train(x=predictors, y=y, training_frame=cars)
+
+    try:
+        gbm_model.vcov()
+    except ValueError as e:
+        assert "The variance-covariance matrix is only found in GLM." in e.args[0], "Wrong error messages received."
+
+# test to make sure we have error when compute_p_value=False and vcov is called
+def test_wrong_model_vcov():
+    cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
+    y = "economy_20mpg"
+    predictors = ["displacement", "power", "weight", "acceleration", "year"]
+    cars[y] = cars[y].asfactor()
+
+    h2oglm_vcov = H2OGeneralizedLinearEstimator(family="binomial", seed=1234)
+    h2oglm_vcov.train(x=predictors, y=y, training_frame=cars)
+
+    try:
+        h2oglm_vcov.vcov()
+    except ValueError as e:
+        assert "The variance-covariance matrix is only calculated when compute_p_values=True" in e.args[0], "Wrong error messages received."
+
+# make sure correct covariances are returned when vcov is called
+def test_glm_vcov_values():
+    cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
+    y = "economy_20mpg"
+    predictors = ["displacement","power","weight","acceleration","year"]
+    cars[y] = cars[y].asfactor()
+
+    h2oglm_compute_vcov =  H2OGeneralizedLinearEstimator(family = "binomial", lambda_=0.0, compute_p_values=True, 
+                                                          seed = 1234)
+
+    h2oglm_compute_vcov.train(x = predictors, y = y, training_frame  = cars)
+    vcov = h2oglm_compute_vcov.vcov()
+    vcov_intercept = vcov['intercept']
+    vcov_displacement = vcov['displacement']
+    vcov_power = vcov['power']
+    vcov_weight = vcov['weight']
+    vcov_acceleration = vcov['acceleration']
+    vcov_year = vcov['year']
+
+    print("variance-covariance table: {0}".format(vcov))
+
+    # manually obtain covariances and compare with ones using functions
+    names = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["names"]
+    manual_intercept = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["intercept"]
+    manual_displacement = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["displacement"]
+    manual_power = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["power"]
+    manual_weight = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["weight"]
+    manual_acceleration = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["acceleration"]
+    manual_year = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["year"]
+
+    # check both sets of values are equal
+    for i, el in enumerate(names):
+        assert abs(vcov_intercept[i]-manual_intercept[i]) < 1e-12, "Expected covariance between intercept and {el} : {0} " \
+                                                                  ", actual covariance: {1}.  They are different".format(
+                                                                  vcov_intercept[count], manual_intercept[count])
+        assert abs(vcov_displacement[i]-manual_displacement[i]) < 1e-12, "Expected covariance between displacement and {el} : {0} " \
+                                                                  ", actual covariance: {1}.  They are different".format(
+                                                                  vcov_displacement[count], manual_displacement[count])
+        assert abs(vcov_power[i]-manual_power[i]) < 1e-12, "Expected covariance between power and {el} : {0} " \
+                                                                  ", actual covariance: {1}.  They are different".format(
+                                                                  vcov_power[count], manual_power[count])
+        assert abs(vcov_weight[i]-manual_weight[i]) < 1e-12, "Expected covariance between weight and {el} : {0} " \
+                                                                  ", actual covariance: {1}.  They are different".format(
+                                                                  vcov_weight[count], manual_weight[count])
+        assert abs(vcov_acceleration[i]-manual_acceleration[i]) < 1e-12, "Expected covariance between acceleration and {el} : {0} " \
+                                                                  ", actual covariance: {1}.  They are different".format(
+                                                                  vcov_acceleration[count], manual_acceleration[count])
+        assert abs(vcov_year[i]-manual_year[i]) < 1e-12, "Expected covariance between year and {el} : {0} " \
+                                                                  ", actual covariance: {1}.  They are different".format(
+                                                                  vcov_year[count], manual_year[count])
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(test_glm_vcov)
+    pyunit_utils.standalone_test(test_wrong_model_vcov)
+    pyunit_utils.standalone_test(test_glm_vcov_values)
+else:
+    test_glm_vcov()
+    test_wrong_model_vcov()
+    test_glm_vcov_values()
diff --git a/h2o-r/h2o-package/R/models.R b/h2o-r/h2o-package/R/models.R
@@ -2663,6 +2663,44 @@ h2o.coef_with_p_values <- function(object) {
   }
 }
 
+#'
+#' Return the variance-covariance matrix for GLM models
+#'
+#' @param object An \linkS4class{H2OModel} object.
+#' @examples 
+#' \dontrun{
+#' library(h2o)
+#' h2o.init()
+#' 
+#' f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv"
+#' cars <- h2o.importFile(f)
+#' predictors <- c("displacement", "power", "weight", "acceleration", "year")
+#' response <- "cylinders"
+#' cars_split <- h2o.splitFrame(data = cars, ratios = 0.8, seed = 1234)
+#' train <- cars_split[[1]]
+#' valid <- cars_split[[2]]
+#' cars_glm <- h2o.glm(seed = 1234, 
+#'                     lambda=0.0,
+#'                     compute_p_values=TRUE,
+#'                     x = predictors, 
+#'                     y = response, 
+#'                     training_frame = train, 
+#'                     validation_frame = valid)
+#' h2o.vcov(cars_glm)
+#' }
+#' @export
+h2o.vcov <- function(object) {
+  if (is(object, "H2OModel") && object@algorithm %in% c("glm")) {
+    if (object@parameters$compute_p_values) {
+      object@model$vcov_table
+    } else {
+      stop("variance-covariance matrix not found in model.  Make sure to set compute_p_values=TRUE.")
+    }
+  } else {
+    stop("variance-covariance matrix is only found in GLM.")
+  }
+}
+
 #'
 #' Return the GLM linear constraints descriptions, constraints values, constraints bounds and whether the constraints
 #' satisfied the bounds (true) or not (false)

diff --git a/h2o-r/tests/testdir_algos/glm/runit_GH_16583_vcov.R b/h2o-r/tests/testdir_algos/glm/runit_GH_16583_vcov.R
@@ -0,0 +1,66 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source("../../../scripts/h2o-r-test-setup.R")
+
+# call glm functions by a gbm model.  Should generate an error
+testGBMvcov <- function() {
+	bhexFV <- h2o.importFile(locate("smalldata/logreg/benign.csv"))
+	maxX <- 11
+	Y <- 4
+	X   <- 3:maxX
+	X   <- X[ X != Y ]
+	mFV <- h2o.gbm(y=Y, x=colnames(bhexFV)[X], training_frame=bhexFV)
+
+	assertError(vcovValues <- h2o.vcov(mFV))
+}
+
+# should throw an error when compute_p_value=FALSE
+testGLMvcovcomputePValueFALSE <- function() {
+	bhexFV <- h2o.importFile(locate("smalldata/logreg/benign.csv"))
+	maxX <- 11
+	Y <- 4
+	X   <- 3:maxX
+	X   <- X[ X != Y ]
+	mFV <- h2o.glm(y=Y, x=colnames(bhexFV)[X], training_frame=bhexFV, lambda=1.0)
+
+	assertError(vcovValues <- h2o.vcov(mFV))
+}
+
+testGLMPValZValStdError <- function() {
+	bhexFV <- h2o.importFile(locate("smalldata/logreg/benign.csv"))
+	maxX <- 11
+	Y <- 4
+	X   <- 3:maxX
+	X   <- X[ X != Y ]
+	bhexFV$FNDX <- h2o.asfactor(bhexFV$FNDX)
+	mFV <- h2o.glm(y=Y, x=colnames(bhexFV)[X], training_frame=bhexFV, family="binomial", lambda=0.0, compute_p_values=TRUE)
+
+	vcovValues <- h2o.coef_with_p_values(mFV)
+	print("variance-covariance table")
+	print(vcovValues)
+	vcovIntercept <- vcovValues$intercept
+	vcovDisplacement <- vcovValues$displacement
+	vcovPower <- vcovValues$power
+	vcovWeight <- vcovValues$weight
+	vcovAcceleration <- vcovValues$acceleration
+	vcovYear <- vcovValues$year
+
+	manualIntercept <- mFV@model$coefficients_table$intercept
+	manualDisplacement <- mFV@model$coefficients_table$displacement
+	manualPower <- mFV@model$coefficients_table$power
+	manualWeight <- mFV@model$coefficients_table$weight
+	manualAcceleration <- mFV@model$coefficients_table$acceleration
+	manualYear <- mFV@model$coefficients_table$year
+
+  # compare values from model and obtained manually
+  for (ind in c(1:length(manuelPValues)))
+    expect_equal(manualIntercept[ind], vcovIntercept[ind])
+    expect_equal(manualDisplacement[ind], vcovDisplacement[ind])
+    expect_equal(manualPower[ind], vcovPower[ind])
+    expect_equal(manualWeight[ind], vcovWeight[ind])
+    expect_equal(manualAcceleration[ind], vcovAcceleration[ind])
+    expect_equal(manualYear[ind], vcovYear[ind])
+}
+
+doTest("GLM: make sure error is generated when a gbm model calls glm functions", testGBMvcov)
+doTest("GLM: make sure error is generated when compute_p_values=FALSE", testGLMvcovcomputePValueFALSE)
+doTest("GLM: test variance-covariance values", testGLMPValZValStdError)