Skip to content

GH-16583: Access GLM Variance-Covariance Matrix with vcov #16586

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions h2o-algos/src/main/java/hex/schemas/GLMModelV3.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ public static final class GLMModelOutputV3 extends ModelOutputSchemaV3<GLMOutput
@API(help = "Variable Importances", direction = API.Direction.OUTPUT, level = API.Level.secondary)
TwoDimTableV3 variable_importances;

@API(help = "Variance-Covariance Matrix")
TwoDimTableV3 vcov_table;

@API(help="Lambda minimizing the objective value, only applicable with lambda search or when arrays of alpha and " +
"lambdas are provided")
double lambda_best;
Expand Down Expand Up @@ -286,6 +289,36 @@ public GLMModelOutputV3 fillFromImpl(GLMModel.GLMOutput impl) {
}
}
coefficients_table.fillFromImpl(tdt);

if(impl.hasPValues()) { //vcov is only populated if p values are calculated
double[][] vcov;
vcov = impl.vcov();
vcov_table = new TwoDimTableV3();
colTypes = new String[ns.length];
colFormats = new String[ns.length];
colnames = ns.clone();
for(int i = 0; i < ns.length; ++i) {
colTypes[i] = "double";
colFormats[i] = "%5f";
}
tdt = new TwoDimTable("Variance-Covariance Matrix","variance-covariance matrix", ns, colnames, colTypes, colFormats, "names");

// move intercept from last row and column to first row and column to match the reordering done with the coefficients
tdt.set(0, 0, vcov[ns.length - 1][ns.length - 1]);
for(int i = 1; i < ns.length; ++i) {
tdt.set(0, i, vcov[ns.length-1][i-1]);
}
for(int i = 1; i < ns.length; ++i) {
tdt.set(i, 0, vcov[i-1][ns.length-1]);
}
for(int i = 1; i < ns.length; ++i) {
for(int j = 1; j < ns.length; ++j) {
tdt.set(i, j, vcov[i - 1][j - 1]);
}
}
vcov_table.fillFromImpl(tdt);
}

if(impl.beta() != null) { // get varImp
calculateVarimpBase(magnitudes, indices, impl.getNormBeta());

Expand Down
11 changes: 10 additions & 1 deletion h2o-py/h2o/model/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,7 +887,16 @@ def coef_with_p_values(self):
"compute_p_values=True.")
else:
raise ValueError("p-values, z-values and std_error are only found in GLM.")


def vcov(self):
if self.algo == 'glm':
if self.parms["compute_p_values"]["actual_value"]:
return self._model_json["output"]["vcov_table"]
else:
raise ValueError("The variance-covariance matrix is only calculated when compute_p_values=True.")
else:
raise ValueError("The variance-covariance matrix is only found in GLM.")

def _fillMultinomialDict(self, standardize=False):
if self.algo == 'gam':
tbl = self._model_json["output"]["coefficients_table"]
Expand Down
97 changes: 97 additions & 0 deletions h2o-py/tests/testdir_algos/glm/pyunit_GH_16583_vcov.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import sys

sys.path.insert(1, "../../../")
import h2o
from tests import pyunit_utils
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

# make sure error is thrown when non glm model try to call vcov
def test_glm_vcov():
cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
y = "economy_20mpg"
predictors = ["displacement", "power", "weight", "acceleration", "year"]
cars[y] = cars[y].asfactor()

gbm_model = H2OGradientBoostingEstimator(distribution="bernoulli", seed=1234)
gbm_model.train(x=predictors, y=y, training_frame=cars)

try:
gbm_model.vcov()
except ValueError as e:
assert "The variance-covariance matrix is only found in GLM." in e.args[0], "Wrong error messages received."

# test to make sure we have error when compute_p_value=False and vcov is called
def test_wrong_model_vcov():
cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
y = "economy_20mpg"
predictors = ["displacement", "power", "weight", "acceleration", "year"]
cars[y] = cars[y].asfactor()

h2oglm_vcov = H2OGeneralizedLinearEstimator(family="binomial", seed=1234)
h2oglm_vcov.train(x=predictors, y=y, training_frame=cars)

try:
h2oglm_vcov.vcov()
except ValueError as e:
assert "The variance-covariance matrix is only calculated when compute_p_values=True" in e.args[0], "Wrong error messages received."

# make sure correct covariances are returned when vcov is called
def test_glm_vcov_values():
cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
y = "economy_20mpg"
predictors = ["displacement","power","weight","acceleration","year"]
cars[y] = cars[y].asfactor()

h2oglm_compute_vcov = H2OGeneralizedLinearEstimator(family = "binomial", lambda_=0.0, compute_p_values=True,
seed = 1234)

h2oglm_compute_vcov.train(x = predictors, y = y, training_frame = cars)
vcov = h2oglm_compute_vcov.vcov()
vcov_intercept = vcov['intercept']
vcov_displacement = vcov['displacement']
vcov_power = vcov['power']
vcov_weight = vcov['weight']
vcov_acceleration = vcov['acceleration']
vcov_year = vcov['year']

print("variance-covariance table: {0}".format(vcov))

# manually obtain covariances and compare with ones using functions
names = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["names"]
manual_intercept = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["intercept"]
manual_displacement = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["displacement"]
manual_power = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["power"]
manual_weight = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["weight"]
manual_acceleration = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["acceleration"]
manual_year = h2oglm_compute_vcov._model_json["output"]["vcov_table"]["year"]

# check both sets of values are equal
for i, el in enumerate(names):
assert abs(vcov_intercept[i]-manual_intercept[i]) < 1e-12, "Expected covariance between intercept and {el} : {0} " \
", actual covariance: {1}. They are different".format(
vcov_intercept[count], manual_intercept[count])
assert abs(vcov_displacement[i]-manual_displacement[i]) < 1e-12, "Expected covariance between displacement and {el} : {0} " \
", actual covariance: {1}. They are different".format(
vcov_displacement[count], manual_displacement[count])
assert abs(vcov_power[i]-manual_power[i]) < 1e-12, "Expected covariance between power and {el} : {0} " \
", actual covariance: {1}. They are different".format(
vcov_power[count], manual_power[count])
assert abs(vcov_weight[i]-manual_weight[i]) < 1e-12, "Expected covariance between weight and {el} : {0} " \
", actual covariance: {1}. They are different".format(
vcov_weight[count], manual_weight[count])
assert abs(vcov_acceleration[i]-manual_acceleration[i]) < 1e-12, "Expected covariance between acceleration and {el} : {0} " \
", actual covariance: {1}. They are different".format(
vcov_acceleration[count], manual_acceleration[count])
assert abs(vcov_year[i]-manual_year[i]) < 1e-12, "Expected covariance between year and {el} : {0} " \
", actual covariance: {1}. They are different".format(
vcov_year[count], manual_year[count])

if __name__ == "__main__":
pyunit_utils.standalone_test(test_glm_vcov)
pyunit_utils.standalone_test(test_wrong_model_vcov)
pyunit_utils.standalone_test(test_glm_vcov_values)
else:
test_glm_vcov()
test_wrong_model_vcov()
test_glm_vcov_values()
38 changes: 38 additions & 0 deletions h2o-r/h2o-package/R/models.R
Original file line number Diff line number Diff line change
Expand Up @@ -2663,6 +2663,44 @@ h2o.coef_with_p_values <- function(object) {
}
}

#'
#' Return the variance-covariance matrix for GLM models
#'
#' @param object An \linkS4class{H2OModel} object.
#' @examples
#' \dontrun{
#' library(h2o)
#' h2o.init()
#'
#' f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv"
#' cars <- h2o.importFile(f)
#' predictors <- c("displacement", "power", "weight", "acceleration", "year")
#' response <- "cylinders"
#' cars_split <- h2o.splitFrame(data = cars, ratios = 0.8, seed = 1234)
#' train <- cars_split[[1]]
#' valid <- cars_split[[2]]
#' cars_glm <- h2o.glm(seed = 1234,
#' lambda=0.0,
#' compute_p_values=TRUE,
#' x = predictors,
#' y = response,
#' training_frame = train,
#' validation_frame = valid)
#' h2o.vcov(cars_glm)
#' }
#' @export
h2o.vcov <- function(object) {
if (is(object, "H2OModel") && object@algorithm %in% c("glm")) {
if (object@parameters$compute_p_values) {
object@model$vcov_table
} else {
stop("variance-covariance matrix not found in model. Make sure to set compute_p_values=TRUE.")
}
} else {
stop("variance-covariance matrix is only found in GLM.")
}
}

#'
#' Return the GLM linear constraints descriptions, constraints values, constraints bounds and whether the constraints
#' satisfied the bounds (true) or not (false)
Expand Down
66 changes: 66 additions & 0 deletions h2o-r/tests/testdir_algos/glm/runit_GH_16583_vcov.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source("../../../scripts/h2o-r-test-setup.R")

# call glm functions by a gbm model. Should generate an error
testGBMvcov <- function() {
bhexFV <- h2o.importFile(locate("smalldata/logreg/benign.csv"))
maxX <- 11
Y <- 4
X <- 3:maxX
X <- X[ X != Y ]
mFV <- h2o.gbm(y=Y, x=colnames(bhexFV)[X], training_frame=bhexFV)

assertError(vcovValues <- h2o.vcov(mFV))
}

# should throw an error when compute_p_value=FALSE
testGLMvcovcomputePValueFALSE <- function() {
bhexFV <- h2o.importFile(locate("smalldata/logreg/benign.csv"))
maxX <- 11
Y <- 4
X <- 3:maxX
X <- X[ X != Y ]
mFV <- h2o.glm(y=Y, x=colnames(bhexFV)[X], training_frame=bhexFV, lambda=1.0)

assertError(vcovValues <- h2o.vcov(mFV))
}

testGLMPValZValStdError <- function() {
bhexFV <- h2o.importFile(locate("smalldata/logreg/benign.csv"))
maxX <- 11
Y <- 4
X <- 3:maxX
X <- X[ X != Y ]
bhexFV$FNDX <- h2o.asfactor(bhexFV$FNDX)
mFV <- h2o.glm(y=Y, x=colnames(bhexFV)[X], training_frame=bhexFV, family="binomial", lambda=0.0, compute_p_values=TRUE)

vcovValues <- h2o.coef_with_p_values(mFV)
print("variance-covariance table")
print(vcovValues)
vcovIntercept <- vcovValues$intercept
vcovDisplacement <- vcovValues$displacement
vcovPower <- vcovValues$power
vcovWeight <- vcovValues$weight
vcovAcceleration <- vcovValues$acceleration
vcovYear <- vcovValues$year

manualIntercept <- mFV@model$coefficients_table$intercept
manualDisplacement <- mFV@model$coefficients_table$displacement
manualPower <- mFV@model$coefficients_table$power
manualWeight <- mFV@model$coefficients_table$weight
manualAcceleration <- mFV@model$coefficients_table$acceleration
manualYear <- mFV@model$coefficients_table$year

# compare values from model and obtained manually
for (ind in c(1:length(manuelPValues)))
expect_equal(manualIntercept[ind], vcovIntercept[ind])
expect_equal(manualDisplacement[ind], vcovDisplacement[ind])
expect_equal(manualPower[ind], vcovPower[ind])
expect_equal(manualWeight[ind], vcovWeight[ind])
expect_equal(manualAcceleration[ind], vcovAcceleration[ind])
expect_equal(manualYear[ind], vcovYear[ind])
}

doTest("GLM: make sure error is generated when a gbm model calls glm functions", testGBMvcov)
doTest("GLM: make sure error is generated when compute_p_values=FALSE", testGLMvcovcomputePValueFALSE)
doTest("GLM: test variance-covariance values", testGLMPValZValStdError)