-
Notifications
You must be signed in to change notification settings - Fork 17
Implement configurable options for comps algorithm methodology #449
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
5bf04c5
22e38e5
5b655cf
6f0ece5
363b108
123dd81
4c0d8cc
b52fbd8
1923e19
972772c
1f6bf19
365dee4
fa8ce67
b433e60
d32eb89
441c4cd
5ca1ed7
1e32a95
899450c
4e269fc
3396b13
f0d3540
7136808
82605f8
1200ea3
39d4d03
51c582c
1cfb285
6bb28ec
dc09147
2215b3d
0d55dc1
a775f6e
ffa4f1b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -199,13 +199,76 @@ extract_num_iterations <- function(x) { | |
| # model: Lightgbm model | ||
| # leaf_idx: integer matrix [training data x trees] of leaf indices | ||
| # init_score: mean value of sale prices in the training data | ||
| # algorithm: type of algorithm to use. Set in params.yaml. Possible types | ||
| # unweighted, unweighted_with_error_reduction, error_reduction, | ||
| # and prediction_variance | ||
| # outcome: Predicted FMV values for each observation in the training data | ||
| # Returns: | ||
| # weights: numeric matrix [n_obs x n_trees] where each row sums to 1 | ||
| extract_tree_weights <- function(model, leaf_idx, init_score, outcome) { | ||
| extract_tree_weights <- function(model, | ||
| leaf_idx, | ||
| init_score = NULL, | ||
| algorithm = "unweighted", | ||
| outcome = NULL) { | ||
| n_obs <- nrow(leaf_idx) | ||
| n_trees <- ncol(leaf_idx) | ||
|
|
||
| # Validate algorithm arg | ||
| valid_algorithms <- c( | ||
| "unweighted", | ||
| "prediction_variance", | ||
| "unweighted_with_error_reduction", | ||
| "proportional_error_reduction" | ||
| ) | ||
|
|
||
| algorithm <- rlang::arg_match( | ||
| algorithm, | ||
| values = valid_algorithms | ||
| ) | ||
wagnerlmichael marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # --------------------------------------------------------- | ||
| # unweighted (vector with 1/n_trees for each tree) | ||
|
||
| # --------------------------------------------------------- | ||
| if (algorithm == "unweighted") { | ||
| weights <- rep(1 / n_trees, n_trees) | ||
|
|
||
| return(weights) | ||
| } | ||
|
|
||
| # --------------------------------------------------------- | ||
| # prediction_variance: | ||
| # vector for tree weights based on variance of | ||
| # leaf values across data | ||
| # --------------------------------------------------------- | ||
| if (algorithm == "prediction_variance") { | ||
| tree_dt <- lgb.model.dt.tree(model) | ||
| leaf_lookup <- tree_dt[ | ||
| !is.na(leaf_index), | ||
| c("tree_index", "leaf_index", "leaf_value") | ||
| ] | ||
|
|
||
| var_per_tree <- numeric(n_trees) | ||
|
|
||
| for (t in seq_len(n_trees)) { | ||
| # LightGBM is 0-indexed | ||
| this_tree <- subset(leaf_lookup, tree_index == (t - 1L)) | ||
| m <- match(leaf_idx[, t], this_tree$leaf_index) | ||
| # incremental outputs for this tree across training rows | ||
| incr <- this_tree$leaf_value[m] | ||
| var_per_tree[t] <- stats::var(incr, na.rm = TRUE) | ||
| } | ||
|
|
||
| var_per_tree[is.na(var_per_tree)] <- 0 | ||
| summed_variance <- sum(var_per_tree) | ||
| weights <- var_per_tree / summed_variance | ||
|
|
||
| return(weights) | ||
| } | ||
|
|
||
| # --------------------------------------------------------- | ||
| # Remaining algorithms require tree-based improvements to the predicted values | ||
| # --------------------------------------------------------- | ||
|
|
||
| init_vec <- rep_len(as.numeric(init_score), n_obs) | ||
|
|
||
| # Lookup: leaf_index -> leaf_value for each tree | ||
|
|
@@ -236,11 +299,31 @@ extract_tree_weights <- function(model, leaf_idx, init_score, outcome) { | |
| colnames(tree_predictions) <- NULL | ||
|
|
||
| # Absolute errors vs outcome for each prefix | ||
|
|
||
wagnerlmichael marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| tree_errors <- abs(outcome - tree_predictions) | ||
|
|
||
| # Improvement per tree = previous error - next error | ||
| prev_err <- tree_errors[, 1:n_trees, drop = FALSE] | ||
| next_err <- tree_errors[, 2:(n_trees + 1L), drop = FALSE] | ||
|
|
||
| # --------------------------------------------------------- | ||
| # unweighted_with_error_reduction | ||
| # (weights are 1/n_improving trees for trees which reduce errors, 0 otherwise) | ||
| # --------------------------------------------------------- | ||
| if (algorithm == "unweighted_with_error_reduction") { | ||
| improving <- prev_err > next_err | ||
| n_improving <- rowSums(improving) | ||
|
|
||
| weights <- improving / n_improving | ||
|
|
||
| return(weights) | ||
| } | ||
|
|
||
| # --------------------------------------------------------- | ||
| # proportional error reduction: | ||
| # weights are proportional to the reduction in error (prev_err - next_err) for | ||
| # improving trees, 0 otherwise | ||
| # --------------------------------------------------------- | ||
| diff_in_errors <- pmax(0, prev_err - next_err) | ||
| dim(diff_in_errors) <- dim(prev_err) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,10 +24,8 @@ | |
| purrr::walk(list.files("R/", "\\.R$", full.names = TRUE), source) | ||
|
|
||
|
|
||
|
|
||
|
|
||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| ? #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
|
Check warning on line 27 in pipeline/04-interpret.R
|
||
| # 2. Load Data ----------------------------------------------------------------- | ||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| message("Loading model fit and recipe") | ||
|
|
||
|
|
@@ -84,8 +82,6 @@ | |
| } | ||
|
|
||
|
|
||
|
|
||
|
|
||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| # 3. Calculate SHAP Values ----------------------------------------------------- | ||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
|
|
@@ -151,8 +147,6 @@ | |
| } | ||
|
|
||
|
|
||
|
|
||
|
|
||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| # 4. Calculate Feature Importance ---------------------------------------------- | ||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
|
|
@@ -172,8 +166,6 @@ | |
| write_parquet(paths$output$feature_importance$local) | ||
|
|
||
|
|
||
|
|
||
|
|
||
jeancochrane marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| # 5. Find Comparables --------------------------------------------------------- | ||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
|
|
@@ -267,19 +259,26 @@ | |
| model = lgbm_final_full_fit$fit, | ||
| leaf_idx = as.matrix(training_leaf_nodes), | ||
| init_score = mean(training_data$meta_sale_price, na.rm = TRUE), | ||
| algorithm = params$comp$algorithm, | ||
| outcome = training_data$meta_sale_price | ||
| ) | ||
|
|
||
| if (length(tree_weights) == 0) { | ||
| message("Warning: tree_weights are empty") | ||
| } | ||
| if (all(rowSums(tree_weights) %in% c(0, 1))) { | ||
| message("Warning: tree_weights do not sum to 1 or 0 for each row") | ||
| message("First 5 weights:") | ||
| print(head(tree_weights, 5)) | ||
| if (is.matrix(tree_weights)) { | ||
| if (!all(rowSums(tree_weights) %in% c(0, 1))) { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added the negation here because unless I'm reading incorrectly, I think that we had this backwards? |
||
| message("Warning: tree_weights do not sum to 1 or 0 for each row") | ||
| message("First 5 weights:") | ||
| print(head(tree_weights, 5)) | ||
| } | ||
| } else { | ||
| message( | ||
| "Tree weights are a vector of length ", length(tree_weights), | ||
| " (same weights for all training observations)" | ||
| ) | ||
jeancochrane marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
|
|
||
| # Make sure that the leaf node tibbles are all integers, which is what | ||
| # the comps algorithm expects | ||
| leaf_nodes <- leaf_nodes %>% | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.