diff --git a/R/helpers.R b/R/helpers.R index 8aefbb277..7766ec237 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -184,28 +184,90 @@ extract_num_iterations <- function(x) { } # Helper function to return weights for comps -# Computes per-tree weights from cumulative leaf node values. -# Basic Steps -# For every observation, map its assigned leaf index in -# each tree to the corresponding leaf value. -# Compute the row-wise cumulative sums of these -# leaf values (stand-in for training data predictions). -# Calculate the absolute prediction error. -# Compute the reduction in error. -# Normalize these improvements so that row-weights sum to 1. +# The `extract_tree_weights` function allows the user to return weights +# for each tree in a LightGBM model. Based on internal testing, we currently +# default to an unweighted value of 1 / n_trees for each tree. This returns +# a single vector with a length of the number of trees. # Inputs: # model: Lightgbm model # leaf_idx: integer matrix [training data x trees] of leaf indices # init_score: mean value of sale prices in the training data +# algorithm: type of algorithm to use. Set in params.yaml. Possible types +# unweighted, unweighted_with_error_reduction, error_reduction, +# and prediction_variance # outcome: Predicted FMV values for each observation in the training data # Returns: # weights: numeric matrix [n_obs x n_trees] where each row sums to 1 -extract_tree_weights <- function(model, leaf_idx, init_score, outcome) { +extract_tree_weights <- function(model, + leaf_idx, + init_score = NULL, + algorithm = "unweighted", + outcome = NULL) { n_obs <- nrow(leaf_idx) n_trees <- ncol(leaf_idx) + # Validate algorithm arg + valid_algorithms <- c( + "unweighted", + "prediction_variance", + "unweighted_with_error_reduction", + "error_reduction" + ) + + if (!algorithm %in% valid_algorithms) { + stop( + "Invalid algorithm '", algorithm, "'. Must be one of: ", + paste0(valid_algorithms, collapse = ", ") + ) + } + + # --------------------------------------------------------- + # Unweighted: + # Vector with 1/n_trees for each tree. This is the default input. + # This returns a vector with the length of the number of trees. + # --------------------------------------------------------- + if (algorithm == "unweighted") { + weights <- rep(1 / n_trees, n_trees) + + return(weights) + } + + # --------------------------------------------------------- + # Prediction_variance: + # Vector for tree weights based on variance of leaf values across data. + # This returns a vector with the length of the number of trees. + # --------------------------------------------------------- + if (algorithm == "prediction_variance") { + tree_dt <- lgb.model.dt.tree(model) + leaf_lookup <- tree_dt[ + !is.na(leaf_index), + c("tree_index", "leaf_index", "leaf_value") + ] + + var_per_tree <- numeric(n_trees) + + for (t in seq_len(n_trees)) { + # LightGBM is 0-indexed + this_tree <- subset(leaf_lookup, tree_index == (t - 1L)) + m <- match(leaf_idx[, t], this_tree$leaf_index) + # incremental outputs for this tree across training rows + incr <- this_tree$leaf_value[m] + var_per_tree[t] <- stats::var(incr, na.rm = TRUE) + } + + var_per_tree[is.na(var_per_tree)] <- 0 + summed_variance <- sum(var_per_tree) + weights <- var_per_tree / summed_variance + + return(weights) + } + + # --------------------------------------------------------- + # Remaining algorithms require tree-based improvements to the predicted values + # --------------------------------------------------------- + init_vec <- rep_len(as.numeric(init_score), n_obs) # Lookup: leaf_index -> leaf_value for each tree @@ -241,6 +303,27 @@ extract_tree_weights <- function(model, leaf_idx, init_score, outcome) { # Improvement per tree = previous error - next error prev_err <- tree_errors[, 1:n_trees, drop = FALSE] next_err <- tree_errors[, 2:(n_trees + 1L), drop = FALSE] + + # --------------------------------------------------------- + # Unweighted_with_error_reduction: + # Weights are 1/n_improving trees for trees which reduce errors, 0 otherwise. + # This returns a matrix with dimensions of observations x trees. + # --------------------------------------------------------- + if (algorithm == "unweighted_with_error_reduction") { + improving <- prev_err > next_err + n_improving <- rowSums(improving) + + weights <- improving / n_improving + + return(weights) + } + + # --------------------------------------------------------- + # Proportional_error_reduction: + # Weights are proportional to the reduction in error (prev_err - next_err) for + # improving trees, 0 otherwise. This returns a matrix with dimensions of + # observations x trees. + # --------------------------------------------------------- diff_in_errors <- pmax(0, prev_err - next_err) dim(diff_in_errors) <- dim(prev_err) diff --git a/params.yaml b/params.yaml index 50d88b78f..92e93ae73 100644 --- a/params.yaml +++ b/params.yaml @@ -412,6 +412,18 @@ ratio_study: comp: # Number of comps to generate for each PIN/card num_comps: 5 + # Algorithm used to weight trees for the comps similarity score. + # Valid options: + # - "unweighted": Equal weight (1/n_trees) for every tree. No + # observation-level variation. Returns a vector of weights. + # - "unweighted_with_error_reduction": Binary 1/0 per tree per observation + # (1 if tree reduces the training sale's prediction error, 0 otherwise), + # then row-normalized. Returns a matrix of weights. + # - "error_reduction": Proportional error reduction for training data sale per + # tree, row-normalized. Returns a matrix of weights. + # - "prediction_variance": Variance of each tree's leaf values across + # training observations, normalized to sum to 1. Returns a vector of weights. + algorithm: unweighted # Export ----------------------------------------------------------------------- diff --git a/pipeline/04-interpret.R b/pipeline/04-interpret.R index cc1036295..d9cac2a5c 100644 --- a/pipeline/04-interpret.R +++ b/pipeline/04-interpret.R @@ -28,8 +28,6 @@ reticulate::py_require( purrr::walk(list.files("R/", "\\.R$", full.names = TRUE), source) - - #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 2. Load Data ----------------------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -155,8 +153,6 @@ if (shap_enable) { } - - #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 4. Calculate Feature Importance ---------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -176,8 +172,6 @@ lightgbm::lgb.importance(lgbm_final_full_fit$fit) %>% write_parquet(paths$output$feature_importance$local) - - #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 5. Find Comparables --------------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -271,19 +265,33 @@ if (comp_enable) { model = lgbm_final_full_fit$fit, leaf_idx = as.matrix(training_leaf_nodes), init_score = mean(training_data$meta_sale_price, na.rm = TRUE), + algorithm = params$comp$algorithm, outcome = training_data$meta_sale_price ) if (length(tree_weights) == 0) { message("Warning: tree_weights are empty") } - if (all(rowSums(tree_weights) %in% c(0, 1))) { - message("Warning: tree_weights do not sum to 1 or 0 for each row") - message("First 5 weights:") - print(head(tree_weights, 5)) + if (is.matrix(tree_weights)) { + if (!all(rowSums(tree_weights) %in% c(0, 1))) { + message("Warning: tree_weights do not sum to 1 or 0 for each row") + message("First 5 weights:") + print(head(tree_weights, 5)) + } + } else { + tree_weights_sum <- sum(tree_weights) + if (!isTRUE(all.equal(tree_weights_sum, 1))) { + stop( + "Tree weights vector does not sum to 1 (got ", tree_weights_sum, "). ", + "All sales would have a score of 0 if weights sum to 0." + ) + } + message( + "Tree weights are a vector of length ", length(tree_weights), + " (same weights for all training observations)" + ) } - # Make sure that the leaf node tibbles are all integers, which is what # the comps algorithm expects leaf_nodes <- leaf_nodes %>% diff --git a/pipeline/05-finalize.R b/pipeline/05-finalize.R index 369434cd1..1341b004f 100644 --- a/pipeline/05-finalize.R +++ b/pipeline/05-finalize.R @@ -91,6 +91,7 @@ metadata <- tibble::tibble( ratio_study_near_column = params$ratio_study$near_column, ratio_study_num_quantile = list(params$ratio_study$num_quantile), shap_enable = shap_enable, + comp_algorithm = params$comp$algorithm, comp_enable = comp_enable, comp_num_comps = params$comp$num_comps, cv_enable = cv_enable, diff --git a/python/comps.py b/python/comps.py index aa05148d8..9cc93e1a9 100644 --- a/python/comps.py +++ b/python/comps.py @@ -17,11 +17,19 @@ def get_comps( lightgbm leaf node assignments (`observation_df`) compared to a second dataframe of leaf node assignments (`comparison_df`). - Leaf nodes are weighted according to a tree importance matrix `weights` - and used to generate a similarity score. The function returns two - dataframes: One containing the indices of the most similar compararables + Leaf nodes are weighted according to a tree importance vector or matrix + `weights` and used to generate a similarity score. The function returns two + dataframes: One containing the indices of the most similar comparables and the other containing their corresponding similarity scores. + Weights can be: + - A 1-D array of shape (n_trees,) for algorithms that produce a single + weight per tree (e.g. "unweighted", "prediction_variance"). Will be + reshaped to (1, n_trees) before being passed to numba + - A 2-D matrix of shape (n_training_obs, n_trees) for algorithms that + produce per-observation weights (e.g. "error_reduction", + "unweighted_with_error_reduction") + More details on the underlying algorithm can be found here: https://ccao-data.github.io/lightsnip/articles/finding-comps.html @@ -33,7 +41,7 @@ def get_comps( comparables. weights (numpy.ndarray): Importance weights for leaf nodes, used to compute similarity - scores. + scores. Either 1-D (n_trees,) or 2-D (n_comparisons, n_trees). num_comps (int, optional): Number of top comparables to return for each observation. Default is 5. @@ -59,15 +67,36 @@ def get_comps( f"({observation_df.shape[1]}) " f"must match `comparison_df` ({comparison_df.shape[1]})" ) - if comparison_df.shape != weights.shape: + + if not isinstance(weights, np.ndarray): + weights = np.asarray(weights) + + # Normalize weights to a 2-D matrix so that we can use a single numba + # kernel for both vector and matrix weights. A 1-D vector of per-tree + # weights is reshaped to (1, n_trees); the numba kernel detects this + # shape and broadcasts the single row to all comparison observations. + if weights.ndim == 1: + if weights.shape[0] != comparison_df.shape[1]: + raise ValueError( + f"`weights` length {weights.shape[0]} must equal number of " + f"trees {comparison_df.shape[1]}" + ) + weights_matrix = weights.reshape(1, -1).astype(np.float32, copy=False) + elif weights.ndim == 2: + if comparison_df.shape != weights.shape: + raise ValueError( + f"`comparison_df.shape` {comparison_df.shape} must match " + f"`weights.shape` {weights.shape}" + ) + weights_matrix = weights.astype(np.float32, copy=False) + else: raise ValueError( - f"`comparison_df.shape` {comparison_df.shape} must match " - f"`weights.shape` {weights.shape}" + "`weights` must be a 1-D vector (n_trees,) or 2-D matrix " + f"(n_comparisons, n_trees), got {weights.ndim}-D" ) - # Convert the weights to a numpy array so that we can take advantage of - # numba acceleration later on - weights_matrix = np.asarray(weights, dtype=np.float32) + # Avoid editing the df in-place + observation_df = observation_df.copy() # Chunk the observations so that the script can periodically report progress observation_df["chunk"] = pd.cut( @@ -137,12 +166,20 @@ def _get_top_n_comps( observations in a tree model, a matrix of weights for each obs/tree, and an integer `num_comps`, and returns a matrix where each observation is scored by similarity to observations in the comparison matrix and the top N scores - are returned along with the indexes of the comparison observations.""" + are returned along with the indexes of the comparison observations. + + The weights_matrix is always 2-D. If its first dimension is 1, the single + row of weights is broadcast to all comparison observations (i.e. tree-level + weights shared across all comparisons). Otherwise, each comparison + observation y_i uses its own row of weights.""" num_observations = len(leaf_node_matrix) num_possible_comparisons = len(comparison_leaf_node_matrix) idx_dtype = np.int32 score_dtype = np.float32 + # Detect whether we have shared (vector-style) or per-observation weights + shared_weights = weights_matrix.shape[0] == 1 + # Store scores and indexes in two separate arrays rather than a 3d matrix # for simplicity (array of tuples does not convert to pandas properly). # Indexes default to -1, which is an impossible index and so is a signal @@ -156,12 +193,14 @@ def _get_top_n_comps( # low memory footprint for y_i in range(num_possible_comparisons): similarity_score = 0.0 - for tree_idx in range(len(leaf_node_matrix[x_i])): + # Use row 0 for shared weights, row y_i for per-obs weights + w_i = 0 if shared_weights else y_i + for tree_idx in range(leaf_node_matrix.shape[1]): if ( - leaf_node_matrix[x_i][tree_idx] - == comparison_leaf_node_matrix[y_i][tree_idx] + leaf_node_matrix[x_i, tree_idx] + == comparison_leaf_node_matrix[y_i, tree_idx] ): - similarity_score += weights_matrix[y_i][tree_idx] + similarity_score += weights_matrix[w_i, tree_idx] # See if the score is higher than any of the top N # comps, and store it in the sorted comps array if it is. @@ -198,18 +237,24 @@ def insert_at_idx_and_shift( num_trees = 500 num_obs = 20001 num_comparisons = 10000 - mean_sale_price = 350000 - std_deviation = 110000 leaf_nodes = pd.DataFrame(np.random.randint(0, num_obs, size=[num_obs, num_trees])) training_leaf_nodes = pd.DataFrame( np.random.randint(0, num_comparisons, size=[num_comparisons, num_trees]) ) - tree_weights = np.asarray( + + # Test with matrix weights (error_reduction style) + tree_weights_matrix = np.asarray( [np.random.dirichlet(np.ones(num_trees)) for _ in range(num_comparisons)] ) + start = time.time() + get_comps(leaf_nodes, training_leaf_nodes, tree_weights_matrix) + end = time.time() + print(f"get_comps (matrix weights) runtime: {end - start}s") + # Test with vector weights (unweighted / prediction_variance style) + tree_weights_vector = np.random.dirichlet(np.ones(num_trees)) start = time.time() - get_comps(leaf_nodes, training_leaf_nodes, tree_weights) + get_comps(leaf_nodes, training_leaf_nodes, tree_weights_vector) end = time.time() - print(f"get_comps runtime: {end - start}s") + print(f"get_comps (vector weights) runtime: {end - start}s") diff --git a/python/tests/test_comps.py b/python/tests/test_comps.py index fc61e1ef1..ab4427e75 100644 --- a/python/tests/test_comps.py +++ b/python/tests/test_comps.py @@ -8,6 +8,8 @@ @pt.mark.parametrize( "leaf_nodes,training_leaf_nodes,tree_weights,num_comps,num_chunks,expected_comps,expected_scores", [ + # Matrix weights tests: + # - - - - - - - - - - - # Simple example that tests one input observation with uniform weights # to make sure that the algorithm correctly prioritizes comparison # observations with the highest number of leaf node matches @@ -203,6 +205,86 @@ ), id="more_possible_comps_than_num_comps", ), + # Vector weights tests: + # - - - - - - - - - - - + # Test one observation with uniform 1-D weights (vector). With a 1-D + # weight vector, every tree gets the same weight and that weight is + # broadcast identically to every comparison observation. This should + # produce the same results as the matrix case with identical rows. + pt.param( + pd.DataFrame([[1, 1, 1]]), + pd.DataFrame([[1, 1, 0], [1, 0, 0], [0, 0, 0]]), + np.array([0.333, 0.333, 0.333]), + 3, + 1, + pd.DataFrame( + { + "comp_idx_1": [0], + "comp_idx_2": [1], + "comp_idx_3": [-1], + }, + dtype=np.int32, + ), + pd.DataFrame( + { + "comp_score_1": [0.333 * 2], + "comp_score_2": [0.333 * 1], + "comp_score_3": [0.333 * 0], + }, + dtype=np.float32, + ), + id="one_observation_1d_weights", + ), + # Test two observations with uniform 1-D weights. The 1-D vector is + # used for all comparison rows, so every comparison uses the same + # per-tree weight. Results should mirror the 2-D uniform-weight case. + pt.param( + pd.DataFrame([[1, 1, 1], [2, 2, 2]]), + pd.DataFrame([[1, 1, 1], [1, 1, 2], [1, 2, 2], [2, 2, 2]]), + np.array([0.333, 0.333, 0.333]), + 4, + 1, + pd.DataFrame( + { + "comp_idx_1": [0, 3], + "comp_idx_2": [1, 2], + "comp_idx_3": [2, 1], + "comp_idx_4": [-1, -1], + }, + dtype=np.int32, + ), + pd.DataFrame( + { + "comp_score_1": [0.333 * 3] * 2, + "comp_score_2": [0.333 * 2] * 2, + "comp_score_3": [0.333 * 1] * 2, + "comp_score_4": [0.333 * 0] * 2, + }, + dtype=np.float32, + ), + id="two_observations_1d_weights", + ), + # Test 1-D weights with non-uniform per-tree weights. + pt.param( + pd.DataFrame([[1, 1, 1]]), + pd.DataFrame([[1, 1, 0], [1, 0, 0], [0, 0, 0]]), + np.array([0.5, 0.3, 0.2]), + 3, + 1, + pd.DataFrame( + {"comp_idx_1": [0], "comp_idx_2": [1], "comp_idx_3": [-1]}, + dtype=np.int32, + ), + pd.DataFrame( + { + "comp_score_1": [0.5 + 0.3], + "comp_score_2": [0.5], + "comp_score_3": [0.0], + }, + dtype=np.float32, + ), + id="non_uniform_1d_weights", + ), ], ) def test_get_comps( @@ -244,6 +326,16 @@ def test_get_comps( "`comparison_df.shape` (3, 3) must match `weights.shape` (2, 2)", id="comparison_weights_shapes_match", ), + # Vector weights validation tests: + # 1-D weights with wrong length: 2 weights for 3 trees + pt.param( + pd.DataFrame([[1, 1, 1]]), + pd.DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]]), + np.array([0.333, 0.333]), + ValueError, + "`weights` length 2 must equal number of trees 3", + id="1d_weights_wrong_length", + ), ], ) def test_get_comps_raises_on_invalid_inputs(