ccao-data · wagnerlmichael · Feb 26, 2026 · Feb 18, 2026 · Feb 18, 2026 · Feb 18, 2026
@@ -184,28 +184,90 @@ extract_num_iterations <- function(x) {
 }
 
 # Helper function to return weights for comps
-# Computes per-tree weights from cumulative leaf node values.
 
-# Basic Steps
-# For every observation, map its assigned leaf index in
-# each tree to the corresponding leaf value.
-# Compute the row-wise cumulative sums of these
-# leaf values (stand-in for training data predictions).
-# Calculate the absolute prediction error.
-# Compute the reduction in error.
-# Normalize these improvements so that row-weights sum to 1.
+# The `extract_tree_weights` function allows the user to return weights
+# for each tree in a LightGBM model. Based on internal testing, we currently
+# default to an unweighted value of 1 / n_trees for each tree. This returns
+# a single vector with a length of the number of trees.
 
 # Inputs:
 #   model:      Lightgbm model
 #   leaf_idx:   integer matrix [training data x trees] of leaf indices
 #   init_score: mean value of sale prices in the training data
+#   algorithm:  type of algorithm to use. Set in params.yaml. Possible types
+#               unweighted, unweighted_with_error_reduction, error_reduction,
+#               and prediction_variance
 #   outcome:    Predicted FMV values for each observation in the training data
 # Returns:
 #   weights: numeric matrix [n_obs x n_trees] where each row sums to 1
-extract_tree_weights <- function(model, leaf_idx, init_score, outcome) {
+extract_tree_weights <- function(model,
+                                 leaf_idx,
+                                 init_score = NULL,
+                                 algorithm = "unweighted",
+                                 outcome = NULL) {
   n_obs <- nrow(leaf_idx)
   n_trees <- ncol(leaf_idx)
 
+  # Validate algorithm arg
+  valid_algorithms <- c(
+    "unweighted",
+    "prediction_variance",
+    "unweighted_with_error_reduction",
+    "error_reduction"
+  )
+
+  if (!algorithm %in% valid_algorithms) {
+    stop(
+      "Invalid algorithm '", algorithm, "'. Must be one of: ",
+      paste0(valid_algorithms, collapse = ", ")
+    )
+  }
+
+  # ---------------------------------------------------------
+  # Unweighted:
+  # Vector with 1/n_trees for each tree. This is the default input.
+  # This returns a vector with the length of the number of trees.
+  # ---------------------------------------------------------
+  if (algorithm == "unweighted") {
+    weights <- rep(1 / n_trees, n_trees)
+
+    return(weights)
+  }
+
+  # ---------------------------------------------------------
+  # Prediction_variance:
+  # Vector for tree weights based on variance of leaf values across data.
+  # This returns a vector with the length of the number of trees.
+  # ---------------------------------------------------------
+  if (algorithm == "prediction_variance") {
+    tree_dt <- lgb.model.dt.tree(model)
+    leaf_lookup <- tree_dt[
+      !is.na(leaf_index),
+      c("tree_index", "leaf_index", "leaf_value")
+    ]
+
+    var_per_tree <- numeric(n_trees)
+
+    for (t in seq_len(n_trees)) {
+      # LightGBM is 0-indexed
+      this_tree <- subset(leaf_lookup, tree_index == (t - 1L))
+      m <- match(leaf_idx[, t], this_tree$leaf_index)
+      # incremental outputs for this tree across training rows
+      incr <- this_tree$leaf_value[m]
+      var_per_tree[t] <- stats::var(incr, na.rm = TRUE)
+    }
+
+    var_per_tree[is.na(var_per_tree)] <- 0
+    summed_variance <- sum(var_per_tree)
+    weights <- var_per_tree / summed_variance
+
+    return(weights)
+  }
+
+  # ---------------------------------------------------------
+  # Remaining algorithms require tree-based improvements to the predicted values
+  # ---------------------------------------------------------
+
   init_vec <- rep_len(as.numeric(init_score), n_obs)
 
   # Lookup: leaf_index -> leaf_value for each tree
@@ -241,6 +303,27 @@ extract_tree_weights <- function(model, leaf_idx, init_score, outcome) {
   # Improvement per tree = previous error - next error
   prev_err <- tree_errors[, 1:n_trees, drop = FALSE]
   next_err <- tree_errors[, 2:(n_trees + 1L), drop = FALSE]
+
+  # ---------------------------------------------------------
+  # Unweighted_with_error_reduction:
+  # Weights are 1/n_improving trees for trees which reduce errors, 0 otherwise.
+  # This returns a matrix with dimensions of observations x trees.
+  # ---------------------------------------------------------
+  if (algorithm == "unweighted_with_error_reduction") {
+    improving <- prev_err > next_err
+    n_improving <- rowSums(improving)
+
+    weights <- improving / n_improving
+
+    return(weights)
+  }
+
+  # ---------------------------------------------------------
+  # Proportional_error_reduction:
+  # Weights are proportional to the reduction in error (prev_err - next_err) for
+  # improving trees, 0 otherwise. This returns a matrix with dimensions of
+  # observations x trees.
+  # ---------------------------------------------------------
   diff_in_errors <- pmax(0, prev_err - next_err)
   dim(diff_in_errors) <- dim(prev_err)
 

@@ -412,6 +412,18 @@ ratio_study:
 comp:
   # Number of comps to generate for each PIN/card
   num_comps: 5
+  # Algorithm used to weight trees for the comps similarity score.
+  # Valid options:
+  #   - "unweighted": Equal weight (1/n_trees) for every tree. No
+  #     observation-level variation. Returns a vector of weights.
+  #   - "unweighted_with_error_reduction": Binary 1/0 per tree per observation
+  #     (1 if tree reduces the training sale's prediction error, 0 otherwise),
+  #     then row-normalized. Returns a matrix of weights.
+  #   - "error_reduction": Proportional error reduction for training data sale per
+  #     tree, row-normalized. Returns a matrix of weights.
+  #   - "prediction_variance": Variance of each tree's leaf values across
+  #     training observations, normalized to sum to 1. Returns a vector of weights.
+  algorithm: unweighted
 
 # Export -----------------------------------------------------------------------
 

@@ -28,8 +28,6 @@ reticulate::py_require(
 purrr::walk(list.files("R/", "\\.R$", full.names = TRUE), source)
 
 
-
-
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 # 2. Load Data -----------------------------------------------------------------
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -155,8 +153,6 @@ if (shap_enable) {
 }
 
 
-
-
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 # 4. Calculate Feature Importance ----------------------------------------------
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -176,8 +172,6 @@ lightgbm::lgb.importance(lgbm_final_full_fit$fit) %>%
   write_parquet(paths$output$feature_importance$local)
 
 
-
-
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 # 5. Find Comparables  ---------------------------------------------------------
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -271,19 +265,33 @@ if (comp_enable) {
     model      = lgbm_final_full_fit$fit,
     leaf_idx   = as.matrix(training_leaf_nodes),
     init_score = mean(training_data$meta_sale_price, na.rm = TRUE),
+    algorithm  = params$comp$algorithm,
     outcome    = training_data$meta_sale_price
   )
 
   if (length(tree_weights) == 0) {
     message("Warning: tree_weights are empty")
   }
-  if (all(rowSums(tree_weights) %in% c(0, 1))) {
-    message("Warning: tree_weights do not sum to 1 or 0 for each row")
-    message("First 5 weights:")
-    print(head(tree_weights, 5))
+  if (is.matrix(tree_weights)) {
+    if (!all(rowSums(tree_weights) %in% c(0, 1))) {
+      message("Warning: tree_weights do not sum to 1 or 0 for each row")
+      message("First 5 weights:")
+      print(head(tree_weights, 5))
+    }
+  } else {
+    tree_weights_sum <- sum(tree_weights)
+    if (!isTRUE(all.equal(tree_weights_sum, 1))) {
+      stop(
+        "Tree weights vector does not sum to 1 (got ", tree_weights_sum, "). ",
+        "All sales would have a score of 0 if weights sum to 0."
+      )
+    }
+    message(
+      "Tree weights are a vector of length ", length(tree_weights),
+      " (same weights for all training observations)"
+    )
   }
 
-
   # Make sure that the leaf node tibbles are all integers, which is what
   # the comps algorithm expects
   leaf_nodes <- leaf_nodes %>%

@@ -91,6 +91,7 @@ metadata <- tibble::tibble(
   ratio_study_near_column = params$ratio_study$near_column,
   ratio_study_num_quantile = list(params$ratio_study$num_quantile),
   shap_enable = shap_enable,
+  comp_algorithm = params$comp$algorithm,
   comp_enable = comp_enable,
   comp_num_comps = params$comp$num_comps,
   cv_enable = cv_enable,

@@ -17,11 +17,19 @@ def get_comps(
     lightgbm leaf node assignments (`observation_df`) compared to a second
     dataframe of leaf node assignments (`comparison_df`).
 
-    Leaf nodes are weighted according to a tree importance matrix `weights`
-    and used to generate a similarity score. The function returns two
-    dataframes: One containing the indices of the most similar compararables
+    Leaf nodes are weighted according to a tree importance vector or matrix
+    `weights` and used to generate a similarity score. The function returns two
+    dataframes: One containing the indices of the most similar comparables
     and the other containing their corresponding similarity scores.
 
+    Weights can be:
+      - A 1-D array of shape (n_trees,) for algorithms that produce a single
+        weight per tree (e.g. "unweighted", "prediction_variance"). Will be
+        reshaped to (1, n_trees) before being passed to numba
+      - A 2-D matrix of shape (n_training_obs, n_trees) for algorithms that
+        produce per-observation weights (e.g. "error_reduction",
+        "unweighted_with_error_reduction")
+
     More details on the underlying algorithm can be found here:
     https://ccao-data.github.io/lightsnip/articles/finding-comps.html
 
@@ -33,7 +41,7 @@ def get_comps(
             comparables.
         weights (numpy.ndarray):
             Importance weights for leaf nodes, used to compute similarity
-            scores.
+            scores. Either 1-D (n_trees,) or 2-D (n_comparisons, n_trees).
         num_comps (int, optional):
             Number of top comparables to return for each observation.
             Default is 5.
@@ -59,16 +67,34 @@ def get_comps(
             f"({observation_df.shape[1]}) "
             f"must match `comparison_df` ({comparison_df.shape[1]})"
         )
-    if comparison_df.shape != weights.shape:
+
+    if not isinstance(weights, np.ndarray):
+        weights = np.asarray(weights)
+
+    # Normalize weights to a 2-D matrix so that we can use a single numba
+    # kernel for both vector and matrix weights. A 1-D vector of per-tree
+    # weights is reshaped to (1, n_trees); the numba kernel detects this
+    # shape and broadcasts the single row to all comparison observations.
+    if weights.ndim == 1:
+        if weights.shape[0] != comparison_df.shape[1]:
+            raise ValueError(
+                f"`weights` length {weights.shape[0]} must equal number of "
+                f"trees {comparison_df.shape[1]}"
+            )
+        weights_matrix = weights.reshape(1, -1).astype(np.float32, copy=False)
+    elif weights.ndim == 2:
+        if comparison_df.shape != weights.shape:
+            raise ValueError(
+                f"`comparison_df.shape` {comparison_df.shape} must match "
+                f"`weights.shape` {weights.shape}"
+            )
+        weights_matrix = weights.astype(np.float32, copy=False)
+    else:
         raise ValueError(
-            f"`comparison_df.shape` {comparison_df.shape} must match "
-            f"`weights.shape` {weights.shape}"
+            "`weights` must be a 1-D vector (n_trees,) or 2-D matrix "
+            f"(n_comparisons, n_trees), got {weights.ndim}-D"
         )
 
-    # Convert the weights to a numpy array so that we can take advantage of
-    # numba acceleration later on
-    weights_matrix = np.asarray(weights, dtype=np.float32)
-
     # Chunk the observations so that the script can periodically report progress
     observation_df["chunk"] = pd.cut(
         observation_df.index, bins=num_chunks, labels=False
@@ -137,12 +163,20 @@ def _get_top_n_comps(
     observations in a tree model, a matrix of weights for each obs/tree, and an
     integer `num_comps`, and returns a matrix where each observation is scored
     by similarity to observations in the comparison matrix and the top N scores
-    are returned along with the indexes of the comparison observations."""
+    are returned along with the indexes of the comparison observations.
+
+    The weights_matrix is always 2-D. If its first dimension is 1, the single
+    row of weights is broadcast to all comparison observations (i.e. tree-level
+    weights shared across all comparisons). Otherwise, each comparison
+    observation y_i uses its own row of weights."""
     num_observations = len(leaf_node_matrix)
     num_possible_comparisons = len(comparison_leaf_node_matrix)
     idx_dtype = np.int32
     score_dtype = np.float32
 
+    # Detect whether we have shared (vector-style) or per-observation weights
+    shared_weights = weights_matrix.shape[0] == 1
+
     # Store scores and indexes in two separate arrays rather than a 3d matrix
     # for simplicity (array of tuples does not convert to pandas properly).
     # Indexes default to -1, which is an impossible index and so is a signal
@@ -156,12 +190,14 @@ def _get_top_n_comps(
         # low memory footprint
         for y_i in range(num_possible_comparisons):
             similarity_score = 0.0
-            for tree_idx in range(len(leaf_node_matrix[x_i])):
+            # Use row 0 for shared weights, row y_i for per-obs weights
+            w_i = 0 if shared_weights else y_i
+            for tree_idx in range(leaf_node_matrix.shape[1]):
                 if (
-                    leaf_node_matrix[x_i][tree_idx]
-                    == comparison_leaf_node_matrix[y_i][tree_idx]
+                    leaf_node_matrix[x_i, tree_idx]
+                    == comparison_leaf_node_matrix[y_i, tree_idx]
                 ):
-                    similarity_score += weights_matrix[y_i][tree_idx]
+                    similarity_score += weights_matrix[w_i, tree_idx]
 
             # See if the score is higher than any of the top N
             # comps, and store it in the sorted comps array if it is.
@@ -198,18 +234,24 @@ def insert_at_idx_and_shift(
     num_trees = 500
     num_obs = 20001
     num_comparisons = 10000
-    mean_sale_price = 350000
-    std_deviation = 110000
 
     leaf_nodes = pd.DataFrame(np.random.randint(0, num_obs, size=[num_obs, num_trees]))
     training_leaf_nodes = pd.DataFrame(
         np.random.randint(0, num_comparisons, size=[num_comparisons, num_trees])
     )
-    tree_weights = np.asarray(
+
+    # Test with matrix weights (error_reduction style)
+    tree_weights_matrix = np.asarray(
         [np.random.dirichlet(np.ones(num_trees)) for _ in range(num_comparisons)]
     )
+    start = time.time()
+    get_comps(leaf_nodes, training_leaf_nodes, tree_weights_matrix)
+    end = time.time()
+    print(f"get_comps (matrix weights) runtime: {end - start}s")
 
+    # Test with vector weights (unweighted / prediction_variance style)
+    tree_weights_vector = np.random.dirichlet(np.ones(num_trees))
     start = time.time()
-    get_comps(leaf_nodes, training_leaf_nodes, tree_weights)
+    get_comps(leaf_nodes, training_leaf_nodes, tree_weights_vector)
     end = time.time()
-    print(f"get_comps runtime: {end - start}s")
+    print(f"get_comps (vector weights) runtime: {end - start}s")