ccao-data · wagnerlmichael · Feb 26, 2026 · Feb 18, 2026 · Feb 18, 2026 · Feb 18, 2026
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -20,19 +20,20 @@ jobs:
         uses: astral-sh/setup-uv@v4
         with:
           enable-cache: true
-          cache-dependency-glob: requirements.txt
           cache-suffix: pytest
+          cache-dependency-glob: |
+            python/pyproject.toml
+            python/uv.lock
 
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
           python-version: 3.12
 
       - name: Install dependencies
+        working-directory: python
         shell: bash
-        run: |
-          uv pip install -r requirements.txt
-          uv pip install pytest~=8.3.5
+        run: uv pip install .[tests]
 
       - name: Run Python tests
         shell: bash

diff --git a/.gitignore b/.gitignore
@@ -30,7 +30,4 @@ cache/
 # Ignore scratch documents
 scratch*.*
 
-# Python files
-__pycache__
-
 /.quarto/
@@ -184,28 +184,90 @@ extract_num_iterations <- function(x) {
 }
 
 # Helper function to return weights for comps
-# Computes per-tree weights from cumulative leaf node values.
 
-# Basic Steps
-# For every observation, map its assigned leaf index in
-# each tree to the corresponding leaf value.
-# Compute the row-wise cumulative sums of these
-# leaf values (stand-in for training data predictions).
-# Calculate the absolute prediction error.
-# Compute the reduction in error.
-# Normalize these improvements so that row-weights sum to 1.
+# The `extract_tree_weights` function allows the user to return weights
+# for each tree in a LightGBM model. Based on internal testing, we currently
+# default to an unweighted value of 1 / n_trees for each tree. This returns
+# a single vector with a length of the number of trees.
 
 # Inputs:
 #   model:      Lightgbm model
 #   leaf_idx:   integer matrix [training data x trees] of leaf indices
 #   init_score: mean value of sale prices in the training data
+#   algorithm:  type of algorithm to use. Set in params.yaml. Possible types
+#               unweighted, unweighted_with_error_reduction, error_reduction,
+#               and prediction_variance
 #   outcome:    Predicted FMV values for each observation in the training data
 # Returns:
 #   weights: numeric matrix [n_obs x n_trees] where each row sums to 1
-extract_tree_weights <- function(model, leaf_idx, init_score, outcome) {
+extract_tree_weights <- function(model,
+                                 leaf_idx,
+                                 init_score = NULL,
+                                 algorithm = "unweighted",
+                                 outcome = NULL) {
   n_obs <- nrow(leaf_idx)
   n_trees <- ncol(leaf_idx)
 
+  # Validate algorithm arg
+  valid_algorithms <- c(
+    "unweighted",
+    "prediction_variance",
+    "unweighted_with_error_reduction",
+    "error_reduction"
+  )
+
+  if (!algorithm %in% valid_algorithms) {
+    stop(
+      "Invalid algorithm '", algorithm, "'. Must be one of: ",
+      paste0(valid_algorithms, collapse = ", ")
+    )
+  }
+
+  # ---------------------------------------------------------
+  # Unweighted:
+  # Vector with 1/n_trees for each tree. This is the default input.
+  # This returns a vector with the length of the number of trees.
+  # ---------------------------------------------------------
+  if (algorithm == "unweighted") {
+    weights <- rep(1 / n_trees, n_trees)
+
+    return(weights)
+  }
+
+  # ---------------------------------------------------------
+  # Prediction_variance:
+  # Vector for tree weights based on variance of leaf values across data.
+  # This returns a vector with the length of the number of trees.
+  # ---------------------------------------------------------
+  if (algorithm == "prediction_variance") {
+    tree_dt <- lgb.model.dt.tree(model)
+    leaf_lookup <- tree_dt[
+      !is.na(leaf_index),
+      c("tree_index", "leaf_index", "leaf_value")
+    ]
+
+    var_per_tree <- numeric(n_trees)
+
+    for (t in seq_len(n_trees)) {
+      # LightGBM is 0-indexed
+      this_tree <- subset(leaf_lookup, tree_index == (t - 1L))
+      m <- match(leaf_idx[, t], this_tree$leaf_index)
+      # incremental outputs for this tree across training rows
+      incr <- this_tree$leaf_value[m]
+      var_per_tree[t] <- stats::var(incr, na.rm = TRUE)
+    }
+
+    var_per_tree[is.na(var_per_tree)] <- 0
+    summed_variance <- sum(var_per_tree)
+    weights <- var_per_tree / summed_variance
+
+    return(weights)
+  }
+
+  # ---------------------------------------------------------
+  # Remaining algorithms require tree-based improvements to the predicted values
+  # ---------------------------------------------------------
+
   init_vec <- rep_len(as.numeric(init_score), n_obs)
 
   # Lookup: leaf_index -> leaf_value for each tree
@@ -241,6 +303,27 @@ extract_tree_weights <- function(model, leaf_idx, init_score, outcome) {
   # Improvement per tree = previous error - next error
   prev_err <- tree_errors[, 1:n_trees, drop = FALSE]
   next_err <- tree_errors[, 2:(n_trees + 1L), drop = FALSE]
+
+  # ---------------------------------------------------------
+  # Unweighted_with_error_reduction:
+  # Weights are 1/n_improving trees for trees which reduce errors, 0 otherwise.
+  # This returns a matrix with dimensions of observations x trees.
+  # ---------------------------------------------------------
+  if (algorithm == "unweighted_with_error_reduction") {
+    improving <- prev_err > next_err
+    n_improving <- rowSums(improving)
+
+    weights <- improving / n_improving
+
+    return(weights)
+  }
+
+  # ---------------------------------------------------------
+  # Proportional error reduction:
+  # Weights are proportional to the reduction in error (prev_err - next_err) for
+  # improving trees, 0 otherwise. This returns a matrix with dimensions of
+  # observations x trees.
+  # ---------------------------------------------------------
   diff_in_errors <- pmax(0, prev_err - next_err)
   dim(diff_in_errors) <- dim(prev_err)
 

@@ -412,6 +412,18 @@ ratio_study:
 comp:
   # Number of comps to generate for each PIN/card
   num_comps: 5
+  # Algorithm used to weight trees for the comps similarity score.
+  # Valid options:
+  #   - "unweighted": Equal weight (1/n_trees) for every tree. No
+  #     observation-level variation. Returns a vector of weights.
+  #   - "unweighted_with_error_reduction": Binary 1/0 per tree per observation
+  #     (1 if tree reduces the training sale's prediction error, 0 otherwise),
+  #     then row-normalized. Returns a matrix of weights.
+  #   - "error_reduction": Proportional error reduction for training data sale per
+  #     tree, row-normalized. Returns a matrix of weights.
+  #   - "prediction_variance": Variance of each tree's leaf values across
+  #     training observations, normalized to sum to 1. Returns a vector of weights.
+  algorithm: unweighted
 
 # Export -----------------------------------------------------------------------
 

@@ -14,7 +14,11 @@ tictoc::tic("Interpret")
 # configures reticulate to use uv to install those dependencies. Reticulate
 # will install the dependencies when we import them. In this particular script,
 # we don't import Python dependencies directly, but rather we import the
-# comps module which then imports these dependencies
+# comps module which then imports these dependencies.
+#
+# Because the reticulate uv integration is not very sophisticated, this
+# dependency list is duplicated in `python/pyproject.toml`. If you add or
+# change any dependencies in this list, make sure to change them there too
 reticulate::py_require(
   packages = c("numpy==2.2.*", "numba==0.62.*", "pandas==2.3.*"),
   python_version = "3.10"
@@ -24,8 +28,6 @@ reticulate::py_require(
 purrr::walk(list.files("R/", "\\.R$", full.names = TRUE), source)
 
 
-
-
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 # 2. Load Data -----------------------------------------------------------------
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -151,8 +153,6 @@ if (shap_enable) {
 }
 
 
-
-
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 # 4. Calculate Feature Importance ----------------------------------------------
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -172,8 +172,6 @@ lightgbm::lgb.importance(lgbm_final_full_fit$fit) %>%
   write_parquet(paths$output$feature_importance$local)
 
 
-
-
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 # 5. Find Comparables  ---------------------------------------------------------
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -267,19 +265,33 @@ if (comp_enable) {
     model      = lgbm_final_full_fit$fit,
     leaf_idx   = as.matrix(training_leaf_nodes),
     init_score = mean(training_data$meta_sale_price, na.rm = TRUE),
+    algorithm  = params$comp$algorithm,
     outcome    = training_data$meta_sale_price
   )
 
   if (length(tree_weights) == 0) {
     message("Warning: tree_weights are empty")
   }
-  if (all(rowSums(tree_weights) %in% c(0, 1))) {
-    message("Warning: tree_weights do not sum to 1 or 0 for each row")
-    message("First 5 weights:")
-    print(head(tree_weights, 5))
+  if (is.matrix(tree_weights)) {
+    if (!all(rowSums(tree_weights) %in% c(0, 1))) {
+      message("Warning: tree_weights do not sum to 1 or 0 for each row")
+      message("First 5 weights:")
+      print(head(tree_weights, 5))
+    }
+  } else {
+    tree_weights_sum <- sum(tree_weights)
+    if (!isTRUE(all.equal(tree_weights_sum, 1))) {
+      stop(
+        "Tree weights vector does not sum to 1 (got ", tree_weights_sum, "). ",
+        "All sales would have a score of 0 if weights sum to 0."
+      )
+    }
+    message(
+      "Tree weights are a vector of length ", length(tree_weights),
+      " (same weights for all training observations)"
+    )
   }
 
-
   # Make sure that the leaf node tibbles are all integers, which is what
   # the comps algorithm expects
   leaf_nodes <- leaf_nodes %>%

diff --git a/python/.gitignore b/python/.gitignore
@@ -1,3 +1,4 @@
-# Ignore uv lockfile because we use requirements.txt for this project, in order
-# to make it compatible with reticulate
-uv.lock
+# Python artifacts
+__pycache__
+build/
+*.egg-info