Skip to content
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
5bf04c5
4 working functions
Damonamajor Feb 18, 2026
22e38e5
Update comps docs
wagnerlmichael Feb 18, 2026
5b655cf
Add algorithm param for extract_tree_weights
wagnerlmichael Feb 18, 2026
6f0ece5
Add input checking
wagnerlmichael Feb 18, 2026
363b108
Adjust tree_weights shape checking
wagnerlmichael Feb 18, 2026
123dd81
Test vector inclusion
wagnerlmichael Feb 18, 2026
4c0d8cc
Attempt using only 1 get_top_comps function
wagnerlmichael Feb 18, 2026
b52fbd8
Fix message
wagnerlmichael Feb 20, 2026
1923e19
Switch boolean check
wagnerlmichael Feb 20, 2026
972772c
Style
wagnerlmichael Feb 20, 2026
1f6bf19
Format
wagnerlmichael Feb 20, 2026
365dee4
Lint
wagnerlmichael Feb 20, 2026
fa8ce67
Lint
wagnerlmichael Feb 20, 2026
b433e60
Merge branch 'master' into 405-persist-all-possible-significant-sales…
wagnerlmichael Feb 20, 2026
d32eb89
Update R/helpers.R
wagnerlmichael Feb 23, 2026
441c4cd
Update R/helpers.R
wagnerlmichael Feb 23, 2026
5ca1ed7
Update R/helpers.R
wagnerlmichael Feb 23, 2026
1e32a95
Update tests to accomdate vector weight support
wagnerlmichael Feb 23, 2026
899450c
Remove redundant test
wagnerlmichael Feb 23, 2026
4e269fc
Fix failing Python test infrastructure on CI
jeancochrane Feb 23, 2026
3396b13
improve commenting
Damonamajor Feb 23, 2026
f0d3540
Merge branch 'master' into 405-persist-all-possible-significant-sales…
wagnerlmichael Feb 23, 2026
7136808
Make sure to install test dependencies in `test` workflow
jeancochrane Feb 23, 2026
82605f8
Fix incorrect path reference in interpret stage
jeancochrane Feb 23, 2026
1200ea3
CHeck to make sure weights sum to 1
wagnerlmichael Feb 23, 2026
39d4d03
Merge branch '405-persist-all-possible-significant-sales-algorithms-t…
wagnerlmichael Feb 23, 2026
51c582c
Merge branch 'jeancochrane/fix-failing-test-workflow' into 405-persis…
wagnerlmichael Feb 23, 2026
1cfb285
Update helpers.R
Damonamajor Feb 23, 2026
6bb28ec
Merge branch 'master' into 405-persist-all-possible-significant-sales…
jeancochrane Feb 24, 2026
dc09147
Add param to finalize.R
Damonamajor Feb 24, 2026
2215b3d
alphabetize
Damonamajor Feb 24, 2026
0d55dc1
Add in-place adustment
wagnerlmichael Feb 25, 2026
a775f6e
Merge branch '405-persist-all-possible-significant-sales-algorithms-t…
wagnerlmichael Feb 25, 2026
ffa4f1b
Remove space
wagnerlmichael Feb 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 93 additions & 10 deletions R/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -184,28 +184,90 @@ extract_num_iterations <- function(x) {
}

# Helper function to return weights for comps
# Computes per-tree weights from cumulative leaf node values.

# Basic Steps
# For every observation, map its assigned leaf index in
# each tree to the corresponding leaf value.
# Compute the row-wise cumulative sums of these
# leaf values (stand-in for training data predictions).
# Calculate the absolute prediction error.
# Compute the reduction in error.
# Normalize these improvements so that row-weights sum to 1.
# The `extract_tree_weights` function allows the user to return weights
# for each tree in a LightGBM model. Based on internal testing, we currently
# default to an unweighted value of 1 / n_trees for each tree. This returns
# a single vector with a length of the number of trees.

# Inputs:
# model: Lightgbm model
# leaf_idx: integer matrix [training data x trees] of leaf indices
# init_score: mean value of sale prices in the training data
# algorithm: type of algorithm to use. Set in params.yaml. Possible types
# unweighted, unweighted_with_error_reduction, error_reduction,
# and prediction_variance
# outcome: Predicted FMV values for each observation in the training data
# Returns:
# weights: numeric matrix [n_obs x n_trees] where each row sums to 1
extract_tree_weights <- function(model, leaf_idx, init_score, outcome) {
extract_tree_weights <- function(model,
leaf_idx,
init_score = NULL,
algorithm = "unweighted",
outcome = NULL) {
n_obs <- nrow(leaf_idx)
n_trees <- ncol(leaf_idx)

# Validate algorithm arg
valid_algorithms <- c(
"unweighted",
"prediction_variance",
"unweighted_with_error_reduction",
"error_reduction"
)

if (!algorithm %in% valid_algorithms) {
stop(
"Invalid algorithm '", algorithm, "'. Must be one of: ",
paste0(valid_algorithms, collapse = ", ")
)
}

# ---------------------------------------------------------
# Unweighted:
# Vector with 1/n_trees for each tree. This is the default input.
# This returns a vector with the length of the number of trees.
# ---------------------------------------------------------
if (algorithm == "unweighted") {
weights <- rep(1 / n_trees, n_trees)

return(weights)
}

# ---------------------------------------------------------
# Prediction_variance:
# Vector for tree weights based on variance of leaf values across data.
# This returns a vector with the length of the number of trees.
# ---------------------------------------------------------
if (algorithm == "prediction_variance") {
tree_dt <- lgb.model.dt.tree(model)
leaf_lookup <- tree_dt[
!is.na(leaf_index),
c("tree_index", "leaf_index", "leaf_value")
]

var_per_tree <- numeric(n_trees)

for (t in seq_len(n_trees)) {
# LightGBM is 0-indexed
this_tree <- subset(leaf_lookup, tree_index == (t - 1L))
m <- match(leaf_idx[, t], this_tree$leaf_index)
# incremental outputs for this tree across training rows
incr <- this_tree$leaf_value[m]
var_per_tree[t] <- stats::var(incr, na.rm = TRUE)
}

var_per_tree[is.na(var_per_tree)] <- 0
summed_variance <- sum(var_per_tree)
weights <- var_per_tree / summed_variance

return(weights)
}

# ---------------------------------------------------------
# Remaining algorithms require tree-based improvements to the predicted values
# ---------------------------------------------------------

init_vec <- rep_len(as.numeric(init_score), n_obs)

# Lookup: leaf_index -> leaf_value for each tree
Expand Down Expand Up @@ -241,6 +303,27 @@ extract_tree_weights <- function(model, leaf_idx, init_score, outcome) {
# Improvement per tree = previous error - next error
prev_err <- tree_errors[, 1:n_trees, drop = FALSE]
next_err <- tree_errors[, 2:(n_trees + 1L), drop = FALSE]

# ---------------------------------------------------------
# Unweighted_with_error_reduction:
# Weights are 1/n_improving trees for trees which reduce errors, 0 otherwise.
# This returns a matrix with dimensions of observations x trees.
# ---------------------------------------------------------
if (algorithm == "unweighted_with_error_reduction") {
improving <- prev_err > next_err
n_improving <- rowSums(improving)

weights <- improving / n_improving

return(weights)
}

# ---------------------------------------------------------
# Proportional_error_reduction:
# Weights are proportional to the reduction in error (prev_err - next_err) for
# improving trees, 0 otherwise. This returns a matrix with dimensions of
# observations x trees.
# ---------------------------------------------------------
diff_in_errors <- pmax(0, prev_err - next_err)
dim(diff_in_errors) <- dim(prev_err)

Expand Down
12 changes: 12 additions & 0 deletions params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,18 @@ ratio_study:
comp:
# Number of comps to generate for each PIN/card
num_comps: 5
# Algorithm used to weight trees for the comps similarity score.
# Valid options:
# - "unweighted": Equal weight (1/n_trees) for every tree. No
# observation-level variation. Returns a vector of weights.
# - "unweighted_with_error_reduction": Binary 1/0 per tree per observation
# (1 if tree reduces the training sale's prediction error, 0 otherwise),
# then row-normalized. Returns a matrix of weights.
# - "error_reduction": Proportional error reduction for training data sale per
# tree, row-normalized. Returns a matrix of weights.
# - "prediction_variance": Variance of each tree's leaf values across
# training observations, normalized to sum to 1. Returns a vector of weights.
algorithm: unweighted

# Export -----------------------------------------------------------------------

Expand Down
30 changes: 19 additions & 11 deletions pipeline/04-interpret.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ reticulate::py_require(
purrr::walk(list.files("R/", "\\.R$", full.names = TRUE), source)




#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 2. Load Data -----------------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand Down Expand Up @@ -155,8 +153,6 @@ if (shap_enable) {
}




#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 4. Calculate Feature Importance ----------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand All @@ -176,8 +172,6 @@ lightgbm::lgb.importance(lgbm_final_full_fit$fit) %>%
write_parquet(paths$output$feature_importance$local)




#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 5. Find Comparables ---------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand Down Expand Up @@ -271,19 +265,33 @@ if (comp_enable) {
model = lgbm_final_full_fit$fit,
leaf_idx = as.matrix(training_leaf_nodes),
init_score = mean(training_data$meta_sale_price, na.rm = TRUE),
algorithm = params$comp$algorithm,
outcome = training_data$meta_sale_price
)

if (length(tree_weights) == 0) {
message("Warning: tree_weights are empty")
}
if (all(rowSums(tree_weights) %in% c(0, 1))) {
message("Warning: tree_weights do not sum to 1 or 0 for each row")
message("First 5 weights:")
print(head(tree_weights, 5))
if (is.matrix(tree_weights)) {
if (!all(rowSums(tree_weights) %in% c(0, 1))) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the negation here because unless I'm reading incorrectly, I think that we had this backwards?

message("Warning: tree_weights do not sum to 1 or 0 for each row")
message("First 5 weights:")
print(head(tree_weights, 5))
}
} else {
tree_weights_sum <- sum(tree_weights)
if (!isTRUE(all.equal(tree_weights_sum, 1))) {
stop(
"Tree weights vector does not sum to 1 (got ", tree_weights_sum, "). ",
"All sales would have a score of 0 if weights sum to 0."
)
}
message(
"Tree weights are a vector of length ", length(tree_weights),
" (same weights for all training observations)"
)
}


# Make sure that the leaf node tibbles are all integers, which is what
# the comps algorithm expects
leaf_nodes <- leaf_nodes %>%
Expand Down
1 change: 1 addition & 0 deletions pipeline/05-finalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ metadata <- tibble::tibble(
ratio_study_near_column = params$ratio_study$near_column,
ratio_study_num_quantile = list(params$ratio_study$num_quantile),
shap_enable = shap_enable,
comp_algorithm = params$comp$algorithm,
comp_enable = comp_enable,
comp_num_comps = params$comp$num_comps,
cv_enable = cv_enable,
Expand Down
84 changes: 63 additions & 21 deletions python/comps.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,19 @@ def get_comps(
lightgbm leaf node assignments (`observation_df`) compared to a second
dataframe of leaf node assignments (`comparison_df`).

Leaf nodes are weighted according to a tree importance matrix `weights`
and used to generate a similarity score. The function returns two
dataframes: One containing the indices of the most similar compararables
Leaf nodes are weighted according to a tree importance vector or matrix
`weights` and used to generate a similarity score. The function returns two
dataframes: One containing the indices of the most similar comparables
and the other containing their corresponding similarity scores.

Weights can be:
- A 1-D array of shape (n_trees,) for algorithms that produce a single
weight per tree (e.g. "unweighted", "prediction_variance"). Will be
reshaped to (1, n_trees) before being passed to numba
- A 2-D matrix of shape (n_training_obs, n_trees) for algorithms that
produce per-observation weights (e.g. "error_reduction",
"unweighted_with_error_reduction")

More details on the underlying algorithm can be found here:
https://ccao-data.github.io/lightsnip/articles/finding-comps.html

Expand All @@ -33,7 +41,7 @@ def get_comps(
comparables.
weights (numpy.ndarray):
Importance weights for leaf nodes, used to compute similarity
scores.
scores. Either 1-D (n_trees,) or 2-D (n_comparisons, n_trees).
num_comps (int, optional):
Number of top comparables to return for each observation.
Default is 5.
Expand All @@ -59,16 +67,34 @@ def get_comps(
f"({observation_df.shape[1]}) "
f"must match `comparison_df` ({comparison_df.shape[1]})"
)
if comparison_df.shape != weights.shape:

if not isinstance(weights, np.ndarray):
weights = np.asarray(weights)

# Normalize weights to a 2-D matrix so that we can use a single numba
# kernel for both vector and matrix weights. A 1-D vector of per-tree
# weights is reshaped to (1, n_trees); the numba kernel detects this
# shape and broadcasts the single row to all comparison observations.
if weights.ndim == 1:
if weights.shape[0] != comparison_df.shape[1]:
raise ValueError(
f"`weights` length {weights.shape[0]} must equal number of "
f"trees {comparison_df.shape[1]}"
)
weights_matrix = weights.reshape(1, -1).astype(np.float32, copy=False)
elif weights.ndim == 2:
if comparison_df.shape != weights.shape:
raise ValueError(
f"`comparison_df.shape` {comparison_df.shape} must match "
f"`weights.shape` {weights.shape}"
)
weights_matrix = weights.astype(np.float32, copy=False)
else:
raise ValueError(
f"`comparison_df.shape` {comparison_df.shape} must match "
f"`weights.shape` {weights.shape}"
"`weights` must be a 1-D vector (n_trees,) or 2-D matrix "
f"(n_comparisons, n_trees), got {weights.ndim}-D"
)

# Convert the weights to a numpy array so that we can take advantage of
# numba acceleration later on
weights_matrix = np.asarray(weights, dtype=np.float32)

# Chunk the observations so that the script can periodically report progress
observation_df["chunk"] = pd.cut(
observation_df.index, bins=num_chunks, labels=False
Expand Down Expand Up @@ -137,12 +163,20 @@ def _get_top_n_comps(
observations in a tree model, a matrix of weights for each obs/tree, and an
integer `num_comps`, and returns a matrix where each observation is scored
by similarity to observations in the comparison matrix and the top N scores
are returned along with the indexes of the comparison observations."""
are returned along with the indexes of the comparison observations.

The weights_matrix is always 2-D. If its first dimension is 1, the single
row of weights is broadcast to all comparison observations (i.e. tree-level
weights shared across all comparisons). Otherwise, each comparison
observation y_i uses its own row of weights."""
num_observations = len(leaf_node_matrix)
num_possible_comparisons = len(comparison_leaf_node_matrix)
idx_dtype = np.int32
score_dtype = np.float32

# Detect whether we have shared (vector-style) or per-observation weights
shared_weights = weights_matrix.shape[0] == 1

# Store scores and indexes in two separate arrays rather than a 3d matrix
# for simplicity (array of tuples does not convert to pandas properly).
# Indexes default to -1, which is an impossible index and so is a signal
Expand All @@ -156,12 +190,14 @@ def _get_top_n_comps(
# low memory footprint
for y_i in range(num_possible_comparisons):
similarity_score = 0.0
for tree_idx in range(len(leaf_node_matrix[x_i])):
# Use row 0 for shared weights, row y_i for per-obs weights
w_i = 0 if shared_weights else y_i
for tree_idx in range(leaf_node_matrix.shape[1]):
if (
leaf_node_matrix[x_i][tree_idx]
== comparison_leaf_node_matrix[y_i][tree_idx]
leaf_node_matrix[x_i, tree_idx]
== comparison_leaf_node_matrix[y_i, tree_idx]
):
similarity_score += weights_matrix[y_i][tree_idx]
similarity_score += weights_matrix[w_i, tree_idx]

# See if the score is higher than any of the top N
# comps, and store it in the sorted comps array if it is.
Expand Down Expand Up @@ -198,18 +234,24 @@ def insert_at_idx_and_shift(
num_trees = 500
num_obs = 20001
num_comparisons = 10000
mean_sale_price = 350000
std_deviation = 110000

leaf_nodes = pd.DataFrame(np.random.randint(0, num_obs, size=[num_obs, num_trees]))
training_leaf_nodes = pd.DataFrame(
np.random.randint(0, num_comparisons, size=[num_comparisons, num_trees])
)
tree_weights = np.asarray(

# Test with matrix weights (error_reduction style)
tree_weights_matrix = np.asarray(
[np.random.dirichlet(np.ones(num_trees)) for _ in range(num_comparisons)]
)
start = time.time()
get_comps(leaf_nodes, training_leaf_nodes, tree_weights_matrix)
end = time.time()
print(f"get_comps (matrix weights) runtime: {end - start}s")

# Test with vector weights (unweighted / prediction_variance style)
tree_weights_vector = np.random.dirichlet(np.ones(num_trees))
start = time.time()
get_comps(leaf_nodes, training_leaf_nodes, tree_weights)
get_comps(leaf_nodes, training_leaf_nodes, tree_weights_vector)
end = time.time()
print(f"get_comps runtime: {end - start}s")
print(f"get_comps (vector weights) runtime: {end - start}s")
Loading
Loading