Skip to content
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
5bf04c5
4 working functions
Damonamajor Feb 18, 2026
22e38e5
Update comps docs
wagnerlmichael Feb 18, 2026
5b655cf
Add algorithm param for extract_tree_weights
wagnerlmichael Feb 18, 2026
6f0ece5
Add input checking
wagnerlmichael Feb 18, 2026
363b108
Adjust tree_weights shape checking
wagnerlmichael Feb 18, 2026
123dd81
Test vector inclusion
wagnerlmichael Feb 18, 2026
4c0d8cc
Attempt using only 1 get_top_comps function
wagnerlmichael Feb 18, 2026
b52fbd8
Fix message
wagnerlmichael Feb 20, 2026
1923e19
Switch boolean check
wagnerlmichael Feb 20, 2026
972772c
Style
wagnerlmichael Feb 20, 2026
1f6bf19
Format
wagnerlmichael Feb 20, 2026
365dee4
Lint
wagnerlmichael Feb 20, 2026
fa8ce67
Lint
wagnerlmichael Feb 20, 2026
b433e60
Merge branch 'master' into 405-persist-all-possible-significant-sales…
wagnerlmichael Feb 20, 2026
d32eb89
Update R/helpers.R
wagnerlmichael Feb 23, 2026
441c4cd
Update R/helpers.R
wagnerlmichael Feb 23, 2026
5ca1ed7
Update R/helpers.R
wagnerlmichael Feb 23, 2026
1e32a95
Update tests to accomdate vector weight support
wagnerlmichael Feb 23, 2026
899450c
Remove redundant test
wagnerlmichael Feb 23, 2026
4e269fc
Fix failing Python test infrastructure on CI
jeancochrane Feb 23, 2026
3396b13
improve commenting
Damonamajor Feb 23, 2026
f0d3540
Merge branch 'master' into 405-persist-all-possible-significant-sales…
wagnerlmichael Feb 23, 2026
7136808
Make sure to install test dependencies in `test` workflow
jeancochrane Feb 23, 2026
82605f8
Fix incorrect path reference in interpret stage
jeancochrane Feb 23, 2026
1200ea3
CHeck to make sure weights sum to 1
wagnerlmichael Feb 23, 2026
39d4d03
Merge branch '405-persist-all-possible-significant-sales-algorithms-t…
wagnerlmichael Feb 23, 2026
51c582c
Merge branch 'jeancochrane/fix-failing-test-workflow' into 405-persis…
wagnerlmichael Feb 23, 2026
1cfb285
Update helpers.R
Damonamajor Feb 23, 2026
6bb28ec
Merge branch 'master' into 405-persist-all-possible-significant-sales…
jeancochrane Feb 24, 2026
dc09147
Add param to finalize.R
Damonamajor Feb 24, 2026
2215b3d
alphabetize
Damonamajor Feb 24, 2026
0d55dc1
Add in-place adustment
wagnerlmichael Feb 25, 2026
a775f6e
Merge branch '405-persist-all-possible-significant-sales-algorithms-t…
wagnerlmichael Feb 25, 2026
ffa4f1b
Remove space
wagnerlmichael Feb 25, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,20 @@ jobs:
uses: astral-sh/setup-uv@v4
with:
enable-cache: true
cache-dependency-glob: requirements.txt
cache-suffix: pytest
cache-dependency-glob: |
python/pyproject.toml
python/uv.lock

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: 3.12

- name: Install dependencies
working-directory: python
shell: bash
run: |
uv pip install -r requirements.txt
uv pip install pytest~=8.3.5
run: uv pip install .[tests]

- name: Run Python tests
shell: bash
Expand Down
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,4 @@ cache/
# Ignore scratch documents
scratch*.*

# Python files
__pycache__

/.quarto/
103 changes: 93 additions & 10 deletions R/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -184,28 +184,90 @@ extract_num_iterations <- function(x) {
}

# Helper function to return weights for comps
# Computes per-tree weights from cumulative leaf node values.

# Basic Steps
# For every observation, map its assigned leaf index in
# each tree to the corresponding leaf value.
# Compute the row-wise cumulative sums of these
# leaf values (stand-in for training data predictions).
# Calculate the absolute prediction error.
# Compute the reduction in error.
# Normalize these improvements so that row-weights sum to 1.
# The `extract_tree_weights` function allows the user to return weights
# for each tree in a LightGBM model. Based on internal testing, we currently
# default to an unweighted value of 1 / n_trees for each tree. This returns
# a single vector with a length of the number of trees.

# Inputs:
# model: Lightgbm model
# leaf_idx: integer matrix [training data x trees] of leaf indices
# init_score: mean value of sale prices in the training data
# algorithm: type of algorithm to use. Set in params.yaml. Possible types
# unweighted, unweighted_with_error_reduction, error_reduction,
# and prediction_variance
# outcome: Predicted FMV values for each observation in the training data
# Returns:
# weights: numeric matrix [n_obs x n_trees] where each row sums to 1
extract_tree_weights <- function(model, leaf_idx, init_score, outcome) {
extract_tree_weights <- function(model,
leaf_idx,
init_score = NULL,
algorithm = "unweighted",
outcome = NULL) {
n_obs <- nrow(leaf_idx)
n_trees <- ncol(leaf_idx)

# Validate algorithm arg
valid_algorithms <- c(
"unweighted",
"prediction_variance",
"unweighted_with_error_reduction",
"error_reduction"
)

if (!algorithm %in% valid_algorithms) {
stop(
"Invalid algorithm '", algorithm, "'. Must be one of: ",
paste0(valid_algorithms, collapse = ", ")
)
}

# ---------------------------------------------------------
# Unweighted:
# Vector with 1/n_trees for each tree. This is the default input.
# This returns a vector with the length of the number of trees.
# ---------------------------------------------------------
if (algorithm == "unweighted") {
weights <- rep(1 / n_trees, n_trees)

return(weights)
}

# ---------------------------------------------------------
# Prediction_variance:
# Vector for tree weights based on variance of leaf values across data.
# This returns a vector with the length of the number of trees.
# ---------------------------------------------------------
if (algorithm == "prediction_variance") {
tree_dt <- lgb.model.dt.tree(model)
leaf_lookup <- tree_dt[
!is.na(leaf_index),
c("tree_index", "leaf_index", "leaf_value")
]

var_per_tree <- numeric(n_trees)

for (t in seq_len(n_trees)) {
# LightGBM is 0-indexed
this_tree <- subset(leaf_lookup, tree_index == (t - 1L))
m <- match(leaf_idx[, t], this_tree$leaf_index)
# incremental outputs for this tree across training rows
incr <- this_tree$leaf_value[m]
var_per_tree[t] <- stats::var(incr, na.rm = TRUE)
}

var_per_tree[is.na(var_per_tree)] <- 0
summed_variance <- sum(var_per_tree)
weights <- var_per_tree / summed_variance

return(weights)
}

# ---------------------------------------------------------
# Remaining algorithms require tree-based improvements to the predicted values
# ---------------------------------------------------------

init_vec <- rep_len(as.numeric(init_score), n_obs)

# Lookup: leaf_index -> leaf_value for each tree
Expand Down Expand Up @@ -241,6 +303,27 @@ extract_tree_weights <- function(model, leaf_idx, init_score, outcome) {
# Improvement per tree = previous error - next error
prev_err <- tree_errors[, 1:n_trees, drop = FALSE]
next_err <- tree_errors[, 2:(n_trees + 1L), drop = FALSE]

# ---------------------------------------------------------
# Unweighted_with_error_reduction:
# Weights are 1/n_improving trees for trees which reduce errors, 0 otherwise.
# This returns a matrix with dimensions of observations x trees.
# ---------------------------------------------------------
if (algorithm == "unweighted_with_error_reduction") {
improving <- prev_err > next_err
n_improving <- rowSums(improving)

weights <- improving / n_improving

return(weights)
}

# ---------------------------------------------------------
# Proportional error reduction:
# Weights are proportional to the reduction in error (prev_err - next_err) for
# improving trees, 0 otherwise. This returns a matrix with dimensions of
# observations x trees.
# ---------------------------------------------------------
diff_in_errors <- pmax(0, prev_err - next_err)
dim(diff_in_errors) <- dim(prev_err)

Expand Down
12 changes: 12 additions & 0 deletions params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,18 @@ ratio_study:
comp:
# Number of comps to generate for each PIN/card
num_comps: 5
# Algorithm used to weight trees for the comps similarity score.
# Valid options:
# - "unweighted": Equal weight (1/n_trees) for every tree. No
# observation-level variation. Returns a vector of weights.
# - "unweighted_with_error_reduction": Binary 1/0 per tree per observation
# (1 if tree reduces the training sale's prediction error, 0 otherwise),
# then row-normalized. Returns a matrix of weights.
# - "error_reduction": Proportional error reduction for training data sale per
# tree, row-normalized. Returns a matrix of weights.
# - "prediction_variance": Variance of each tree's leaf values across
# training observations, normalized to sum to 1. Returns a vector of weights.
algorithm: unweighted

# Export -----------------------------------------------------------------------

Expand Down
36 changes: 24 additions & 12 deletions pipeline/04-interpret.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ tictoc::tic("Interpret")
# configures reticulate to use uv to install those dependencies. Reticulate
# will install the dependencies when we import them. In this particular script,
# we don't import Python dependencies directly, but rather we import the
# comps module which then imports these dependencies
# comps module which then imports these dependencies.
#
# Because the reticulate uv integration is not very sophisticated, this
# dependency list is duplicated in `python/pyproject.toml`. If you add or
# change any dependencies in this list, make sure to change them there too
reticulate::py_require(
packages = c("numpy==2.2.*", "numba==0.62.*", "pandas==2.3.*"),
python_version = "3.10"
Expand All @@ -24,8 +28,6 @@ reticulate::py_require(
purrr::walk(list.files("R/", "\\.R$", full.names = TRUE), source)




#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 2. Load Data -----------------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand Down Expand Up @@ -151,8 +153,6 @@ if (shap_enable) {
}




#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 4. Calculate Feature Importance ----------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand All @@ -172,8 +172,6 @@ lightgbm::lgb.importance(lgbm_final_full_fit$fit) %>%
write_parquet(paths$output$feature_importance$local)




#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 5. Find Comparables ---------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand Down Expand Up @@ -267,19 +265,33 @@ if (comp_enable) {
model = lgbm_final_full_fit$fit,
leaf_idx = as.matrix(training_leaf_nodes),
init_score = mean(training_data$meta_sale_price, na.rm = TRUE),
algorithm = params$comp$algorithm,
outcome = training_data$meta_sale_price
)

if (length(tree_weights) == 0) {
message("Warning: tree_weights are empty")
}
if (all(rowSums(tree_weights) %in% c(0, 1))) {
message("Warning: tree_weights do not sum to 1 or 0 for each row")
message("First 5 weights:")
print(head(tree_weights, 5))
if (is.matrix(tree_weights)) {
if (!all(rowSums(tree_weights) %in% c(0, 1))) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the negation here because unless I'm reading incorrectly, I think that we had this backwards?

message("Warning: tree_weights do not sum to 1 or 0 for each row")
message("First 5 weights:")
print(head(tree_weights, 5))
}
} else {
tree_weights_sum <- sum(tree_weights)
if (!isTRUE(all.equal(tree_weights_sum, 1))) {
stop(
"Tree weights vector does not sum to 1 (got ", tree_weights_sum, "). ",
"All sales would have a score of 0 if weights sum to 0."
)
}
message(
"Tree weights are a vector of length ", length(tree_weights),
" (same weights for all training observations)"
)
}


# Make sure that the leaf node tibbles are all integers, which is what
# the comps algorithm expects
leaf_nodes <- leaf_nodes %>%
Expand Down
7 changes: 4 additions & 3 deletions python/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Ignore uv lockfile because we use requirements.txt for this project, in order
# to make it compatible with reticulate
uv.lock
# Python artifacts
__pycache__
build/
*.egg-info
Loading