-
Notifications
You must be signed in to change notification settings - Fork 17
Implement configurable options for comps algorithm methodology #449
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5bf04c5
22e38e5
5b655cf
6f0ece5
363b108
123dd81
4c0d8cc
b52fbd8
1923e19
972772c
1f6bf19
365dee4
fa8ce67
b433e60
d32eb89
441c4cd
5ca1ed7
1e32a95
899450c
4e269fc
3396b13
f0d3540
7136808
82605f8
1200ea3
39d4d03
51c582c
1cfb285
6bb28ec
dc09147
2215b3d
0d55dc1
a775f6e
ffa4f1b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,8 +28,6 @@ reticulate::py_require( | |
| purrr::walk(list.files("R/", "\\.R$", full.names = TRUE), source) | ||
|
|
||
|
|
||
|
|
||
|
|
||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| # 2. Load Data ----------------------------------------------------------------- | ||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
|
|
@@ -155,8 +153,6 @@ if (shap_enable) { | |
| } | ||
|
|
||
|
|
||
|
|
||
|
|
||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| # 4. Calculate Feature Importance ---------------------------------------------- | ||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
|
|
@@ -176,8 +172,6 @@ lightgbm::lgb.importance(lgbm_final_full_fit$fit) %>% | |
| write_parquet(paths$output$feature_importance$local) | ||
|
|
||
|
|
||
|
|
||
|
|
||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
| # 5. Find Comparables --------------------------------------------------------- | ||
| #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
|
|
@@ -271,19 +265,33 @@ if (comp_enable) { | |
| model = lgbm_final_full_fit$fit, | ||
| leaf_idx = as.matrix(training_leaf_nodes), | ||
| init_score = mean(training_data$meta_sale_price, na.rm = TRUE), | ||
| algorithm = params$comp$algorithm, | ||
| outcome = training_data$meta_sale_price | ||
| ) | ||
|
|
||
| if (length(tree_weights) == 0) { | ||
| message("Warning: tree_weights are empty") | ||
| } | ||
| if (all(rowSums(tree_weights) %in% c(0, 1))) { | ||
| message("Warning: tree_weights do not sum to 1 or 0 for each row") | ||
| message("First 5 weights:") | ||
| print(head(tree_weights, 5)) | ||
| if (is.matrix(tree_weights)) { | ||
| if (!all(rowSums(tree_weights) %in% c(0, 1))) { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added the negation here because unless I'm reading incorrectly, I think that we had this backwards? |
||
| message("Warning: tree_weights do not sum to 1 or 0 for each row") | ||
| message("First 5 weights:") | ||
| print(head(tree_weights, 5)) | ||
| } | ||
| } else { | ||
| tree_weights_sum <- sum(tree_weights) | ||
| if (!isTRUE(all.equal(tree_weights_sum, 1))) { | ||
| stop( | ||
| "Tree weights vector does not sum to 1 (got ", tree_weights_sum, "). ", | ||
| "All sales would have a score of 0 if weights sum to 0." | ||
| ) | ||
| } | ||
| message( | ||
| "Tree weights are a vector of length ", length(tree_weights), | ||
| " (same weights for all training observations)" | ||
| ) | ||
jeancochrane marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
|
|
||
| # Make sure that the leaf node tibbles are all integers, which is what | ||
| # the comps algorithm expects | ||
| leaf_nodes <- leaf_nodes %>% | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,11 +17,19 @@ def get_comps( | |
| lightgbm leaf node assignments (`observation_df`) compared to a second | ||
| dataframe of leaf node assignments (`comparison_df`). | ||
|
|
||
| Leaf nodes are weighted according to a tree importance matrix `weights` | ||
| and used to generate a similarity score. The function returns two | ||
| dataframes: One containing the indices of the most similar compararables | ||
| Leaf nodes are weighted according to a tree importance vector or matrix | ||
| `weights` and used to generate a similarity score. The function returns two | ||
| dataframes: One containing the indices of the most similar comparables | ||
| and the other containing their corresponding similarity scores. | ||
|
|
||
| Weights can be: | ||
| - A 1-D array of shape (n_trees,) for algorithms that produce a single | ||
| weight per tree (e.g. "unweighted", "prediction_variance"). Will be | ||
| reshaped to (1, n_trees) before being passed to numba | ||
| - A 2-D matrix of shape (n_training_obs, n_trees) for algorithms that | ||
| produce per-observation weights (e.g. "error_reduction", | ||
| "unweighted_with_error_reduction") | ||
|
|
||
| More details on the underlying algorithm can be found here: | ||
| https://ccao-data.github.io/lightsnip/articles/finding-comps.html | ||
|
|
||
|
|
@@ -33,7 +41,7 @@ def get_comps( | |
| comparables. | ||
| weights (numpy.ndarray): | ||
| Importance weights for leaf nodes, used to compute similarity | ||
| scores. | ||
| scores. Either 1-D (n_trees,) or 2-D (n_comparisons, n_trees). | ||
| num_comps (int, optional): | ||
| Number of top comparables to return for each observation. | ||
| Default is 5. | ||
|
|
@@ -59,15 +67,36 @@ def get_comps( | |
| f"({observation_df.shape[1]}) " | ||
| f"must match `comparison_df` ({comparison_df.shape[1]})" | ||
| ) | ||
| if comparison_df.shape != weights.shape: | ||
|
|
||
| if not isinstance(weights, np.ndarray): | ||
| weights = np.asarray(weights) | ||
|
|
||
| # Normalize weights to a 2-D matrix so that we can use a single numba | ||
| # kernel for both vector and matrix weights. A 1-D vector of per-tree | ||
| # weights is reshaped to (1, n_trees); the numba kernel detects this | ||
| # shape and broadcasts the single row to all comparison observations. | ||
| if weights.ndim == 1: | ||
| if weights.shape[0] != comparison_df.shape[1]: | ||
| raise ValueError( | ||
| f"`weights` length {weights.shape[0]} must equal number of " | ||
| f"trees {comparison_df.shape[1]}" | ||
| ) | ||
| weights_matrix = weights.reshape(1, -1).astype(np.float32, copy=False) | ||
| elif weights.ndim == 2: | ||
| if comparison_df.shape != weights.shape: | ||
| raise ValueError( | ||
| f"`comparison_df.shape` {comparison_df.shape} must match " | ||
| f"`weights.shape` {weights.shape}" | ||
| ) | ||
| weights_matrix = weights.astype(np.float32, copy=False) | ||
| else: | ||
| raise ValueError( | ||
| f"`comparison_df.shape` {comparison_df.shape} must match " | ||
| f"`weights.shape` {weights.shape}" | ||
| "`weights` must be a 1-D vector (n_trees,) or 2-D matrix " | ||
| f"(n_comparisons, n_trees), got {weights.ndim}-D" | ||
| ) | ||
|
|
||
| # Convert the weights to a numpy array so that we can take advantage of | ||
| # numba acceleration later on | ||
| weights_matrix = np.asarray(weights, dtype=np.float32) | ||
| # Avoid editing the df in-place | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you add more extensive documentation about the reason for adding this here? @jeancochrane
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I think so. I actually don't even understand why we need this. Do we mutate the observation dataframe later on?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the following chunk, when # Test with matrix weights (error_reduction style)
tree_weights_matrix = np.asarray(
[np.random.dirichlet(np.ones(num_trees)) for _ in range(num_comparisons)]
)
start = time.time()
get_comps(leaf_nodes, training_leaf_nodes, tree_weights_matrix)
end = time.time()
print(f"get_comps (matrix weights) runtime: {end - start}s")this code that creates (in-place) a new column in the # Chunk the observations so that the script can periodically report progress
observation_df["chunk"] = pd.cut(
observation_df.index, bins=num_chunks, labels=False
)such that when we finish the matrix test and move onto the vector test, the # Test with vector weights (unweighted / prediction_variance style)
tree_weights_vector = np.random.dirichlet(np.ones(num_trees))
start = time.time()
get_comps(leaf_nodes, training_leaf_nodes, tree_weights_vector)
end = time.time()
print(f"get_comps (vector weights) runtime: {end - start}s")Here is a reproducible isolated example that I think replicates the behaviour: import pandas as pd
import numpy as np
# Create toy dataframes
leaf_nodes = pd.DataFrame(np.random.randint(0, 10, size=[5, 3]))
print("Before:", leaf_nodes.shape) # (5, 3)
def add_chunk_column(observation_df):
observation_df["chunk"] = [0, 0, 1, 1, 1]
add_chunk_column(leaf_nodes)
print("After:", leaf_nodes.shape) # (5, 4)
print(leaf_nodes)Does this make sense? I feel like pandas inplace trickiness always gets me
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ahh right! Thanks for the clear explanation. I see now that the mutation happens literally on the next line lol, my bad for missing it 🤦🏻♀️ Since we perform the mutation immediately after this copy operation, I don't actually think we need to document the decision any more thoroughly than this.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No prob! Got it, sounds good |
||
| observation_df = observation_df.copy() | ||
|
|
||
| # Chunk the observations so that the script can periodically report progress | ||
| observation_df["chunk"] = pd.cut( | ||
|
|
@@ -137,12 +166,20 @@ def _get_top_n_comps( | |
| observations in a tree model, a matrix of weights for each obs/tree, and an | ||
| integer `num_comps`, and returns a matrix where each observation is scored | ||
| by similarity to observations in the comparison matrix and the top N scores | ||
| are returned along with the indexes of the comparison observations.""" | ||
| are returned along with the indexes of the comparison observations. | ||
|
|
||
| The weights_matrix is always 2-D. If its first dimension is 1, the single | ||
| row of weights is broadcast to all comparison observations (i.e. tree-level | ||
| weights shared across all comparisons). Otherwise, each comparison | ||
| observation y_i uses its own row of weights.""" | ||
| num_observations = len(leaf_node_matrix) | ||
| num_possible_comparisons = len(comparison_leaf_node_matrix) | ||
| idx_dtype = np.int32 | ||
| score_dtype = np.float32 | ||
|
|
||
| # Detect whether we have shared (vector-style) or per-observation weights | ||
| shared_weights = weights_matrix.shape[0] == 1 | ||
|
|
||
| # Store scores and indexes in two separate arrays rather than a 3d matrix | ||
| # for simplicity (array of tuples does not convert to pandas properly). | ||
| # Indexes default to -1, which is an impossible index and so is a signal | ||
|
|
@@ -156,12 +193,14 @@ def _get_top_n_comps( | |
| # low memory footprint | ||
| for y_i in range(num_possible_comparisons): | ||
| similarity_score = 0.0 | ||
| for tree_idx in range(len(leaf_node_matrix[x_i])): | ||
| # Use row 0 for shared weights, row y_i for per-obs weights | ||
| w_i = 0 if shared_weights else y_i | ||
jeancochrane marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| for tree_idx in range(leaf_node_matrix.shape[1]): | ||
| if ( | ||
| leaf_node_matrix[x_i][tree_idx] | ||
| == comparison_leaf_node_matrix[y_i][tree_idx] | ||
| leaf_node_matrix[x_i, tree_idx] | ||
| == comparison_leaf_node_matrix[y_i, tree_idx] | ||
| ): | ||
| similarity_score += weights_matrix[y_i][tree_idx] | ||
| similarity_score += weights_matrix[w_i, tree_idx] | ||
|
|
||
| # See if the score is higher than any of the top N | ||
| # comps, and store it in the sorted comps array if it is. | ||
|
|
@@ -198,18 +237,24 @@ def insert_at_idx_and_shift( | |
| num_trees = 500 | ||
| num_obs = 20001 | ||
| num_comparisons = 10000 | ||
| mean_sale_price = 350000 | ||
| std_deviation = 110000 | ||
jeancochrane marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| leaf_nodes = pd.DataFrame(np.random.randint(0, num_obs, size=[num_obs, num_trees])) | ||
| training_leaf_nodes = pd.DataFrame( | ||
| np.random.randint(0, num_comparisons, size=[num_comparisons, num_trees]) | ||
| ) | ||
| tree_weights = np.asarray( | ||
|
|
||
| # Test with matrix weights (error_reduction style) | ||
| tree_weights_matrix = np.asarray( | ||
| [np.random.dirichlet(np.ones(num_trees)) for _ in range(num_comparisons)] | ||
| ) | ||
| start = time.time() | ||
| get_comps(leaf_nodes, training_leaf_nodes, tree_weights_matrix) | ||
| end = time.time() | ||
| print(f"get_comps (matrix weights) runtime: {end - start}s") | ||
|
|
||
| # Test with vector weights (unweighted / prediction_variance style) | ||
| tree_weights_vector = np.random.dirichlet(np.ones(num_trees)) | ||
| start = time.time() | ||
| get_comps(leaf_nodes, training_leaf_nodes, tree_weights) | ||
| get_comps(leaf_nodes, training_leaf_nodes, tree_weights_vector) | ||
jeancochrane marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| end = time.time() | ||
| print(f"get_comps runtime: {end - start}s") | ||
| print(f"get_comps (vector weights) runtime: {end - start}s") | ||
Uh oh!
There was an error while loading. Please reload this page.