Skip to content

Commit 117f349

Browse files
Fix shaps and comps to work with multi-card solution (#360)
This PR addresses #358. We edit the interpret stage such that shaps and comps are generated to be in line with our recent [multi-card methodology update](6ee2f11). Within multi-card PINs, we essentially compute shaps and comps with a single-card, the card that was used for prediction in the `assess` stage. Then, we use that card's shaps and comps for each card within the multi-card PIN. We also introduce another sorting mechanism which fixes an issue of tiebreaks within multi-card PINs. We were choosing the card to predict based on highest sqft. Sometimes the sqft was tied between two pins, which led to a tiebreak that we didn't have a rule for. Now, we give preference to the lowest `meta_card_num` within the PIN. --------- Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com>
1 parent 0a78d2e commit 117f349

File tree

2 files changed

+89
-8
lines changed

2 files changed

+89
-8
lines changed

pipeline/02-assess.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ assessment_card_data_mc <- assessment_card_data_pred %>%
105105
# that prediction as the PIN value. For > 3 cards, we predict each card with
106106
# its original square footage then sum the predictions to get the PIN value
107107
group_by(meta_pin) %>%
108-
arrange(meta_pin, desc(char_bldg_sf)) %>%
108+
arrange(meta_pin, desc(char_bldg_sf), meta_card_num) %>%
109109
mutate(
110110
pred_pin_card_sum = ifelse(
111111
meta_pin_num_cards > 3,

pipeline/04-interpret.R

Lines changed: 88 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,36 @@ if (shap_enable || comp_enable) {
3131
# PINs) that need values. Will use the the trained model to calc SHAP values
3232
assessment_data <- as_tibble(read_parquet(paths$input$assessment$local))
3333

34+
# Aggregate square footage to the parcel level for small (2-3 card)
35+
# multi-cards. We do this to ensure consistent SHAP values for small
36+
# multi-card parcels, since we use aggregated parcel square footage when
37+
# predicting values for these parcels. More details in multi-card handling
38+
# step in the assess stage.
39+
40+
# Start by persisting card sort order for the purposes of aggregating
41+
# building square footage. We use characteristics from the largest card
42+
# ("frankencard") in order to predict value, so we save the card sort order
43+
# as a way to reference this card later on
44+
assessment_data_ordered <- assessment_data %>%
45+
group_by(meta_pin) %>%
46+
arrange(desc(char_bldg_sf), meta_card_num) %>%
47+
mutate(sqft_card_num_sort = row_number()) %>%
48+
ungroup()
49+
50+
assessment_data <- assessment_data_ordered %>%
51+
mutate(
52+
char_bldg_sf = ifelse(
53+
ind_pin_is_multicard & meta_pin_num_cards %in% c(2, 3),
54+
sum(char_bldg_sf),
55+
char_bldg_sf
56+
),
57+
.by = meta_pin
58+
)
59+
3460
# Run the saved recipe on the assessment data to format it for prediction
3561
assessment_data_prepped <- recipes::bake(
3662
object = lgbm_final_full_recipe,
37-
new_data = assessment_data,
63+
new_data = assessment_data %>% select(-sqft_card_num_sort),
3864
all_predictors()
3965
)
4066
}
@@ -77,13 +103,35 @@ if (shap_enable) {
77103
shap_values_final <- assessment_data %>%
78104
select(
79105
meta_year, meta_pin, meta_card_num,
80-
township_code = meta_township_code
106+
meta_pin_num_cards,
107+
township_code = meta_township_code,
108+
sqft_card_num_sort
81109
) %>%
82110
bind_cols(shap_values_tbl) %>%
83111
select(
84-
meta_year, meta_pin, meta_card_num, pred_card_shap_baseline_fmv,
112+
meta_year, meta_pin, meta_card_num, sqft_card_num_sort,
113+
meta_pin_num_cards, pred_card_shap_baseline_fmv,
85114
all_of(params$model$predictor$all), township_code
86115
) %>%
116+
# Adjust small (2-3 card) multi-cards to copy the SHAPs from the
117+
# "frankencard" to all of the cards in the PIN. This aligns with the way
118+
# that we handle small multi-cards in the assess stage.
119+
# Start by grouping and sorting the same way we do in the assess stage
120+
# so that we can figure out which card is the frankencard
121+
group_by(meta_pin) %>%
122+
arrange(sqft_card_num_sort) %>%
123+
group_modify(~ {
124+
shap_cols <- c("pred_card_shap_baseline_fmv", params$model$predictor$all)
125+
# If the first row indicates 2 or 3 cards,
126+
# duplicate its SHAP values across the group
127+
if (.x$meta_pin_num_cards[1] %in% c(2, 3)) {
128+
.x[shap_cols] <- .x[rep(1, nrow(.x)), shap_cols]
129+
}
130+
.x
131+
}) %>%
132+
arrange(meta_pin, meta_card_num) %>%
133+
ungroup() %>%
134+
select(-meta_pin_num_cards, -sqft_card_num_sort) %>%
87135
write_parquet(paths$output$shap$local)
88136
} else {
89137
# If SHAP creation is disabled, we still need to write an empty stub file
@@ -124,14 +172,34 @@ if (comp_enable) {
124172

125173
# Filter target properties for only the current triad, to speed up the comps
126174
# algorithm
127-
comp_assessment_data <- assessment_data %>%
175+
comp_assessment_data_preprocess <- assessment_data %>%
128176
filter(
129177
meta_township_code %in% (
130178
ccao::town_dict %>%
131179
filter(triad_name == tools::toTitleCase(params$assessment$triad)) %>%
132180
pull(township_code)
133181
)
134182
)
183+
184+
# Multi-card handling. For multi-card pins with 2-3 cards, we predict by
185+
# aggregating the bldg_sf to a single card, and using that card to predict
186+
# the value for the multi-card PIN as a whole. Since we don't predict on the
187+
# other cards, we set them aside for comp generation, to re-attach them later
188+
small_multicards <- comp_assessment_data_preprocess %>%
189+
filter(meta_pin_num_cards %in% c(2, 3))
190+
191+
frankencards <- small_multicards %>%
192+
group_by(meta_pin) %>%
193+
arrange(sqft_card_num_sort) %>%
194+
slice(1) %>%
195+
ungroup()
196+
197+
single_cards_and_large_multicards <- comp_assessment_data_preprocess %>%
198+
filter(!meta_pin %in% frankencards$meta_pin)
199+
200+
comp_assessment_data <-
201+
bind_rows(single_cards_and_large_multicards, frankencards)
202+
135203
comp_assessment_data_prepped <- recipes::bake(
136204
object = lgbm_final_full_recipe,
137205
new_data = comp_assessment_data,
@@ -259,9 +327,22 @@ if (comp_enable) {
259327
) %>%
260328
relocate(pin, card)
261329

262-
# Combine the comp indexes and scores into one dataframe and write to a file
263-
cbind(comps[[1]], comps[[2]]) %>%
264-
write_parquet(paths$output$comp$local)
330+
comp_idxs_and_scores <- cbind(comps[[1]], comps[[2]])
331+
332+
# Grab removed small multi-cards, re-add them, and assign them the comps data
333+
# that we calculated for the frankencard
334+
removed_cards <- small_multicards %>%
335+
anti_join(frankencards, by = c("meta_pin", "meta_card_num")) %>%
336+
select(meta_pin, meta_card_num)
337+
338+
removed_cards_comps <- removed_cards %>%
339+
rename(pin = meta_pin, card = meta_card_num) %>%
340+
left_join(comp_idxs_and_scores %>% select(-card), by = "pin")
341+
342+
# Save final combined comps data
343+
bind_rows(comp_idxs_and_scores, removed_cards_comps) %>%
344+
arrange(pin, card) %>%
345+
arrow::write_parquet(paths$output$comp$local)
265346
} else {
266347
# If comp creation is disabled, we still need to write an empty stub file
267348
# so DVC doesn't complain

0 commit comments

Comments
 (0)