Skip to content
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
b4c36af
Add temp checkpoint for eda
wagnerlmichael Mar 11, 2025
66a02f8
Update w tentative solution for shaps
wagnerlmichael Mar 11, 2025
73517f9
Remove comment
wagnerlmichael Mar 11, 2025
5afb7e4
Remove og_sf feature
wagnerlmichael Mar 11, 2025
c1ad821
Add comment
wagnerlmichael Mar 11, 2025
9bfbd01
Shorten lines
wagnerlmichael Mar 11, 2025
4c5d254
Add draft of comps solution
wagnerlmichael Mar 13, 2025
836493d
Linting
wagnerlmichael Mar 13, 2025
4512a1e
Linting
wagnerlmichael Mar 13, 2025
8c62d74
Linting
wagnerlmichael Mar 13, 2025
918ac98
Linting
wagnerlmichael Mar 13, 2025
9cbbd8d
Clean up code chunk
wagnerlmichael Mar 13, 2025
87f00a7
Remove unneeded comment
wagnerlmichael Mar 13, 2025
c372822
Improve comment
wagnerlmichael Mar 13, 2025
1167343
Fix sorting
wagnerlmichael Mar 17, 2025
43d25b3
Linting
wagnerlmichael Mar 17, 2025
a8dacd2
Remove intermediate column
wagnerlmichael Mar 17, 2025
fc53281
Replace variable name
wagnerlmichael Mar 17, 2025
b9a8ee9
Add small touches
wagnerlmichael Mar 17, 2025
1566e5c
Lint
wagnerlmichael Mar 17, 2025
51722a5
Improve comment
wagnerlmichael Mar 17, 2025
43f114a
Merge branch 'master' into 358-fix-shaps-and-comps-for-multicard-prop…
wagnerlmichael Mar 26, 2025
48a2090
Merge branch 'master' into 358-fix-shaps-and-comps-for-multicard-prop…
wagnerlmichael Apr 24, 2025
5b09c06
Update pipeline/04-interpret.R
wagnerlmichael May 1, 2025
5962316
Update pipeline/04-interpret.R
wagnerlmichael May 1, 2025
bf05a17
Fix lintr error
wagnerlmichael May 1, 2025
9380d99
Update pipeline/04-interpret.R
wagnerlmichael May 1, 2025
b0616f2
Fix lintr error
wagnerlmichael May 1, 2025
7cbe52c
Improve docs
wagnerlmichael May 1, 2025
d40a4e4
Improve clarity
wagnerlmichael May 1, 2025
54bb39c
Update pipeline/04-interpret.R
wagnerlmichael May 1, 2025
fedf47d
Update pipeline/04-interpret.R
wagnerlmichael May 1, 2025
dc5c473
Fix rest of variable names
wagnerlmichael May 1, 2025
f733b13
Update pipeline/04-interpret.R
wagnerlmichael May 5, 2025
fa99e29
Update pipeline/04-interpret.R
wagnerlmichael May 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pipeline/02-assess.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ assessment_card_data_mc <- assessment_card_data_pred %>%
# that prediction as the PIN value. For > 3 cards, we predict each card with
# its original square footage then sum the predictions to get the PIN value
group_by(meta_pin) %>%
arrange(meta_pin, desc(char_bldg_sf)) %>%
arrange(meta_pin, desc(char_bldg_sf), meta_card_num) %>%
mutate(
pred_pin_card_sum = ifelse(
meta_pin_num_cards > 3,
Expand Down
97 changes: 87 additions & 10 deletions pipeline/04-interpret.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,32 @@ if (shap_enable || comp_enable) {
# PINs) that need values. Will use the the trained model to calc SHAP values
assessment_data <- as_tibble(read_parquet(paths$input$assessment$local))

# Aggregate square footage to building level. We do this to ensure consistent
# shap values and comps based on the generated prediction for multi-card pins.
# More details in multi-card handling in the assess stage.
# https://github.com/ccao-data/model-res-avm/issues/358

# Persist sort order
assessment_data_ordered <- assessment_data %>%
group_by(meta_pin) %>%
arrange(desc(char_bldg_sf), meta_card_num) %>%
mutate(sqft_card_num_sort = row_number()) %>%
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This column sqft_card_num_sort is persisted throughout the shaps and comps calculations to maintain the correct ordering

ungroup()

assessment_data <- assessment_data_ordered %>%
mutate(
char_bldg_sf = ifelse(
ind_pin_is_multicard & meta_pin_num_cards %in% c(2, 3),
sum(char_bldg_sf),
char_bldg_sf
),
.by = meta_pin
)

# Run the saved recipe on the assessment data to format it for prediction
assessment_data_prepped <- recipes::bake(
object = lgbm_final_full_recipe,
new_data = assessment_data,
new_data = assessment_data %>% select(-sqft_card_num_sort),
all_predictors()
)
}
Expand Down Expand Up @@ -77,13 +99,33 @@ if (shap_enable) {
shap_values_final <- assessment_data %>%
select(
meta_year, meta_pin, meta_card_num,
township_code = meta_township_code
meta_pin_num_cards,
township_code = meta_township_code,
sqft_card_num_sort
) %>%
bind_cols(shap_values_tbl) %>%
select(
meta_year, meta_pin, meta_card_num, pred_card_shap_baseline_fmv,
meta_year, meta_pin, meta_card_num, sqft_card_num_sort,
meta_pin_num_cards, pred_card_shap_baseline_fmv,
all_of(params$model$predictor$all), township_code
) %>%
group_by(meta_pin) %>%
# Replicate sort used in the assess stage to ensure the same card's chars
# are used accross the assess stage and interpret stage
arrange(sqft_card_num_sort) %>%
group_modify(~ {
shap_cols <- c("pred_card_shap_baseline_fmv", params$model$predictor$all)
# If the first row indicates 2 or 3 cards,
# duplicate its SHAP values across the group
# https://github.com/ccao-data/model-res-avm/issues/358#issue-2897826811
if (.x$meta_pin_num_cards[1] %in% c(2, 3)) {
.x[shap_cols] <- .x[rep(1, nrow(.x)), shap_cols]
}
.x
}) %>%
arrange(meta_pin, meta_card_num) %>%
ungroup() %>%
select(-meta_pin_num_cards, -sqft_card_num_sort) %>%
write_parquet(paths$output$shap$local)
} else {
# If SHAP creation is disabled, we still need to write an empty stub file
Expand Down Expand Up @@ -124,17 +166,37 @@ if (comp_enable) {

# Filter target properties for only the current triad, to speed up the comps
# algorithm
comp_assessment_data <- assessment_data %>%
comp_assessment_data_preprocess <- assessment_data %>%
filter(
meta_township_code %in% (
ccao::town_dict %>%
filter(triad_name == tools::toTitleCase(params$assessment$triad)) %>%
pull(township_code)
)
)

# Multi-card handling. For multi-card pins with 2-3 cards, we predict by
# aggregating the bldg_sf to a single card, and using that card to predict
# which becomes the value for the mult-card pin. Since we don't predict on the
# other cards, we set them aside for comp generation, to re-attach them later
multicard_pins <- comp_assessment_data_preprocess %>%
filter(meta_pin_num_cards %in% c(2, 3))

selected_cards <- multicard_pins %>%
group_by(meta_pin) %>%
arrange(sqft_card_num_sort) %>%
slice(1) %>%
ungroup()

singlecard_pins <- comp_assessment_data_preprocess %>%
filter(!meta_pin %in% multicard_pins$meta_pin)

comp_assessment_data_intermediate <-
bind_rows(singlecard_pins, selected_cards)

comp_assessment_data_prepped <- recipes::bake(
object = lgbm_final_full_recipe,
new_data = comp_assessment_data,
new_data = comp_assessment_data_intermediate,
all_predictors()
)

Expand Down Expand Up @@ -254,14 +316,29 @@ if (comp_enable) {
) %>%
select(-starts_with("comp_idx_")) %>%
cbind(
pin = comp_assessment_data$meta_pin,
card = comp_assessment_data$meta_card_num
pin = comp_assessment_data_intermediate$meta_pin,
card = comp_assessment_data_intermediate$meta_card_num
) %>%
relocate(pin, card)

# Combine the comp indexes and scores into one dataframe and write to a file
cbind(comps[[1]], comps[[2]]) %>%
write_parquet(paths$output$comp$local)
final_comps_data <- cbind(comps[[1]], comps[[2]])

# Grab removed multi-card cards,re-add them, and assign them the comps data
# that we used for the main frankencard used for prediction
removed_cards <- multicard_pins %>%
anti_join(selected_cards, by = c("meta_pin", "meta_card_num")) %>%
select(meta_pin, meta_card_num)

removed_cards_comps <- removed_cards %>%
rename(pin = meta_pin, card = meta_card_num) %>%
left_join(final_comps_data %>% select(-card), by = "pin")

complete_comps_data <-
bind_rows(final_comps_data, removed_cards_comps) %>%
arrange(pin, card)

# Save final combined comps data
arrow::write_parquet(complete_comps_data, paths$output$comp$local)
} else {
# If comp creation is disabled, we still need to write an empty stub file
# so DVC doesn't complain
Expand Down
Loading