Skip to content
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
b4c36af
Add temp checkpoint for eda
wagnerlmichael Mar 11, 2025
66a02f8
Update w tentative solution for shaps
wagnerlmichael Mar 11, 2025
73517f9
Remove comment
wagnerlmichael Mar 11, 2025
5afb7e4
Remove og_sf feature
wagnerlmichael Mar 11, 2025
c1ad821
Add comment
wagnerlmichael Mar 11, 2025
9bfbd01
Shorten lines
wagnerlmichael Mar 11, 2025
4c5d254
Add draft of comps solution
wagnerlmichael Mar 13, 2025
836493d
Linting
wagnerlmichael Mar 13, 2025
4512a1e
Linting
wagnerlmichael Mar 13, 2025
8c62d74
Linting
wagnerlmichael Mar 13, 2025
918ac98
Linting
wagnerlmichael Mar 13, 2025
9cbbd8d
Clean up code chunk
wagnerlmichael Mar 13, 2025
87f00a7
Remove unneeded comment
wagnerlmichael Mar 13, 2025
c372822
Improve comment
wagnerlmichael Mar 13, 2025
1167343
Fix sorting
wagnerlmichael Mar 17, 2025
43d25b3
Linting
wagnerlmichael Mar 17, 2025
a8dacd2
Remove intermediate column
wagnerlmichael Mar 17, 2025
fc53281
Replace variable name
wagnerlmichael Mar 17, 2025
b9a8ee9
Add small touches
wagnerlmichael Mar 17, 2025
1566e5c
Lint
wagnerlmichael Mar 17, 2025
51722a5
Improve comment
wagnerlmichael Mar 17, 2025
43f114a
Merge branch 'master' into 358-fix-shaps-and-comps-for-multicard-prop…
wagnerlmichael Mar 26, 2025
48a2090
Merge branch 'master' into 358-fix-shaps-and-comps-for-multicard-prop…
wagnerlmichael Apr 24, 2025
5b09c06
Update pipeline/04-interpret.R
wagnerlmichael May 1, 2025
5962316
Update pipeline/04-interpret.R
wagnerlmichael May 1, 2025
bf05a17
Fix lintr error
wagnerlmichael May 1, 2025
9380d99
Update pipeline/04-interpret.R
wagnerlmichael May 1, 2025
b0616f2
Fix lintr error
wagnerlmichael May 1, 2025
7cbe52c
Improve docs
wagnerlmichael May 1, 2025
d40a4e4
Improve clarity
wagnerlmichael May 1, 2025
54bb39c
Update pipeline/04-interpret.R
wagnerlmichael May 1, 2025
fedf47d
Update pipeline/04-interpret.R
wagnerlmichael May 1, 2025
dc5c473
Fix rest of variable names
wagnerlmichael May 1, 2025
f733b13
Update pipeline/04-interpret.R
wagnerlmichael May 5, 2025
fa99e29
Update pipeline/04-interpret.R
wagnerlmichael May 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pipeline/02-assess.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ assessment_card_data_mc <- assessment_card_data_pred %>%
# that prediction as the PIN value. For > 3 cards, we predict each card with
# its original square footage then sum the predictions to get the PIN value
group_by(meta_pin) %>%
arrange(meta_pin, desc(char_bldg_sf)) %>%
arrange(meta_pin, desc(char_bldg_sf), meta_card_num) %>%
mutate(
pred_pin_card_sum = ifelse(
meta_pin_num_cards > 3,
Expand Down
95 changes: 88 additions & 7 deletions pipeline/04-interpret.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,36 @@ if (shap_enable || comp_enable) {
# PINs) that need values. Will use the the trained model to calc SHAP values
assessment_data <- as_tibble(read_parquet(paths$input$assessment$local))

# Aggregate square footage to the parcel level for small (2-3 card)
# multi-cards. We do this to ensure consistent SHAP values for small
# multi-card parcels, since we use aggregated parcel square footage when
# predicting values for these parcels. More details in multi-card handling
# step in the assess stage.

# Start by persisting card sort order for the purposes of aggregating
# building square footage. We use characteristics from the largest card
# ("frankencard") in order to predict value, so we save the card sort order
# as a way to reference this card later on
assessment_data_ordered <- assessment_data %>%
group_by(meta_pin) %>%
arrange(desc(char_bldg_sf), meta_card_num) %>%
mutate(sqft_card_num_sort = row_number()) %>%
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This column sqft_card_num_sort is persisted throughout the shaps and comps calculations to maintain the correct ordering

ungroup()

assessment_data <- assessment_data_ordered %>%
mutate(
char_bldg_sf = ifelse(
ind_pin_is_multicard & meta_pin_num_cards %in% c(2, 3),
sum(char_bldg_sf),
char_bldg_sf
),
.by = meta_pin
)

# Run the saved recipe on the assessment data to format it for prediction
assessment_data_prepped <- recipes::bake(
object = lgbm_final_full_recipe,
new_data = assessment_data,
new_data = assessment_data %>% select(-sqft_card_num_sort),
all_predictors()
)
}
Expand Down Expand Up @@ -77,13 +103,35 @@ if (shap_enable) {
shap_values_final <- assessment_data %>%
select(
meta_year, meta_pin, meta_card_num,
township_code = meta_township_code
meta_pin_num_cards,
township_code = meta_township_code,
sqft_card_num_sort
) %>%
bind_cols(shap_values_tbl) %>%
select(
meta_year, meta_pin, meta_card_num, pred_card_shap_baseline_fmv,
meta_year, meta_pin, meta_card_num, sqft_card_num_sort,
meta_pin_num_cards, pred_card_shap_baseline_fmv,
all_of(params$model$predictor$all), township_code
) %>%
# Adjust small (2-3 card) multi-cards to copy the SHAPs from the
# "frankencard" to all of the cards in the PIN. This aligns with the way
# that we handle small multi-cards in the assess stage.
# Start by grouping and sorting the same way we do in the assess stage
# so that we can figure out which card is the frankencard
group_by(meta_pin) %>%
arrange(sqft_card_num_sort) %>%
group_modify(~ {
shap_cols <- c("pred_card_shap_baseline_fmv", params$model$predictor$all)
# If the first row indicates 2 or 3 cards,
# duplicate its SHAP values across the group
if (.x$meta_pin_num_cards[1] %in% c(2, 3)) {
.x[shap_cols] <- .x[rep(1, nrow(.x)), shap_cols]
}
.x
}) %>%
arrange(meta_pin, meta_card_num) %>%
ungroup() %>%
select(-meta_pin_num_cards, -sqft_card_num_sort) %>%
write_parquet(paths$output$shap$local)
} else {
# If SHAP creation is disabled, we still need to write an empty stub file
Expand Down Expand Up @@ -124,14 +172,34 @@ if (comp_enable) {

# Filter target properties for only the current triad, to speed up the comps
# algorithm
comp_assessment_data <- assessment_data %>%
comp_assessment_data_preprocess <- assessment_data %>%
filter(
meta_township_code %in% (
ccao::town_dict %>%
filter(triad_name == tools::toTitleCase(params$assessment$triad)) %>%
pull(township_code)
)
)

# Multi-card handling. For multi-card pins with 2-3 cards, we predict by
# aggregating the bldg_sf to a single card, and using that card to predict
# the value for the multi-card PIN as a whole Since we don't predict on the
# other cards, we set them aside for comp generation, to re-attach them later
small_multicards <- comp_assessment_data_preprocess %>%
filter(meta_pin_num_cards %in% c(2, 3))

frankencards <- small_multicards %>%
group_by(meta_pin) %>%
arrange(sqft_card_num_sort) %>%
slice(1) %>%
ungroup()

single_cards_and_large_multicards <- comp_assessment_data_preprocess %>%
filter(!meta_pin %in% small_multicards$meta_pin)

comp_assessment_data <-
bind_rows(single_cards_and_large_multicards, frankencards)

comp_assessment_data_prepped <- recipes::bake(
object = lgbm_final_full_recipe,
new_data = comp_assessment_data,
Expand Down Expand Up @@ -259,9 +327,22 @@ if (comp_enable) {
) %>%
relocate(pin, card)

# Combine the comp indexes and scores into one dataframe and write to a file
cbind(comps[[1]], comps[[2]]) %>%
write_parquet(paths$output$comp$local)
comp_idxs_and_scores <- cbind(comps[[1]], comps[[2]])

# Grab removed small multi-cards, re-add them, and assign them the comps data
# that we calculated for the frankencard
removed_cards <- small_multicards %>%
anti_join(frankencards, by = c("meta_pin", "meta_card_num")) %>%
select(meta_pin, meta_card_num)

removed_cards_comps <- removed_cards %>%
rename(pin = meta_pin, card = meta_card_num) %>%
left_join(comp_idxs_and_scores %>% select(-card), by = "pin")

# Save final combined comps data
bind_rows(comp_idxs_and_scores, removed_cards_comps) %>%
arrange(pin, card) %>%
arrow::write_parquet(paths$output$comp$local)
} else {
# If comp creation is disabled, we still need to write an empty stub file
# so DVC doesn't complain
Expand Down
Loading