@@ -31,10 +31,36 @@ if (shap_enable || comp_enable) {
3131 # PINs) that need values. Will use the the trained model to calc SHAP values
3232 assessment_data <- as_tibble(read_parquet(paths $ input $ assessment $ local ))
3333
34+ # Aggregate square footage to the parcel level for small (2-3 card)
35+ # multi-cards. We do this to ensure consistent SHAP values for small
36+ # multi-card parcels, since we use aggregated parcel square footage when
37+ # predicting values for these parcels. More details in multi-card handling
38+ # step in the assess stage.
39+
40+ # Start by persisting card sort order for the purposes of aggregating
41+ # building square footage. We use characteristics from the largest card
42+ # ("frankencard") in order to predict value, so we save the card sort order
43+ # as a way to reference this card later on
44+ assessment_data_ordered <- assessment_data %> %
45+ group_by(meta_pin ) %> %
46+ arrange(desc(char_bldg_sf ), meta_card_num ) %> %
47+ mutate(sqft_card_num_sort = row_number()) %> %
48+ ungroup()
49+
50+ assessment_data <- assessment_data_ordered %> %
51+ mutate(
52+ char_bldg_sf = ifelse(
53+ ind_pin_is_multicard & meta_pin_num_cards %in% c(2 , 3 ),
54+ sum(char_bldg_sf ),
55+ char_bldg_sf
56+ ),
57+ .by = meta_pin
58+ )
59+
3460 # Run the saved recipe on the assessment data to format it for prediction
3561 assessment_data_prepped <- recipes :: bake(
3662 object = lgbm_final_full_recipe ,
37- new_data = assessment_data ,
63+ new_data = assessment_data % > % select( - sqft_card_num_sort ) ,
3864 all_predictors()
3965 )
4066}
@@ -77,13 +103,35 @@ if (shap_enable) {
77103 shap_values_final <- assessment_data %> %
78104 select(
79105 meta_year , meta_pin , meta_card_num ,
80- township_code = meta_township_code
106+ meta_pin_num_cards ,
107+ township_code = meta_township_code ,
108+ sqft_card_num_sort
81109 ) %> %
82110 bind_cols(shap_values_tbl ) %> %
83111 select(
84- meta_year , meta_pin , meta_card_num , pred_card_shap_baseline_fmv ,
112+ meta_year , meta_pin , meta_card_num , sqft_card_num_sort ,
113+ meta_pin_num_cards , pred_card_shap_baseline_fmv ,
85114 all_of(params $ model $ predictor $ all ), township_code
86115 ) %> %
116+ # Adjust small (2-3 card) multi-cards to copy the SHAPs from the
117+ # "frankencard" to all of the cards in the PIN. This aligns with the way
118+ # that we handle small multi-cards in the assess stage.
119+ # Start by grouping and sorting the same way we do in the assess stage
120+ # so that we can figure out which card is the frankencard
121+ group_by(meta_pin ) %> %
122+ arrange(sqft_card_num_sort ) %> %
123+ group_modify(~ {
124+ shap_cols <- c(" pred_card_shap_baseline_fmv" , params $ model $ predictor $ all )
125+ # If the first row indicates 2 or 3 cards,
126+ # duplicate its SHAP values across the group
127+ if (.x $ meta_pin_num_cards [1 ] %in% c(2 , 3 )) {
128+ .x [shap_cols ] <- .x [rep(1 , nrow(.x )), shap_cols ]
129+ }
130+ .x
131+ }) %> %
132+ arrange(meta_pin , meta_card_num ) %> %
133+ ungroup() %> %
134+ select(- meta_pin_num_cards , - sqft_card_num_sort ) %> %
87135 write_parquet(paths $ output $ shap $ local )
88136} else {
89137 # If SHAP creation is disabled, we still need to write an empty stub file
@@ -124,14 +172,34 @@ if (comp_enable) {
124172
125173 # Filter target properties for only the current triad, to speed up the comps
126174 # algorithm
127- comp_assessment_data <- assessment_data %> %
175+ comp_assessment_data_preprocess <- assessment_data %> %
128176 filter(
129177 meta_township_code %in% (
130178 ccao :: town_dict %> %
131179 filter(triad_name == tools :: toTitleCase(params $ assessment $ triad )) %> %
132180 pull(township_code )
133181 )
134182 )
183+
184+ # Multi-card handling. For multi-card pins with 2-3 cards, we predict by
185+ # aggregating the bldg_sf to a single card, and using that card to predict
186+ # the value for the multi-card PIN as a whole. Since we don't predict on the
187+ # other cards, we set them aside for comp generation, to re-attach them later
188+ small_multicards <- comp_assessment_data_preprocess %> %
189+ filter(meta_pin_num_cards %in% c(2 , 3 ))
190+
191+ frankencards <- small_multicards %> %
192+ group_by(meta_pin ) %> %
193+ arrange(sqft_card_num_sort ) %> %
194+ slice(1 ) %> %
195+ ungroup()
196+
197+ single_cards_and_large_multicards <- comp_assessment_data_preprocess %> %
198+ filter(! meta_pin %in% frankencards $ meta_pin )
199+
200+ comp_assessment_data <-
201+ bind_rows(single_cards_and_large_multicards , frankencards )
202+
135203 comp_assessment_data_prepped <- recipes :: bake(
136204 object = lgbm_final_full_recipe ,
137205 new_data = comp_assessment_data ,
@@ -259,9 +327,22 @@ if (comp_enable) {
259327 ) %> %
260328 relocate(pin , card )
261329
262- # Combine the comp indexes and scores into one dataframe and write to a file
263- cbind(comps [[1 ]], comps [[2 ]]) %> %
264- write_parquet(paths $ output $ comp $ local )
330+ comp_idxs_and_scores <- cbind(comps [[1 ]], comps [[2 ]])
331+
332+ # Grab removed small multi-cards, re-add them, and assign them the comps data
333+ # that we calculated for the frankencard
334+ removed_cards <- small_multicards %> %
335+ anti_join(frankencards , by = c(" meta_pin" , " meta_card_num" )) %> %
336+ select(meta_pin , meta_card_num )
337+
338+ removed_cards_comps <- removed_cards %> %
339+ rename(pin = meta_pin , card = meta_card_num ) %> %
340+ left_join(comp_idxs_and_scores %> % select(- card ), by = " pin" )
341+
342+ # Save final combined comps data
343+ bind_rows(comp_idxs_and_scores , removed_cards_comps ) %> %
344+ arrange(pin , card ) %> %
345+ arrow :: write_parquet(paths $ output $ comp $ local )
265346} else {
266347 # If comp creation is disabled, we still need to write an empty stub file
267348 # so DVC doesn't complain
0 commit comments