@@ -45,23 +45,34 @@ calculate_IV <- function(data,
4545 )
4646 }
4747
48- # Compute q
49- q <- stats :: quantile(
50- pred_var ,
51- probs = c(1 : (bins - 1 ) / bins ),
52- na.rm = TRUE ,
53- type = 3
54- )
48+ # Check if predictor is categorical (character or factor)
49+ if (is.character(pred_var ) || is.factor(pred_var )){
50+
51+ # For categorical variables, use the categories themselves as intervals
52+ unique_vals <- unique(pred_var [! is.na(pred_var )])
53+ intervals <- as.numeric(as.factor(pred_var ))
54+
55+ } else {
56+
57+ # For numeric variables, use quantile-based binning (original logic)
58+ # Compute q
59+ q <- stats :: quantile(
60+ pred_var ,
61+ probs = c(1 : (bins - 1 ) / bins ),
62+ na.rm = TRUE ,
63+ type = 3
64+ )
5565
56- # Compute cuts
57- cuts <- unique(q )
66+ # Compute cuts
67+ cuts <- unique(q )
5868
59- # Compute intervals
60- intervals <-
61- findInterval(
62- pred_var ,
63- vec = cuts ,
64- rightmost.closed = FALSE )
69+ # Compute intervals
70+ intervals <-
71+ findInterval(
72+ pred_var ,
73+ vec = cuts ,
74+ rightmost.closed = FALSE )
75+ }
6576
6677 # Compute cut_table
6778 cut_table <-
@@ -70,23 +81,45 @@ calculate_IV <- function(data,
7081 outc_var ) %> %
7182 as.data.frame.matrix()
7283
73- # # get min/max
74- cut_table_2 <-
75- data.frame (
76- var = pred_var ,
77- intervals
78- ) %> %
79- group_by(intervals ) %> %
80- summarise(
81- min = min(var , na.rm = TRUE ) %> % round(digits = 1 ),
82- max = max(var , na.rm = TRUE ) %> % round(digits = 1 ),
83- n = n(),
84- .groups = " drop"
84+ # # get min/max or category labels
85+ if (is.character(pred_var ) || is.factor(pred_var )){
86+
87+ # For categorical variables, use the actual category names
88+ cut_table_2 <-
89+ data.frame (
90+ var = pred_var ,
91+ intervals
92+ ) %> %
93+ group_by(intervals ) %> %
94+ summarise(
95+ category = first(var ), # Get the actual category name
96+ n = n(),
97+ .groups = " drop"
98+ ) %> %
99+ mutate(!! sym(predictor ) : = category ) %> %
100+ mutate(percentage = n / sum(n )) %> %
101+ select(!! sym(predictor ), intervals , n , percentage )
102+
103+ } else {
104+
105+ # For numeric variables, use min/max ranges (original logic)
106+ cut_table_2 <-
107+ data.frame (
108+ var = pred_var ,
109+ intervals
85110 ) %> %
86- mutate(!! sym(predictor ) : =
87- glue :: glue(" [{round(min, digits = 1)},{round(max, digits = 1)}]" )) %> %
88- mutate(percentage = n / sum(n )) %> %
89- select(!! sym(predictor ), intervals , n , percentage )
111+ group_by(intervals ) %> %
112+ summarise(
113+ min = min(var , na.rm = TRUE ) %> % round(digits = 1 ),
114+ max = max(var , na.rm = TRUE ) %> % round(digits = 1 ),
115+ n = n(),
116+ .groups = " drop"
117+ ) %> %
118+ mutate(!! sym(predictor ) : =
119+ glue :: glue(" [{round(min, digits = 1)},{round(max, digits = 1)}]" )) %> %
120+ mutate(percentage = n / sum(n )) %> %
121+ select(!! sym(predictor ), intervals , n , percentage )
122+ }
90123
91124 # Create variables that are double
92125 cut_table_1 <- as.numeric(cut_table $ `1` )
@@ -138,8 +171,8 @@ calculate_IV <- function(data,
138171# ' @param data Data frame containing the data.
139172# ' @param outcome String containing the name of the outcome variable.
140173# ' @param predictors Character vector containing the names of the predictor
141- # ' variables. If `NULL` (default) is supplied, all numeric variables in the
142- # ' data will be used.
174+ # ' variables. If `NULL` (default) is supplied, all numeric, character, and factor
175+ # ' variables in the data will be used.
143176# ' @param bins Numeric value representing the number of bins to use. Defaults to
144177# ' 10.
145178# '
@@ -162,7 +195,7 @@ map_IV <- function(data,
162195 data %> %
163196 select(- !! sym(outcome )) %> %
164197 select(
165- where(is.numeric )
198+ where(function ( x ) is.numeric( x ) || is.character( x ) || is.factor( x ) )
166199 ) %> %
167200 names()
168201 }
0 commit comments