Skip to content

Commit 0a90757

Browse files
authored
Merge pull request #247 from microsoft/copilot/fix-230
Enable categorical variables as predictors in IV analysis
2 parents 13cdba8 + c48cb0d commit 0a90757

File tree

14 files changed

+334
-102
lines changed

14 files changed

+334
-102
lines changed

..Rcheck/00check.log

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
* using log directory ‘/home/runner/work/wpa/wpa/..Rcheck’
2+
* using R version 4.3.3 (2024-02-29)
3+
* using platform: x86_64-pc-linux-gnu (64-bit)
4+
* R was compiled by
5+
gcc (Ubuntu 13.2.0-23ubuntu3) 13.2.0
6+
GNU Fortran (Ubuntu 13.2.0-23ubuntu3) 13.2.0
7+
* running under: Ubuntu 24.04.2 LTS
8+
* using session charset: UTF-8
9+
* using options ‘--no-examples --no-manual --no-vignettes’
10+
* checking for file ‘./DESCRIPTION’ ... ERROR
11+
Required fields missing or empty:
12+
‘Author’ ‘Maintainer’
13+
* DONE
14+
Status: 1 ERROR

.Rbuildignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,6 @@ lastMiKTeXException
3131
^cran-comments\.md$
3232
^CRAN-RELEASE$
3333
^CRAN-SUBMISSION$
34+
35+
# Reverse dependency checks
36+
^revdep/

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ Meta
88
.RDataTmp
99
SQ-overview.html
1010
wpa export 20200427_131327.png
11+
revdep/

CRAN-SUBMISSION

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
Version: 1.9.2
2-
Date: 2025-05-28 14:01:14 UTC
3-
SHA: d2bbd2a998182433f85e93dc51d36460b43e2ef7
1+
Version: 1.10.0
2+
Date: 2025-08-26 11:43:05 UTC
3+
SHA: a4d4ea06c5de4ed451fcd8ab05d94a248ec8bda8

NEWS.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
# wpa 1.10.0
22

33
- Refactored codebase to use latest dplyr syntax
4-
- Added support for logical outcome variables in `create_IV()`
4+
- Added support for logical outcome and categorical predictor variables in `create_IV()`
55
- Added detection of text missing values in `validation_report()`
6+
- Enhanced flexibility in display control for `create_dt()`
7+
- Improved test coverage
68

79
# wpa 1.9.2
810

R/calculate_IV.R

Lines changed: 67 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -45,23 +45,34 @@ calculate_IV <- function(data,
4545
)
4646
}
4747

48-
# Compute q
49-
q <- stats::quantile(
50-
pred_var,
51-
probs = c(1:(bins - 1) / bins),
52-
na.rm = TRUE,
53-
type = 3
54-
)
48+
# Check if predictor is categorical (character or factor)
49+
if(is.character(pred_var) || is.factor(pred_var)){
50+
51+
# For categorical variables, use the categories themselves as intervals
52+
unique_vals <- unique(pred_var[!is.na(pred_var)])
53+
intervals <- as.numeric(as.factor(pred_var))
54+
55+
} else {
56+
57+
# For numeric variables, use quantile-based binning (original logic)
58+
# Compute q
59+
q <- stats::quantile(
60+
pred_var,
61+
probs = c(1:(bins - 1) / bins),
62+
na.rm = TRUE,
63+
type = 3
64+
)
5565

56-
# Compute cuts
57-
cuts <- unique(q)
66+
# Compute cuts
67+
cuts <- unique(q)
5868

59-
# Compute intervals
60-
intervals <-
61-
findInterval(
62-
pred_var,
63-
vec = cuts,
64-
rightmost.closed = FALSE)
69+
# Compute intervals
70+
intervals <-
71+
findInterval(
72+
pred_var,
73+
vec = cuts,
74+
rightmost.closed = FALSE)
75+
}
6576

6677
# Compute cut_table
6778
cut_table <-
@@ -70,23 +81,45 @@ calculate_IV <- function(data,
7081
outc_var) %>%
7182
as.data.frame.matrix()
7283

73-
## get min/max
74-
cut_table_2 <-
75-
data.frame(
76-
var = pred_var,
77-
intervals
78-
) %>%
79-
group_by(intervals) %>%
80-
summarise(
81-
min = min(var, na.rm = TRUE) %>% round(digits = 1),
82-
max = max(var, na.rm = TRUE) %>% round(digits = 1),
83-
n = n(),
84-
.groups = "drop"
84+
## get min/max or category labels
85+
if(is.character(pred_var) || is.factor(pred_var)){
86+
87+
# For categorical variables, use the actual category names
88+
cut_table_2 <-
89+
data.frame(
90+
var = pred_var,
91+
intervals
92+
) %>%
93+
group_by(intervals) %>%
94+
summarise(
95+
category = first(var), # Get the actual category name
96+
n = n(),
97+
.groups = "drop"
98+
) %>%
99+
mutate(!!sym(predictor) := category) %>%
100+
mutate(percentage = n / sum(n)) %>%
101+
select(!!sym(predictor), intervals, n, percentage)
102+
103+
} else {
104+
105+
# For numeric variables, use min/max ranges (original logic)
106+
cut_table_2 <-
107+
data.frame(
108+
var = pred_var,
109+
intervals
85110
) %>%
86-
mutate(!!sym(predictor) :=
87-
glue::glue("[{round(min, digits = 1)},{round(max, digits = 1)}]")) %>%
88-
mutate(percentage = n / sum(n)) %>%
89-
select(!!sym(predictor), intervals, n, percentage)
111+
group_by(intervals) %>%
112+
summarise(
113+
min = min(var, na.rm = TRUE) %>% round(digits = 1),
114+
max = max(var, na.rm = TRUE) %>% round(digits = 1),
115+
n = n(),
116+
.groups = "drop"
117+
) %>%
118+
mutate(!!sym(predictor) :=
119+
glue::glue("[{round(min, digits = 1)},{round(max, digits = 1)}]")) %>%
120+
mutate(percentage = n / sum(n)) %>%
121+
select(!!sym(predictor), intervals, n, percentage)
122+
}
90123

91124
# Create variables that are double
92125
cut_table_1 <- as.numeric(cut_table$`1`)
@@ -138,8 +171,8 @@ calculate_IV <- function(data,
138171
#' @param data Data frame containing the data.
139172
#' @param outcome String containing the name of the outcome variable.
140173
#' @param predictors Character vector containing the names of the predictor
141-
#' variables. If `NULL` (default) is supplied, all numeric variables in the
142-
#' data will be used.
174+
#' variables. If `NULL` (default) is supplied, all numeric, character, and factor
175+
#' variables in the data will be used.
143176
#' @param bins Numeric value representing the number of bins to use. Defaults to
144177
#' 10.
145178
#'
@@ -162,7 +195,7 @@ map_IV <- function(data,
162195
data %>%
163196
select(-!!sym(outcome)) %>%
164197
select(
165-
where(is.numeric)
198+
where(function(x) is.numeric(x) || is.character(x) || is.factor(x))
166199
) %>%
167200
names()
168201
}

R/create_IV.R

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77
#'
88
#' @description
99
#' Specify an outcome variable and return IV outputs.
10-
#' All numeric variables in the dataset are used as predictor variables.
10+
#' All numeric, character, and factor variables in the dataset are used as predictor variables.
1111
#'
1212
#' @param data A Person Query dataset in the form of a data frame.
1313
#' @param predictors A character vector specifying the columns to be used as
14-
#' predictors. Defaults to NULL, where all numeric vectors in the data will be
15-
#' used as predictors.
14+
#' predictors. Defaults to NULL, where all numeric, character, and factor vectors
15+
#' in the data will be used as predictors.
1616
#' @param outcome A string specifying a binary variable, i.e. can only contain
1717
#' the values 1 or 0, or a logical variable (TRUE/FALSE). Logical variables will
1818
#' be automatically converted to binary (TRUE to 1, FALSE to 0).
@@ -109,7 +109,7 @@ create_IV <- function(data,
109109
train <-
110110
data %>%
111111
rename(outcome = outcome) %>%
112-
select(where(is.numeric)) %>%
112+
select(where(function(x) is.numeric(x) || is.character(x) || is.factor(x))) %>%
113113
tidyr::drop_na()
114114

115115
} else {

R/globals.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ utils::globalVariables(
266266
"degree",
267267
"eigenvector",
268268
"node_size",
269-
"pagerank"
269+
"pagerank",
270+
"category"
270271
)
271272
)

R/tm_wordcloud.R

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,12 @@
4242
#'
4343
#' @import dplyr
4444
#' @examples
45-
#' tm_wordcloud(mt_data, keep = 30)
45+
#' mt_data_mini <- mt_data[sample(1:nrow(mt_data), 500), ]
46+
#'
47+
#' tm_wordcloud(mt_data_mini, keep = 30)
4648
#'
4749
#' # Removing stopwords
48-
#' tm_wordcloud(mt_data, keep = 30, stopwords = c("weekly", "update"))
50+
#' tm_wordcloud(mt_data_mini, keep = 30, stopwords = c("weekly", "update"))
4951
#'
5052
#' @family Text-mining
5153
#'

man/create_IV.Rd

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)