-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfunctions.R
484 lines (457 loc) · 19.2 KB
/
functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
#' Motor insurance data
#'
#' Dataset from CASDataset R package
#'
#' @name COLORS
#' @details variables are
#' \itemize{
#' \item 0 #5BBCD6 light blue (from Wes Anderson color palette)
#' \item 1 #FF0000 red
#' \item A #00A08A green (from Wes Anderson color palette)
#' \item B #F2AD00 yellow (from Wes Anderson color palette)
#' \item with #046C9A dark blue
#' \item without #C93312 dark red
#' \item 2 #0B775E dark green
#' }
#'
#' @docType vector of colors
#' @usage data(frenchmotor)
#' @format vectors with 7 colors
#' @keywords colors
#' @examples
#' data(COLORS)
"COLORS"
#' Motor insurance data
#'
#' Dataset from CASDataset R package
#'
#' @name frenchmotor
#' @details See also [CASdatasets](http://cas.uqam.ca/). Variables are
#' \itemize{
#' \item LicAge : num 454 163 332 447 408 222 316 284 459 244 ...
#' \item VehAge : Factor w/ 9 levels "0","1","10+",..: 1 4 2 1 9 8 1 7 8 5 ...
#' \item sensitive : Factor w/ 2 levels "Female","Male": 2 1 2 2 1 1 1 1 2 1 ...
#' \item MariStat : Factor w/ 2 levels "Alone","Other": 2 2 2 1 2 2 2 2 2 2 ...
#' \item SocioCateg : Factor w/ 52 levels "CSP1","CSP16",..: 21 27 23 18 21 21 21 21 21 23 ...
#' \item VehUsage : Factor w/ 4 levels "Private","Private+trip to office",..: 3 1 3 3 2 2 1 2 1 2 ...
#' \item DrivAge : num 56 63 46 58 54 56 45 43 56 38 ...
#' \item HasKmLimit : num 0 0 0 0 0 0 0 0 1 0 ...
#' \item BonusMalus : num 50 85 50 50 54 50 60 50 50 50 ...
#' \item VehBody : Factor w/ 9 levels "bus","cabriolet",..: 7 6 6 6 5 6 6 6 6 6 ...
#' \item VehPrice : Factor w/ 26 levels "A","B","C","D",..: 16 7 8 11 10 8 8 5 6 8 ...
#' \item VehEngine : Factor w/ 6 levels "carburation",..: 2 5 2 2 5 5 5 5 5 5 ...
#' \item VehEnergy : Factor w/ 4 levels "diesel","eletric",..: 1 4 1 1 4 4 4 4 4 4 ...
#' \item VehMaxSpeed: Factor w/ 10 levels "1-130 km/h","130-140 km/h",..: 9 4 5 7 7 6 7 4 4 6 ...
#' \item VehClass : Factor w/ 6 levels "0","A","B","H",..: 6 3 3 5 5 5 3 3 3 5 ...
#' \item y : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
#' \item RiskVar : num 17 16 18 18 17 9 16 20 18 5 ...
#' \item Garage : Factor w/ 3 levels "Collective garage",..: 3 2 3 2 2 2 2 2 2 2 ...#' }
#' }
#'
#' @docType data
#' @usage data(frenchmotor)
#' @format data frame with 12437 obs. of 18 variables
#' @keywords dataset
#' @examples
#' data(frenchmotor)
"frenchmotor"
#' German credit data
#'
#' Dataset from CASDataset R package
#'
#' @name germancredit
#' @details See also [CASdatasets](http://cas.uqam.ca/). Variables are
#' \itemize{
#' \item Account_status : Factor w/ 4 levels "< 0 DM",">= 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
#' \item Duration : num 6 48 12 42 24 36 24 36 12 30 ...
#' \item Credit_history : Factor w/ 5 levels "all credits at this bank paid back duly",..: 2 4 2 4 3 4 4 4 4 2 ...
#' \item Purpose : Factor w/ 10 levels "business","car (new)",..: 8 8 5 6 2 5 6 3 8 2 ...
#' \item Credit_amount : num 1169 5951 2096 7882 4870 ...
#' \item Savings_bonds : Factor w/ 5 levels "< 100 DM",">= 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
#' \item Present_employment_since: Factor w/ 5 levels "< 1 year",">= 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
#' \item Installment_rate : num 4 2 2 2 3 2 3 2 2 4 ...
#' \item Other_debtors_guarantors: Factor w/ 3 levels "co-applicant",..: 3 3 3 2 3 3 3 3 3 3 ...
#' \item Resident_since : num 4 2 3 4 4 4 4 2 4 2 ...
#' \item Property : Factor w/ 4 levels "building society savings agreement / life insurance",..: 3 3 3 1 4 4 1 2 3 2 ...
#' \item Age : num 67 22 49 45 53 35 53 35 61 28 ...
#' \item Other_installment_plans : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
#' \item Housing : Factor w/ 3 levels "rent","own","for free": 2 2 2 3 3 3 2 1 2 2 ...
#' \item Existing_credits : num 2 1 1 1 2 1 1 1 1 2 ...
#' \item Job : Factor w/ 4 levels "management / self-employed / highly qualified employee / officer",..: 2 2 4 2 2 4 2 1 4 1 ...
#' \item People_maintenance_for : num 1 1 2 2 2 2 1 1 1 1 ...
#' \item Telephone : Factor w/ 2 levels "none","yes": 2 1 1 1 1 2 1 2 1 1 ...
#' \item Foreign_worker : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
#' \item y : Factor w/ 2 levels "BAD","GOOD": 2 1 2 2 1 2 2 2 2 1 ...
#' \item sensitive : Factor w/ 2 levels "Female","Male": 1 2 1 1 1 1 1 1 1 1 ...
#' }
#'
#' @docType data
#' @usage data(germancredit)
#' @format data frame
#' @keywords dataset
#' @examples
#' data(germancredit)
"germancredit"
#' Telematic data
#'
#' Telematic dataset (real data)
#'
#' @name telematic
#' @details variables are
#' \itemize{
#' \item id : chr "1" "2" "3" "4" ...
#' \item Total_Distance : num 3649 9066 27156 8558 17853 ...
#' \item Drive_Score : num 67 63 74 77 68 84 71 16 61 80 ...
#' \item Time_Day : num 109 116 73 108 106 63 108 136 117 105 ...
#' \item Style_Score : num 85 92 55 74 65 32 91 16 64 33 ...
#' \item Corner_Score : num 86 93 61 67 61 32 92 19 30 27 ...
#' \item Acceleration_Score: num 88 80 49 91 89 40 94 16 51 42 ...
#' \item Braking_Score : num 84 93 64 82 72 46 90 21 87 50 ...
#' \item Total_Night_Time : num 34 28 24 25 38 20 23 27 38 21 ...
#' \item Total_Time : num 79 339 464 437 649 437 349 24 523 500 ...
#' \item Acceleration : Factor w/ 2 levels "High","Low": 2 1 2 1 1 2 1 2 2 2 ...
#' \item Brake : Factor w/ 2 levels "High","Low": 2 1 1 1 1 2 1 2 1 1 ...
#' \item Corner : Factor w/ 2 levels "High","Low": 1 1 1 1 1 2 1 2 2 2 ...
#' \item insured_gender : Factor w/ 2 levels "F","M": 2 1 1 2 2 2 2 2 2 1 ...
#' \item age : num 21 20 30 52 21 34 22 29 22 53 ...
#' \item claim : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 2 ...
#' }
#'
#' @docType data
#' @usage data(telematic)
#' @format data frame with 1177 obs. of 16 variables
#' @keywords dataset
#' @examples
#' data(telematic)
#' table(telematic$insured_gender, telematic$claim)
"telematic"
#' Toy data no2
#'
#' Toy dataset
#'
#' @name toydata2
#' @details variables are
#' \itemize{
#' \item x1: (continuous) legitimate explanatory variable: num -0.33 -1.22 -1.28 -0.93 0.8 -0.33 0.05 1 -0.07 -1.34 ...
#' \item x2: (continuous) legitimate explanatory variable: num 7.9 3.7 9.6 4.5 9.2 5.8 2.8 1.2 1.4 3.8 ...
#' \item x3: (continuous) legitimate explanatory variable: num -0.23 -1.19 -0.41 -1.05 -0.65 -0.66 -0.89 0.25 -1.53 -0.3 ...
#' \item score: (continuous) true score used to generate outcomes: 0.46 0.09 0.5 0.13 0.84 0.27 0.14 0.22 0.07 0.09 ...
#' \item sensitive: (binary) sensistive attribute: chr "A" "A" "A" "A" ...
#' \item y: (binary) outome: int 1 0 0 0 1 0 0 0 0 0 ...
#' }
#'
#' @docType data
#' @usage data(toydata2)
#' @format data frame with 1000 obs. of 6 variables
#' @keywords dataset
#' @examples
#' data(toydata2)
#' data(toydata2_train)
#' data(toydata2_valid)
"toydata2"
#' Toy data no1
#'
#' Toy dataset
#'
#' @name toydata1
#' @details variables are
#' \itemize{
#' \item x: (continuous) legitimate explanatory variable: num -1.05838 -0.75016 1.54972 0.00442 -1.14538 ...
#' \item s: (binary) sensitive attribute: Factor w/ 2 levels "B","A": 1 2 2 2 2 2 1 1 2 1 ...
#' \item y: (binary) outome: Factor w/ 2 levels "0","1": 1 1 2 1 2 2 2 1 1 1 ...
#' }
#'
#' @docType data
#' @usage data(toydata1)
#' @format data frame with 1000 obs. of 3 variables
#' @keywords dataset
#' @examples
#' data(toydata1)
#' data(toydata1_train)
#' data(toydata1_valid)
"toydata1"
#' Optimal transport plot for scores
#'
#' Plots the optimal transport between two wamples
#'
#' @param yA vector of predictions in group A
#' @param yB vector of predictions in group B
#' @param densA kernel density estimate associated with sample yA
#' @param densB kernel density estimate associated with sample yB
#' @param limA range of values for yA
#' @param limB range of values for yB
#' @param limY magnitude of the density
#' @param lab names for colors (default A and B)
#' @param sub number of subdivisions of scores axis
#' @return A plot
#' @examples
#' NA
#' @import scales
#' @export
draw_transport = function(yA, yB,
densA, densB,
limA = c(0,1),
limB=c(0,1),
limY = c(0,12),
lab = c("A","B"),
sub = 5){
d_A = data.frame(x=seq(limA[1],limA[2],length=251),
y=densA(seq(limA[1],limA[2],length=251)))
d_B = data.frame(x=seq(limB[1],limB[2],length=251),
y=densB(seq(limB[1],limB[2],length=251)))
{
mat <- matrix(c(1,2,0,3), 2)
par(mfrow = c(2,2))
layout(mat, c(3.5,1), c(1,3))
par(mar = c(0.5, 4.5, 0.5, 0.5))
}
plot(d_A$x,d_A$y,type="l",col=COLORS[lab[1]],
lwd=2, axes=FALSE ,xlab="", ylab="",
xlim=limA, ylim=limY)
polygon(c(0,d_A$x,1),c(0,d_A$y,0),
col=scales::alpha(COLORS[lab[1]],.1),border=NA)
axis(1,at=seq(limA[1],limA[2],length=sub),label=c(NA,seq(limA[1],limA[2],length=sub)[-1]))
par(mar=c(4.5, 4.5, 0.5, 0.5))
plot(plot_transport(yA,yB),
col = COLORS["1"], lwd = 2,
type = "l",
xlab = "",
ylab = "",
xlim=limA,
ylim=limB,
axes=FALSE)
abline(a = 0, b = 1, col = COLORS["0"], lty = 2)
axis(1)
axis(2)
mtext("score distribution, S=A", side=1,line=3, col = "black")
mtext("score distribution, S=B", side=2,line=3, col = "black")
par(mar = c(4.5, 0.5, 0.5, 0.5))
plot(d_B$y,d_B$x,type="l",col=COLORS[lab[2]],
lwd=2,ylim=limB,xlim=limY, xlab="",ylab="",axes=FALSE)
polygon(c(0,d_B$y,0),c(0,d_B$x,1), col=scales::alpha(COLORS[lab[2]],.1),
border=NA)
axis(2,at=seq(limB[1],limB[2],length=sub),label=c(NA,seq(limB[1],limB[2],length=sub)[-c(1,sub)],NA))
}
#' Beta-kernel density estimation of a score
#'
#' Estimate the density of the score defined on [0,1]
#'
#' @param y The vector of scores
#' @param b The kernel bandwidth
#' @return A function that is the density estimate
#' @examples
#' d1 <- density_score(runif(100));
#' d2 <- density_score(rbeta(100,2,1));
#' @import kdensity
#' @export
density_score = function(y,b = NA){
if( is.na(b)) d = kdensity::kdensity(y ,kernel="beta")
if(!is.na(b)) d = kdensity::kdensity(y ,kernel="beta", bw = b)
return(d)
}
#' Compute the calibration function
#'
#' Compute values use to plot the calibration plot of a score defined on $[0,1]$
#'
#' @param predy The vector of predicted scores (in [0,1])
#' @param y The vector of observed values (in {0,1})
#' @param u The grid to compute values for the calibration plot (in $[0,1]$)
#' @param a Smoothing parameter for the local regression
#' @return A dataframe with $x$ and $y$ used in the calibration plot
#' @examples
#' d1 <- 1
#' @import locfit
#' @export
plot_calibration = function(predy,
y,
u=seq(0,1,by=.01),
a=.05){
L = locfit::locfit.raw(
x=predy,
y=y,
kern="rect", deg=0, alpha=a)
caly = predict(L,newdata=u)
return(data.frame(x=u,
y=caly))
}
#' Compute the transport function on a grid
#'
#' Compute values use to plot the (univariate) optimal transport
#'
#' @param y0 The vector of initial vector
#' @param y1 The vector of final vector
#' @param u The grid to compute values for the transport plot
#' @return A dataframe with $x$ and $y$ used in the transport function plot
#' @seealso [OPT()]
#' @examples
#' str(plot_transport(y0 = rnorm(100,0,1),
#' y1 = rnorm(100,1,2),
#' u = seq(-3,3,by=.1))
#' @export
plot_transport = function(y0,
y1,
u=seq(min(y0),max(y0),length=251)){
F0 = function(y) mean(y0<=y)
Q1 = function(u) quantile(y1, u)
F0u = Vectorize(F0)(u)
Q1u = Vectorize(Q1)(F0u)
return(data.frame(x=u,
y=Q1u))
}
#' Compute the transport function
#'
#' Function for (univariate) optimal transport, see also [wikipedia](https://en.wikipedia.org/wiki/Transportation_theory_(mathematics)#Optimal_transportation_on_the_real_line)
#'
#' @param y0 The vector of initial vector
#' @param y1 The vector of final vector
#' @param u Value at which transport is applied
#' @return The transported value
#' @examples
#' str(OPT(y0 = rnorm(100,0,1),
#' y1 = rnorm(100,1,2),
#' u = 0)
#' @export
OPT = function(y0,y1,u=0){
F0 = function(y) mean(y0<=y)
Q1 = function(p) quantile(y1, p)
return(Q1u(F0(u)))
}
#' Fairness metrics
#'
#' Compute different fairness metrics
#'
#' @param threshold a model
#' @param name_outcome xxxxx
#' @param outcome_ref xxxxx
#' @param name_sensitive xxxxx
#' @param sensitive_ref xxxxx
#' @param base xxxxx
#' @param pred_outcome xxxxx
#' @return a vector with 11 metrics, and details (table and graphs) on those 1. metrics
#' @examples
#' data(toydata2)
#' model_gam = mgcv::gam(y~s(x1,k=12,bs="cr")+s(x2,k=12,bs="cr")+s(x3,k=12,bs="cr"),
#' data=toydata2,family=binomial)
#' pred_y = predict(model_gam, type="response")
#' fair_metrics(pred_outcome = pred_y)$metrics
#' @import fairness
#' @import mgcv
#' @export
fair_metrics = function(threshold=.5,
name_outcome = "y",
outcome_ref = "1",
base = toydata2,
name_sensitive = "sensitive",
sensitive_ref = "B",
pred_outcome = probs){
vect = rep(NA,11)
res_prp <- fairness::pred_rate_parity(data = base,
outcome = name_outcome,
outcome_base = outcome_ref,
group = name_sensitive,
probs = pred_outcome,
cutoff = threshold,
base = sensitive_ref)
vect[1] = res_prp$Metric_plot$data[2,1]
res_dem <- fairness::dem_parity(data = base,
outcome = name_outcome,
outcome_base = outcome_ref,
group = name_sensitive,
probs = pred_outcome,
cutoff = threshold,
base = sensitive_ref)
vect[2] = res_dem$Metric_plot$data[2,1]
res_prop <- fairness::prop_parity(data = base,
outcome = name_outcome,
outcome_base = outcome_ref,
group = name_sensitive,
probs = pred_outcome,
cutoff = threshold,
base = sensitive_ref)
vect[3] = res_prop$Metric_plot$data[2,1]
res_eq <- fairness::equal_odds(data = base,
outcome = name_outcome,
outcome_base = outcome_ref,
group = name_sensitive,
probs = pred_outcome,
cutoff = threshold,
base = sensitive_ref)
vect[4] = res_eq$Metric_plot$data[2,1]
res_acc <- fairness::acc_parity(data = base,
outcome = name_outcome,
outcome_base = outcome_ref,
group = name_sensitive,
probs = pred_outcome,
cutoff = threshold,
base = sensitive_ref)
vect[5] = res_acc$Metric_plot$data[2,1]
res_fnr <- fairness::fnr_parity(data = base,
outcome = name_outcome,
outcome_base = outcome_ref,
group = name_sensitive,
probs = pred_outcome,
cutoff = threshold,
base = sensitive_ref)
vect[6] = res_fnr$Metric_plot$data[2,1]
res_fpr <- fairness::fpr_parity(data = base,
outcome = name_outcome,
outcome_base = outcome_ref,
group = name_sensitive,
probs = pred_outcome,
cutoff = threshold,
base = sensitive_ref)
vect[7] = res_fpr$Metric_plot$data[2,1]
res_npv <- fairness::npv_parity(data = base,
outcome = name_outcome,
outcome_base = outcome_ref,
group = name_sensitive,
probs = pred_outcome,
cutoff = threshold,
base = sensitive_ref)
vect[8] = res_npv$Metric_plot$data[2,1]
res_sp <- fairness::spec_parity(data = base,
outcome = name_outcome,
outcome_base = outcome_ref,
group = name_sensitive,
probs = pred_outcome,
cutoff = threshold,
base = sensitive_ref)
vect[9] = res_sp$Metric_plot$data[2,1]
res_auc <- fairness::roc_parity(data = base,
outcome = name_outcome,
group = name_sensitive,
probs = pred_outcome,
base = sensitive_ref)
vect[10] = res_auc$Metric_plot$data[2,1]
res_mcc <- fairness::mcc_parity(data = base,
outcome = name_outcome,
outcome_base = outcome_ref,
group = name_sensitive,
probs = pred_outcome,
base = sensitive_ref)
vect[11] = res_mcc$Metric_plot$data[2,1]
names(vect) = c("pred_rate_parity",
"dem_parity",
"prop_parity",
"equal_odds",
"acc_parity",
"fnr_parity",
"fpr_parity",
"npv_parity",
"spec_parity",
"roc_parity",
"mcc_parity")
L=list(metrics = vect,
pred_rate_parity = res_prp,
dem_parity = res_dem,
prop_parity = res_prop,
equal_odds = res_eq,
acc_parity = res_acc,
fnr_parity = res_fnr,
fpr_parity = res_fpr,
npv_parity = res_npv,
spec_parity = res_mcc,
roc_parity = res_auc,
mcc_parity = res_mcc)
return(L)
}