-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path4-creditcard.html
905 lines (859 loc) · 62.9 KB
/
4-creditcard.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
<meta charset="utf-8">
<meta name="generator" content="quarto-0.9.464">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
<title>kaggle - 4 信用卡欺诈识别</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
span.underline{text-decoration: underline;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>
<script id="quarto-search-options" type="application/json">{
"location": "sidebar",
"copy-button": false,
"collapse-after": 3,
"panel-placement": "start",
"type": "textbox",
"limit": 20,
"language": {
"search-no-results-text": "No results",
"search-matching-documents-text": "matching documents",
"search-copy-link-title": "Copy link to search",
"search-hide-matches-text": "Hide additional matches",
"search-more-match-text": "more match in this document",
"search-more-matches-text": "more matches in this document",
"search-clear-button-title": "Clear",
"search-detached-cancel-button-title": "Cancel",
"search-submit-button-title": "Submit"
}
}</script>
<script src="site_libs\quarto-nav\quarto-nav.js"></script><script src="site_libs\quarto-nav\headroom.min.js"></script><script src="site_libs\clipboard\clipboard.min.js"></script><script src="site_libs\quarto-search\autocomplete.umd.js"></script><script src="site_libs\quarto-search\fuse.min.js"></script><script src="site_libs\quarto-search\quarto-search.js"></script><meta name="quarto:offset" content="./"><link href="/5-Student-performance-level.html" rel="next"><link href="/3-HR-comma-sep.html" rel="prev"><script src="site_libs\quarto-html\quarto.js"></script><script src="site_libs\quarto-html\popper.min.js"></script><script src="site_libs\quarto-html\tippy.umd.min.js"></script><script src="site_libs\quarto-html\anchor.min.js"></script><link href="site_libs\quarto-html\tippy.css" rel="stylesheet"><link href="site_libs\quarto-html\quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles"><script src="site_libs\bootstrap\bootstrap.min.js"></script><link href="site_libs\bootstrap\bootstrap-icons.css" rel="stylesheet"><link href="site_libs\bootstrap\bootstrap.min.css" rel="stylesheet"></head>
<body class="nav-sidebar floating">
<div id="quarto-search-results"></div>
<header id="quarto-header" class="headroom fixed-top">
<nav class="quarto-secondary-nav" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
<div class="container-fluid d-flex justify-content-between">
<h1 class="quarto-secondary-nav-title"><span class="chapter-number">4</span> <span class="chapter-title">信用卡欺诈识别</span></h1>
<button type="button" class="quarto-btn-toggle btn" aria-label="Show secondary navigation">
<i class="bi bi-chevron-right"></i>
</button>
</div>
</nav>
</header>
<!-- content -->
<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article">
<!-- sidebar -->
<nav id="quarto-sidebar" class="sidebar collapse sidebar-navigation floating overflow-auto">
<div class="pt-lg-2 mt-2 text-left sidebar-header">
<div class="sidebar-title mb-0 py-0">
<a href="./">kaggle</a>
</div>
</div>
<div class="mt-2 flex-shrink-0 align-items-center">
<div class="sidebar-search">
<div id="quarto-search" class="" title="Search"></div>
</div>
</div>
<div class="sidebar-menu-container">
<ul class="list-unstyled mt-1">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="./index.html" class="sidebar-item-text sidebar-link">简介</a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="./1-vgsales.html" class="sidebar-item-text sidebar-link"><span class="chapter-number">1</span> <span class="chapter-title">Video Games Sales</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="./2-Olympic-history.html" class="sidebar-item-text sidebar-link"><span class="chapter-number">2</span> <span class="chapter-title">Olympic history</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="./3-HR-comma-sep.html" class="sidebar-item-text sidebar-link"><span class="chapter-number">3</span> <span class="chapter-title">员工离职分析</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="./4-creditcard.html" class="sidebar-item-text sidebar-link active"><span class="chapter-number">4</span> <span class="chapter-title">信用卡欺诈识别</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="./5-Student-performance-level.html" class="sidebar-item-text sidebar-link"><span class="chapter-number">5</span> <span class="chapter-title">学生成绩水平分类</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="./6-mass-shoot.html" class="sidebar-item-text sidebar-link"><span class="chapter-number">6</span> <span class="chapter-title">美国大规模枪击案</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="./7-hotel-demond.html" class="sidebar-item-text sidebar-link"><span class="chapter-number">7</span> <span class="chapter-title">酒店房间预定预测</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="./9990-references.html" class="sidebar-item-text sidebar-link">参考文献</a>
</div>
</li>
</ul>
</div>
</nav>
<!-- margin-sidebar -->
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
<nav id="TOC" role="doc-toc">
<h2 id="toc-title">Table of contents</h2>
<ul>
<li><a href="#sec:five1" id="toc-sec:five1" class="nav-link active" data-scroll-target="#sec\:five1"> <span class="header-section-number">4.1</span> 数据变量说明</a></li>
<li><a href="#sec:five2" id="toc-sec:five2" class="nav-link" data-scroll-target="#sec\:five2"> <span class="header-section-number">4.2</span> 数据预处理</a>
<ul class="collapse">
<li><a href="#sec:five22" id="toc-sec:five22" class="nav-link" data-scroll-target="#sec\:five22"> <span class="header-section-number">4.2.1</span> 分层抽样</a></li>
<li><a href="#sec:five23" id="toc-sec:five23" class="nav-link" data-scroll-target="#sec\:five23"> <span class="header-section-number">4.2.2</span> 标准化</a></li>
</ul></li>
<li><a href="#sec:five3" id="toc-sec:five3" class="nav-link" data-scroll-target="#sec\:five3"> <span class="header-section-number">4.3</span> 描述性分析</a>
<ul class="collapse">
<li><a href="#不同时间诈骗次数-条形图" id="toc-不同时间诈骗次数-条形图" class="nav-link" data-scroll-target="#不同时间诈骗次数-条形图"> <span class="header-section-number">4.3.1</span> 不同时间诈骗次数-条形图</a></li>
<li><a href="#不同时间诈骗金额-箱线图" id="toc-不同时间诈骗金额-箱线图" class="nav-link" data-scroll-target="#不同时间诈骗金额-箱线图"> <span class="header-section-number">4.3.2</span> 不同时间诈骗金额-箱线图</a></li>
<li><a href="#不同时间平均诈骗金额-条形图" id="toc-不同时间平均诈骗金额-条形图" class="nav-link" data-scroll-target="#不同时间平均诈骗金额-条形图"> <span class="header-section-number">4.3.3</span> 不同时间平均诈骗金额-条形图</a></li>
</ul></li>
<li><a href="#sec:five4" id="toc-sec:five4" class="nav-link" data-scroll-target="#sec\:five4"> <span class="header-section-number">4.4</span> 自动参数调整调参-使用<code>caret</code>包</a></li>
<li><a href="#sec:five5" id="toc-sec:five5" class="nav-link" data-scroll-target="#sec\:five5"> <span class="header-section-number">4.5</span> kNN建模</a>
<ul class="collapse">
<li><a href="#原理" id="toc-原理" class="nav-link" data-scroll-target="#原理"> <span class="header-section-number">4.5.1</span> 原理</a></li>
<li><a href="#模型建立" id="toc-模型建立" class="nav-link" data-scroll-target="#模型建立"> <span class="header-section-number">4.5.2</span> 模型建立</a></li>
</ul></li>
<li><a href="#sec:five6" id="toc-sec:five6" class="nav-link" data-scroll-target="#sec\:five6"> <span class="header-section-number">4.6</span> 模型评估</a></li>
</ul>
</nav>
</div>
<!-- main -->
<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title"><span id="creditcard" class="quarto-section-identifier d-none d-lg-block"><span class="chapter-number">4</span> <span class="chapter-title">信用卡欺诈识别</span></span></h1>
</div>
<div class="quarto-title-meta">
</div>
</header>
<section id="sec:five1" class="level2" data-number="4.1">
<h2 data-number="4.1" class="anchored" data-anchor-id="sec:five1"><span class="header-section-number">4.1</span> 数据变量说明</h2>
<p><a href="https://www.kaggle.com/arockiaselciaa/creditcardcsv">变量说明</a></p>
<p><code>class</code>变量:0表示非欺诈,1表示非欺诈。</p>
</section>
<section id="sec:five2" class="level2" data-number="4.2">
<h2 data-number="4.2" class="anchored" data-anchor-id="sec:five2"><span class="header-section-number">4.2</span> 数据预处理</h2>
<div class="cell">
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>card <span class="ot"><-</span> <span class="fu">read_csv</span>(<span class="st">"data/creditcard.csv"</span>)</span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>card <span class="ot"><-</span> <span class="fu">as.data.frame</span>(card)</span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">str</span>(card) <span class="co"># 查看数据基本结构和数据类型</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>'data.frame': 284807 obs. of 31 variables:
$ Time : num 0 0 1 1 2 2 4 7 7 9 ...
$ V1 : num -1.36 1.192 -1.358 -0.966 -1.158 ...
$ V2 : num -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
$ V3 : num 2.536 0.166 1.773 1.793 1.549 ...
$ V4 : num 1.378 0.448 0.38 -0.863 0.403 ...
$ V5 : num -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
$ V6 : num 0.4624 -0.0824 1.8005 1.2472 0.0959 ...
$ V7 : num 0.2396 -0.0788 0.7915 0.2376 0.5929 ...
$ V8 : num 0.0987 0.0851 0.2477 0.3774 -0.2705 ...
$ V9 : num 0.364 -0.255 -1.515 -1.387 0.818 ...
$ V10 : num 0.0908 -0.167 0.2076 -0.055 0.7531 ...
$ V11 : num -0.552 1.613 0.625 -0.226 -0.823 ...
$ V12 : num -0.6178 1.0652 0.0661 0.1782 0.5382 ...
$ V13 : num -0.991 0.489 0.717 0.508 1.346 ...
$ V14 : num -0.311 -0.144 -0.166 -0.288 -1.12 ...
$ V15 : num 1.468 0.636 2.346 -0.631 0.175 ...
$ V16 : num -0.47 0.464 -2.89 -1.06 -0.451 ...
$ V17 : num 0.208 -0.115 1.11 -0.684 -0.237 ...
$ V18 : num 0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...
$ V19 : num 0.404 -0.146 -2.262 -1.233 0.803 ...
$ V20 : num 0.2514 -0.0691 0.525 -0.208 0.4085 ...
$ V21 : num -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...
$ V22 : num 0.27784 -0.63867 0.77168 0.00527 0.79828 ...
$ V23 : num -0.11 0.101 0.909 -0.19 -0.137 ...
$ V24 : num 0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...
$ V25 : num 0.129 0.167 -0.328 0.647 -0.206 ...
$ V26 : num -0.189 0.126 -0.139 -0.222 0.502 ...
$ V27 : num 0.13356 -0.00898 -0.05535 0.06272 0.21942 ...
$ V28 : num -0.0211 0.0147 -0.0598 0.0615 0.2152 ...
$ Amount: num 149.62 2.69 378.66 123.5 69.99 ...
$ Class : num 0 0 0 0 0 0 0 0 0 0 ...</code></pre>
</div>
<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">summary</span>(card) <span class="co"># 查看数据的主要描述性统计量</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> Time V1 V2 V3
Min. : 0 Min. :-56.40751 Min. :-72.71573 Min. :-48.3256
1st Qu.: 54202 1st Qu.: -0.92037 1st Qu.: -0.59855 1st Qu.: -0.8904
Median : 84692 Median : 0.01811 Median : 0.06549 Median : 0.1799
Mean : 94814 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
3rd Qu.:139321 3rd Qu.: 1.31564 3rd Qu.: 0.80372 3rd Qu.: 1.0272
Max. :172792 Max. : 2.45493 Max. : 22.05773 Max. : 9.3826
V4 V5 V6 V7
Min. :-5.68317 Min. :-113.74331 Min. :-26.1605 Min. :-43.5572
1st Qu.:-0.84864 1st Qu.: -0.69160 1st Qu.: -0.7683 1st Qu.: -0.5541
Median :-0.01985 Median : -0.05434 Median : -0.2742 Median : 0.0401
Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.74334 3rd Qu.: 0.61193 3rd Qu.: 0.3986 3rd Qu.: 0.5704
Max. :16.87534 Max. : 34.80167 Max. : 73.3016 Max. :120.5895
V8 V9 V10 V11
Min. :-73.21672 Min. :-13.43407 Min. :-24.58826 Min. :-4.79747
1st Qu.: -0.20863 1st Qu.: -0.64310 1st Qu.: -0.53543 1st Qu.:-0.76249
Median : 0.02236 Median : -0.05143 Median : -0.09292 Median :-0.03276
Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
3rd Qu.: 0.32735 3rd Qu.: 0.59714 3rd Qu.: 0.45392 3rd Qu.: 0.73959
Max. : 20.00721 Max. : 15.59500 Max. : 23.74514 Max. :12.01891
V12 V13 V14 V15
Min. :-18.6837 Min. :-5.79188 Min. :-19.2143 Min. :-4.49894
1st Qu.: -0.4056 1st Qu.:-0.64854 1st Qu.: -0.4256 1st Qu.:-0.58288
Median : 0.1400 Median :-0.01357 Median : 0.0506 Median : 0.04807
Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
3rd Qu.: 0.6182 3rd Qu.: 0.66251 3rd Qu.: 0.4931 3rd Qu.: 0.64882
Max. : 7.8484 Max. : 7.12688 Max. : 10.5268 Max. : 8.87774
V16 V17 V18
Min. :-14.12985 Min. :-25.16280 Min. :-9.498746
1st Qu.: -0.46804 1st Qu.: -0.48375 1st Qu.:-0.498850
Median : 0.06641 Median : -0.06568 Median :-0.003636
Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
3rd Qu.: 0.52330 3rd Qu.: 0.39968 3rd Qu.: 0.500807
Max. : 17.31511 Max. : 9.25353 Max. : 5.041069
V19 V20 V21
Min. :-7.213527 Min. :-54.49772 Min. :-34.83038
1st Qu.:-0.456299 1st Qu.: -0.21172 1st Qu.: -0.22839
Median : 0.003735 Median : -0.06248 Median : -0.02945
Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
3rd Qu.: 0.458949 3rd Qu.: 0.13304 3rd Qu.: 0.18638
Max. : 5.591971 Max. : 39.42090 Max. : 27.20284
V22 V23 V24
Min. :-10.933144 Min. :-44.80774 Min. :-2.83663
1st Qu.: -0.542350 1st Qu.: -0.16185 1st Qu.:-0.35459
Median : 0.006782 Median : -0.01119 Median : 0.04098
Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
3rd Qu.: 0.528554 3rd Qu.: 0.14764 3rd Qu.: 0.43953
Max. : 10.503090 Max. : 22.52841 Max. : 4.58455
V25 V26 V27
Min. :-10.29540 Min. :-2.60455 Min. :-22.565679
1st Qu.: -0.31715 1st Qu.:-0.32698 1st Qu.: -0.070840
Median : 0.01659 Median :-0.05214 Median : 0.001342
Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
3rd Qu.: 0.35072 3rd Qu.: 0.24095 3rd Qu.: 0.091045
Max. : 7.51959 Max. : 3.51735 Max. : 31.612198
V28 Amount Class
Min. :-15.43008 Min. : 0.00 Min. :0.000000
1st Qu.: -0.05296 1st Qu.: 5.60 1st Qu.:0.000000
Median : 0.01124 Median : 22.00 Median :0.000000
Mean : 0.00000 Mean : 88.35 Mean :0.001728
3rd Qu.: 0.07828 3rd Qu.: 77.17 3rd Qu.:0.000000
Max. : 33.84781 Max. :25691.16 Max. :1.000000 </code></pre>
</div>
<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="fu">round</span>(<span class="fu">prop.table</span>(<span class="fu">table</span>(card<span class="sc">$</span>Class)),<span class="dv">4</span>)<span class="co"># 查看数据类别比例</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>
0 1
0.9983 0.0017 </code></pre>
</div>
</div>
<section id="sec:five22" class="level3" data-number="4.2.1">
<h3 data-number="4.2.1" class="anchored" data-anchor-id="sec:five22"><span class="header-section-number">4.2.1</span> 分层抽样</h3>
<p>处理类别不平衡的数据需要了解的几个概念点:</p>
<ol type="1">
<li><p>类别不平衡:指分类任务重不同类别的训练样本树木差别很大的情况。</p></li>
<li><p>欠抽样:指某类(样本数占比很大)的样本中抽取出与另一类样本(样本数占比很小)个数一样的样本。即从大类别中抽取与小类别数目一样的样本。</p></li>
<li><p>过抽样:指针对样本数占比很小的类别,重新塑造一些数据,使其与另一类数据接近。</p></li>
</ol>
<p>对数据进行一些基本转化。</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 把Time列转换为小时</span></span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>card <span class="ot"><-</span> card <span class="sc">%>%</span> </span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">Time_Hour =</span> <span class="fu">round</span>(card[, <span class="dv">1</span>]<span class="sc">/</span><span class="dv">3600</span>, <span class="dv">0</span>))</span>
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a><span class="co"># 把Class列转化为因子型</span></span>
<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>card<span class="sc">$</span>Class <span class="ot"><-</span> <span class="fu">factor</span>(card<span class="sc">$</span>Class)</span>
<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a>card_1 <span class="ot"><-</span> card[card<span class="sc">$</span>Class <span class="sc">==</span> <span class="st">"1"</span>, ] <span class="co"># 欺诈样本</span></span>
<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>card_0 <span class="ot"><-</span> card[card<span class="sc">$</span>Class <span class="sc">==</span> <span class="st">"0"</span>, ] <span class="co"># 非欺诈样本</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>随机抽取与诈骗样本个数相同的非欺诈样本数据,并与元欺诈样本合并为新的数据。此处使用的欠抽样的方法。</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">1234</span>)</span>
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>index <span class="ot"><-</span> <span class="fu">sample</span>(<span class="at">x =</span> <span class="dv">1</span><span class="sc">:</span><span class="fu">nrow</span>(card_0), <span class="at">size =</span> <span class="fu">nrow</span>(card_1))</span>
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>card_0_new <span class="ot"><-</span> card_0[index, ]</span>
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>card_end <span class="ot"><-</span> <span class="fu">rbind</span>(card_0_new, card_1)</span>
<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a><span class="co"># 剔除Time列,用Time_Hour列代替。everything()选择所有的变量</span></span>
<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a>card_end <span class="ot"><-</span> card_end[<span class="sc">-</span><span class="dv">1</span>] <span class="sc">%>%</span> </span>
<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">select</span>(Time_Hour, <span class="fu">everything</span>())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>按照类别进行分层抽样,建立训练集和测试集。</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">1234</span>)</span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="co"># 按照新数据的目标变量进行8:2</span></span>
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>index2 <span class="ot"><-</span> <span class="fu">createDataPartition</span>(card_end<span class="sc">$</span>Class,</span>
<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a> <span class="at">p =</span> <span class="fl">0.8</span>, <span class="at">list =</span> F)</span>
<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>train_data <span class="ot"><-</span> card_end[index2, ] <span class="co"># 创建训练集</span></span>
<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a>test_data <span class="ot"><-</span> card_end[<span class="sc">-</span>index2, ] <span class="co"># 创建测试集</span></span>
<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a><span class="co"># 验证抽样结果,统计三个数据集中正反样本比例是否一致</span></span>
<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(card_end<span class="sc">$</span>Class)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>
0 1
492 492 </code></pre>
</div>
<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(train_data<span class="sc">$</span>Clas)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>
0 1
394 394 </code></pre>
</div>
<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="fu">table</span>(test_data<span class="sc">$</span>Class)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>
0 1
98 98 </code></pre>
</div>
</div>
</section>
<section id="sec:five23" class="level3" data-number="4.2.2">
<h3 data-number="4.2.2" class="anchored" data-anchor-id="sec:five23"><span class="header-section-number">4.2.2</span> 标准化</h3>
<div class="cell">
<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>standard <span class="ot"><-</span> <span class="fu">preProcess</span>(card_end, <span class="at">method =</span> <span class="st">"range"</span>) </span>
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>card_s <span class="ot"><-</span> <span class="fu">predict</span>(standard, card_end)</span>
<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a>train_data2 <span class="ot"><-</span> card_s[index2, ]</span>
<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a>test_data2 <span class="ot"><-</span> card_s[<span class="sc">-</span>index2, ]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
</section>
<section id="sec:five3" class="level2" data-number="4.3">
<h2 data-number="4.3" class="anchored" data-anchor-id="sec:five3"><span class="header-section-number">4.3</span> 描述性分析</h2>
<section id="不同时间诈骗次数-条形图" class="level3" data-number="4.3.1">
<h3 data-number="4.3.1" class="anchored" data-anchor-id="不同时间诈骗次数-条形图"><span class="header-section-number">4.3.1</span> 不同时间诈骗次数-条形图</h3>
<div class="cell">
<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(card_1, <span class="fu">aes</span>(<span class="at">x =</span> <span class="fu">factor</span>(Time_Hour), </span>
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a> <span class="at">fill =</span> <span class="fu">factor</span>(Time_Hour)))<span class="sc">+</span></span>
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"count"</span>) <span class="sc">+</span></span>
<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_classic</span>() <span class="sc">+</span></span>
<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">x =</span> <span class="st">"Time_Hour"</span>, <span class="at">y =</span> <span class="st">"Count"</span>) <span class="sc">+</span></span>
<span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.position =</span> <span class="st">"none"</span>,</span>
<span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a> <span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">angle =</span> <span class="dv">90</span>,</span>
<span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a> <span class="at">vjust =</span> <span class="fl">0.5</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="4-creditcard_files/figure-html/card-times-1.png" class="img-fluid figure-img" width="672"></p>
<p></p><figcaption aria-hidden="true" class="figure-caption">不同时间诈骗次数</figcaption><p></p>
</figure>
</div>
</div>
</div>
<p>由图@ref(fig:card-times)可知:</p>
<ul>
<li><p>第一天(0<sub>24h)的诈骗总次数大于第二天(25</sub>48h)。</p></li>
<li><p>诈骗发生次数最多的三个时间段分别是:</p>
<ol type="1">
<li>第二天凌晨2点左右。</li>
<li>第一天上午11点左右。</li>
<li>第一天凌晨2点左右。</li>
</ol></li>
</ul>
</section>
<section id="不同时间诈骗金额-箱线图" class="level3" data-number="4.3.2">
<h3 data-number="4.3.2" class="anchored" data-anchor-id="不同时间诈骗金额-箱线图"><span class="header-section-number">4.3.2</span> 不同时间诈骗金额-箱线图</h3>
<div class="cell">
<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(card_1, <span class="fu">aes</span>(<span class="at">x =</span> <span class="fu">factor</span>(Time_Hour),</span>
<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a> <span class="at">y =</span> Amount, </span>
<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a> <span class="at">fill =</span> <span class="fu">factor</span>(Time_Hour))) <span class="sc">+</span></span>
<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_boxplot</span>() <span class="sc">+</span></span>
<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_hline</span>(<span class="fu">aes</span>(<span class="at">yintercept =</span><span class="dv">250</span>, <span class="at">color =</span> <span class="st">"red"</span>)) <span class="sc">+</span> </span>
<span id="cb17-6"><a href="#cb17-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">annotate</span>(<span class="st">"text"</span>, <span class="at">x =</span> <span class="dv">6</span>, <span class="at">y =</span> <span class="dv">500</span>, <span class="at">label =</span> <span class="st">"Amount = 250"</span>, <span class="at">color =</span> <span class="st">"red"</span>) <span class="sc">+</span></span>
<span id="cb17-7"><a href="#cb17-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_curve</span>(<span class="at">x =</span> <span class="dv">3</span>, <span class="at">y =</span> <span class="dv">450</span>, <span class="at">xend =</span> <span class="dv">5</span>, <span class="at">yend =</span> <span class="dv">250</span>, <span class="at">angle =</span> <span class="dv">25</span>, <span class="at">color =</span> <span class="st">"red"</span>,</span>
<span id="cb17-8"><a href="#cb17-8" aria-hidden="true" tabindex="-1"></a> <span class="at">arrow =</span> <span class="fu">arrow</span>(<span class="at">length =</span> <span class="fu">unit</span>(<span class="fl">0.25</span>, <span class="st">"cm"</span>))) <span class="sc">+</span></span>
<span id="cb17-9"><a href="#cb17-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_classic</span>() <span class="sc">+</span></span>
<span id="cb17-10"><a href="#cb17-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">x =</span> <span class="st">"Time_Hour"</span>, <span class="at">y =</span> <span class="st">"Amount"</span>) <span class="sc">+</span></span>
<span id="cb17-11"><a href="#cb17-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.position =</span> <span class="st">"none"</span>, </span>
<span id="cb17-12"><a href="#cb17-12" aria-hidden="true" tabindex="-1"></a> <span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">angle =</span> <span class="dv">90</span>, </span>
<span id="cb17-13"><a href="#cb17-13" aria-hidden="true" tabindex="-1"></a> <span class="at">vjust =</span> <span class="fl">0.5</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="4-creditcard_files/figure-html/card-amount-1.png" class="img-fluid figure-img" width="672"></p>
<p></p><figcaption aria-hidden="true" class="figure-caption">不同时间诈骗金额</figcaption><p></p>
</figure>
</div>
</div>
</div>
<p>由图@ref(fig:card-amount)可知:</p>
<ul>
<li><p>诈骗金额最多的一次发生在第二天下午1点作用(34h),诈骗金额达到2000欧元左右。</p></li>
<li><p>诈骗金额普遍在250欧元之内。</p></li>
</ul>
</section>
<section id="不同时间平均诈骗金额-条形图" class="level3" data-number="4.3.3">
<h3 data-number="4.3.3" class="anchored" data-anchor-id="不同时间平均诈骗金额-条形图"><span class="header-section-number">4.3.3</span> 不同时间平均诈骗金额-条形图</h3>
<div class="cell">
<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 提取所需数据</span></span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>card_1_mean <span class="ot"><-</span> card_1 <span class="sc">%>%</span> </span>
<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">group_by</span>(Time_Hour) <span class="sc">%>%</span> </span>
<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">summarise</span>(<span class="at">MeanAmount =</span> <span class="fu">mean</span>(Amount))</span>
<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(card_1_mean, <span class="fu">aes</span>(<span class="at">x =</span> <span class="fu">factor</span>(Time_Hour), <span class="at">y =</span> MeanAmount, <span class="at">fill =</span> <span class="fu">factor</span>(Time_Hour))) <span class="sc">+</span></span>
<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>(<span class="at">stat =</span> <span class="st">"identity"</span>) <span class="sc">+</span></span>
<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_hline</span>(<span class="fu">aes</span>(<span class="at">yintercept =</span> <span class="dv">200</span>, <span class="at">color =</span> <span class="st">"red"</span>)) <span class="sc">+</span></span>
<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">annotate</span>(<span class="st">"text"</span>, <span class="at">x =</span> <span class="dv">26</span>, <span class="at">y =</span> <span class="dv">240</span>, <span class="at">label =</span> <span class="st">"Mean_Amount = 200"</span>, <span class="at">color =</span> <span class="st">"red"</span>) <span class="sc">+</span></span>
<span id="cb18-11"><a href="#cb18-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_curve</span>(<span class="at">x =</span> <span class="dv">23</span>, <span class="at">y =</span> <span class="dv">220</span>, <span class="at">xend =</span> <span class="dv">24</span>, <span class="at">yend =</span> <span class="dv">200</span>, </span>
<span id="cb18-12"><a href="#cb18-12" aria-hidden="true" tabindex="-1"></a> <span class="at">curvature =</span> <span class="fl">0.3</span>, <span class="at">arrow =</span> <span class="fu">arrow</span>(<span class="at">length =</span> <span class="fu">unit</span>(<span class="fl">0.2</span>, <span class="st">"cm"</span>)), <span class="at">color =</span> <span class="st">"red"</span>) <span class="sc">+</span></span>
<span id="cb18-13"><a href="#cb18-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_classic</span>() <span class="sc">+</span></span>
<span id="cb18-14"><a href="#cb18-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme</span>(<span class="at">legend.position =</span> <span class="st">"none"</span>,</span>
<span id="cb18-15"><a href="#cb18-15" aria-hidden="true" tabindex="-1"></a> <span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">angle =</span> <span class="dv">90</span>, <span class="at">vjust =</span> <span class="fl">0.5</span>)) <span class="sc">+</span></span>
<span id="cb18-16"><a href="#cb18-16" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">x =</span> <span class="st">"Time_Hour"</span>, <span class="at">y =</span> <span class="st">"Mean_Amount"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="4-creditcard_files/figure-html/card-mean-1.png" class="img-fluid figure-img" width="672"></p>
<p></p><figcaption aria-hidden="true" class="figure-caption">不同时间平均诈骗金额-条形图</figcaption><p></p>
</figure>
</div>
</div>
</div>
<p>如图@ref(fig:card-mean)所示:</p>
<ul>
<li>平均诈骗金额最多的时间段为第二天下午1点,此时间点包含诈骗金额最多的观测。</li>
<li>总体而言,平均诈骗金额普遍在200欧元以内。</li>
</ul>
</section>
</section>
<section id="sec:five4" class="level2" data-number="4.4">
<h2 data-number="4.4" class="anchored" data-anchor-id="sec:five4"><span class="header-section-number">4.4</span> 自动参数调整调参-使用<code>caret</code>包</h2>
<p><strong>参数调整</strong>是提升模型性能的一个重要过程,而大多数机器学习算法都可以至少调整一个参数。复杂的模型通常可以通过调节多个参数值来调整模型从而达到更好的拟合效果。</p>
<p>e.g.,寻找最合适的k值来调整k近邻模型、调节隐藏层层数和隐藏层的节点数来优化神经网络模型;又如支持向量机模型中的调节核函数以及“软边界”惩罚大小等优化。</p>
<p>值得注意的是,如果对所有可能的调参选项均进行尝试,其复杂度非常大,耗时且不科学,需要一种更系统、科学的方式对模型的参数进行调节。</p>
<p>下表列举了使用<code>caret</code>包进行自动参数调整的模型及其参数:</p>
<table class="table">
<thead>
<tr class="header">
<th style="text-align: center;">模型</th>
<th style="text-align: center;">方法名</th>
<th style="text-align: center;">参数</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td style="text-align: center;">k近邻</td>
<td style="text-align: center;">knn</td>
<td style="text-align: center;">k</td>
</tr>
<tr class="even">
<td style="text-align: center;">朴素贝叶斯</td>
<td style="text-align: center;">nb</td>
<td style="text-align: center;">fL、usekernel</td>
</tr>
<tr class="odd">
<td style="text-align: center;">决策树</td>
<td style="text-align: center;">C5.0</td>
<td style="text-align: center;">model、trials、winnow</td>
</tr>
<tr class="even">
<td style="text-align: center;">OneR规则学习器</td>
<td style="text-align: center;">OneR</td>
<td style="text-align: center;">无</td>
</tr>
<tr class="odd">
<td style="text-align: center;">线性回归</td>
<td style="text-align: center;">lm</td>
<td style="text-align: center;">无</td>
</tr>
<tr class="even">
<td style="text-align: center;">回归树</td>
<td style="text-align: center;">rpart</td>
<td style="text-align: center;">cp</td>
</tr>
<tr class="odd">
<td style="text-align: center;">模型树</td>
<td style="text-align: center;">M5</td>
<td style="text-align: center;">pruned、smoothed、rules</td>
</tr>
<tr class="even">
<td style="text-align: center;">支持向量机(径向基核)</td>
<td style="text-align: center;">svmRadial</td>
<td style="text-align: center;">C, sigma</td>
</tr>
<tr class="odd">
<td style="text-align: center;">随机森林</td>
<td style="text-align: center;">rf</td>
<td style="text-align: center;">mtry</td>
</tr>
</tbody>
</table>
<p><a href="http://topepo.github.io/caret/available-models.html">更多可调节参数的详细信息</a></p>
<p>本案例我们使用knn和随机森林两个模型。</p>
<p>我们用iris数据对自动调参的步骤进行演示。</p>
<ol type="1">
<li>创建简单的调整的模型</li>
</ol>
<div class="cell">
<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">1234</span>)</span>
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>m_C50 <span class="ot"><-</span> <span class="fu">train</span>(Species<span class="sc">~</span>., <span class="at">data =</span> iris, <span class="at">method =</span> <span class="st">"C5.0"</span>)</span>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>m_C50</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>C5.0
150 samples
4 predictor
3 classes: 'setosa', 'versicolor', 'virginica'
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 150, 150, 150, 150, 150, 150, ...
Resampling results across tuning parameters:
model winnow trials Accuracy Kappa
rules FALSE 1 0.9353579 0.9019696
rules FALSE 10 0.9370844 0.9045424
rules FALSE 20 0.9325835 0.8976068
rules TRUE 1 0.9382311 0.9062975
rules TRUE 10 0.9407392 0.9099910
rules TRUE 20 0.9385430 0.9066136
tree FALSE 1 0.9347127 0.9009924
tree FALSE 10 0.9369888 0.9044013
tree FALSE 20 0.9332286 0.8985820
tree TRUE 1 0.9375860 0.9053246
tree TRUE 10 0.9399845 0.9088007
tree TRUE 20 0.9392443 0.9076915
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were trials = 10, model = rules and
winnow = TRUE.</code></pre>
</div>
</div>
<p>由上面的结果可以看出,基于<code>model</code>、<code>trials</code>和<code>winnow</code>三个参数,建立并测试了12个决策树(C5.0)模型,每个模型均给出了精度及Kappa统计量,最下方同时展示了最佳候选模型所对应的参数值。其中Kappa统计量用来衡量模型的稳定性:</p>
<ul>
<li>很差的一致性: <0.2</li>
<li>尚可的一致性: 0.2~0.4</li>
<li>中等的一致性: 0.4~0.6</li>
<li>不错的一致性: 0.6~0.8</li>
<li>很好的一致性: 0.8~1</li>
</ul>
<ol start="2" type="1">
<li>定制调参</li>
</ol>
<ul>
<li><p>使用trainCotrol()函数创建一些列配置选项,这些选项考虑了包括重抽样策略以及用于选择最佳模型的度量这些模型评价标准的管理。主要专注于两个重要的参数:method和selectionFuncio。</p>
<ul>
<li><p>method为冲抽样的方法。</p></li>
<li><p>selectionFunction参数可以设定一个函数,用于在各个候选者中选取特定的模型,共3个函数:</p>
<ul>
<li><strong>best</strong>函数:默认选项,简单的选择具有最好的某特定度量值的候选者。</li>
<li><strong>oneSE</strong>函数:选择最好性能标准差之内的最简单的候选者。</li>
<li><strong>Tolerance</strong>函数:选择某个用户制定比例之内最简单的候选者。</li>
</ul></li>
</ul></li>
</ul>
<div class="cell">
<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">1234</span>)</span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>model_rf <span class="ot"><-</span> <span class="fu">train</span>(Class<span class="sc">~</span>., <span class="at">data =</span> train_data, <span class="at">method =</span> <span class="st">"rf"</span>,</span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a> <span class="at">trControl =</span> <span class="fu">trainControl</span>(<span class="at">method =</span> <span class="st">"cv"</span>,</span>
<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a> <span class="at">number =</span> <span class="dv">5</span>,</span>
<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a> <span class="at">selectionFunction =</span> <span class="st">"oneSE"</span>))</span>
<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a>model_rf</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>Random Forest
788 samples
30 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 631, 631, 630, 630, 630
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.9276465 0.8552977
16 0.9314521 0.8628921
30 0.9276627 0.8553120
Accuracy was used to select the optimal model using the one SE rule.
The final value used for the model was mtry = 2.</code></pre>
</div>
<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 进行预测</span></span>
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>pred_rf <span class="ot"><-</span> <span class="fu">predict</span>(model_rf, test_data[<span class="sc">-</span><span class="dv">31</span>]) </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 建立混淆矩阵</span></span>
<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a><span class="fu">confusionMatrix</span>(<span class="at">data =</span> pred_rf, <span class="at">reference =</span> test_data<span class="sc">$</span>Class,</span>
<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a> <span class="at">positive =</span> <span class="st">"1"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>Confusion Matrix and Statistics
Reference
Prediction 0 1
0 98 7
1 0 91
Accuracy : 0.9643
95% CI : (0.9278, 0.9855)
No Information Rate : 0.5
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.9286
Mcnemar's Test P-Value : 0.02334
Sensitivity : 0.9286
Specificity : 1.0000
Pos Pred Value : 1.0000
Neg Pred Value : 0.9333
Prevalence : 0.5000
Detection Rate : 0.4643
Detection Prevalence : 0.4643
Balanced Accuracy : 0.9643
'Positive' Class : 1
</code></pre>
</div>
<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(<span class="fu">varImp</span>(model_rf)) <span class="co"># 查看变量的重要性</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="4-creditcard_files/figure-html/unnamed-chunk-8-1.png" class="img-fluid" width="672"></p>
</div>
</div>
</section>
<section id="sec:five5" class="level2" data-number="4.5">
<h2 data-number="4.5" class="anchored" data-anchor-id="sec:five5"><span class="header-section-number">4.5</span> kNN建模</h2>
<section id="原理" class="level3" data-number="4.5.1">
<h3 data-number="4.5.1" class="anchored" data-anchor-id="原理"><span class="header-section-number">4.5.1</span> 原理</h3>
<p>knn,即邻近分类器,就是把未标记的案例归类为与他们最相似的带有标记的案例所在的类。</p>
<p>算法流程:</p>
<ol type="1">
<li><p>依次计算测试样本与哥哥训练样本间的距离(常用欧式距离);</p></li>
<li><p>将这些距离按照升序排列;</p></li>
<li><p>选取距离最小的k(3~10)个训练样本点;</p></li>
<li><p>确定这k个点中不同类别的占比;</p></li>
<li><p>返回这k个点中占比最大的类别作为测试样本的预测分类。</p></li>
</ol>
</section>
<section id="模型建立" class="level3" data-number="4.5.2">
<h3 data-number="4.5.2" class="anchored" data-anchor-id="模型建立"><span class="header-section-number">4.5.2</span> 模型建立</h3>
<div class="cell">
<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 创建空向量</span></span>
<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a>results <span class="ot"><-</span> <span class="fu">c</span>()</span>
<span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb27-4"><a href="#cb27-4" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> (i <span class="cf">in</span> <span class="dv">3</span><span class="sc">:</span><span class="dv">10</span>){</span>
<span id="cb27-5"><a href="#cb27-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">set.seed</span>(<span class="dv">1234</span>)</span>
<span id="cb27-6"><a href="#cb27-6" aria-hidden="true" tabindex="-1"></a> pred_knn <span class="ot"><-</span> <span class="fu">knn</span>(train_data2[<span class="sc">-</span><span class="dv">31</span>], test_data2[<span class="sc">-</span><span class="dv">31</span>],</span>
<span id="cb27-7"><a href="#cb27-7" aria-hidden="true" tabindex="-1"></a> train_data2<span class="sc">$</span>Class, i)</span>
<span id="cb27-8"><a href="#cb27-8" aria-hidden="true" tabindex="-1"></a> Table <span class="ot"><-</span> <span class="fu">table</span>(pred_knn, test_data2<span class="sc">$</span>Class) <span class="co"># 得到混淆矩阵</span></span>
<span id="cb27-9"><a href="#cb27-9" aria-hidden="true" tabindex="-1"></a> accuracy <span class="ot"><-</span> <span class="fu">sum</span>(<span class="fu">diag</span>(Table))<span class="sc">/</span><span class="fu">sum</span>(Table) <span class="co"># diag()提取对角线的值</span></span>
<span id="cb27-10"><a href="#cb27-10" aria-hidden="true" tabindex="-1"></a> results <span class="ot"><-</span> <span class="fu">c</span>(results, accuracy)</span>
<span id="cb27-11"><a href="#cb27-11" aria-hidden="true" tabindex="-1"></a>}</span>
<span id="cb27-12"><a href="#cb27-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb27-13"><a href="#cb27-13" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(<span class="fu">as.data.frame</span>(results), <span class="fu">aes</span>(<span class="at">x =</span> <span class="dv">3</span><span class="sc">:</span><span class="dv">10</span>, <span class="at">y =</span> results)) <span class="sc">+</span></span>
<span id="cb27-14"><a href="#cb27-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_point</span>()<span class="sc">+</span></span>
<span id="cb27-15"><a href="#cb27-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_line</span>() <span class="sc">+</span></span>
<span id="cb27-16"><a href="#cb27-16" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_bw</span>() <span class="sc">+</span></span>
<span id="cb27-17"><a href="#cb27-17" aria-hidden="true" tabindex="-1"></a> <span class="fu">labs</span>(<span class="at">xlab =</span> <span class="st">" "</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="4-creditcard_files/figure-html/unnamed-chunk-9-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">1234</span>)</span>
<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a>pred_knn <span class="ot"><-</span> <span class="fu">knn</span>(<span class="at">train =</span> train_data2[<span class="sc">-</span><span class="dv">31</span>], <span class="at">test =</span> test_data2[<span class="sc">-</span><span class="dv">31</span>],</span>
<span id="cb28-3"><a href="#cb28-3" aria-hidden="true" tabindex="-1"></a> <span class="at">cl =</span> train_data2<span class="sc">$</span>Class, <span class="at">k =</span> <span class="dv">4</span>)</span>
<span id="cb28-4"><a href="#cb28-4" aria-hidden="true" tabindex="-1"></a><span class="fu">confusionMatrix</span>(pred_knn,test_data2<span class="sc">$</span>Class, <span class="at">positive =</span> <span class="st">"1"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>Confusion Matrix and Statistics
Reference
Prediction 0 1
0 97 7
1 1 91
Accuracy : 0.9592
95% CI : (0.9212, 0.9822)
No Information Rate : 0.5
P-Value [Acc > NIR] : <2e-16
Kappa : 0.9184
Mcnemar's Test P-Value : 0.0771
Sensitivity : 0.9286
Specificity : 0.9898
Pos Pred Value : 0.9891
Neg Pred Value : 0.9327
Prevalence : 0.5000
Detection Rate : 0.4643
Detection Prevalence : 0.4694
Balanced Accuracy : 0.9592
'Positive' Class : 1
</code></pre>
</div>
</div>
</section>
</section>
<section id="sec:five6" class="level2" data-number="4.6">
<h2 data-number="4.6" class="anchored" data-anchor-id="sec:five6"><span class="header-section-number">4.6</span> 模型评估</h2>
<div class="cell">
<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 建立一个数据框,将两个模型预测的结果和真实值放进去。并展示不同预测值</span></span>
<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a>pred_results <span class="ot"><-</span> <span class="fu">data.frame</span>(<span class="at">knn =</span> pred_knn, <span class="at">rf =</span> pred_rf, </span>
<span id="cb30-3"><a href="#cb30-3" aria-hidden="true" tabindex="-1"></a> <span class="at">class =</span> test_data<span class="sc">$</span>Class)</span>
<span id="cb30-4"><a href="#cb30-4" aria-hidden="true" tabindex="-1"></a>index3 <span class="ot"><-</span> <span class="fu">which</span>(pred_results<span class="sc">$</span>knn <span class="sc">!=</span> pred_rf)</span>
<span id="cb30-5"><a href="#cb30-5" aria-hidden="true" tabindex="-1"></a>pred_results[index3, ]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> knn rf class
25 1 0 0
159 1 0 1
160 0 1 1
168 1 0 1
182 0 1 1</code></pre>
</div>
</div>
</section>
</main> <!-- /main -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
const icon = "";
const anchorJS = new window.AnchorJS();
anchorJS.options = {
placement: 'right',
icon: icon
};
anchorJS.add('.anchored');
const clipboard = new window.ClipboardJS('.code-copy-button', {
target: function(trigger) {
return trigger.previousElementSibling;
}
});
clipboard.on('success', function(e) {
// button target
const button = e.trigger;
// don't keep focus
button.blur();
// flash "checked"
button.classList.add('code-copy-button-checked');
var currentTitle = button.getAttribute("title");
button.setAttribute("title", "Copied!");
setTimeout(function() {
button.setAttribute("title", currentTitle);
button.classList.remove('code-copy-button-checked');
}, 1000);
// clear code selection
e.clearSelection();
});
function tippyHover(el, contentFn) {
const config = {
allowHTML: true,
content: contentFn,
maxWidth: 500,
delay: 100,
arrow: false,
appendTo: function(el) {
return el.parentElement;
},
interactive: true,
interactiveBorder: 10,
theme: 'quarto',
placement: 'bottom-start'
};
window.tippy(el, config);
}
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
for (var i=0; i<noterefs.length; i++) {
const ref = noterefs[i];
tippyHover(ref, function() {
let href = ref.getAttribute('href');
try { href = new URL(href).hash; } catch {}
const id = href.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
return note.innerHTML;
});
}
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
for (var i=0; i<bibliorefs.length; i++) {
const ref = bibliorefs[i];
const cites = ref.parentNode.getAttribute('data-cites').split(' ');
tippyHover(ref, function() {
var popup = window.document.createElement('div');
cites.forEach(function(cite) {
var citeDiv = window.document.createElement('div');
citeDiv.classList.add('hanging-indent');
citeDiv.classList.add('csl-entry');
var biblioDiv = window.document.getElementById('ref-' + cite);
if (biblioDiv) {
citeDiv.innerHTML = biblioDiv.innerHTML;
}
popup.appendChild(citeDiv);
});
return popup.innerHTML;
});
}
});
</script>
<nav class="page-navigation">
<div class="nav-page nav-page-previous">
<a href="./3-HR-comma-sep.html" class="pagination-link">
<i class="bi bi-arrow-left-short"></i> <span class="nav-page-text"><span class="chapter-number">3</span> <span class="chapter-title">员工离职分析</span></span>
</a>
</div>
<div class="nav-page nav-page-next">
<a href="./5-Student-performance-level.html" class="pagination-link">
<span class="nav-page-text"><span class="chapter-number">5</span> <span class="chapter-title">学生成绩水平分类</span></span> <i class="bi bi-arrow-right-short"></i>
</a>
</div>
</nav>
</div> <!-- /content -->
</body></html>