-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.json
268 lines (268 loc) · 148 KB
/
search.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
[
{
"objectID": "6-mass-shoot.html",
"href": "6-mass-shoot.html",
"title": "6 美国大规模枪击案",
"section": "",
"text": "变量说明。"
},
{
"objectID": "6-mass-shoot.html#描述性分析",
"href": "6-mass-shoot.html#描述性分析",
"title": "6 美国大规模枪击案",
"section": "6.2 描述性分析",
"text": "6.2 描述性分析\n\nshoot <- read_csv(\"data/Mass shooting/Mass Shootings Dataset Ver 3.csv\")\nshoot <- shoot %>% \n rename(ID = `S#`, Open_Close = `Open/Close Location`,\n Total = `Total victims`,\n Mental = `Mental Health Issues`)\nshoot$Date <- as.Date(shoot$Date, \"%m/%d/%Y\")\nshoot$year <- year(shoot$Date)\nshoot <- shoot %>% \n filter(year != 1966)\n\nglimpse(shoot)\n\nRows: 318\nColumns: 22\n$ ID <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …\n$ Title <chr> \"Las Vegas Strip mass shooting\", \"San Francisco UPS…\n$ Location <chr> \"Las Vegas, NV\", \"San Francisco, CA\", \"Tunkhannock,…\n$ Date <date> 2017-10-01, 2017-06-14, 2017-06-07, 2017-06-05, 20…\n$ `Incident Area` <chr> NA, \"UPS facility\", \"Weis grocery\", \"manufacturer F…\n$ Open_Close <chr> NA, \"Close\", \"Close\", \"Close\", \"Close\", \"Open\", \"Cl…\n$ Target <chr> NA, \"coworkers\", \"coworkers\", \"coworkers\", \"coworke…\n$ Cause <chr> NA, NA, \"terrorism\", \"unemployement\", NA, \"racism\",…\n$ Summary <chr> NA, \"Jimmy Lam, 38, fatally shot three coworkers an…\n$ Fatalities <dbl> 58, 3, 3, 5, 3, 3, 5, 5, 3, 5, 49, 0, 1, 0, 0, 1, 4…\n$ Injured <dbl> 527, 2, 0, 0, 0, 0, 6, 0, 3, 11, 53, 4, 4, 6, 4, 4,…\n$ Total <dbl> 585, 5, 3, 5, 3, 3, 11, 5, 6, 16, 102, 4, 5, 6, 4, …\n$ `Policeman Killed` <dbl> NA, 0, NA, NA, 1, NA, NA, NA, 3, 5, 0, 0, 0, 0, 0, …\n$ Age <dbl> NA, 38, 24, 45, 43, 39, 26, 20, NA, 25, 29, 0, NA, …\n$ `Employeed (Y/N)` <dbl> NA, 1, 1, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA,…\n$ `Employed at` <chr> NA, NA, \"Weis grocery\", \"manufacturer Fiamma Inc.\",…\n$ Mental <chr> \"Unclear\", \"Yes\", \"Unclear\", \"Unclear\", \"Yes\", \"Unc…\n$ Race <chr> \"White\", \"Asian\", \"White\", NA, \"White\", \"Black\", \"L…\n$ Gender <chr> \"M\", \"M\", \"M\", \"M\", \"M\", \"M\", \"M\", \"M\", \"M\", \"M\", \"…\n$ Latitude <dbl> 36.18127, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3…\n$ Longitude <dbl> -115.13413, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…\n$ year <dbl> 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2016, 201…\n\n\n\n6.2.1 枪击案中伤亡人数\n\n# 封装条形图作图函数\nfun_bar1 <- function(data, xlab, ylab, xname, yname){\n data %>% \n group_by({{xlab}}) %>% \n summarise(count = sum({{ylab}})) %>% \n ggplot(aes(x = reorder(year, -count),\n y = count)) +\n geom_bar(stat = \"identity\", fill = \"#63B8FF\") +\n labs(x = xname, y = yname) +\n theme_bw() +\n theme(axis.text.x = element_text(angle = 90, vjust = 0.5))\n}\n\np1 <- fun_bar1(shoot, year, Total, \" \", \"受害人数\") +\n geom_text(aes(label = count), size = 2, angle = 45, vjust = 0.5)\n\np2 <- fun_bar1(shoot, year, Injured, \" \", \"受伤人数\")+\n geom_text(aes(label = count), size = 2, angle = 45, vjust = 0.5)\n\np3 <- fun_bar1(shoot, year, Fatalities, \" \", \"死亡人数\")+\n geom_text(aes(label = count), size = 2, angle = 45, vjust = 0.5)\n\nshoot %>% \n select(year, Injured, Fatalities) %>% \n pivot_longer(-year,\n names_to = \"Types\",\n values_to = \"Values\") %>% \n ggplot(aes(x = year, y = Values, fill = Types)) +\n geom_col() +\n coord_flip() +\n theme_bw() \n\n\n\np1/p2/p3\n\n\n\n\n\n整体而言,总伤亡人数呈现增长趋势。\n2015-2017年间,伤亡人数突然增多。\n受害人数2017年最多,1971年最少。\n除2017年外,其余年份受伤和死亡任务相对平衡\n\n\n\n6.2.2 绘制枪击案频率、伤亡总人数和平均伤亡人数(月为单位)\n\n# 提取月份数据\nshoot$month <- month(shoot$Date)\n\n# 每月枪击频率\np4 <- month_freq <- shoot %>% \n group_by(month) %>% \n summarise(freq = n()) %>% \n ggplot(aes(x = reorder(month, freq), y = freq)) +\n geom_bar(stat = \"identity\", fill = \"#63B8FF\") +\n geom_text(aes(label = freq), hjust = -0.1, size = 2) +\n labs(x = \" \", y = \"月均枪击事件数\") +\n coord_flip() +\n theme_bw()\n\n# 月伤亡总人数\np5 <- month_total <- shoot %>% \n group_by(month) %>% \n summarise(total = sum(Total))%>% \n ggplot(aes(x = reorder(month, total), y = total)) +\n geom_bar(stat = \"identity\", fill = \"#63B8FF\") +\n geom_text(aes(label = total), hjust = -0.1, size = 2) +\n labs(x = \" \", y = \"月伤亡总人数\") +\n coord_flip() +\n theme_bw()\n\n# 月均伤亡人数\np6 <- month_meantotal <- shoot %>% \n group_by(month) %>% \n summarise(mean = mean(Total)) %>% \n ggplot(aes(x = reorder(month, mean), y = mean)) +\n geom_bar(stat = \"identity\", fill = \"#63B8FF\") +\n geom_text(aes(label = round(mean,2)), hjust = -0.1, size = 2) +\n labs(x = \" \", y = \"月均伤亡人数\") +\n coord_flip() +\n theme_bw()\np4/p5/p6 \n\n\n\n\n绘制枪击案频率、伤亡总人数和平均伤亡人数(月为单位)\n\n\n\n\n\n十月份至少发生过一起大型枪击案\n\n\n\n6.2.3 枪手与种族之间的关系\n\ntable(shoot$Race)\n\n\n Asian \n 6 \n Asian American \n 11 \n Asian American/Some other race \n 1 \n black \n 3 \n Black \n 4 \n Black American or African American \n 76 \n Black American or African American/Unknown \n 1 \n Latino \n 5 \n Native American or Alaska Native \n 3 \n Other \n 2 \n Some other race \n 20 \n Two or more races \n 2 \n Unknown \n 42 \n white \n 12 \n White \n 7 \n White American or European American \n 120 \nWhite American or European American/Some other Race \n 1 \n\n# 将所有重复的数据重塑为统一\nshoot <- within(shoot,{ \n Race_new <- \" \"\n Race_new[Race == \"Black\"|\n Race == \"black\"|\n Race == \"Black American or African American\"|\n Race == \"Black American or African American/Unknown\"] <- \"Black\"\n Race_new[Race == \"White\"|Race == \"white\"|\n Race == \"White American or European American\"|\n Race == \"White American or European American/Some other Race \"] <- \"White\"\n Race_new[Race == \"unclear\"|\n Race == \"\"|\n Race == \"Unknown\"] <- \"Unknow\"\n Race_new[Race == \"Asian\"|\n Race == \"Asian American/Some other race\"|\n Race == \"Asian American\"] <- \"Asian\"\n Race_new[Race == \"Latino\"|Race == \"Other\"|\n Race == \"Native American or Alaska Native\"|\n Race == \"Native American\"|\n Race == \"Some other race\"|\n Race == \"Two or more races\"] <- \"Other\"\n})\n\n# 统计不同Race_new的个数\nshoot$Race_new[is.na(shoot$Race_new)] <- \"Unknow\"\np7 <- shoot %>% \n group_by(Race_new) %>% \n summarise(freq = n()) %>% \n ggplot(aes(x = reorder(Race_new, freq), y = freq)) +\n geom_bar(stat = \"identity\", fill = \"#63B8FF\") +\n geom_text(aes(label = round(freq, 2)), hjust = 0.7, size = 3) +\n labs(x = \"种族\", y = \"频数\") +\n coord_flip() +\n theme_bw()\np8 <- shoot %>% \n group_by(Race_new) %>% \n summarise(freq = n()) %>% \n ggplot(aes(x = Race_new, y = freq, fill = Race_new)) +\n geom_bar(stat = \"identity\", width = 1) +\n labs(x = \"种族\", y = \"频数\") +\n coord_polar(theta = \"y\") + # 将角度映射到y轴\n theme_bw() +\n guides(fill = guide_legend(title = NULL))\np7|p8\n\n\n\n\n枪手与种族之间的关系\n\n\n\n\n\n枪手中白种人最多,黑种人次之,亚洲人最少。\n\n\n\n6.2.4 杀手性别和精神状况\n\n# 性别情况\ntable(shoot$Gender)\n\n\n Female M M/F Male Male/Female Unknown \n 5 17 1 270 4 21 \n\n# 对数据进行重塑统一\nshoot$Gender[shoot$Gender == \"M\"] <- \"Male\"\nshoot$Gender[shoot$Gender == \"M/F\"] <- \"Male/Female\"\n\np9 <- shoot %>% \n group_by(Gender) %>% \n summarise(count = n()) %>% \n ggplot(aes(x = reorder(Gender, count), y = count)) +\n geom_bar(stat = \"identity\", fill = \"#63B8FF\") +\n geom_text(aes(label = count), hjust = 0.7, size = 3) +\n labs(x = \"性别\", y = \" \") +\n coord_flip() +\n theme_bw()\n\n# 精神情况\ntable(shoot$Mental)\n\n\n No Unclear unknown Unknown Yes \n 90 13 1 110 104 \n\nshoot$Mental[shoot$Mental == \"Unclear\"|\n shoot$Mental == \"Unknown\"|\n shoot$Mental == \"unknown\"] <- \"Unknown\"\n\np10 <- shoot %>% \n group_by(Mental) %>% \n summarise(count = n()) %>% \n ggplot(aes(x = reorder(Mental, count), y = count)) +\n geom_bar(stat = \"identity\", fill = \"#63B8FF\") +\n geom_text(aes(label = count), hjust = 0.7, size = 3) +\n labs(x = \"是否有精神问题\", y = \"频数\") +\n coord_flip() +\n theme_bw()\n\np9/p10\n\n\n\n\n杀手性别和精神状况\n\n\n\n\n由图@ref(fig:gender-mental)可知:\n\n枪手中男性占绝大多数。\n并不是每个枪手都有精神问题,相反有精神问题的与没有精神问题的枪手数量并未拉开差距。\n\n\n\n6.2.5 伤亡水平和不同伤亡水平伤亡人数\n\n# 按照伤亡人数确定伤亡水平\nshoot <- within(shoot,{\n level <- \"\"\n level[Total < 5] <- \"<5\"\n level[Total>=5] <- \"5-10\"\n level[Total>10] <- \">10\"\n})\n\np11 <- shoot %>% \n group_by(level) %>% \n summarise(count = sum(Total)) %>% \n ggplot(aes(x = reorder(level, count), y = count)) +\n geom_bar(stat = \"identity\", fill = \"#63B8FF\") +\n #geom_label(aes(label = level), hjust = 0.7) +\n labs(x = \"伤亡水平\", y = \"伤亡人数\") +\n coord_flip() +\n theme_bw()\n\np12 <- shoot %>% \n group_by(level) %>% \n summarise(count = n()) %>% \n ggplot(aes(x = reorder(level, count), y = count)) +\n geom_bar(stat = \"identity\", fill = \"#63B8FF\") +\n geom_label(aes(label = count), hjust = 0.7) +\n labs(x = \"伤亡水平\", y = \"频数\") +\n coord_flip() +\n theme_bw()\np12/p11\n\n\n\n\n伤亡水平和不同伤亡水平伤亡人数\n\n\n\n\n由图@ref(fig:casualties-level)可知:\n\n大部分枪击事件为伤亡人数5-10之间的小规模枪击事件。\n伤亡水平大于10的案件中总伤亡人数远多于小伤亡水平的枪击案件。"
},
{
"objectID": "6-mass-shoot.html#枪击发生地的可视化地图",
"href": "6-mass-shoot.html#枪击发生地的可视化地图",
"title": "6 美国大规模枪击案",
"section": "6.3 枪击发生地的可视化(地图)",
"text": "6.3 枪击发生地的可视化(地图)\n\nstates_map <- map_data(\"state\") # 获取美国地图\np13 <- ggplot() +\n geom_polygon(data= states_map, \n aes(x = long, y = lat, group = group),\n color = \"black\", fill = \"white\") +\n geom_point(data = shoot[shoot$Longitude >= -140, ],\n aes(x = Longitude, y = Latitude,\n size = Total, color = Fatalities), \n alpha = 0.6) +\n scale_color_gradient(low = \"red\", high = \"black\")\nggplotly(p13)\n\n\n\n\n\n\n枪击案多发生在美国东部地区及西部边境地区。"
},
{
"objectID": "6-mass-shoot.html#枪击案发生地点是否露天",
"href": "6-mass-shoot.html#枪击案发生地点是否露天",
"title": "6 美国大规模枪击案",
"section": "6.4 枪击案发生地点(是否露天)",
"text": "6.4 枪击案发生地点(是否露天)\n\ntable(shoot$Open_Close)\n\n\n Close Open Open+Close Open+CLose \n 193 76 19 1 \n\nshoot$Open_Close[is.na(shoot$Open_Close)] <- \"Unknown\"\nshoot$Open_Close[shoot$Open_Close == \"Open+CLose\"] <- \"Open+Close\"\n\nshoot_Op_Cl <- shoot %>% \n group_by(Open_Close) %>% \n summarise(Count = n())\n\nP14 <- ggplot(shoot_Op_Cl, aes(x = reorder(Open_Close, Count), y = Count))+\n geom_col(fill = \"#63B8FF\") +\n geom_label(aes(label = Count), hjust = 0.7) +\n coord_flip()+\n labs(x = \"Open&Close\", y = \"Count\") +\n theme_bw()\nP14\n\n\n\n\n\n枪击案件大多发生在室内,室外枪击的数量不到室内数量的一半。"
},
{
"objectID": "6-mass-shoot.html#不同枪击案起因",
"href": "6-mass-shoot.html#不同枪击案起因",
"title": "6 美国大规模枪击案",
"section": "6.5 不同枪击案起因",
"text": "6.5 不同枪击案起因\n\nshoot_cause <- shoot %>% \n group_by(Cause) %>% \n summarise(Count = n())\n\nshoot_cause$Cause[is.na(shoot_cause$Cause)] <- \"Unknown\"\n\n(P15 <- ggplot(shoot_cause, aes(x = reorder(Cause, Count), y = Count))+\n geom_col(fill = \"#63B8FF\") +\n geom_text(aes(label = Count), hjust = 0.7, size = 3) +\n coord_flip()+\n labs(x = \"Cause\", y = \"Count\") +\n theme_bw())\n\n\n\n\n\n枪击案的发生大都由于枪手存在精神问题。\n恐怖主义、愤怒和挫折是除精神问题外引发枪击案最多的诱因。"
},
{
"objectID": "index.html",
"href": "index.html",
"title": "kaggle",
"section": "",
"text": "Kaggle成立于2010年,是一个进行数据发掘和预测竞赛的在线平台。从公司的角度来讲,可以提供一些数据,进而提出一个实际需要解决的问题;从参赛者的角度来讲,他们将组队参与项目,针对其中一个问题提出解决方案,最终由公司选出的最佳方案可以获得5K-10K美金的奖金。\n除此之外,Kaggle官方每年还会举办一次大规模的竞赛,奖金高达一百万美金,吸引了广大的数据科学爱好者参与其中。从某种角度来讲,大家可以把它理解为一个众包平台,类似国内的猪八戒。但是不同于传统的低层次劳动力需求,Kaggle一直致力于解决业界难题,因此也创造了一种全新的劳动力市场——不再以学历和工作经验作为唯一的人才评判标准,而是着眼于个人技能,为顶尖人才和公司之间搭建了一座桥梁。\n\n\n基于 Pandoc 自定义 block 是一件很有意思的事情,目前不想让模版过于复杂,仅给出几个最常用的例子。如何自定义可以去看谢益辉的新书 https://bookdown.org/yihui/rmarkdown-cookbook/custom-blocks.html。\n\n\n\n\n\n\nNote\n\n\n\nNote that there are five types of callouts, including: note, tip, warning, caution, and important.\n\n\n\n\n\n\n\n\nTip With Caption\n\n\n\nThis is an example of a callout with a caption.\n\n\n\n\n\n\n\n\nExpand To Learn About Collapse\n\n\n\n\n\nThis is an example of a ‘folded’ caution callout that can be expanded by the user. You can use collapse=\"true\" to collapse it by default or collapse=\"false\" to make a collapsible callout that is expanded by default."
},
{
"objectID": "7-hotel-demond.html",
"href": "7-hotel-demond.html",
"title": "7 酒店房间预定预测",
"section": "",
"text": "df <- read_csv(\"data/hotel_bookings.csv\",\n col_names = TRUE,\n show_col_types = FALSE)\n\n# Surpressing summarize info\noptions(dplyr.summarise.inform = FALSE)"
},
{
"objectID": "7-hotel-demond.html#数据总览",
"href": "7-hotel-demond.html#数据总览",
"title": "7 酒店房间预定预测",
"section": "7.2 数据总览",
"text": "7.2 数据总览\n\nglimpse(df)\n\nRows: 119,390\nColumns: 32\n$ hotel <chr> \"Resort Hotel\", \"Resort Hotel\", \"Resort…\n$ is_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …\n$ lead_time <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …\n$ arrival_date_year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 201…\n$ arrival_date_month <chr> \"July\", \"July\", \"July\", \"July\", \"July\",…\n$ arrival_date_week_number <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…\n$ arrival_date_day_of_month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …\n$ stays_in_weekend_nights <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ stays_in_week_nights <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …\n$ adults <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …\n$ children <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ babies <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ meal <chr> \"BB\", \"BB\", \"BB\", \"BB\", \"BB\", \"BB\", \"BB…\n$ country <chr> \"PRT\", \"PRT\", \"GBR\", \"GBR\", \"GBR\", \"GBR…\n$ market_segment <chr> \"Direct\", \"Direct\", \"Direct\", \"Corporat…\n$ distribution_channel <chr> \"Direct\", \"Direct\", \"Direct\", \"Corporat…\n$ is_repeated_guest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ previous_cancellations <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ reserved_room_type <chr> \"C\", \"C\", \"A\", \"A\", \"A\", \"A\", \"C\", \"C\",…\n$ assigned_room_type <chr> \"C\", \"C\", \"C\", \"A\", \"A\", \"A\", \"C\", \"C\",…\n$ booking_changes <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ deposit_type <chr> \"No Deposit\", \"No Deposit\", \"No Deposit…\n$ agent <chr> \"NULL\", \"NULL\", \"NULL\", \"304\", \"240\", \"…\n$ company <chr> \"NULL\", \"NULL\", \"NULL\", \"NULL\", \"NULL\",…\n$ days_in_waiting_list <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ customer_type <chr> \"Transient\", \"Transient\", \"Transient\", …\n$ adr <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…\n$ required_car_parking_spaces <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ total_of_special_requests <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …\n$ reservation_status <chr> \"Check-Out\", \"Check-Out\", \"Check-Out\", …\n$ reservation_status_date <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…\n\n\n\n7.2.1 数据清洗\n在快速浏览数据集以初步了解变量及其格式后,下一步是检查每列中缺失值的数量及其各自的大小。\n将”NULL”转换为NA,将”CN”转换为CHN。\n\n# 1.将数据集中的date列临时转换为字符串,便于将NULL转换为NA\ndftemp1 <- transform(\n df,\n reservation_status_date = \n as.character(reservation_status_date))\n\n# 2. 将NULL转化为NA,将CN转换为CHN\ndftemp1[dftemp1 == \"NULL\"] <- NA\ndftemp1[dftemp1 == \"CN\"] <- \"CHN\"\n\n# 3. 除 iso3c 代码外,还添加国家/地区名称、大洲和地区\ndftemp1$country_name <- countrycode(\n dftemp1$country, \"iso3c\", \"country.name\"\n)\ndftemp1$continent <- countrycode(\n dftemp1$country, \"iso3c\", \"continent\"\n)\ndftemp1$region <- countrycode(\n dftemp1$country, \"iso3c\", \"region23\"\n)\n\n# 4. 将date列变会date\ndftemp2 <- \n transform(dftemp1,\n reservation_status_date =\n as.Date(reservation_status_date))\n\n# 5. 计算缺失值数量\nmiss_var_summary(dftemp2) %>% \n filter(n_miss != 0)\n\n# A tibble: 7 × 3\n variable n_miss pct_miss\n <chr> <int> <dbl>\n1 company 112593 94.3 \n2 agent 16340 13.7 \n3 continent 495 0.415 \n4 region 495 0.415 \n5 country_name 491 0.411 \n6 country 488 0.409 \n7 children 4 0.00335\n\n\n\n我们发现,company列中有112593个缺失值,占该列总数据的95%左右,只是单纯的将该列缺失值删除显然不太合适。\n我们将country,country_name,continent,agent和company,等NAs替换为None,将children替换为0。\n\n\n\n7.2.2 缺失值处理\n\n# 1. 将所有NA替换为0\ndftemp2[is.na(dftemp2)] <- 0\n\n# 2. 将country, agent and company 列中的0值替换为 \"None\" \ndftemp3 <- dftemp2 %>% \n mutate(country = replace(country, \n country == 0,\n \"None\"),\n agent = replace(agent,\n agent == 0, \n \"None\"),\n company = replace(company,\n company == 0,\n \"None\"),\n country_name = replace(country_name,\n country_name == \"0\",\n \"None\")\n )\nglimpse(dftemp3)\n\nRows: 119,390\nColumns: 35\n$ hotel <chr> \"Resort Hotel\", \"Resort Hotel\", \"Resort…\n$ is_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …\n$ lead_time <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …\n$ arrival_date_year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 201…\n$ arrival_date_month <chr> \"July\", \"July\", \"July\", \"July\", \"July\",…\n$ arrival_date_week_number <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…\n$ arrival_date_day_of_month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …\n$ stays_in_weekend_nights <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ stays_in_week_nights <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …\n$ adults <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …\n$ children <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ babies <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ meal <chr> \"BB\", \"BB\", \"BB\", \"BB\", \"BB\", \"BB\", \"BB…\n$ country <chr> \"PRT\", \"PRT\", \"GBR\", \"GBR\", \"GBR\", \"GBR…\n$ market_segment <chr> \"Direct\", \"Direct\", \"Direct\", \"Corporat…\n$ distribution_channel <chr> \"Direct\", \"Direct\", \"Direct\", \"Corporat…\n$ is_repeated_guest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ previous_cancellations <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ reserved_room_type <chr> \"C\", \"C\", \"A\", \"A\", \"A\", \"A\", \"C\", \"C\",…\n$ assigned_room_type <chr> \"C\", \"C\", \"C\", \"A\", \"A\", \"A\", \"C\", \"C\",…\n$ booking_changes <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ deposit_type <chr> \"No Deposit\", \"No Deposit\", \"No Deposit…\n$ agent <chr> \"None\", \"None\", \"None\", \"304\", \"240\", \"…\n$ company <chr> \"None\", \"None\", \"None\", \"None\", \"None\",…\n$ days_in_waiting_list <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ customer_type <chr> \"Transient\", \"Transient\", \"Transient\", …\n$ adr <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…\n$ required_car_parking_spaces <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ total_of_special_requests <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …\n$ reservation_status <chr> \"Check-Out\", \"Check-Out\", \"Check-Out\", …\n$ reservation_status_date <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…\n$ country_name <chr> \"Portugal\", \"Portugal\", \"United Kingdom…\n$ continent <chr> \"Europe\", \"Europe\", \"Europe\", \"Europe\",…\n$ region <chr> \"Southern Europe\", \"Southern Europe\", \"…\n\n\n\n\n7.2.3 数据整合\n\n增加四个新列:\n\ntotal_nights:总住宿天数\ntotal_rates:总房价\ntotal_guests:总房客\ntotal_nights_rates_guests\n\n\n\ndftemp3 <- dftemp2 %>% \n mutate(\n total_nights =\n (stays_in_weekend_nights +\n stays_in_week_nights),\n total_rates = \n (total_nights * adr),\n total_guests = (adults + children +\n babies),\n total_nights_rates_guests = (\n total_nights + total_guests +\n total_rates\n ))\n\n\n增加arrival_month_year列。\n\n方便可视化\ndftemp4 <- transform(\n dftemp3, \n arrival_date_year = \n as.character(arrival_date_year)) %>% \n mutate(\n month_year_temp = \n paste(\"1\", arrival_date_month,\n arrival_date_year)) %>% \n mutate(\n arrival_monty_year = \n make_date(month_year_temp))\ndftemp4$arrival_monty_year <-\n dmy(dftemp4$month_year_temp)\n\n\n\n7.2.4 数据完整新检查\n计算没有夜晚、房价或客人记录的观测值数量.\n\ndftemp4 %>% \n filter(total_nights_rates_guests == 0) %>% \n count()\n\n n\n1 70\n\n\n下一步是通过过滤掉没有夜晚、房价和客人记录的 70 行并删除total_nights_rate_guests变量,最终确定数据集以准备进行分析。\n\n\n7.2.5 数据定稿\n\ndfc <- dftemp4 %>% \n filter(total_nights_rates_guests != 0) %>% \n select(., -total_nights_rates_guests,\n -month_year_temp)\nglimpse(dfc)\n\nRows: 119,320\nColumns: 39\n$ hotel <chr> \"Resort Hotel\", \"Resort Hotel\", \"Resort…\n$ is_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …\n$ lead_time <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …\n$ arrival_date_year <chr> \"2015\", \"2015\", \"2015\", \"2015\", \"2015\",…\n$ arrival_date_month <chr> \"July\", \"July\", \"July\", \"July\", \"July\",…\n$ arrival_date_week_number <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…\n$ arrival_date_day_of_month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …\n$ stays_in_weekend_nights <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ stays_in_week_nights <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …\n$ adults <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …\n$ children <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ babies <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ meal <chr> \"BB\", \"BB\", \"BB\", \"BB\", \"BB\", \"BB\", \"BB…\n$ country <chr> \"PRT\", \"PRT\", \"GBR\", \"GBR\", \"GBR\", \"GBR…\n$ market_segment <chr> \"Direct\", \"Direct\", \"Direct\", \"Corporat…\n$ distribution_channel <chr> \"Direct\", \"Direct\", \"Direct\", \"Corporat…\n$ is_repeated_guest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ previous_cancellations <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ reserved_room_type <chr> \"C\", \"C\", \"A\", \"A\", \"A\", \"A\", \"C\", \"C\",…\n$ assigned_room_type <chr> \"C\", \"C\", \"C\", \"A\", \"A\", \"A\", \"C\", \"C\",…\n$ booking_changes <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ deposit_type <chr> \"No Deposit\", \"No Deposit\", \"No Deposit…\n$ agent <chr> \"0\", \"0\", \"0\", \"304\", \"240\", \"240\", \"0\"…\n$ company <chr> \"0\", \"0\", \"0\", \"0\", \"0\", \"0\", \"0\", \"0\",…\n$ days_in_waiting_list <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ customer_type <chr> \"Transient\", \"Transient\", \"Transient\", …\n$ adr <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…\n$ required_car_parking_spaces <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …\n$ total_of_special_requests <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …\n$ reservation_status <chr> \"Check-Out\", \"Check-Out\", \"Check-Out\", …\n$ reservation_status_date <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…\n$ country_name <chr> \"Portugal\", \"Portugal\", \"United Kingdom…\n$ continent <chr> \"Europe\", \"Europe\", \"Europe\", \"Europe\",…\n$ region <chr> \"Southern Europe\", \"Southern Europe\", \"…\n$ total_nights <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …\n$ total_rates <dbl> 0.00, 0.00, 75.00, 75.00, 196.00, 196.0…\n$ total_guests <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …\n$ arrival_monty_year <date> 2015-07-01, 2015-07-01, 2015-07-01, 20…"
},
{
"objectID": "7-hotel-demond.html#数据初步探索",
"href": "7-hotel-demond.html#数据初步探索",
"title": "7 酒店房间预定预测",
"section": "7.3 数据初步探索",
"text": "7.3 数据初步探索\n\ndescr(dfc) %>% \n round(., 2) %>% \n t()\n\nDescriptive Statistics \ndfc \nN: 119320 \n\n Mean Std.Dev Min Q1 Median Q3\n------------------------------------ -------- --------- -------- -------- -------- --------\n adr 101.89 50.49 -6.38 69.36 94.80 126.00\n adults 1.86 0.58 0.00 2.00 2.00 2.00\n arrival_date_day_of_month 15.80 8.78 1.00 8.00 16.00 23.00\n arrival_date_week_number 27.16 13.60 1.00 16.00 28.00 38.00\n babies 0.01 0.10 0.00 0.00 0.00 0.00\n booking_changes 0.22 0.65 0.00 0.00 0.00 0.00\n children 0.10 0.40 0.00 0.00 0.00 0.00\n days_in_waiting_list 2.32 17.60 0.00 0.00 0.00 0.00\n is_canceled 0.37 0.48 0.00 0.00 0.00 1.00\n is_repeated_guest 0.03 0.17 0.00 0.00 0.00 0.00\n lead_time 104.07 106.87 0.00 18.00 69.00 160.00\n previous_bookings_not_canceled 0.14 1.50 0.00 0.00 0.00 0.00\n previous_cancellations 0.09 0.84 0.00 0.00 0.00 0.00\n required_car_parking_spaces 0.06 0.25 0.00 0.00 0.00 0.00\n stays_in_week_nights 2.50 1.91 0.00 1.00 2.00 3.00\n stays_in_weekend_nights 0.93 1.00 0.00 0.00 1.00 2.00\n total_guests 1.97 0.72 0.00 2.00 2.00 2.00\n total_nights 3.43 2.56 0.00 2.00 3.00 4.00\n total_of_special_requests 0.57 0.79 0.00 0.00 0.00 1.00\n total_rates 358.06 335.90 -63.80 146.00 267.00 446.40\n\nTable: Table continues below\n\n \n\n Max MAD IQR CV Skewness SE.Skewness\n------------------------------------ --------- -------- -------- ------- ---------- -------------\n adr 5400.00 41.22 56.64 0.50 10.57 0.01\n adults 55.00 0.00 0.00 0.31 18.49 0.01\n arrival_date_day_of_month 31.00 11.86 15.00 0.56 0.00 0.01\n arrival_date_week_number 53.00 16.31 22.00 0.50 -0.01 0.01\n babies 10.00 0.00 0.00 12.25 24.64 0.01\n booking_changes 21.00 0.00 0.00 2.95 6.00 0.01\n children 10.00 0.00 0.00 3.84 4.11 0.01\n days_in_waiting_list 391.00 0.00 0.00 7.58 11.94 0.01\n is_canceled 1.00 0.00 1.00 1.30 0.54 0.01\n is_repeated_guest 1.00 0.00 0.00 5.54 5.36 0.01\n lead_time 737.00 88.96 142.00 1.03 1.35 0.01\n previous_bookings_not_canceled 72.00 0.00 0.00 10.93 23.54 0.01\n previous_cancellations 26.00 0.00 0.00 9.69 24.45 0.01\n required_car_parking_spaces 8.00 0.00 0.00 3.92 4.16 0.01\n stays_in_week_nights 50.00 1.48 2.00 0.76 2.86 0.01\n stays_in_weekend_nights 19.00 1.48 2.00 1.08 1.38 0.01\n total_guests 55.00 0.00 0.00 0.37 10.22 0.01\n total_nights 69.00 1.48 2.00 0.75 3.31 0.01\n total_of_special_requests 5.00 0.00 1.00 1.39 1.35 0.01\n total_rates 7590.00 206.82 300.40 0.94 2.98 0.01\n\nTable: Table continues below\n\n \n\n Kurtosis N.Valid Pct.Valid\n------------------------------------ ---------- ----------- -----------\n adr 1017.33 119320.00 100.00\n adults 1367.51 119320.00 100.00\n arrival_date_day_of_month -1.19 119320.00 100.00\n arrival_date_week_number -0.99 119320.00 100.00\n babies 1632.90 119320.00 100.00\n booking_changes 79.39 119320.00 100.00\n children 18.66 119320.00 100.00\n days_in_waiting_list 186.67 119320.00 100.00\n is_canceled -1.71 119320.00 100.00\n is_repeated_guest 26.76 119320.00 100.00\n lead_time 1.70 119320.00 100.00\n previous_bookings_not_canceled 767.15 119320.00 100.00\n previous_cancellations 673.76 119320.00 100.00\n required_car_parking_spaces 29.98 119320.00 100.00\n stays_in_week_nights 24.31 119320.00 100.00\n stays_in_weekend_nights 7.17 119320.00 100.00\n total_guests 559.08 119320.00 100.00\n total_nights 28.91 119320.00 100.00\n total_of_special_requests 1.49 119320.00 100.00\n total_rates 17.15 119320.00 100.00\n\n\n\n7.3.1 按大洲划分的预订量和平均每晚花费\n\n# 1. 计算酒店及大洲数据\ndfccharts <- dfc %>% \n group_by(hotel, continent) %>% \n summarise(\n total_rate = sum(total_rates),\n bookings_count = n()\n ) %>% \n mutate(\n average_rate = \n total_rate / bookings_count) %>% \n filter(continent != 0)\n\n# 2. 仅计算大洲数据\ndfcchartscontonly <- dfc %>% \n group_by(continent) %>% \n summarise(total_rate = sum(total_rates),\n bookings_count = n()) %>% \n mutate(average_rate = \n total_rate / bookings_count) %>% \n filter(continent != 0)\n\n# 3. 核定图形主题\nTheme1 <- theme(\n legend.position = \"top\",\n legend.title = element_blank(),\n plot.title = element_text(size = 15),\n axis.title = element_blank(),\n axis.text.x = element_text(size = 5),\n axis.text.y = element_text(size = 5),\n legend.text = element_text(size = 10),\n legend.spacing.x = unit(4, \"mm\"),\n legend.justification = \"left\"\n)\n\n# 4. 作图\nplot1a <- ggplot(\n dfccharts, aes(x = reorder(continent,\n -bookings_count),\n y = bookings_count)) +\n geom_col(fill = \"steelblue\") +\n scale_y_continuous(labels = unit_format(\n suffix = \"K\", scale = 0.001\n )) +\n labs(title = \"各大洲预定总数\") +\n Theme1\n\nplot2a <- ggplot(\n dfccharts, aes(x = reorder(continent,\n -bookings_count),\n y = bookings_count,\n fill = hotel)) +\n geom_bar(position = \"dodge\", stat = \"identity\") +\n scale_fill_manual(\n values = c(\"City Hotel\" = \"steelblue3\",\n \"Resort Hotel\" = \"steelblue4\")\n ) +\n scale_y_continuous(\n labels = unit_format(suffix = \"K\", scale = 0.001)) +\n labs(title = \"按照城市酒店和度假酒店细分\") +\n Theme1\n\nplot3a <- ggplot(\n dfcchartscontonly, aes(x = reorder(continent,\n -average_rate),\n y = average_rate)) +\n geom_col(fill = \"steelblue\") +\n scale_y_continuous(labels = unit_format(\n perfix = \"$\", suffix = \"\"\n )) +\n labs(title = \"各大洲每天预定酒店平均消费\") +\n Theme1\n\nplot4a <- ggplot(dfccharts, \n aes(x = continent,\n y = average_rate, \n fill = hotel)) +\n geom_bar(position = \"dodge\", stat = \"identity\") +\n scale_fill_manual(\n values = c(\"City Hotel\" = \"steelblue3\",\n \"Resort Hotel\" = \"steelblue4\")\n ) +\n labs(title = \"按城市酒店和度假酒店细分\") +\n Theme1\n(plot1a | plot3a) /(plot2a | plot4a)\n\n\n\n\n\n欧洲似乎是迄今为止最受欢迎的目的地,并且强烈偏爱城市酒店与度假村。\n非洲是最昂贵的大陆,平均每晚住宿费用超过600美元。\n除非洲外,度假酒店的平均花费更高。而美洲城市和度假酒店之间的平均房价似乎相似。\n\n下面,我们深入到各个区域,以了解花费的具体不同。\n\n\n7.3.2 欧洲\n\n筛选欧洲大陆的数据-by region & hotel\n\n\ndfchotelregions <- dfc %>% \n filter(continent == \"Europe\") %>% \n group_by(hotel, region) %>% \n summarise(total_rate = sum(total_rates),\n bookings_count = n()) %>% \n mutate(average_rate = total_rate / bookings_count)\nhead(dfchotelregions)\n\n# A tibble: 6 × 5\n# Groups: hotel [2]\n hotel region total_rate bookings_count average_rate\n <chr> <chr> <dbl> <int> <dbl>\n1 City Hotel Eastern Europe 745533. 1866 400.\n2 City Hotel Northern Europe 3290622. 8565 384.\n3 City Hotel Southern Europe 10209285. 39229 260.\n4 City Hotel Western Europe 7750375. 20929 370.\n5 Resort Hotel Eastern Europe 492552. 820 601.\n6 Resort Hotel Northern Europe 4945907. 9745 508.\n\n\n\n筛选欧洲大陆的数据-by region & hotel\n\n\ndfcregiononly <- dfc %>% \n filter(continent == \"Europe\") %>% \n group_by(region) %>% \n summarise(total_rate = sum(total_rates),\n bookings_count = n()) %>% \n mutate(average_rate = total_rate / bookings_count)\nhead(dfchotelregions)\n\n# A tibble: 6 × 5\n# Groups: hotel [2]\n hotel region total_rate bookings_count average_rate\n <chr> <chr> <dbl> <int> <dbl>\n1 City Hotel Eastern Europe 745533. 1866 400.\n2 City Hotel Northern Europe 3290622. 8565 384.\n3 City Hotel Southern Europe 10209285. 39229 260.\n4 City Hotel Western Europe 7750375. 20929 370.\n5 Resort Hotel Eastern Europe 492552. 820 601.\n6 Resort Hotel Northern Europe 4945907. 9745 508.\n\n\n\n作图\n\n\nplot1b <- ggplot(dfcregiononly,\n aes(\n reorder(region, -bookings_count),\n bookings_count)) +\n geom_col(fill = \"steelblue\") +\n labs(title = \"欧洲预定数量\") +\n Theme1\n\nplot2b <- ggplot(dfcregiononly,\n aes(reorder(region, -average_rate),\n average_rate)) +\n geom_col(fill = \"steelblue\") +\n labs(title = \"欧洲平均花销\") +\n Theme1\n\nplot3b <- ggplot(dfchotelregions,\n aes(reorder(region, -bookings_count),\n bookings_count,\n fill = hotel)) +\n geom_bar(position = \"dodge\", stat = \"identity\") +\n scale_fill_manual(\n values = c(\"City Hotel\" = \"steelblue3\",\n \"Resort Hotel\" = \"steelblue4\")) +\n scale_y_continuous(\n labels = unit_format(perfix = \"$\",\n suffix = \"\")) +\n labs(title = \"按照城市和度假酒店细分\") +\n Theme1\n\nplot4b <- ggplot(dfchotelregions,\n aes(reorder(region, -average_rate),\n average_rate,\n fill = hotel)) +\n geom_bar(position = \"dodge\", stat = \"identity\") +\n scale_fill_manual(\n values = c(\"City Hotel\" = \"steelblue3\",\n \"Resort Hotel\" = \"steelblue4\")) +\n scale_y_continuous(\n labels = unit_format(prefix = \"$\",\n suffix = \"\")\n ) +\n labs(title = \"按照城市和度假酒店细分\") +\n Theme1\n\n(plot1b | plot2b) / (plot3b | plot4b)\n\n\n\n\n\n\n7.3.3 美洲\n\name_hotel_regions <- dfc %>% \n filter(continent == \"Americas\") %>% \n group_by(hotel, region) %>% \n summarise(\n total_rate = sum(total_rates),\n bookings_count = n()) %>% \n mutate(\n average_rate = total_rate/bookings_count)\n\name_region <- dfc %>% \n filter(continent == \"Americas\") %>% \n group_by(region) %>% \n summarise(\n total_rate = sum(total_rates),\n bookings_count = n()\n ) %>% \n mutate(\n average_rate = total_rate/bookings_count)\n\n\nplot1c <- ggplot(ame_region,\n aes(reorder(region, -bookings_count), \n bookings_count)) +\n geom_col(fill = \"steelblue\") +\n labs(title = \"美洲预定数量\") +\n Theme1\n\nplot2c <- ggplot(ame_region,\n aes(reorder(region, -average_rate),\n average_rate)) +\n geom_col(fill = \"steelblue\") +\n scale_y_continuous(\n labels = unit_format(prefix = \"$\", \n suffix = \"\")) +\n labs(title = \"美洲平均花销\") +\n Theme1\n \nplot3c <- ggplot(ame_hotel_regions,\n aes(reorder(region, -bookings_count),\n bookings_count,\n fill = hotel)) +\n geom_col(position = \"dodge\") +\n scale_fill_manual(\n values = c(\"City Hotel\" = \"steelblue3\",\n \"Resort Hotel\" = \"steelblue4\")) +\n labs(title = \"按照城市和度假酒店细分\") +\n Theme1\n\nplot4c <- ggplot(ame_hotel_regions,\n aes(reorder(region, -average_rate),\n average_rate,\n fill = hotel)) +\n geom_col(position = \"dodge\") +\n scale_fill_manual(\n values = c(\"City Hotel\" = \"steelblue3\",\n \"Resort Hotel\" = \"steelblue4\")\n ) +\n scale_y_continuous(\n labels = unit_format(prefix = \"$\",\n suffix = \"\")\n ) +\n labs(title = \"按照城市和独家酒店细分\") +\n Theme1\n\n(plot1c | plot2c) /(plot3c | plot4c)\n\n\n\n\n\n\n7.3.4 亚洲\n\n\n7.3.5 大洋洲\n\n\n7.3.6 非洲"
},
{
"objectID": "1-vgsales.html",
"href": "1-vgsales.html",
"title": "1 Video Games Sales",
"section": "",
"text": "file <- \"D:/Tools/Rwork/0.Study R/kaggle-project/data/vgsales.csv\"\ndf <- read_csv(file)\nstr(df)\n\nspec_tbl_df [19,600 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)\n $ Rank : num [1:19600] 1 2 3 4 5 6 7 8 9 10 ...\n $ Name : chr [1:19600] \"Wii Sports\" \"Super Mario Bros.\" \"Counter-Strike: Global Offensive\" \"Mario Kart Wii\" ...\n $ Platform : chr [1:19600] \"Wii\" \"NES\" \"PC\" \"Wii\" ...\n $ Publisher : chr [1:19600] \"Nintendo\" \"Nintendo\" \"Valve\" \"Nintendo\" ...\n $ Developer : chr [1:19600] \"Nintendo EAD\" \"Nintendo EAD\" \"Valve Corporation\" \"Nintendo EAD\" ...\n $ Critic_Score : num [1:19600] 7.7 10 8 8.2 8.6 10 8 9.4 9.1 8.6 ...\n $ User_Score : num [1:19600] 8 8.2 7.5 9.1 4.7 7.8 8.8 8.8 8.1 9.2 ...\n $ Total_Shipped: num [1:19600] 82.9 40.2 40 37.3 36.6 ...\n $ Year : num [1:19600] 2006 1985 2012 2008 2017 ...\n - attr(*, \"spec\")=\n .. cols(\n .. Rank = col_double(),\n .. Name = col_character(),\n .. Platform = col_character(),\n .. Publisher = col_character(),\n .. Developer = col_character(),\n .. Critic_Score = col_double(),\n .. User_Score = col_double(),\n .. Total_Shipped = col_double(),\n .. Year = col_double()\n .. )\n - attr(*, \"problems\")=<externalptr> \n\nhead(df, 3)\n\n# A tibble: 3 × 9\n Rank Name Platform Publisher Developer Critic_Score User_Score Total_Shipped\n <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>\n1 1 Wii … Wii Nintendo Nintendo… 7.7 8 82.9\n2 2 Supe… NES Nintendo Nintendo… 10 8.2 40.2\n3 3 Coun… PC Valve Valve Co… 8 7.5 40 \n# … with 1 more variable: Year <dbl>\n\n\n数据共包括11列,包含了从1977年~2020年中的游戏销量数据,具体变量说明如下表所示:\n\ntable_var <- read_excel(\"data-intro.xlsx\", sheet = 3)\nkable(table_var, align = \"c\") %>% \n kable_classic()\n\n\n\n \n \n 变量 \n 说明 \n \n \n\n \n Rank \n 销量排名 \n \n \n Name \n 游戏名称 \n \n \n Platform \n 发型平台 \n \n \n Publisher \n 发行商 \n \n \n Develooper \n 开发商 \n \n \n Critic_Score \n 从业人评分 \n \n \n User_Score \n 用户评分 \n \n \n Total_shipped \n 总销量(百万套) \n \n \n Year \n 发型年份"
},
{
"objectID": "1-vgsales.html#缺失值处理",
"href": "1-vgsales.html#缺失值处理",
"title": "1 Video Games Sales",
"section": "1.2 缺失值处理",
"text": "1.2 缺失值处理\n\nsummary(df)\n\n Rank Name Platform Publisher \n Min. : 1 Length:19600 Length:19600 Length:19600 \n 1st Qu.: 4899 Class :character Class :character Class :character \n Median : 9798 Mode :character Mode :character Mode :character \n Mean : 9799 \n 3rd Qu.:14698 \n Max. :19598 \n \n Developer Critic_Score User_Score Total_Shipped \n Length:19600 Min. : 0.800 Min. : 1.000 Min. : 0.0100 \n Class :character 1st Qu.: 6.100 1st Qu.: 6.300 1st Qu.: 0.0500 \n Mode :character Median : 7.300 Median : 7.200 Median : 0.1600 \n Mean : 7.035 Mean : 6.995 Mean : 0.5511 \n 3rd Qu.: 8.200 3rd Qu.: 8.000 3rd Qu.: 0.4600 \n Max. :10.000 Max. :10.000 Max. :82.9000 \n NA's :9631 NA's :17377 \n Year \n Min. :1977 \n 1st Qu.:2004 \n Median :2008 \n Mean :2008 \n 3rd Qu.:2012 \n Max. :2020 \n \n\nsum(is.na(df))\n\n[1] 27010\n\nn_miss(df)\n\n[1] 27010\n\n\n数据中有27010个缺失值,而缺失值主要存在与Critic_Score和User_Score,主要原因在于并不是每个用户和从业者都会对游戏进行评分。 需要对其进行一些处理,未打分的我们认为其打分为5.0分,即使用5.0代替所有缺失值。\n\n# 采用每一列的众数替换该列的缺失值\ndf <- df %>% \n map_dfc(~replace_na(.x, rstatix::get_mode(.x)[1]))"
},
{
"objectID": "1-vgsales.html#描述性分析",
"href": "1-vgsales.html#描述性分析",
"title": "1 Video Games Sales",
"section": "1.3 描述性分析",
"text": "1.3 描述性分析\n描述性统计是一个统计范围,它应用各种技术来描述和总结任何数据集,并研究观察到的数据的一般行为,以促进问题的解决。这可以通过频率表、图形和集中趋势的度量来完成,例如平均值、中位数、众数、离散度量(例如标准偏差、百分位数和四分位数)。\n\n由于2020年只有前半段的数据,我们分析时将2020年的数据剔除,以便更好的分析对比各年份的差异。同时剔除Rank列。\n\n\ndf <- df %>% \n filter(Year != 2020) %>% \n select(-Rank)\n\ndf$Year <- factor(df$Year)\n\n\n1.3.1 常规分析\n\n1.3.1.1 哪一年的游戏总销量最高\n\ndf_shipped <- df %>% \n select(Year, Total_Shipped) %>% \n group_by(Year) %>% \n summarise(count = n()) %>% \n arrange(desc(count))\n\np1 <- ggplot(head(df_shipped,10), aes(x = Year, y = count, \n fill = Year)) + \n geom_bar(stat = \"identity\", alpha = 0.7) +\n geom_label(aes(label = count), fontface = \"bold\",\n fill = \"#006400\", \n color = \"white\",\n size = 3) +\n theme_bw() +\n labs(x = \" \", y = \" \") +\n ggtitle(\"销量排名前十的年份\") +\n theme(legend.position = \"none\", \n plot.background = element_rect(color = \"black\", size = 1.1), \n axis.text.x = element_text(face = \"bold\"),\n axis.text.y = element_text(face = \"bold\"),\n axis.title = element_text(face = \"bold\")\n )\n\np2 <- ggplot(df_shipped, aes(x = Year, y = count, group = 1)) +\n geom_point() +\n geom_line() +\n theme_bw() +\n ggtitle(\"游戏销量变化\") +\n labs(x = \" \", y = \" \") +\n theme(plot.background = element_rect(color = \"black\", size = 1.1),\n axis.text.x = element_text(face = \"bold\", angle = 90)) +\n geom_curve(x = 40, y = 450, xend = 42, yend = 500,\n angle = 35,\n arrow = arrow(length = unit(0.3, \"cm\")),\n color = \"red\") +\n annotate(\"text\", x = 42, y = 550, \n label = \"COVID-19\", color = \"red\", size = 3)\np1/p2\n\n\n\n\n\n\n\n\n由图@ref(fig:shipped)可以看出:\n- 销量排名前十的年份均在21世纪,且2009年销量最高。2009年之后,游戏销量逐渐下滑,在2011年左右趋于平稳。\n\n2018~2019年,游戏销量急剧下滑。猜测原因为新冠肺炎疫情的爆发导致的游戏产能下降、经济下滑,从而大幅影响了游戏的销量。\n\n下面我们将具体看一下各游戏平台的表现。\n\n\n1.3.1.2 游戏平台排名(销量、游戏数量)\n\ndf_platform <- df %>% \n select(Platform, Total_Shipped) %>% \n group_by(Platform) %>% \n summarize(amount = sum(Total_Shipped)) %>% \n arrange(desc(amount)) %>% \n head(10)\n\np3 <- ggplot(df_platform, aes(x = reorder(Platform, amount), y = amount, \n fill = Platform)) +\n geom_bar(stat = \"identity\", alpha = 0.7) +\n labs(x = \" \", y = \" \") +\n ggtitle(\"游戏平台排名\", subtitle = \"平台销量排名\") +\n coord_flip() +\n theme_bw() +\n theme(legend.position = \"none\",\n axis.text = element_text(face = \"bold\"), \n plot.title = element_text(face = \"bold\"),\n plot.background = element_rect(color = \"black\"))\n\ndf_platform2 <- as.data.frame(table(df$Platform)) %>% \n rename(Platform = Var1) %>% \n arrange(desc(Freq)) %>% \n head(10)\n\np4 <- ggplot(df_platform2, aes(x = reorder(Platform, Freq), y = Freq, \n fill = Platform)) +\n geom_bar(stat = \"identity\", alpha = 0.7) +\n labs(x = \" \", y = \" \") +\n ggtitle(\"游戏平台排名\", subtitle = \"平台游戏数量排名\") +\n coord_flip() +\n theme_bw() +\n theme(plot.background = element_rect(color = \"black\"), \n legend.position = \"none\",\n plot.title = element_text(face = \"bold\"),\n axis.text = element_text(face = \"bold\"))\np3|p4\n\n\n\n\n游戏平台排名\n\n\n\n\n由图@ref(fig:platform)可以看出:\n\nPS2不愧是有史以来最成功的的家用主机,发行在其上的游戏销量排名第一、游戏数量排名第二。\nPC游戏仍有一定竞争力。\n御三家统治了主机游戏。\n\n下面我们看一下游戏开发商的情况。\n\n\n1.3.1.3 开发商和发行商排名\n\ndf_developer <- df %>% \n select(Developer, Total_Shipped) %>% \n group_by(Developer) %>% \n summarise(amount = sum(Total_Shipped)) %>% \n arrange(desc(amount)) %>% \n head(10)\n\np5 <- ggplot(df_developer, aes(x = reorder(Developer, amount), y = amount,\n fill = Developer)) +\n geom_bar(stat = \"identity\", alpha = 0.7) +\n coord_flip() +\n ggtitle(\"开发商销量排名\") +\n labs(x = \" \", y = \"\") +\n theme_bw() +\n theme(legend.position = \"none\",\n plot.background = element_rect(color = \"black\"))\n\n\ndf_publisher <- df %>% \n select(Publisher, Total_Shipped) %>% \n group_by(Publisher) %>% \n summarise(amount = sum(Total_Shipped)) %>% \n arrange(desc(amount)) %>% \n head(10)\n\np6 <- ggplot(df_publisher, aes(x = reorder(Publisher, amount), y = amount,\n fill = Publisher)) +\n geom_bar(stat = \"identity\", alpha = 0.7) +\n coord_flip() +\n ggtitle(\"发行商销量排名\") +\n labs(x = \" \", y = \"\") +\n theme_bw() +\n theme(legend.position = \"none\",\n plot.background = element_rect(color = \"black\"))\np5|p6\n\n\n\n\n\n任天堂作为开发商和发行商均独占鳌头。\nGame Freak依靠王牌IP精灵宝可梦占据开发商销量第三名。\n大家耳熟能详的游戏开发商和发行商均有上榜。"
},
{
"objectID": "1-vgsales.html#探索性分析",
"href": "1-vgsales.html#探索性分析",
"title": "1 Video Games Sales",
"section": "1.4 探索性分析",
"text": "1.4 探索性分析\n在统计学中,探索性数据分析 (EAD) 是一种分析数据集以总结其主要特征的方法,通常使用可视化方法。\n\n1.4.1 世界最畅销游戏\n\n1.4.1.1 最畅销的5个游戏\n那么,1977年~2019年间,到底哪个游戏销量是最高的呢?\n\noptions(repr.plot.width = 20, repr.plot.height = 8)\n\ndf_games <- df %>% \n select(Name, Total_Shipped) %>% \n group_by(Name) %>% \n summarise(amount = sum(Total_Shipped)) %>% \n arrange(desc(amount)) %>% \n head(5)\n\np7 <- ggplot(df_games, aes(x = reorder(Name, amount), y = amount,\n fill = Name)) +\n geom_col(aes(alpha = 0.9)) +\n geom_label(aes(label = amount), size = 3,\n fontface = \"bold\",\n color = \"white\") +\n labs(x = \" \", y = \" \") +\n coord_flip() +\n theme_bw() +\n theme(legend.position = \"none\",\n plot.background = element_rect(color = \"black\", size = 1.1),\n axis.text.x = element_text( face = \"bold\"),\n axis.text.y = element_text(face = \"bold\"))\n\np8 <- ggplot(df_games, aes(x = Name, y = amount)) +\n geom_line(alpha = 0.7, group = 1) +\n geom_point(aes(fill = Name), shape = 2) +\n theme_bw()+ \n theme(legend.position = \"none\",\n plot.background = element_rect(color = \"black\"),\n axis.text.x = element_text(face = \"bold\")) +\n labs(x = \"\", y = \"\") +\n coord_polar()\np7|p8\n\n\n\n\n游戏总销量排名\n\n\n\n\n由图@ref(fig:games)可知:\n\n游戏销量排名前3的游戏为:Wii Sports、GTA5和我的世界。\nGTV5和我的世界在多个游戏平台均有发售,Wii Sports为任天堂平台独占。\n任天堂游戏平台发售的游戏占前十的大多数,任天堂就是世界的主宰!\n\n\n\n1.4.1.2 最畅销的5个游戏逐年分布\n\ndf_games_top5 <- df %>% \n filter(Name == \"Wii Sports\" |\n Name == \"Grand Theft Auto V\"|\n Name == \"Minecraft\"|\n Name == \"Super Mario Bros.\"|\n Name == \"Counter-Strike: Global Offensive\") %>% \n select(Name, Year, Total_Shipped)\n\nggplot(df_games_top5, aes(x = Year, y = Total_Shipped)) +\n geom_bar(stat = \"identity\", aes(fill = Name, \n color = Name),\n alpha = 8) +\n facet_wrap(~Name) +\n labs(x = \"\", y = \"总销量(百万套)\") +\n theme_bw()+\n theme(legend.position = \"none\",\n strip.text.x = element_text(\n margin = margin(7, 7, 7, 7), size = 7, \n face = \"bold\", color = \"white\"),\n strip.background = element_rect(fill = \"#B45F04\",\n color = \"black\"),\n plot.title = element_text(face = \"bold\"),\n axis.text.x = element_text(face = \"bold\",\n angle = 90,\n vjust = 0.5),\n axis.text.y = element_text(face = \"bold\"))"
},
{
"objectID": "1-vgsales.html#媒体打分与玩家打分的关系",
"href": "1-vgsales.html#媒体打分与玩家打分的关系",
"title": "1 Video Games Sales",
"section": "1.5 媒体打分与玩家打分的关系",
"text": "1.5 媒体打分与玩家打分的关系\n俗话说,“低分信媒体,高分信自己”。如果一款游戏媒体打分低,那肯定不行,但如果一个游戏媒体打高分,也不一定好玩(有可能是塞了钱)。\n下面我们就分析一下媒体打分与玩家打分的关系。\n\ndf_score <- df %>% \n select(Name, User_Score, Critic_Score) \n\ncor <- cor.test(df_score$User_Score, df_score$Critic_Score,\n method = \"pearson\")\np.value <- cor$p.value\ncoef <- cor$estimate \n\nggplot(df_score, aes(x = User_Score, y = Critic_Score)) +\n geom_smooth(method = lm)\n\n\n\n\n可以看到两个打分的相关系数为0.16,且p值小于0.05,表明两者呈现显著的正相关。看来游戏媒体和玩家对游戏的口味还是一样的,某种程度上说,高分也可以信媒体。"
},
{
"objectID": "1-vgsales.html#玩家与媒体分别最喜欢哪个发行商",
"href": "1-vgsales.html#玩家与媒体分别最喜欢哪个发行商",
"title": "1 Video Games Sales",
"section": "1.6 玩家与媒体分别最喜欢哪个发行商",
"text": "1.6 玩家与媒体分别最喜欢哪个发行商\n\n1.6.1 玩家\n那么,玩家最喜欢(打分最高)的游戏发行商是谁呢?\n\ndf_player <- df %>% \n select(Publisher, User_Score)\n\ndf_player$Publisher <- df_player$Publisher %>% \n map(~\n str_detect(.x,\"Sony\") %>% \n ifelse(\"Sony\", .x)) %>% \n unlist() \n\ndf_player <- df_player%>% \n group_by(Publisher) %>% \n summarise(count = n(),\n mean_score = mean(User_Score)) %>% \n filter(count >= 50) %>% \n select(Publisher, mean_score) %>% \n arrange(desc(mean_score))\ndf_player\n\n# A tibble: 62 × 2\n Publisher mean_score\n <chr> <dbl>\n 1 Rockstar Games 7.84\n 2 Microsoft Game Studios 7.77\n 3 Nintendo 7.77\n 4 Sierra Entertainment 7.72\n 5 DreamCatcher Interactive 7.71\n 6 5pb 7.71\n 7 Sony 7.70\n 8 Eidos Interactive 7.70\n 9 Acclaim Entertainment 7.7 \n10 Agetec 7.7 \n# … with 52 more rows\n\n\n在发行过50个以上游戏的老牌发行商中:\n\nR星凭借GTA、荒野大镖客等重量级IP以7.842的评分独占鳌头。\n所有发行商的评分均超过7分,说明现代游戏的质量还是有保障的。"
},
{
"objectID": "2-Olympic-history.html",
"href": "2-Olympic-history.html",
"title": "2 Olympic history",
"section": "",
"text": "本项目的主要目标是阐明奥运会历史的主要模式,例如运动员、运动、参与国家的数量,哪些国家的运动员最多,赢得奖牌情况,运动员的特性(eg:性别、身体特征等)。\n我将一些你可能不知道的奥运历史上特别有趣的方面进行放大化。你知道纳粹德国举办了1936年奥运会,而那届奥运会他们打败了所有人?你知道绘画和诗歌曾经是奥运会项目吗?这些历史小花絮同样是我的关注点。\n首先,我们读取数据,并对每列的数据类型进行定义。\n\nfile<- \"D:/Tools/Rwork/0.Study R/kaggle-project/data/olympics/athlete_events.csv\"\ndata <- read_csv(file,\n col_types = cols(\n ID = col_character(),\n Name = col_character(),\n Sex = col_factor(levels = c(\"M\",\"F\")),\n Age = col_integer(),\n Height = col_double(),\n Weight = col_double(),\n Team = col_character(),\n NOC = col_character(),\n Games = col_character(),\n Year = col_integer(),\n Season = col_factor(levels =\n c(\"Summer\",\"Winter\")),\n City = col_character(),\n Sport = col_character(),\n Event = col_character(),\n Medal = col_factor(levels =\n c(\"Gold\",\"Silver\",\"Bronze\"))\n ))\nhead(data)\n\n# A tibble: 6 × 15\n ID Name Sex Age Height Weight Team NOC Games Year Season City \n <chr> <chr> <fct> <int> <dbl> <dbl> <chr> <chr> <chr> <int> <fct> <chr>\n1 1 A Dijiang M 24 180 80 China CHN 1992… 1992 Summer Barc…\n2 2 A Lamusi M 23 170 60 China CHN 2012… 2012 Summer Lond…\n3 3 Gunnar N… M 24 NA NA Denm… DEN 1920… 1920 Summer Antw…\n4 4 Edgar Li… M 34 NA NA Denm… DEN 1900… 1900 Summer Paris\n5 5 Christin… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…\n6 5 Christin… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…\n# … with 3 more variables: Sport <chr>, Event <chr>, Medal <fct>"
},
{
"objectID": "2-Olympic-history.html#运动员国家和时间",
"href": "2-Olympic-history.html#运动员国家和时间",
"title": "2 Olympic history",
"section": "2.2 运动员、国家和时间",
"text": "2.2 运动员、国家和时间\n\n2.2.1 随着时间的推移,运动员、国家和赛事的数量是否发生了变化?\n\ncounts <- data %>% \n filter(Sport != \"Art Competitions\") %>% # 去掉艺术类的运动类别\n group_by(Year, Season) %>% \n summarize(\n Athletes = length(unique(ID)),\n Nations = length(unique(NOC)),\n Events = length(unique(Event))\n )\ncounts\n\n# A tibble: 51 × 5\n# Groups: Year [35]\n Year Season Athletes Nations Events\n <int> <fct> <int> <int> <int>\n 1 1896 Summer 176 12 43\n 2 1900 Summer 1224 31 90\n 3 1904 Summer 650 15 95\n 4 1906 Summer 841 21 74\n 5 1908 Summer 2024 22 109\n 6 1912 Summer 2377 27 102\n 7 1920 Summer 2665 29 153\n 8 1924 Summer 3067 44 126\n 9 1924 Winter 313 19 17\n10 1928 Summer 2877 46 109\n# … with 41 more rows\n\n# 作图\n# 运动员数量及关键时间点\np1 <- ggplot(counts, aes(x = Year, y = Athletes, \n group = Season, color = Season)) +\n geom_point(size = 2) +\n geom_line() +\n scale_color_manual(values = c(\"darkorange\", \"darkblue\")) + # 手动设置图形颜色\n xlab(\" \") +\n annotate(\"text\", x = c(1932, 1956, 1976, 1980),\n y = c(2000, 2750, 6800, 4700),\n label = c(\n \"L.A. 1932\",\n \"Melbourne 1956\",\n \"Montreal 1976\",\n \"Moscow 1980\"),\n size=3) + # 对几个临界拐点进行标记。\n # 针对两次世界大战的时间做出标记\n annotate(\"text\", x = c(1916,1942), y = c(10000,10000),\n label = c(\"WWI\", \"WWII\"), \n size = 4, color = \"red\") +\n geom_segment(aes(x = 1914, y = 8000, xend = 1918, yend = 8000),\n color = \"red\", size = 2) +\n geom_segment(aes(x = 1939,y = 8000, xend = 1945, yend = 8000),\n color = \"red\", size=2) + \n scale_x_continuous(breaks = seq(1890, 2020, 4)) +\n scale_y_continuous(breaks = seq(0, 15000, 5000)) +\n theme_bw() +\n theme(axis.text.x = element_text(size = 10, angle = 90, vjust = 0.4, face = \"bold\"))\n\n# 参与国家数量及关键时间点\np2 <- ggplot(counts, aes(x = Year, y = Nations, \n group = Season, color = Season)) +\n geom_point(size = 2) +\n geom_line() +\n scale_color_manual(values = c(\"darkorange\", \"darkblue\")) +\n xlab(\"\") +\n annotate(\"text\", x = c(1932, 1976, 1980),\n y = c(60, 105, 70),\n label = c(\"L.A. 1932\",\n \"Montreal 1976\",\n \"Moscow 1980\")) +\n scale_x_continuous(breaks = seq(1890, 2020, 4)) +\n scale_y_continuous(breaks = seq(0, 250, 50)) +\n theme_bw() + \n theme(axis.text.x = element_text(size = 10, angle = 90, vjust = 0.4, face = \"bold\"))\n\n# 参赛项目\np3 <- ggplot(counts, aes(x = Year, y = Events, group = Season, color = Season)) +\n geom_point(size = 2) +\n geom_line() +\n scale_color_manual(values = c(\"darkorange\", \"darkblue\")) +\n scale_x_continuous(breaks = seq(1890, 2020, 4)) +\n scale_y_continuous(breaks = seq(0, 350, 50)) +\n theme_bw() +\n theme(axis.text.x = element_text(size = 10, angle = 90, vjust = 0.4, face = \"bold\"))\n\ngrid.arrange(p1, p2, p3, ncol = 1)\n\n\n\n\n(ref:fig-countNO)\n\n\n\n\n(ref:fig-countNO) 随着时间的推移,运动员、国家和赛事的数量的变化情况\n从图@ref(fig:countNO) 可以看出两次世界大战期间,奥运会基本处于停滞状态(WWI:1912-1920,WWII:1936-1948)。此外,有几届值得注意的奥运会上发生了一些事件,在图形中呈现为明显的下降趋势:\n\nL.A., 1932: 1932年洛杉矶奥运会,当时的美国正处于大萧条期间,许多运动员没办法负担到奥运会的差旅费用,导致参加本届奥运会的国家机器运动员数量骤降。\nMelbourne, 1956:1956年墨尔本奥运会是事端最多的一届奥运会:1、由于战争立场不同,包括埃及、以色列、伊拉克和黎巴嫩在内的一些中东及非洲国家,西班牙、荷兰、瑞士、哥伦比亚等欧洲国家均抵制本届奥运会;2.由于台湾问题,我国也未参加本届奥运会。\nMontreal, 1976:1976年蒙特利尔奥运会,由于发生了抵制运动,25个国家(大部分为非洲国家)未参加本届奥运会。\nMoscow, 1980:1980年莫斯科奥运会,由于苏联军队对阿富汗的入侵,包括美国在内的66个国家对本届奥运会进行了抵制,本届奥运会最终仅有80个国家参加,是自1956年以来最少国家参加的一届,而许多参赛的国家也只派一名旗手,以奥运会会旗代替国旗进场。这是奥运历史上最严重的一次危机,从一定程度上威胁了奥林匹克运动的发展。\n\n从2000年开始,奥运会的增长势头(比赛项目额参赛运动员数量)趋于平缓,夏季奥运会迎来了它的饱和点。而冬季奥运会处于虽然还有一定上升空间,但冰雪运动受限于气候和场地,在很多国家并不普吉,所以其涨势并不明显。"
},
{
"objectID": "2-Olympic-history.html#sec:art-com",
"href": "2-Olympic-history.html#sec:art-com",
"title": "2 Olympic history",
"section": "2.3 艺术竞赛(The Art Competitions)",
"text": "2.3 艺术竞赛(The Art Competitions)\n\n2.3.1 运动员、时间、参赛项目\n\nart <- data %>% \n filter(Sport == \"Art Competitions\") %>% \n select(Name, Sex, Age, Team, NOC, Year, City, Event, Medal)\nhead(art)\n\n# A tibble: 6 × 9\n Name Sex Age Team NOC Year City Event Medal\n <chr> <fct> <int> <chr> <chr> <int> <chr> <chr> <fct>\n1 Win Valdemar Aaltonen M 54 Finland FIN 1948 London Art C… <NA> \n2 Adolf Gaston Abel M 45 Germany GER 1928 Amsterdam Art C… <NA> \n3 Adolf Gaston Abel M 45 Germany GER 1928 Amsterdam Art C… <NA> \n4 Georges Achille-Fould F 55 France FRA 1924 Paris Art C… <NA> \n5 Dsir Antoine Acket M 27 Belgium BEL 1932 Los Angeles Art C… <NA> \n6 Dsir Antoine Acket M 27 Belgium BEL 1932 Los Angeles Art C… <NA> \n\n# 基础计算\ncount_art <- art %>% \n group_by(Year) %>% \n summarize(\n Events = length(unique(Event)),\n Nations = length(unique(Team)),\n Artists = length(unique(Name))\n )\nhead(count_art)\n\n# A tibble: 6 × 4\n Year Events Nations Artists\n <int> <int> <int> <int>\n1 1912 5 12 33\n2 1920 5 5 11\n3 1924 5 24 189\n4 1928 13 19 370\n5 1932 13 36 588\n6 1936 19 24 527\n\n\n在最初的奥运会中,还存在艺术竞赛(Art Competition)项目的比拼,准确的说从1912年至1948年间的几届奥运会。\n1954年,国际奥委会决定艺术竞赛不再作为奥运会的比赛项目。\n虽然艺术竞赛的数据仅占本数据集的1.3%,但考虑到其在奥运会理事上的特殊意义,仍有必要对其进行探索分析。\n\n# 提取有关艺术竞赛的数据\nart <- data %>% \n filter(Sport == \"Art Competitions\") %>% \n select(Name, Sex, Age, Team, NOC, Year, City, Event, Medal)\n\n# 计算每年的Events, Nations, Artists数量\ncounts_art <- art %>% \n filter(Team != \"Unknow\") %>% # 剔除国籍不确定的运动员\n group_by(Year) %>% \n summarize(\n Events = length(unique(Event)),\n Nations = length(unique(Team)),\n Artisit = length(unique(Name))\n )\n\n\np4 <- ggplot(counts_art, aes(x = Year, y = Events)) +\n geom_point(size = 2) +\n geom_line() +\n xlab(\" \") +\n scale_x_continuous(breaks = seq(min(counts_art[\"Year\"]), max(counts_art[\"Year\"]), 4)) +\n scale_y_continuous(breaks = seq(0, 20, 2)) +\n theme_bw() +\n theme(axis.text.x = element_text(size = 10, vjust = 0.4, face = \"bold\"))\n\np5 <- ggplot(counts_art, aes(x = Year, y = Nations)) +\n geom_point(size = 2) +\n geom_line() + \n xlab(\" \") +\n scale_x_continuous(breaks = seq(min(counts_art[\"Year\"]), max(counts_art[\"Year\"]), 4)) +\n scale_y_continuous(breaks = seq(0, 40, 5)) +\n theme_bw() +\n theme(axis.text.x = element_text(size = 10, vjust = 0.4, face = \"bold\"))\n\np6 <- ggplot(counts_art, aes(x = Year, y = Artisit)) +\n geom_point(size = 2) +\n geom_line() + \n xlab(\" \") +\n scale_x_continuous(breaks = seq(min(counts_art[\"Year\"]), max(counts_art[\"Year\"]), 4)) +\n scale_y_continuous(breaks = seq(0, 600, 100)) +\n theme_bw() +\n theme(axis.text.x = element_text(size = 10, vjust = 0.4, face = \"bold\"))\n\ngrid.arrange(p4, p5, p6, ncol = 1)\n\n\n\n\n(ref:fig-art-competition)\n\n\n\n\n(ref:fig-art-competition) 历届奥运会艺术竞赛随时间变化图(参与国家、参赛运动员、比赛项目)\n如图@ref(fig:art-competition)所示,艺术竞赛的数据变化呈不规则的增长趋势。可以看到,与1920年相比,1924年奥运会参与艺术竞赛的国家和运动员数量均有相对较大幅度的增长,这或许是1928年奥运会艺术竞赛比赛项目增多的一个原因。\n\n\n2.3.2 获得奖牌情况-哪个国家获得的艺术竞赛类奖牌数量最多\n\n# 计算每个国家所获奖牌数量\nmedal_counts_arts <- art %>% \n filter(!is.na(Medal)) %>% \n group_by(Team, Medal) %>% \n summarize(Count = length(Medal))\n\n# 根据奖牌数量对国家进行排序\nlevs_art <- medal_counts_arts %>% \n group_by(Team) %>% \n summarize(Total = sum(Count)) %>% \n arrange(Total) %>% \n select(Team)\nmedal_counts_arts$Team <- factor(medal_counts_arts$Team, levels = levs_art$Team)\n\n\nggplot(medal_counts_arts, aes(x = Team, y = Count, fill = Medal)) +\n geom_col() +\n coord_flip() +\n scale_y_continuous(breaks = seq(0, 30, 5)) +\n scale_fill_manual(values = c(\"gold1\", \"gray70\", \"gold4\")) +\n ggtitle(\"艺术竞赛奖牌榜\") +\n theme(plot.title = element_text(hjust = 0.5))\n\n\n\n\n(ref:fig-medal-arts)\n\n\n\n\n(ref:fig-medal-arts) 各国艺术竞赛奖牌榜\n\nart_country <- nrow(unique(art[\"Team\"]))\n\nart_country_medal <- nrow(medal_counts_arts %>% \n summarize(country = length(unique(Team)))\n )\n\n共有51个国家参加了奥运会的艺术竞赛类项目,仅有不到一半数量的国家(23)获得了奖牌(如图@ref(fig:medal-arts)所示),排名前三位的为德国、法国、意大利。"
},
{
"objectID": "2-Olympic-history.html#奥林匹克中的女将们",
"href": "2-Olympic-history.html#奥林匹克中的女将们",
"title": "2 Olympic history",
"section": "2.4 奥林匹克中的女将们",
"text": "2.4 奥林匹克中的女将们\n现代奥林匹克之父顾拜旦曾今极力反对女性参加奥运会。其后的IOC主席也同样支持这个观点。尽管有多方面的阻力,从第一届奥运会(1896年)开始的每届奥运会均有女性参与其中。所以我们在这里探索一下女性参加奥运会的历史趋势:多少人?从哪里来?她们如何找到途经参加的?\n本部分中,对数据集进行如下处理:\n\n将第@sec:art-com节中讨论过的艺术竞赛数据剔除。\n将夏季和冬季奥运会合并为“Olympiads”列,即每四年期内包括一届夏季奥运会和一届冬季奥运会。\n\n让我们开始!\n\n2.4.1 男女运动员数量比较\n\n# 剔除艺术竞赛项目数据\ndata <- data %>% \n filter(Sport != \"Art Competition\")\n\n# 对Year列数据重新编码(1992年之后,与夏季奥运会匹配)\n# 处理后,冬季和夏季奥运会举办时间的已匹配\noriginal <- c(1994, 1998, 2002, 2006, 2010, 2014)\nnew <- c(1996, 2000, 2004, 2008, 2012, 2016)\nfor (i in 1:length(original)) {\n data$Year <- gsub(original[i], new[i], data$Year)\n}\ndata$Year <- as.integer(data$Year)\n\ncounts_sex <- data %>% \n group_by(Year, Sex) %>% \n summarize(Athletes = length(unique(ID)))\ncounts_sex$Year <- as.integer(counts_sex$Year)\n\n\nggplot(counts_sex, aes(x = Year, y = Athletes, group = Sex, color = Sex)) +\n geom_point(size = 2) + \n geom_line() +\n scale_color_manual(values = c(\"darkblue\", \"red\")) +\n scale_x_continuous(breaks = seq(1896, 2016, 4)) +\n scale_y_continuous(breaks = seq(0, 9000, 500)) +\n labs(title = \"Number of male and female Olympians over time\") +\n theme_bw() +\n theme(plot.title= element_text(hjust = 0.5),\n axis.text.x = element_text(angle = 90, face = \"bold\", vjust = 0.5)) \n\n\n\n\n(ref:fig-counts-sex)\n\n\n\n\n(ref:fig-counts-sex) 历届奥运会南云运动员数量\n\ncounts_sex_latest <- counts_sex %>% \n filter(Year == 2016)\ncounts_sex_latest\n\n# A tibble: 2 × 3\n# Groups: Year [1]\n Year Sex Athletes\n <int> <fct> <int>\n1 2016 M 7788\n2 2016 F 6133\n\n\n如图@ref(fig:counts-sex)所示,直到1996年,参加奥运会的女运动员数量的变化趋势都与男运动员相同,此时男运动员的数量达到8000人的最顶点,而此时女运动员的数量还在以一个较高的增长率上升。最近的一届奥运会(2014年索契冬奥会和2016年里约奥运会中),女运动的数量已经超过了0.4405574。\n\n\n2.4.2 各国参赛男女运动员间数量关系\n选择1936,1956,1976,1996,2016五个年份的数据进行分析。这五年的数据互相间隔20年,可以独立的展示独立的回归曲线。\n\n# Count M/F Total per country per Olympics\ncounts_NOC <- data %>% \n filter(Year %in% c(1936, 1956, 1976, 1996, 2016)) %>% \n group_by(Year, NOC, Sex) %>% \n summarize(Count = length(unique(ID))) %>% \n spread(Sex, Count) %>% # 按照Sex为key,Count为value的规则进行pivot_wider()操作。\n mutate(Total = sum(M, F, na.rm = T)) # 计算运动员总数\nnames(counts_NOC)[3: 4] <- c(\"Male\", \"Female\")\n\n# 将缺失值变为0\ncounts_NOC$Male[is.na(counts_NOC$Male)] <- 0\ncounts_NOC$Female[is.na(counts_NOC$Female)] <- 0\n\ncounts_NOC$Year <- as.factor(counts_NOC$Year) # 将Year列转化为因子\ncounts_NOC\n\n# A tibble: 622 × 5\n# Groups: Year, NOC [622]\n Year NOC Male Female Total\n <fct> <chr> <dbl> <dbl> <int>\n 1 1936 AFG 15 0 15\n 2 1936 ARG 50 1 51\n 3 1936 AUS 29 4 33\n 4 1936 AUT 265 27 292\n 5 1936 BEL 168 8 176\n 6 1936 BER 5 0 5\n 7 1936 BOL 2 0 2\n 8 1936 BRA 67 6 73\n 9 1936 BUL 33 0 33\n10 1936 CAN 101 25 126\n# … with 612 more rows\n\n\n\nggplot(counts_NOC, aes(x = Male, y = Female, group = Year, color = Year)) +\n geom_point(alpha = 0.5) +\n geom_abline(intercept = 0, slope = 1, linetype = \"dashed\") +\n geom_smooth(method = \"lm\", se = FALSE)\n\n\n\n\n(ref:fig-5-years)\n\n\n\n\n(ref:fig-5-years) 各国参加奥运会男女运动员比例关系\n从图@ref(fig:5-years-regression)可以看出,与1936-1956年间女运动员的数量增长数量较为缓慢相比,1956-2016年间,女运动员的参赛人数有了明显的提升。从回归曲线(虚线)可以看出,在1996年及2016年,部分国家甚至派出了女性占大多数的参赛代表团。"
},
{
"objectID": "2-Olympic-history.html#奖牌榜medal-count",
"href": "2-Olympic-history.html#奖牌榜medal-count",
"title": "2 Olympic history",
"section": "2.5 奖牌榜(Medal Count)",
"text": "2.5 奖牌榜(Medal Count)\n\n# 按奖牌多少的顺序计算各国奖牌榜\nmedalCount <- data %>% \n filter(!is.na(Medal)) %>% \n filter(Sport != \"Art Competitions\") %>% \n group_by(Team, Medal) %>% \n summarize(Count = length(Medal))\n \nmedalCountLevs <- medalCount %>% \n group_by(Team) %>% \n summarize(Total = sum(Count)) %>% \n arrange(Total) %>% \n select(Team) %>% \n tail(25)\nmedalCount$Team <- factor(medalCount$Team, levels = medalCountLevs$Team)\nmedalCount <- medalCount %>% \n filter(Team != \"NA\")\n\n\nggplot(medalCount, aes(x = Team, y = Count, fill = Medal)) +\n geom_col() +\n coord_flip() +\n scale_fill_manual(values = c(\"gold1\", \"gray70\", \"gold4\")) +\n ggtitle(\"Olympics Medal Tally\") +\n theme(plot.title = element_text(hjust = 0.5))\n\n\n\n\n(ref:fig-medal)\n\n\n\n\n(ref:fig-medal) 各国历史奖牌榜前50名的国家\n图@ref(fig:medal)显示,美国所获的奖牌数最多,而且远多于第二名的苏联。中国排在日本之后,排名第16位。进一步我们看一下中国的奖牌数变化。\n\nmedalChina <- data %>% \n filter(!is.na(Medal)&Sport != \"Art Competitions\"&Team == \"China\") %>% \n group_by(Year, Medal) %>% \n summarize(Count = length(Medal))\n\n\nggplot(medalChina, aes(x = reorder(Year, -Count), y = Count, fill = Medal)) +\n geom_col() +\n scale_fill_manual(values = c(\"gold1\", \"gray70\", \"gold4\")) +\n xlab(\" \") +\n ylab(\"奖牌数\") +\n theme_bw()\n\n\n\n\n历届奥运会中国奖牌数量变化\n\n\n\n\n图@ref(fig:china-medal)显示,2008年北京奥运会,中国获得奖牌数最多。1988年汉城奥运会,由于苏联、德国等国家重新参加比赛,竞争明显大于1984年奥运会,导致中国获得奖牌数最少,仅5枚金牌。"
},
{
"objectID": "2-Olympic-history.html#地理信息地图",
"href": "2-Olympic-history.html#地理信息地图",
"title": "2 Olympic history",
"section": "2.6 地理信息地图",
"text": "2.6 地理信息地图\n本部分我们聚焦夏季奥运会,在世界地图上呈现各国参加奥运会人数的变化情况。选取最近的一届2016年奥运会以及分别相隔44年的1972年慕尼黑奥运会和1928年阿姆斯特丹奥运会。\n\n2.6.1 1928年奥运会情况\n\n# 读取NOC数据\nnoc <- read_csv(\"D:/Tools/Rwork/0.Study R/kaggle-project/data/olympics/noc_regions.csv\",\n col_types = cols(\n NOC = col_character(),\n region = col_character()\n ))\n\n# 增加regions数据,去除缺失值\ndataRegions <- data %>% \n left_join(noc, by = \"NOC\") %>% \n filter(!is.na(region))\n\n# 将选择的三届奥运会的数据筛选出来\namsterdam <- dataRegions %>% \n filter(Games== \"1928 Summer\") %>% \n group_by(region) %>% \n summarize(Amsterdam = length(unique(ID)))\n\nmunich <- dataRegions %>% \n filter(Games == \"1972 Summer\") %>% \n group_by(region) %>% \n summarize(Munich = length(unique(ID)))\n\nrio <- dataRegions %>% \n filter(Games == \"2016 Summer\") %>% \n group_by(region) %>% \n summarize(Rio = length(unique(ID)))\n\n\n# 建立地图\nworld <- map_data(\"world\")\nmapdat <- tibble(region = unique(world$region)) # 提取国家列\nmapdat <- mapdat %>% \n left_join(amsterdam, by = \"region\") %>% \n left_join(munich, by = \"region\") %>% \n left_join(rio, by = \"region\")\nmapdat$Amsterdam[is.na(mapdat$Amsterdam)] <- 0\nmapdat$Munich[is.na(mapdat$Munich)] <- 0\nmapdat$Rio[is.na(mapdat$Rio)] <- 0\nworld <- left_join(world, mapdat, by = \"region\")\n\n# 1928\nggplot(world, aes(x = long, y = lat, group = group)) + \n geom_polygon(aes(fill = Amsterdam)) +\n labs(title = \"Amsterdam 1928\",\n x = NULL, y = NULL) +\n theme(axis.ticks = element_blank(),\n axis.text = element_blank(),\n panel.background = element_rect(fill = \"navy\"), \n plot.title = element_text(hjust = 0.5)) +\n guides(fill = guide_colorbar(title = \"Athletes\")) +\n scale_fill_gradient(low = \"white\", high = \"red\")\n\n\n\n\n\n\n2.6.2 1972年奥运会情况\n\nggplot(world, aes(x = long, y = lat, group = group)) +\n geom_polygon(aes(fill = Munich)) +\n labs(title = \"Munich 1972\",\n x = NULL, y = NULL) +\n theme(axis.ticks = element_blank(),\n axis.text = element_blank(),\n panel.background = element_rect(fill = \"navy\"),\n plot.title = element_text(hjust = 0.5)) +\n guides(fill = guide_colorbar(title = \"Athletes\")) +\n scale_fill_gradient2(low = \"white\", high = \"red\")\n\n\n\n\n\n\n2.6.3 2016年奥运会情况\n\nggplot(world, aes(x = long, y = lat, group = group)) +\n geom_polygon(aes(fill = Rio)) +\n labs(title = \"Rio 2016\", x = NULL, y = NULL) +\n theme(axis.ticks = element_blank(),\n axis.text =element_blank(),\n panel.background = element_rect(fill = \"navy\"),\n plot.title = element_text(hjust = 0.5)) +\n guides(fill = guide_colorbar(title = \"Athletes\")) +\n scale_fill_gradient2(low = \"white\", high = \"red\")"
},
{
"objectID": "2-Olympic-history.html#参赛运动员身高体重",
"href": "2-Olympic-history.html#参赛运动员身高体重",
"title": "2 Olympic history",
"section": "2.7 参赛运动员身高体重",
"text": "2.7 参赛运动员身高体重\n更高、更快、更强是奥运会的座右铭,而每届奥运会的参赛运动员似乎也比之前奥运会更快更强。要验证这个观点,我们需要通过本数据探索历届奥运会运动员身高及体重的变化趋势。\n\n2.7.1 数据完整性及可用性检测\n\ndata %>% group_by(Year, Sex) %>% \n summarize(Present = length(unique(ID[which(!is.na(Height)&!is.na(Weight))])),\n Total = length(unique(ID))) %>% \n mutate(Proportion = Present/Total) %>% \n ggplot(aes(x = Year, y = Proportion, group = Sex, color = Sex)) +\n geom_point() +\n geom_line() +\n scale_color_manual(values = c(\"darkblue\", \"red\")) +\n theme(plot.title = element_text(hjust = 0.5),\n axis.text.x = element_text(face = \"bold\", angle = 90)) +\n labs(title = \"Height/Weight data completeness from each Olympics\") +\n scale_x_continuous(breaks = seq(1896, 2016, 4))\n\n\n\n\nHeight/Weight data completeness from each Olympics\n\n\n\n\n图@ref(fig:data-completeness)所示,1960年,数据的完整性有一个巨大的飞跃,且从本届奥运会开始,数据的完整性均超过了85%(除1992年外)。鉴于此,我们选取从1960年开始的数据,共包括56年间的15届奥运会。\n\ndata <- data %>% \n filter(!is.na(Height), !is.na(Weight), Year> 1959)\n\n\n\n2.7.2 身高体重\n\npHeight <- ggplot(data, aes(x = as.factor(Year), y = Height, fill = Sex)) +\n geom_boxplot(alpha = 0.75) +\n xlab(\"Olympiad Year\") + ylab(\"Height(cm)\") +\n scale_fill_manual(values = c(\"blue\", \"red\"))\n\npWeight <- ggplot(data, aes(x = as.factor(Year), y = Weight, fill = Sex)) +\n geom_boxplot(alpha = 0.75) +\n xlab(\"Olympiad Year\") + ylab(\"Weight(kg)\") +\n scale_fill_manual(values = c(\"blue\", \"red\"))\n\ngrid.arrange(pHeight, pWeight,ncol = 1)\n\n\n\n\nAthlete height & weight over time\n\n\n\n\n图@ref(fig:WH-overtime)显示,运动员身高体重(包括男女)均呈现稳步上升的趋势。但是,由于不同运动项目要求体型不同,图@ref(fig:WH-overtime)可能隐藏了一些重要的变化规律。因此,必须进一步深入探索不同项目中身高体重数据变化的趋势。然而,在奥运会的历史上,运动项目是不断变化的,首先,必须筛选出1960~2016年间奥运会都设立的比赛项目。\n\n# 筛选1960年奥运会的项目\nevents <- data[data$Year == 1960, \"Event\"] %>% \n unique %>% \n .$Event \n\nyears <- data$Year %>% \n unique %>% \n sort %>% tail(-1)\n\nfor (i in 1:length(years)) {\n nxt <- data[data$Year == years[i], \"Event\"] %>% \n unique %>% .$Event\n events <- intersect(events, nxt)\n}\n\n# 按照1960年项目对之后的项目进行筛选\ndata <- data %>% \n filter(Event %in% events)\n\n# get list of sports matching events\nsportsEvents <- data %>% \n select(Sport, Event) %>% \n unique\n\n\n\n2.7.3 change in Weight VS change in Height over times across men’s sports\n\n# 剔除摔跤、举重、拳击、马术\nsportsEvents <- sportsEvents %>% \n filter(!Sport %in% c(\"Wrestling\", \"Weightlifting\", \"Boxing\", \"Equestrianism\")) %>% \n filter(!Event %in% c(\"Figure Skating Mixed Pairs\")) %>% \n arrange(Sport)\n\n# 增加一列 men/women/mixed 区分男女项目\nsportsEvents$Sex <- ifelse(grepl(\"Women\", sportsEvents$Event), \"Women\", \"Men\")\n\n# 创建循环,进行回归\ns.height <- s.weight <- c()\nfor (i in 1:nrow(sportsEvents)) {\n temp <- data %>% filter(Event == sportsEvents$Event[i])\n lm.height <- lm(Height ~ Year, data = temp)\n lm.weight <- lm(Weight ~ Year, data = temp)\n s.height[i] <- lm.height$coefficients[\"Year\"]\n s.weight[i] <- lm.weight$coefficients[\"Year\"]\n}\nslopes <- tibble(Sport = sportsEvents$Sport,\n Event = sportsEvents$Event,\n Sex = sportsEvents$Sex,\n Height = s.height,\n Weight = s.weight)"
},
{
"objectID": "3-HR-comma-sep.html",
"href": "3-HR-comma-sep.html",
"title": "3 员工离职分析",
"section": "",
"text": "hr <- read.csv(\"data/HR_comma_sep.csv\")\nsummary(hr)\n\n satisfaction_level last_evaluation number_project average_montly_hours\n Min. :0.0900 Min. :0.3600 Min. :2.000 Min. : 96.0 \n 1st Qu.:0.4400 1st Qu.:0.5600 1st Qu.:3.000 1st Qu.:156.0 \n Median :0.6400 Median :0.7200 Median :4.000 Median :200.0 \n Mean :0.6128 Mean :0.7161 Mean :3.803 Mean :201.1 \n 3rd Qu.:0.8200 3rd Qu.:0.8700 3rd Qu.:5.000 3rd Qu.:245.0 \n Max. :1.0000 Max. :1.0000 Max. :7.000 Max. :310.0 \n time_spend_company Work_accident left promotion_last_5years\n Min. : 2.000 Min. :0.0000 Min. :0.0000 Min. :0.00000 \n 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 \n Median : 3.000 Median :0.0000 Median :0.0000 Median :0.00000 \n Mean : 3.498 Mean :0.1446 Mean :0.2381 Mean :0.02127 \n 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000 \n Max. :10.000 Max. :1.0000 Max. :1.0000 Max. :1.00000 \n sales salary \n Length:14999 Length:14999 \n Class :character Class :character \n Mode :character Mode :character \n \n \n \n\n\n观察各个变量的主要描述统计量,可知:\n\n离职率(left)平均将近24%。\n对公司的满意度(satisfaction_level)仅有62%左右。\n平均每个人参加过的项目数(number_project)仅为3~4个。\n员工每月平均工作时间(average_montly_hours)达到201.1小时,按照每月工作20天(去除8天双休)计算,每个员工平均每天工作超过10小时。\n\n\n\n\n我们通过绘图观察离职员工的特点。\n\nhr$left <- factor(hr$left, levels = c(\"0\", \"1\"))\n\n# 离职率与公司满意度关系\nboxSat <- ggplot(hr, aes(x = left, y = satisfaction_level,\n fill = left)) +\n geom_boxplot() +\n theme_bw() +\n labs(x = \"离职情况\", y = \"员工满意度\") +\n guides(fill = guide_legend(title = \"离职情况\"))\n\n# 离职率与绩效评估的关系\nboxEva <- ggplot(hr, aes(x = left, y = last_evaluation,\n fill = left)) +\n geom_boxplot() +\n theme_bw() +\n labs(x = \"离职情况\", y = \"绩效评估\") +\n guides(fill = guide_legend(title = \"离职情况\"))\n\n# 离职率与月均工作时间的关系\nboxMonth <- ggplot(hr, aes(x = left, y = average_montly_hours, fill = left)) +\n geom_boxplot() +\n theme_bw()+\n labs(x = \"离职率\", y = \"月均工作时间\") +\n guides(fill = guide_legend(title = \"离职情况\"))\n\n# 离职率与工作年限的关系\nboxTime <- ggplot(hr, aes(x = left, y = time_spend_company, fill = left)) +\n geom_boxplot() +\n theme_bw() +\n labs(x = \"离职率\", y = \"在职年限\") +\n guides(fill = guide_legend(title = \"离职情况\"))\n\nboxSat/boxEva |boxMonth/boxTime\n\n\n\n\n(ref:fig-resigned)\n\n\n\n\n(ref:fig-resigned) 员工离职情况与员工满意度、月均工作时间、绩效评估和在职年限的关系。\n由图@ref(fig:boxplot-resigned)可以看出,离职员工的几个特点:\n\n左上图:离职员工的满意度明显低于未离职的满意度,大都集中于0.4左右。\n左下图:离职员工的绩效评估较高。推测离职员工倾向于寻找待遇更好的工作。\n右上图:离职员工的月均工作时长较高,大部分超过了平均水平(200小时)。\n右下图:工作年限均在4年左右。\n\n\n\n\n\nhr$number_project <- factor(hr$number_project,\n levels = c(\"2\", \"3\", \"4\", \"5\", \"6\", \"7\"))\n\n# 离职与参与项目数关系\nbarProject <- ggplot(hr, aes(x = number_project, fill = left)) +\n geom_bar(position = \"fill\") + # fill为百分比条形图\n theme_bw() +\n labs(x = \"参与项目数\", y = \"比例\") +\n guides(fill = guide_legend(title = \"离职情况\"))\n\n\n# 离职与升职情况关系\nhr$promotion_last_5years[hr$promotion_last_5years == 1] <- \"已升职\"\nhr$promotion_last_5years[hr$promotion_last_5years == 0] <- \"未升职\"\n\nbar5years <- ggplot(hr, aes(x = as.factor(promotion_last_5years), fill = left)) + \n geom_bar(position = \"fill\") +\n theme_bw() +\n labs(x = \"5年内升职情况\", y = \"比例\") +\n theme(axis.text.x = element_text(angle = 45,\n hjust = 1)) +\n guides(fill = guide_legend(title = \"离职情况\"))\n\n# 离职与薪资关系\nbarSalary <- ggplot(hr, aes(x = factor(salary, levels = c(\"low\", \"medium\", \"high\"), ordered=TRUE), fill = left)) +\n geom_bar(position = \"fill\") +\n theme_bw() +\n labs(x = \"薪资情况\", y = \"比例\") +\n theme(axis.text.x = element_text(angle = 45,\n hjust = 1)) +\n guides(fill = guide_legend(title = \"离职情况\")) \n\nbarProject|bar5years |barSalary\n\n\n\n\n(ref:fig-bar-resigned)\n\n\n\n\n(ref:fig-bar-resigned) 员工离职情况与项目参与个数、五年内升职情况和薪资的关系。\n由图@ref(fig:barplot-resigned)可以看出,离职员工的几个特点:\n\n参与项目过少(2个)与过多(7个)的员工离职率均比较高。且参与项目在3个及以上时,参与项目越多,离职率越高。\n5年内未升职的员工离职率较高。\n薪资越低,离职率越高。"
},
{
"objectID": "3-HR-comma-sep.html#sec:three-model1",
"href": "3-HR-comma-sep.html#sec:three-model1",
"title": "3 员工离职分析",
"section": "3.2 建模预测1-回归树+混淆矩阵",
"text": "3.2 建模预测1-回归树+混淆矩阵\n建模的思路:\n\n提取所需数据。\n定义交叉验证方法。\n进行分层抽样,提取出想要的训练集和测试集。\n实际建模。\n对数据进行预测(利用混淆矩阵的方式)。\n\n\n3.2.1 提取数据\n选择符合条件的样本。通过绩效评估、在职时间和参与项目数筛选出更有代表性的样本数据进行分析。 按照绩效评估、在职时间、参与项目数量\n\nhr_model <- hr %>% \n filter(last_evaluation >= 0.70 |\n time_spend_company>=4 |\n number_project>=5)\n\n\n\n3.2.2 确定交叉验证方法\n\n# cv为设置交叉验证方法,number = 5为5折交叉验证。\ntrain_control <- trainControl(method = \"cv\",\n number = 5)\n\n\n\n3.2.3 分层抽样 1\n\n# 设定随机种子,确保每次抽样结果一致。\nset.seed(1234)\n\n# 根据数据因变量进行7:3的分层抽样,返回行索引向量 p = 0.7为按照7:3进行抽样\n# 参数list表示返回值是否为列表\nindex <- createDataPartition(hr_model$left,\n p = 0.7, list = F)\n# 以index为索引的数据为训练集\n# 剩余的数据为测试集\ntrainData <- hr_model[index, ]\ntestData <- hr_model[-index, ]\n\n\n\n3.2.4 实际建模\n使用carte包中的train函数对训练集进行5折交叉验证建立回归树模型。\n\n# left~. 代表因变量left与所有自变量进行建模。\nrpartmodel <- train(left~., data = trainData,\n trControl = train_control,\n method = \"rpart\")\n\n利用建立好的模型rpartmodel对测试集进行预测。\n\n# testdata[-7]剔除left列。\npredRpart <- predict(rpartmodel, testData[-7])\n\n建立混淆矩阵,验证建立的模型。\n\nconPart <- table(predRpart, testData$left)\nconPart\n\n \npredRpart 0 1\n 0 2246 72\n 1 51 528\n\n\n\n混淆矩阵:混淆矩阵的每一列代表了预测类别,每一列的总数表示预测为该类别的数据的数目;每一行代表了数据的真实归属类别,每一行的数据总数表示该类别的数据实例的数目。根据查全率和查准率两个参数判断模型拟合结果是否够好。\n\n混淆矩阵的查准率和查全率是两个重要的参数,具体计算公式如下式@ref(eq:three-CM):\n\\[\\begin{align}\n 查准率=\\frac{真正例}{真正例+假正例} \\\\\n 查全率=\\frac{真正例}{真正例+假反例}\n (\\#eq:three-CM)\n\\end{align}\\]\n根据混淆矩阵结果,可以得到回归树模型的:\n\n查准率为91.19 %。\n查全率为88 %。\n\n回归模型的拟合效果不错。"
},
{
"objectID": "3-HR-comma-sep.html#sec:three-model2",
"href": "3-HR-comma-sep.html#sec:three-model2",
"title": "3 员工离职分析",
"section": "3.3 建模预测2-朴素贝叶斯",
"text": "3.3 建模预测2-朴素贝叶斯\n建模步骤与第@ref(sec:three-model1)小结基本相同,下面只列出代码及结果。\n\nnbModel <- train(left~., data = trainData,\n trControl = train_control,\n method = \"nb\")\npredNb <- predict(nbModel, testData[-7])\nconNb <- table(predNb, testData$left)\nconNb\n\n \npredNb 0 1\n 0 2248 146\n 1 49 454\n\n\n根据公式@ref(eq:three-CM),计算得到朴素贝叶斯模型的:\n\n查准率为90.26 %。\n查全率为75.67 %。\n\n通过两种模型的评估,我们发现回归树模型的拟合度比朴素贝叶斯更好,所以接下来我们采用回归数模型进行进一步分析。"
},
{
"objectID": "3-HR-comma-sep.html#sec:three-model-use",
"href": "3-HR-comma-sep.html#sec:three-model-use",
"title": "3 员工离职分析",
"section": "3.4 模型评估及应用",
"text": "3.4 模型评估及应用\n\n本部分使用ROC曲线的方法对模型进行评估。具体步骤如下:\n\n根据建模预测的结果对样例进行排序。\n按照排序逐个把样本作为正例进行预测,每次计算出两个重要的值(分别为假正例率和真正例率,具体计算公式见下式@ref(eq:three-ROC)。\n\n\\[\\begin{align}\n 假正例率 = \\frac{假正例}{假正例+真反例} \\\\\n 真正例率 = \\frac{真正例}{真正例+假反例}(查全率)\n (\\#eq:three-ROC)\n\\end{align}\\]\n\n分别以计算的两个值作为横纵坐标,就得到了ROC曲线。\n\nROC曲线的评估方法:\n\n如果一个模型的ROC曲线被另一个模型的ROC曲线完全“包住”,说明后者的性能优于前者。\n如果两个ROC曲线发生交叉,则难以一般性的断言两者的优劣。如果一定要进行比较,较为合理的判断依据是比较ROC曲线下的面积(AUC)。一般情况下,如果\n\n\n\n3.4.1 ROC曲线绘制\n绘制ROC曲线的数据必须是数值型。\n\npredRpart <- as.numeric(as.character(predRpart))\npredNb <- as.numeric(predNb)\n\n转换后绘制图形。\n\n# 获取后续绘图使用的信息\nrocPart <- roc(testData$left, predRpart)\n\n# 计算两个关键值\n# 假正例率\nspecificityRp <- rocPart$specificities\n# 查全率,即真正利率\nsensitivityRp <- rocPart$sensitivities\n\n\n# 获取后续绘图使用的信息\nrocNb <- roc(testData$left, predNb)\n\n# 计算两个关键值\n# 假正例率\nspecificityNb <- rocNb$specificities\n# 查全率,即真正利率\nsensitivityNb <- rocNb$sensitivities\n\n绘制ROC图形。\n\n# 定义data = NULL声明未用任何数据\npRpart <- ggplot(data = NULL, aes(\n x = 1 - specificityRp, \n y = sensitivityRp)) +\n geom_line(color = \"red\") +\n geom_abline() +\n annotate(\"text\", x = 0.4, y = 0.5, \n label = paste(\"AUC = \", \n round(rocPart$auc, 3))) +\n theme_bw() +\n labs(x = \"1 - Specificity\", y = \"Sensitivities\")\n\npNb <- ggplot(data = NULL, aes(\n x = 1 - specificityNb,\n y = sensitivityNb)) +\n geom_line(color = \"red\") +\n geom_abline() +\n annotate(\"text\", x = 0.4, y = 0.5,\n label = paste(\"AUC = \",\n round(rocNb$auc, 3))) +\n theme_bw() +\n labs(x = \"1 - Specificity\", y = \" \")\n\npRpart|pNb\n\n\n\n\n回归树模型和朴素贝叶斯模型ROC曲线\n\n\n\n\n(ref:fig-ROC)\n从AUC值来看,同样是回归树模型的拟合效果好于朴素贝叶斯模型。"
},
{
"objectID": "3-HR-comma-sep.html#模型应用",
"href": "3-HR-comma-sep.html#模型应用",
"title": "3 员工离职分析",
"section": "3.5 模型应用",
"text": "3.5 模型应用\n使用回归树模型预测分类的概率,绘制交互预测表\n\n# type = \"prob\"表示结果显示为概率\n# predEnd <- predict(rpartmodel, testData[-7],\n# type = \"prob\")\n\n# 合并预测结果及概率\n# dataEnd <- cbind(round(predEnd, 3), predRpart)\n\n# 重命名预测结果表列名。\n# names(dataEnd) <- c(\"pred.0\", \"pred.1\", \"pred\")\n\n# head(dataEnd)\n# 生成交互式表格\n# datatable(dataEnd)"
},
{
"objectID": "3-HR-comma-sep.html#mlr3建模",
"href": "3-HR-comma-sep.html#mlr3建模",
"title": "3 员工离职分析",
"section": "3.6 mlr3建模",
"text": "3.6 mlr3建模\n\n3.6.1 回归树模型\n\nlibrary(mlr3verse)\n\n\n建立任务\n\n\nhr <- read.csv(\"data/HR_comma_sep.csv\")\nhr$left <- factor(hr$left)\nhr$salary <- factor(hr$salary)\nhr$sales <- factor(hr$sales)\nhr_model <- hr_model <- hr %>% \n filter(last_evaluation >= 0.70 |\n time_spend_company>=4 |\n number_project>=5)\ntask_hr <- \n TaskClassif$new(id = \"left\", backend = hr_model,\n target = \"left\")\ntask_hr\n\n<TaskClassif:left> (10394 x 10)\n* Target: left\n* Properties: twoclass\n* Features (9):\n - int (5): Work_accident, average_montly_hours, number_project,\n promotion_last_5years, time_spend_company\n - dbl (2): last_evaluation, satisfaction_level\n - fct (2): salary, sales\n\n\n\n定义学习器\n\n\nlearner_rpart <- lrn(\"classif.rpart\", \n predict_type = \"prob\")\n\n\n基础训练+预测\n\n\nset.seed(1234)\n# 划分训练集和测试集\ntrain_set <- sample(task_hr$nrow, 0.7 * task_hr$nrow)\ntest_set <- setdiff(seq_len(task_hr$nrow), train_set)\n\n# 训练模型\nlearner_rpart$train(task_hr, row_ids = train_set)\n\n# 数据预测\nprediction_rpart <- learner_rpart$predict(task_hr, \n row_ids = test_set)\n# 建立混淆矩阵\nprediction_rpart$confusion\n\n truth\nresponse 0 1\n 0 2483 85\n 1 17 534\n\n# 评估模型准确性\nmeasure_rpart <- msr(\"classif.acc\") \nprediction_rpart$score(measure_rpart)\n\nclassif.acc \n 0.9672972 \n\n\n\n重采样\n\n\n# 自动重采样\n## 定义重采样方法:5折交叉\nresampling_rpart <- rsmp(\"cv\", folds = 5L)\n## 应用重采样方法\nrr_rpart <- resample(task_hr, learner_rpart, resampling_rpart)\n\nINFO [18:54:50.866] [mlr3] Applying learner 'classif.rpart' on task 'left' (iter 1/5) \nINFO [18:54:50.959] [mlr3] Applying learner 'classif.rpart' on task 'left' (iter 3/5) \nINFO [18:54:51.019] [mlr3] Applying learner 'classif.rpart' on task 'left' (iter 5/5) \nINFO [18:54:51.071] [mlr3] Applying learner 'classif.rpart' on task 'left' (iter 2/5) \nINFO [18:54:51.133] [mlr3] Applying learner 'classif.rpart' on task 'left' (iter 4/5) \n\n## 每次重采样建模评分\nrr_rpart$score(measure_rpart)\n\n task task_id learner learner_id\n1: <TaskClassif[50]> left <LearnerClassifRpart[38]> classif.rpart\n2: <TaskClassif[50]> left <LearnerClassifRpart[38]> classif.rpart\n3: <TaskClassif[50]> left <LearnerClassifRpart[38]> classif.rpart\n4: <TaskClassif[50]> left <LearnerClassifRpart[38]> classif.rpart\n5: <TaskClassif[50]> left <LearnerClassifRpart[38]> classif.rpart\n resampling resampling_id iteration prediction\n1: <ResamplingCV[20]> cv 1 <PredictionClassif[20]>\n2: <ResamplingCV[20]> cv 2 <PredictionClassif[20]>\n3: <ResamplingCV[20]> cv 3 <PredictionClassif[20]>\n4: <ResamplingCV[20]> cv 4 <PredictionClassif[20]>\n5: <ResamplingCV[20]> cv 5 <PredictionClassif[20]>\n classif.acc\n1: 0.9639250\n2: 0.9735450\n3: 0.9610390\n4: 0.9672920\n5: 0.9682387\n\n\n\n## 将重采样的模型进行聚合并评分\nrr_rpart$aggregate(measure_rpart)\n\nclassif.acc \n 0.9668079 \n\n\n得到的回归树模型最终的拟合准确率为96.75%,拟合效果不错\n\n\n3.6.2 朴素贝叶斯模型\n\n建立任务\n\n\nhr <- read.csv(\"data/HR_comma_sep.csv\")\nhr$left <- factor(hr$left)\nhr$salary <- factor(hr$salary)\nhr$sales <- factor(hr$sales)\nhr_model <- hr_model <- hr %>% \n filter(last_evaluation >= 0.70 |\n time_spend_company>=4 |\n number_project>=5)\ntask_hr_nb <- \n TaskClassif$new(id = \"left\", backend = hr_model,\n target = \"left\")\ntask_hr_nb\n\n<TaskClassif:left> (10394 x 10)\n* Target: left\n* Properties: twoclass\n* Features (9):\n - int (5): Work_accident, average_montly_hours, number_project,\n promotion_last_5years, time_spend_company\n - dbl (2): last_evaluation, satisfaction_level\n - fct (2): salary, sales\n\n\n\n选择学习器\n\n\nlearner_nb <- lrn(\"classif.naive_bayes\",\n predict_type = \"prob\")\n\n\n划分训练集和测试集\n\n\nset.seed(1234)\ntrain_set <- sample(task_hr_nb$nrow, task_hr_nb$nrow * 0.7)\ntest_set <- setdiff(seq_len(task_hr_nb$nrow), train_set)\n\n\n模型训练和预测\n\n\n# 模型训练\nlearner_nb$train(task_hr_nb, row_ids = train_set)\nlearner_nb$model # 查看训练好的模型\n\n\nNaive Bayes Classifier for Discrete Predictors\n\nCall:\nnaiveBayes.default(x = x, y = y)\n\nA-priori probabilities:\ny\n 0 1 \n0.8074227 0.1925773 \n\nConditional probabilities:\n Work_accident\ny [,1] [,2]\n 0 0.1813075 0.3853055\n 1 0.0442541 0.2057326\n\n average_montly_hours\ny [,1] [,2]\n 0 201.0504 45.49460\n 1 254.7744 34.45457\n\n last_evaluation\ny [,1] [,2]\n 0 0.7657457 0.1552908\n 1 0.8763241 0.1013923\n\n number_project\ny [,1] [,2]\n 0 3.959993 1.023111\n 1 5.244825 1.156665\n\n promotion_last_5years\ny [,1] [,2]\n 0 0.028770855 0.16717611\n 1 0.002141328 0.04624142\n\n salary\ny high low medium\n 0 0.10316650 0.44722506 0.44960844\n 1 0.01641685 0.60314061 0.38044254\n\n sales\ny accounting hr IT management marketing product_mng\n 0 0.04902962 0.04324140 0.08614232 0.04971059 0.05498808 0.06179775\n 1 0.05424697 0.04782298 0.07994290 0.02997859 0.04710921 0.05353319\n sales\ny RandD sales support technical\n 0 0.05720123 0.27221655 0.15253660 0.17313585\n 1 0.03783012 0.26766595 0.15488936 0.22698073\n\n satisfaction_level\ny [,1] [,2]\n 0 0.6622302 0.2274002\n 1 0.4660742 0.3465198\n\n time_spend_company\ny [,1] [,2]\n 0 3.654069 1.6892539\n 1 4.559600 0.7978871\n\n# 模型预测\nprediction_nb <- learner_nb$predict(task_hr_nb, row_ids = test_set)\nprediction_nb # 查看预测结果\n\n<PredictionClassif> for 3119 observations:\n row_ids truth response prob.0 prob.1\n 2 1 1 0.0002953563 0.9997046\n 3 1 1 0.1080414241 0.8919586\n 4 1 1 0.0031520108 0.9968480\n--- \n 10388 1 1 0.0423579034 0.9576421\n 10390 1 0 0.5807408182 0.4192592\n 10392 1 1 0.0014815943 0.9985184\n\n\n\n模型评估\n\n\nprediction_nb$confusion\n\n truth\nresponse 0 1\n 0 2243 100\n 1 257 519\n\nmeasure_nb <- msr(\"classif.acc\")\nprediction_nb$score(measure_nb) # 预测精度\n\nclassif.acc \n 0.8855402 \n\n\n\n重采样\n\n\nresampling_nb <- rsmp(\"cv\", folds = 5L)\nrr_nb <- resample(task_hr_nb, learner_nb, resampling_nb)\n\nINFO [18:54:52.391] [mlr3] Applying learner 'classif.naive_bayes' on task 'left' (iter 1/5) \nINFO [18:54:52.820] [mlr3] Applying learner 'classif.naive_bayes' on task 'left' (iter 3/5) \nINFO [18:54:53.230] [mlr3] Applying learner 'classif.naive_bayes' on task 'left' (iter 5/5) \nINFO [18:54:53.634] [mlr3] Applying learner 'classif.naive_bayes' on task 'left' (iter 2/5) \nINFO [18:54:54.031] [mlr3] Applying learner 'classif.naive_bayes' on task 'left' (iter 4/5) \n\nrr_nb$score(measure_nb)\n\n task task_id learner learner_id\n1: <TaskClassif[50]> left <LearnerClassifNaiveBayes[36]> classif.naive_bayes\n2: <TaskClassif[50]> left <LearnerClassifNaiveBayes[36]> classif.naive_bayes\n3: <TaskClassif[50]> left <LearnerClassifNaiveBayes[36]> classif.naive_bayes\n4: <TaskClassif[50]> left <LearnerClassifNaiveBayes[36]> classif.naive_bayes\n5: <TaskClassif[50]> left <LearnerClassifNaiveBayes[36]> classif.naive_bayes\n resampling resampling_id iteration prediction\n1: <ResamplingCV[20]> cv 1 <PredictionClassif[20]>\n2: <ResamplingCV[20]> cv 2 <PredictionClassif[20]>\n3: <ResamplingCV[20]> cv 3 <PredictionClassif[20]>\n4: <ResamplingCV[20]> cv 4 <PredictionClassif[20]>\n5: <ResamplingCV[20]> cv 5 <PredictionClassif[20]>\n classif.acc\n1: 0.8840789\n2: 0.8951419\n3: 0.8946609\n4: 0.8869649\n5: 0.8695861\n\nrr_nb$aggregate(measure_nb)\n\nclassif.acc \n 0.8860865 \n\n\n通过两种模型的评估,我们发现回归树模型的拟合度比朴素贝叶斯更好,与传统方法得出的结论一致。\n\n\n3.6.3 利用mlr3进行ROC曲线绘制\n\nlibrary(mlr3viz)\nroc_nb <- autoplot(prediction_nb, type = \"roc\")\nroc_rpart <- autoplot(prediction_rpart, type = \"roc\")\nroc_rpart|roc_nb\n\n\n\n\n\n\n3.6.4 模型应用\n使用回归树模型预测分类概率,绘制表格交互表\n\nautoplot(prediction_rpart)\n\n\n\n\n\n# type = \"prob\"表示结果显示为概率\npredEnd <- predict(rpartmodel, testData[-7],\n type = \"prob\")\n\n# 合并预测结果及概率\ndataEnd <- cbind(round(predEnd, 3), predRpart)\n\n# 重命名预测结果表列名。\nnames(dataEnd) <- c(\"pred.0\", \"pred.1\", \"pred\")\n\n# head(dataEnd)\n# 生成交互式表格\n# datatable(dataEnd)"
},
{
"objectID": "4-creditcard.html",
"href": "4-creditcard.html",
"title": "4 信用卡欺诈识别",
"section": "",
"text": "变量说明\nclass变量:0表示非欺诈,1表示非欺诈。"
},
{
"objectID": "4-creditcard.html#sec:five2",
"href": "4-creditcard.html#sec:five2",
"title": "4 信用卡欺诈识别",
"section": "4.2 数据预处理",
"text": "4.2 数据预处理\n\ncard <- read_csv(\"data/creditcard.csv\")\ncard <- as.data.frame(card)\nstr(card) # 查看数据基本结构和数据类型\n\n'data.frame': 284807 obs. of 31 variables:\n $ Time : num 0 0 1 1 2 2 4 7 7 9 ...\n $ V1 : num -1.36 1.192 -1.358 -0.966 -1.158 ...\n $ V2 : num -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...\n $ V3 : num 2.536 0.166 1.773 1.793 1.549 ...\n $ V4 : num 1.378 0.448 0.38 -0.863 0.403 ...\n $ V5 : num -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...\n $ V6 : num 0.4624 -0.0824 1.8005 1.2472 0.0959 ...\n $ V7 : num 0.2396 -0.0788 0.7915 0.2376 0.5929 ...\n $ V8 : num 0.0987 0.0851 0.2477 0.3774 -0.2705 ...\n $ V9 : num 0.364 -0.255 -1.515 -1.387 0.818 ...\n $ V10 : num 0.0908 -0.167 0.2076 -0.055 0.7531 ...\n $ V11 : num -0.552 1.613 0.625 -0.226 -0.823 ...\n $ V12 : num -0.6178 1.0652 0.0661 0.1782 0.5382 ...\n $ V13 : num -0.991 0.489 0.717 0.508 1.346 ...\n $ V14 : num -0.311 -0.144 -0.166 -0.288 -1.12 ...\n $ V15 : num 1.468 0.636 2.346 -0.631 0.175 ...\n $ V16 : num -0.47 0.464 -2.89 -1.06 -0.451 ...\n $ V17 : num 0.208 -0.115 1.11 -0.684 -0.237 ...\n $ V18 : num 0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...\n $ V19 : num 0.404 -0.146 -2.262 -1.233 0.803 ...\n $ V20 : num 0.2514 -0.0691 0.525 -0.208 0.4085 ...\n $ V21 : num -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...\n $ V22 : num 0.27784 -0.63867 0.77168 0.00527 0.79828 ...\n $ V23 : num -0.11 0.101 0.909 -0.19 -0.137 ...\n $ V24 : num 0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...\n $ V25 : num 0.129 0.167 -0.328 0.647 -0.206 ...\n $ V26 : num -0.189 0.126 -0.139 -0.222 0.502 ...\n $ V27 : num 0.13356 -0.00898 -0.05535 0.06272 0.21942 ...\n $ V28 : num -0.0211 0.0147 -0.0598 0.0615 0.2152 ...\n $ Amount: num 149.62 2.69 378.66 123.5 69.99 ...\n $ Class : num 0 0 0 0 0 0 0 0 0 0 ...\n\nsummary(card) # 查看数据的主要描述性统计量\n\n Time V1 V2 V3 \n Min. : 0 Min. :-56.40751 Min. :-72.71573 Min. :-48.3256 \n 1st Qu.: 54202 1st Qu.: -0.92037 1st Qu.: -0.59855 1st Qu.: -0.8904 \n Median : 84692 Median : 0.01811 Median : 0.06549 Median : 0.1799 \n Mean : 94814 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 \n 3rd Qu.:139321 3rd Qu.: 1.31564 3rd Qu.: 0.80372 3rd Qu.: 1.0272 \n Max. :172792 Max. : 2.45493 Max. : 22.05773 Max. : 9.3826 \n V4 V5 V6 V7 \n Min. :-5.68317 Min. :-113.74331 Min. :-26.1605 Min. :-43.5572 \n 1st Qu.:-0.84864 1st Qu.: -0.69160 1st Qu.: -0.7683 1st Qu.: -0.5541 \n Median :-0.01985 Median : -0.05434 Median : -0.2742 Median : 0.0401 \n Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 \n 3rd Qu.: 0.74334 3rd Qu.: 0.61193 3rd Qu.: 0.3986 3rd Qu.: 0.5704 \n Max. :16.87534 Max. : 34.80167 Max. : 73.3016 Max. :120.5895 \n V8 V9 V10 V11 \n Min. :-73.21672 Min. :-13.43407 Min. :-24.58826 Min. :-4.79747 \n 1st Qu.: -0.20863 1st Qu.: -0.64310 1st Qu.: -0.53543 1st Qu.:-0.76249 \n Median : 0.02236 Median : -0.05143 Median : -0.09292 Median :-0.03276 \n Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 \n 3rd Qu.: 0.32735 3rd Qu.: 0.59714 3rd Qu.: 0.45392 3rd Qu.: 0.73959 \n Max. : 20.00721 Max. : 15.59500 Max. : 23.74514 Max. :12.01891 \n V12 V13 V14 V15 \n Min. :-18.6837 Min. :-5.79188 Min. :-19.2143 Min. :-4.49894 \n 1st Qu.: -0.4056 1st Qu.:-0.64854 1st Qu.: -0.4256 1st Qu.:-0.58288 \n Median : 0.1400 Median :-0.01357 Median : 0.0506 Median : 0.04807 \n Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 \n 3rd Qu.: 0.6182 3rd Qu.: 0.66251 3rd Qu.: 0.4931 3rd Qu.: 0.64882 \n Max. : 7.8484 Max. : 7.12688 Max. : 10.5268 Max. : 8.87774 \n V16 V17 V18 \n Min. :-14.12985 Min. :-25.16280 Min. :-9.498746 \n 1st Qu.: -0.46804 1st Qu.: -0.48375 1st Qu.:-0.498850 \n Median : 0.06641 Median : -0.06568 Median :-0.003636 \n Mean : 0.00000 Mean : 0.00000 Mean : 0.000000 \n 3rd Qu.: 0.52330 3rd Qu.: 0.39968 3rd Qu.: 0.500807 \n Max. : 17.31511 Max. : 9.25353 Max. : 5.041069 \n V19 V20 V21 \n Min. :-7.213527 Min. :-54.49772 Min. :-34.83038 \n 1st Qu.:-0.456299 1st Qu.: -0.21172 1st Qu.: -0.22839 \n Median : 0.003735 Median : -0.06248 Median : -0.02945 \n Mean : 0.000000 Mean : 0.00000 Mean : 0.00000 \n 3rd Qu.: 0.458949 3rd Qu.: 0.13304 3rd Qu.: 0.18638 \n Max. : 5.591971 Max. : 39.42090 Max. : 27.20284 \n V22 V23 V24 \n Min. :-10.933144 Min. :-44.80774 Min. :-2.83663 \n 1st Qu.: -0.542350 1st Qu.: -0.16185 1st Qu.:-0.35459 \n Median : 0.006782 Median : -0.01119 Median : 0.04098 \n Mean : 0.000000 Mean : 0.00000 Mean : 0.00000 \n 3rd Qu.: 0.528554 3rd Qu.: 0.14764 3rd Qu.: 0.43953 \n Max. : 10.503090 Max. : 22.52841 Max. : 4.58455 \n V25 V26 V27 \n Min. :-10.29540 Min. :-2.60455 Min. :-22.565679 \n 1st Qu.: -0.31715 1st Qu.:-0.32698 1st Qu.: -0.070840 \n Median : 0.01659 Median :-0.05214 Median : 0.001342 \n Mean : 0.00000 Mean : 0.00000 Mean : 0.000000 \n 3rd Qu.: 0.35072 3rd Qu.: 0.24095 3rd Qu.: 0.091045 \n Max. : 7.51959 Max. : 3.51735 Max. : 31.612198 \n V28 Amount Class \n Min. :-15.43008 Min. : 0.00 Min. :0.000000 \n 1st Qu.: -0.05296 1st Qu.: 5.60 1st Qu.:0.000000 \n Median : 0.01124 Median : 22.00 Median :0.000000 \n Mean : 0.00000 Mean : 88.35 Mean :0.001728 \n 3rd Qu.: 0.07828 3rd Qu.: 77.17 3rd Qu.:0.000000 \n Max. : 33.84781 Max. :25691.16 Max. :1.000000 \n\nround(prop.table(table(card$Class)),4)# 查看数据类别比例\n\n\n 0 1 \n0.9983 0.0017 \n\n\n\n4.2.1 分层抽样\n处理类别不平衡的数据需要了解的几个概念点:\n\n类别不平衡:指分类任务重不同类别的训练样本树木差别很大的情况。\n欠抽样:指某类(样本数占比很大)的样本中抽取出与另一类样本(样本数占比很小)个数一样的样本。即从大类别中抽取与小类别数目一样的样本。\n过抽样:指针对样本数占比很小的类别,重新塑造一些数据,使其与另一类数据接近。\n\n对数据进行一些基本转化。\n\n# 把Time列转换为小时\ncard <- card %>% \n mutate(Time_Hour = round(card[, 1]/3600, 0))\n\n# 把Class列转化为因子型\ncard$Class <- factor(card$Class)\n\ncard_1 <- card[card$Class == \"1\", ] # 欺诈样本\ncard_0 <- card[card$Class == \"0\", ] # 非欺诈样本\n\n随机抽取与诈骗样本个数相同的非欺诈样本数据,并与元欺诈样本合并为新的数据。此处使用的欠抽样的方法。\n\nset.seed(1234)\nindex <- sample(x = 1:nrow(card_0), size = nrow(card_1))\ncard_0_new <- card_0[index, ]\ncard_end <- rbind(card_0_new, card_1)\n\n# 剔除Time列,用Time_Hour列代替。everything()选择所有的变量\ncard_end <- card_end[-1] %>% \n select(Time_Hour, everything())\n\n按照类别进行分层抽样,建立训练集和测试集。\n\nset.seed(1234)\n# 按照新数据的目标变量进行8:2\nindex2 <- createDataPartition(card_end$Class,\n p = 0.8, list = F)\ntrain_data <- card_end[index2, ] # 创建训练集\ntest_data <- card_end[-index2, ] # 创建测试集\n\n# 验证抽样结果,统计三个数据集中正反样本比例是否一致\ntable(card_end$Class)\n\n\n 0 1 \n492 492 \n\ntable(train_data$Clas)\n\n\n 0 1 \n394 394 \n\ntable(test_data$Class)\n\n\n 0 1 \n98 98 \n\n\n\n\n4.2.2 标准化\n\nstandard <- preProcess(card_end, method = \"range\") \ncard_s <- predict(standard, card_end)\ntrain_data2 <- card_s[index2, ]\ntest_data2 <- card_s[-index2, ]"
},
{
"objectID": "4-creditcard.html#sec:five3",
"href": "4-creditcard.html#sec:five3",
"title": "4 信用卡欺诈识别",
"section": "4.3 描述性分析",
"text": "4.3 描述性分析\n\n4.3.1 不同时间诈骗次数-条形图\n\nggplot(card_1, aes(x = factor(Time_Hour), \n fill = factor(Time_Hour)))+\n geom_bar(stat = \"count\") +\n theme_classic() +\n labs(x = \"Time_Hour\", y = \"Count\") +\n theme(legend.position = \"none\",\n axis.text.x = element_text(angle = 90,\n vjust = 0.5))\n\n\n\n\n不同时间诈骗次数\n\n\n\n\n由图@ref(fig:card-times)可知:\n\n第一天(024h)的诈骗总次数大于第二天(2548h)。\n诈骗发生次数最多的三个时间段分别是:\n\n第二天凌晨2点左右。\n第一天上午11点左右。\n第一天凌晨2点左右。\n\n\n\n\n4.3.2 不同时间诈骗金额-箱线图\n\nggplot(card_1, aes(x = factor(Time_Hour),\n y = Amount, \n fill = factor(Time_Hour))) +\n geom_boxplot() +\n geom_hline(aes(yintercept =250, color = \"red\")) + \n annotate(\"text\", x = 6, y = 500, label = \"Amount = 250\", color = \"red\") +\n geom_curve(x = 3, y = 450, xend = 5, yend = 250, angle = 25, color = \"red\",\n arrow = arrow(length = unit(0.25, \"cm\"))) +\n theme_classic() +\n labs(x = \"Time_Hour\", y = \"Amount\") +\n theme(legend.position = \"none\", \n axis.text.x = element_text(angle = 90, \n vjust = 0.5))\n\n\n\n\n不同时间诈骗金额\n\n\n\n\n由图@ref(fig:card-amount)可知:\n\n诈骗金额最多的一次发生在第二天下午1点作用(34h),诈骗金额达到2000欧元左右。\n诈骗金额普遍在250欧元之内。\n\n\n\n4.3.3 不同时间平均诈骗金额-条形图\n\n# 提取所需数据\ncard_1_mean <- card_1 %>% \n group_by(Time_Hour) %>% \n summarise(MeanAmount = mean(Amount))\n\n\nggplot(card_1_mean, aes(x = factor(Time_Hour), y = MeanAmount, fill = factor(Time_Hour))) +\n geom_bar(stat = \"identity\") +\n geom_hline(aes(yintercept = 200, color = \"red\")) +\n annotate(\"text\", x = 26, y = 240, label = \"Mean_Amount = 200\", color = \"red\") +\n geom_curve(x = 23, y = 220, xend = 24, yend = 200, \n curvature = 0.3, arrow = arrow(length = unit(0.2, \"cm\")), color = \"red\") +\n theme_classic() +\n theme(legend.position = \"none\",\n axis.text.x = element_text(angle = 90, vjust = 0.5)) +\n labs(x = \"Time_Hour\", y = \"Mean_Amount\")\n\n\n\n\n不同时间平均诈骗金额-条形图\n\n\n\n\n如图@ref(fig:card-mean)所示:\n\n平均诈骗金额最多的时间段为第二天下午1点,此时间点包含诈骗金额最多的观测。\n总体而言,平均诈骗金额普遍在200欧元以内。"
},
{
"objectID": "4-creditcard.html#sec:five4",
"href": "4-creditcard.html#sec:five4",
"title": "4 信用卡欺诈识别",
"section": "4.4 自动参数调整调参-使用caret包",
"text": "4.4 自动参数调整调参-使用caret包\n参数调整是提升模型性能的一个重要过程,而大多数机器学习算法都可以至少调整一个参数。复杂的模型通常可以通过调节多个参数值来调整模型从而达到更好的拟合效果。\ne.g.,寻找最合适的k值来调整k近邻模型、调节隐藏层层数和隐藏层的节点数来优化神经网络模型;又如支持向量机模型中的调节核函数以及“软边界”惩罚大小等优化。\n值得注意的是,如果对所有可能的调参选项均进行尝试,其复杂度非常大,耗时且不科学,需要一种更系统、科学的方式对模型的参数进行调节。\n下表列举了使用caret包进行自动参数调整的模型及其参数:\n\n\n\n模型\n方法名\n参数\n\n\n\n\nk近邻\nknn\nk\n\n\n朴素贝叶斯\nnb\nfL、usekernel\n\n\n决策树\nC5.0\nmodel、trials、winnow\n\n\nOneR规则学习器\nOneR\n无\n\n\n线性回归\nlm\n无\n\n\n回归树\nrpart\ncp\n\n\n模型树\nM5\npruned、smoothed、rules\n\n\n支持向量机(径向基核)\nsvmRadial\nC, sigma\n\n\n随机森林\nrf\nmtry\n\n\n\n更多可调节参数的详细信息\n本案例我们使用knn和随机森林两个模型。\n我们用iris数据对自动调参的步骤进行演示。\n\n创建简单的调整的模型\n\n\nset.seed(1234)\nm_C50 <- train(Species~., data = iris, method = \"C5.0\")\nm_C50\n\nC5.0 \n\n150 samples\n 4 predictor\n 3 classes: 'setosa', 'versicolor', 'virginica' \n\nNo pre-processing\nResampling: Bootstrapped (25 reps) \nSummary of sample sizes: 150, 150, 150, 150, 150, 150, ... \nResampling results across tuning parameters:\n\n model winnow trials Accuracy Kappa \n rules FALSE 1 0.9353579 0.9019696\n rules FALSE 10 0.9370844 0.9045424\n rules FALSE 20 0.9325835 0.8976068\n rules TRUE 1 0.9382311 0.9062975\n rules TRUE 10 0.9407392 0.9099910\n rules TRUE 20 0.9385430 0.9066136\n tree FALSE 1 0.9347127 0.9009924\n tree FALSE 10 0.9369888 0.9044013\n tree FALSE 20 0.9332286 0.8985820\n tree TRUE 1 0.9375860 0.9053246\n tree TRUE 10 0.9399845 0.9088007\n tree TRUE 20 0.9392443 0.9076915\n\nAccuracy was used to select the optimal model using the largest value.\nThe final values used for the model were trials = 10, model = rules and\n winnow = TRUE.\n\n\n由上面的结果可以看出,基于model、trials和winnow三个参数,建立并测试了12个决策树(C5.0)模型,每个模型均给出了精度及Kappa统计量,最下方同时展示了最佳候选模型所对应的参数值。其中Kappa统计量用来衡量模型的稳定性:\n\n很差的一致性: <0.2\n尚可的一致性: 0.2~0.4\n中等的一致性: 0.4~0.6\n不错的一致性: 0.6~0.8\n很好的一致性: 0.8~1\n\n\n定制调参\n\n\n使用trainCotrol()函数创建一些列配置选项,这些选项考虑了包括重抽样策略以及用于选择最佳模型的度量这些模型评价标准的管理。主要专注于两个重要的参数:method和selectionFuncio。\n\nmethod为冲抽样的方法。\nselectionFunction参数可以设定一个函数,用于在各个候选者中选取特定的模型,共3个函数:\n\nbest函数:默认选项,简单的选择具有最好的某特定度量值的候选者。\noneSE函数:选择最好性能标准差之内的最简单的候选者。\nTolerance函数:选择某个用户制定比例之内最简单的候选者。\n\n\n\n\nset.seed(1234)\nmodel_rf <- train(Class~., data = train_data, method = \"rf\",\n trControl = trainControl(method = \"cv\",\n number = 5,\n selectionFunction = \"oneSE\"))\nmodel_rf\n\nRandom Forest \n\n788 samples\n 30 predictor\n 2 classes: '0', '1' \n\nNo pre-processing\nResampling: Cross-Validated (5 fold) \nSummary of sample sizes: 631, 631, 630, 630, 630 \nResampling results across tuning parameters:\n\n mtry Accuracy Kappa \n 2 0.9276465 0.8552977\n 16 0.9314521 0.8628921\n 30 0.9276627 0.8553120\n\nAccuracy was used to select the optimal model using the one SE rule.\nThe final value used for the model was mtry = 2.\n\n# 进行预测\npred_rf <- predict(model_rf, test_data[-31]) \n\n\n# 建立混淆矩阵\nconfusionMatrix(data = pred_rf, reference = test_data$Class,\n positive = \"1\")\n\nConfusion Matrix and Statistics\n\n Reference\nPrediction 0 1\n 0 98 7\n 1 0 91\n \n Accuracy : 0.9643 \n 95% CI : (0.9278, 0.9855)\n No Information Rate : 0.5 \n P-Value [Acc > NIR] : < 2e-16 \n \n Kappa : 0.9286 \n \n Mcnemar's Test P-Value : 0.02334 \n \n Sensitivity : 0.9286 \n Specificity : 1.0000 \n Pos Pred Value : 1.0000 \n Neg Pred Value : 0.9333 \n Prevalence : 0.5000 \n Detection Rate : 0.4643 \n Detection Prevalence : 0.4643 \n Balanced Accuracy : 0.9643 \n \n 'Positive' Class : 1 \n \n\nplot(varImp(model_rf)) # 查看变量的重要性"
},
{
"objectID": "4-creditcard.html#sec:five5",
"href": "4-creditcard.html#sec:five5",
"title": "4 信用卡欺诈识别",
"section": "4.5 kNN建模",
"text": "4.5 kNN建模\n\n4.5.1 原理\nknn,即邻近分类器,就是把未标记的案例归类为与他们最相似的带有标记的案例所在的类。\n算法流程:\n\n依次计算测试样本与哥哥训练样本间的距离(常用欧式距离);\n将这些距离按照升序排列;\n选取距离最小的k(3~10)个训练样本点;\n确定这k个点中不同类别的占比;\n返回这k个点中占比最大的类别作为测试样本的预测分类。\n\n\n\n4.5.2 模型建立\n\n# 创建空向量\nresults <- c()\n\nfor (i in 3:10){\n set.seed(1234)\n pred_knn <- knn(train_data2[-31], test_data2[-31],\n train_data2$Class, i)\n Table <- table(pred_knn, test_data2$Class) # 得到混淆矩阵\n accuracy <- sum(diag(Table))/sum(Table) # diag()提取对角线的值\n results <- c(results, accuracy)\n}\n\nggplot(as.data.frame(results), aes(x = 3:10, y = results)) +\n geom_point()+\n geom_line() +\n theme_bw() +\n labs(xlab = \" \")\n\n\n\n\n\nset.seed(1234)\npred_knn <- knn(train = train_data2[-31], test = test_data2[-31],\n cl = train_data2$Class, k = 4)\nconfusionMatrix(pred_knn,test_data2$Class, positive = \"1\")\n\nConfusion Matrix and Statistics\n\n Reference\nPrediction 0 1\n 0 97 7\n 1 1 91\n \n Accuracy : 0.9592 \n 95% CI : (0.9212, 0.9822)\n No Information Rate : 0.5 \n P-Value [Acc > NIR] : <2e-16 \n \n Kappa : 0.9184 \n \n Mcnemar's Test P-Value : 0.0771 \n \n Sensitivity : 0.9286 \n Specificity : 0.9898 \n Pos Pred Value : 0.9891 \n Neg Pred Value : 0.9327 \n Prevalence : 0.5000 \n Detection Rate : 0.4643 \n Detection Prevalence : 0.4694 \n Balanced Accuracy : 0.9592 \n \n 'Positive' Class : 1"
},
{
"objectID": "4-creditcard.html#sec:five6",
"href": "4-creditcard.html#sec:five6",
"title": "4 信用卡欺诈识别",
"section": "4.6 模型评估",
"text": "4.6 模型评估\n\n# 建立一个数据框,将两个模型预测的结果和真实值放进去。并展示不同预测值\npred_results <- data.frame(knn = pred_knn, rf = pred_rf, \n class = test_data$Class)\nindex3 <- which(pred_results$knn != pred_rf)\npred_results[index3, ]\n\n knn rf class\n25 1 0 0\n159 1 0 1\n160 0 1 1\n168 1 0 1\n182 0 1 1"
},
{
"objectID": "5-Student-performance-level.html",
"href": "5-Student-performance-level.html",
"title": "5 学生成绩水平分类",
"section": "",
"text": "变量说明。\n变量中最重要的的为Class学生等级变量,是我们建模的目标变量。\n\nedudata <- read_csv(\"data/xAPI-Edu-Data.csv\")\nedudata$Class <- factor(edudata$Class, levels = c(\"H\", \"M\", \"L\"))\nedudata$gender <- factor(edudata$gender, levels = c(\"M\", \"F\"))\nstr(edudata)\n\nspec_tbl_df [480 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)\n $ gender : Factor w/ 2 levels \"M\",\"F\": 1 1 1 1 1 2 1 1 2 2 ...\n $ NationalITy : chr [1:480] \"KW\" \"KW\" \"KW\" \"KW\" ...\n $ PlaceofBirth : chr [1:480] \"KuwaIT\" \"KuwaIT\" \"KuwaIT\" \"KuwaIT\" ...\n $ StageID : chr [1:480] \"lowerlevel\" \"lowerlevel\" \"lowerlevel\" \"lowerlevel\" ...\n $ GradeID : chr [1:480] \"G-04\" \"G-04\" \"G-04\" \"G-04\" ...\n $ SectionID : chr [1:480] \"A\" \"A\" \"A\" \"A\" ...\n $ Topic : chr [1:480] \"IT\" \"IT\" \"IT\" \"IT\" ...\n $ Semester : chr [1:480] \"F\" \"F\" \"F\" \"F\" ...\n $ Relation : chr [1:480] \"Father\" \"Father\" \"Father\" \"Father\" ...\n $ raisedhands : num [1:480] 15 20 10 30 40 42 35 50 12 70 ...\n $ VisITedResources : num [1:480] 16 20 7 25 50 30 12 10 21 80 ...\n $ AnnouncementsView : num [1:480] 2 3 0 5 12 13 0 15 16 25 ...\n $ Discussion : num [1:480] 20 25 30 35 50 70 17 22 50 70 ...\n $ ParentAnsweringSurvey : chr [1:480] \"Yes\" \"Yes\" \"No\" \"No\" ...\n $ ParentschoolSatisfaction: chr [1:480] \"Good\" \"Good\" \"Bad\" \"Bad\" ...\n $ StudentAbsenceDays : chr [1:480] \"Under-7\" \"Under-7\" \"Above-7\" \"Above-7\" ...\n $ Class : Factor w/ 3 levels \"H\",\"M\",\"L\": 2 2 3 3 2 2 3 2 2 2 ...\n - attr(*, \"spec\")=\n .. cols(\n .. gender = col_character(),\n .. NationalITy = col_character(),\n .. PlaceofBirth = col_character(),\n .. StageID = col_character(),\n .. GradeID = col_character(),\n .. SectionID = col_character(),\n .. Topic = col_character(),\n .. Semester = col_character(),\n .. Relation = col_character(),\n .. raisedhands = col_double(),\n .. VisITedResources = col_double(),\n .. AnnouncementsView = col_double(),\n .. Discussion = col_double(),\n .. ParentAnsweringSurvey = col_character(),\n .. ParentschoolSatisfaction = col_character(),\n .. StudentAbsenceDays = col_character(),\n .. Class = col_character()\n .. )\n - attr(*, \"problems\")=<externalptr>"
},
{
"objectID": "5-Student-performance-level.html#sec:descriptive-analysis",
"href": "5-Student-performance-level.html#sec:descriptive-analysis",
"title": "5 学生成绩水平分类",
"section": "5.2 描述性分析",
"text": "5.2 描述性分析\n\n5.2.1 封装绘图函数\n\nfun_bar <- function(data, xlab, fillc, pos, xname, yname, legend){\n data %>% \n group_by({{xlab}}) %>% # dplyr中的自定函数参数需要使用{{}}括起来\n mutate(count = n()) %>% \n ggplot(aes(reorder({{xlab}}, count), count, fill = {{fillc}})) +\n geom_col(position = pos) + #pos = \"stack\" or \"fill\"\n labs(x = xname, y = yname) +\n coord_flip() +\n theme_bw() +\n guides(fill = guide_legend(title = legend))\n}\n\n\n\n5.2.2 不同教育程度的学生选择课程主题\n\np1 <- fun_bar(data = edudata, xlab = Topic, fillc = StageID,\n pos = \"stack\", xname = \"Topic\", yname = \"Student_Count\",\n legend = \"教育程度\")\np2 <- fun_bar(data = edudata, xlab = Topic, fillc = StageID,\n pos = \"fill\", xname = \"Topic\", yname = \"Per_Student_Count\",\n legend = \"教育程度\")\np1/p2\n\n\n\n\n不同教育程度的学生选择课程主题\n\n\n\n\n由图@ref(fig:school-level)可以看出:\n\n课程主题最多的为IT、French和Arabic,其中选择IT的课程主题的学员远高于其他课。\n无论哪种教育程度,IT、Science、Math和English四种课程都是必修的(三种颜色都有)。\n\n\n\n5.2.3 不同课程主题监护人情况\n这部分主要针对家长的情况进行分析,了解父母对学员学习的不同情况。对应在数据集中的变量为Relation。\n\np3 <- fun_bar(data = edudata, xlab = Topic, fillc = Relation,\n pos = \"stack\", xname = \"Topic\", yname = \"Student_count\",\n legend = \"监护人情况\")\np4 <- fun_bar(data = edudata, xlab = Topic, fillc = Relation,\n pos = \"fill\", xname = \"Topic\", yname = \"Student_count\",\n legend = \"监护人情况\")\np3/p4\n\n\n\n\n不同课程主题监护人情况\n\n\n\n\n由图@ref(fig:relation)可以看出:\n\n总体而言,监护人为父亲的较多。其中,IT和Math课程中,负责人为父亲的超过75%。\nFrench课程,监护人大多数为母亲,占70%左右。\n\n\n\n5.2.4 不同课程学生学习成绩\n\np5 <- fun_bar(data = edudata, xlab = Topic, fillc = Class,\n pos = \"stack\", xname = \"Topic\", yname = \"Student_count\",\n legend = \"学生成绩\")\np6 <- fun_bar(data = edudata, xlab = Topic, fillc = Class,\n pos = \"fill\", xname = \"Topic\", yname = \"Student_count\",\n legend = \"学生成绩\")\np5/p6\n\n\n\n\n不同课程学生学习成绩\n\n\n\n\n由图@ref(fig:class):\n\n所有课程中,只有Biology课程中,属于高水平的学生数超过了50%。\n在Geology课程中,没有低水平的学生。\n\n\n\n5.2.5 不同教室学生成绩水平\n\np7 <- fun_bar(data = edudata, xlab = SectionID, fillc = Class,\n pos = \"stack\", xname = \"Section_ID\", yname = \"Student_count\",\n legend = \"学生成绩\")\np8 <- fun_bar(data = edudata, xlab = SectionID, fillc = Class,\n pos = \"fill\", xname = \"Section_ID\", yname = \"Student_count\",\n legend = \"学生成绩\")\np7/p8\n\n\n\n\n不同教室学生成绩水平\n\n\n\n\n由图@ref(fig:score-section)可以看出:\n\n在A班的学生最多,C班的学生最少。\nC班的低水平成绩的学生相对较多,其它两个班级的成绩水平基本一致。\n\n\n\n5.2.6 不同学期、不同成绩水平与监护人的关系\n\n# 封装函数,去掉坐标轴翻转\nfun_bar2 <- function(data, xlab, fillc, pos, xname, yname, legend){\n data %>% \n group_by({{xlab}}) %>% # dplyr中的自定函数参数需要使用{{}}括起来\n mutate(count = n()) %>% \n ggplot(aes(reorder({{xlab}}, count), count, fill = {{fillc}})) +\n geom_col(position = pos) + #pos = \"stack\" or \"fill\"\n labs(x = xname, y = yname) +\n theme_bw() +\n guides(fill = guide_legend(title = legend))\n}\n\n\np9 <- fun_bar2(edudata, Semester, Relation, pos = \"stack\",\n xname = \"Semester\", yname = \"Student_count\",\n legend = \"监护人情况\")\n\np10 <- fun_bar2(edudata, Semester, Relation, pos = \"fill\",\n xname = \"Semester\", yname = \"per_Student_count\",\n legend = \"监护人情况\")\n\np11 <- fun_bar2(edudata, Class, Relation, pos = \"stack\",\n xname = \"Class\", yname = \"Student_count\",\n legend = \"监护人情况\")\n\np12 <- fun_bar2(edudata, Class, Relation, pos = \"fill\",\n xname = \"Class\", yname = \"per_Student_count\",\n legend = \"监护人情况\")\n(p9|p10) / (p11|p12)\n\n\n\n\n不同学期、不同成绩水平与监护人的关系\n\n\n\n\n由图@ref(fig:semester)可知:\n\n第一学期父亲作为监护人的学生数比第二学期多。\n总体看,成绩水平较高的学生中,监护人为母亲的比较多;其它水平均是父亲较多。\n\n\n\n5.2.7 家长是否回答调查问卷、成绩水平与家长对学校是否满意的关系\n\np13 <- fun_bar2(edudata, ParentAnsweringSurvey, ParentschoolSatisfaction, \n pos = \"stack\", xname = \"ParentAnsweringSurvey\",\n yname = \"Student_count\", legend = \"是否满意\")\np14 <- fun_bar2(edudata, ParentAnsweringSurvey, ParentschoolSatisfaction, \n pos = \"fill\", xname = \"ParentAnsweringSurvey\",\n yname = \"Per_Student_count\", legend = \"是否满意\")\n\np15 <- fun_bar2(edudata, Class, ParentschoolSatisfaction, \n pos = \"stack\", xname = \"Class\",\n yname = \"Student_count\", legend = \"是否满意\")\np16 <- fun_bar2(edudata, Class, ParentschoolSatisfaction, \n pos = \"fill\", xname = \"Class\",\n yname = \"Per_Student_count\", legend = \"是否满意\")\n(p13|p14)/(p15|p16)\n\n\n\n\n家长是否回答调查问卷、成绩水平与家长对学校是否满意的关系\n\n\n\n\n由图@ref(fig:surey-class)可以看出:\n\n有超过一半的家长回答了问卷,其中,回答问卷的家长大部分对学校满意,而未回答问卷的则大部分对学校不满。\n成绩越高,家长对学校越满意。\n\n\n\n5.2.8 性别、逃课次数与学生成绩水平的关系\n\np17 <- fun_bar2(edudata, gender, Class, \n pos = \"stack\", xname = \"Gender\",\n yname = \"Student_count\", legend = \"成绩水平\")\np18 <- fun_bar2(edudata, gender, Class, \n pos = \"fill\", xname = \"Gender\",\n yname = \"Per_Student_count\", legend = \"成绩水平\")\n\np19 <- fun_bar2(edudata, StudentAbsenceDays, Class, \n pos = \"stack\", xname = \"Class\",\n yname = \"Student_count\", legend = \"成绩水平\")\np20 <- fun_bar2(edudata, StudentAbsenceDays, Class, \n pos = \"fill\", xname = \"Class\",\n yname = \"Per_Student_count\", legend = \"成绩水平\")\n(p17|p18)/(p19|p20)\n\n\n\n\n性别、逃课次数与学生成绩水平的关系\n\n\n\n\n由图@ref(fig:gender-absence)可知:\n\n女生比男生数量少很多,但高水平成绩的人数明显比男生多;中水平成绩男女比例基本持平。\n逃课超过7天的的学生基本无法取得好的成绩。\n\n\n\n5.2.9 举手次数和参加讨论次数与成绩水平关系\n\nfun_bar3 <- function(data, xlab, ylab, fillc, xname, yname){\n data %>% \n group_by({{xlab}}) %>% \n summarise(Mcount = mean({{ylab}})) %>% \n ggplot(aes({{xlab}}, Mcount, fill = {{fillc}})) +\n geom_col() + \n labs(x = xname, y = yname) +\n theme_bw() +\n theme(legend.position = \"none\")\n}\n\n\n# edudata$Class <- factor(edudata$Class, c(\"H\", \"M\", \"L\"), ordered = TRUE)\np21 <- fun_bar3(data = edudata, xlab = Class, ylab = raisedhands, \n fillc = Class, \"成绩水平\", \"平均举手次数\" )\n\np22 <- fun_bar3(data = edudata, xlab = Class, ylab = Discussion, \n fillc = Class, \"成绩水平\", \"平均参与讨论次数\" )\np21|p22\n\n\n\n\n举手次数和参加讨论次数与成绩水平关系\n\n\n\n\n由图@ref(fig:raisedhands-discuss)可知: 举手次数和参与讨论次数越多,成绩水平越高。"
},
{
"objectID": "5-Student-performance-level.html#模型建立",
"href": "5-Student-performance-level.html#模型建立",
"title": "5 学生成绩水平分类",
"section": "5.3 模型建立",
"text": "5.3 模型建立\n\n5.3.1 回归树模型建立\n\nset.seed(1234)\n# 按照数据目标8:2进行分层抽样,返回矩阵形式的抽样索引\nindex <- createDataPartition(edudata$Class, p = 0.8, list = F)\ntrain <- edudata[index, ]\ntest <- edudata[-index, ]\n\n# 建立回归树模型\nrpart_model <- rpart(Class ~., data = train)\n# type = \"class\"指定预测结果是具体的某个类别\npred_rp <- predict(rpart_model, test[-17], type = \"class\")\nconfusionMatrix(pred_rp, test$Class)\n\nConfusion Matrix and Statistics\n\n Reference\nPrediction H M L\n H 18 3 0\n M 9 29 3\n L 1 10 22\n\nOverall Statistics\n \n Accuracy : 0.7263 \n 95% CI : (0.6252, 0.8128)\n No Information Rate : 0.4421 \n P-Value [Acc > NIR] : 1.882e-08 \n \n Kappa : 0.5806 \n \n Mcnemar's Test P-Value : 0.05103 \n\nStatistics by Class:\n\n Class: H Class: M Class: L\nSensitivity 0.6429 0.6905 0.8800\nSpecificity 0.9552 0.7736 0.8429\nPos Pred Value 0.8571 0.7073 0.6667\nNeg Pred Value 0.8649 0.7593 0.9516\nPrevalence 0.2947 0.4421 0.2632\nDetection Rate 0.1895 0.3053 0.2316\nDetection Prevalence 0.2211 0.4316 0.3474\nBalanced Accuracy 0.7990 0.7320 0.8614\n\nprp(rpart_model)\n\n\n\n\n\n\n5.3.2 随机数模型\n\nset.seed(1234)\n# importance = T:稍后对变量进行重要性的可视化\nrf_model <- randomForest(Class~., data = train, importance = T)\npred_rf <- predict(rf_model, test[-17], type = \"class\")\nconfusionMatrix(pred_rf, test$Class) # 混淆矩阵判断模型准确率\n\nConfusion Matrix and Statistics\n\n Reference\nPrediction H M L\n H 20 4 0\n M 8 36 4\n L 0 2 21\n\nOverall Statistics\n \n Accuracy : 0.8105 \n 95% CI : (0.7172, 0.8837)\n No Information Rate : 0.4421 \n P-Value [Acc > NIR] : 1.886e-13 \n \n Kappa : 0.7032 \n \n Mcnemar's Test P-Value : NA \n\nStatistics by Class:\n\n Class: H Class: M Class: L\nSensitivity 0.7143 0.8571 0.8400\nSpecificity 0.9403 0.7736 0.9714\nPos Pred Value 0.8333 0.7500 0.9130\nNeg Pred Value 0.8873 0.8723 0.9444\nPrevalence 0.2947 0.4421 0.2632\nDetection Rate 0.2105 0.3789 0.2211\nDetection Prevalence 0.2526 0.5053 0.2421\nBalanced Accuracy 0.8273 0.8154 0.9057\n\nvarImpPlot(rf_model) # 可视化变量重要性函数\n\n\n\n\n阅读上图:\n\n圆点越靠近右侧越重要。\n我们重点观察排名前五的变量。通过左右两图的对比发现,两图中前四个变量相同(交叉),可以判定这四个变量是数据中最重要的变量。\n\n\n\n5.3.3 SVM建模-支持向量机(需要再研究)\n\nset.seed(1234)\nlibrary(kernlab) # Kernel-Based Machine Learning Lab\nsvm_model <- ksvm(Class~., data = test, kernel = \"rbfdot\")\n# type = \"response\":指定预测结果是具体的某个列别\npred_svm <- predict(svm_model, test[-17], type = \"response\")\nconfusionMatrix(pred_svm, test$Class)\n\nConfusion Matrix and Statistics\n\n Reference\nPrediction H M L\n H 23 4 0\n M 5 36 1\n L 0 2 24\n\nOverall Statistics\n \n Accuracy : 0.8737 \n 95% CI : (0.7897, 0.933)\n No Information Rate : 0.4421 \n P-Value [Acc > NIR] : < 2.2e-16 \n \n Kappa : 0.8053 \n \n Mcnemar's Test P-Value : NA \n\nStatistics by Class:\n\n Class: H Class: M Class: L\nSensitivity 0.8214 0.8571 0.9600\nSpecificity 0.9403 0.8868 0.9714\nPos Pred Value 0.8519 0.8571 0.9231\nNeg Pred Value 0.9265 0.8868 0.9855\nPrevalence 0.2947 0.4421 0.2632\nDetection Rate 0.2421 0.3789 0.2526\nDetection Prevalence 0.2842 0.4421 0.2737\nBalanced Accuracy 0.8809 0.8720 0.9657\n\n\n\n\n5.3.4 模型融合\n将各个模型的结果做一个融合(合并至一个数据框)。\n\nresult <- data.frame(rpart = pred_rp,\n randomForest = pred_rf,\n svm = pred_svm,\n actual_class = test$Class, \n final_pred = rep(\"-\", nrow(test)))\nhead(result)\n\n rpart randomForest svm actual_class final_pred\n1 M M M M -\n2 L L L L -\n3 L L L L -\n4 M M L M -\n5 L L L L -\n6 M M M M -\n\n\n\n# 封装求众数函数\nfun_pred <- function(x){\n names(which.max(table(x)))\n}\n\nresult$final_pred <- factor(apply(result[1:2], 1, fun_pred))\nconfusionMatrix(result$actual_class, result$final_pred)\n\nConfusion Matrix and Statistics\n\n Reference\nPrediction H L M\n H 21 1 6\n L 0 23 2\n M 7 10 25\n\nOverall Statistics\n \n Accuracy : 0.7263 \n 95% CI : (0.6252, 0.8128)\n No Information Rate : 0.3579 \n P-Value [Acc > NIR] : 3.029e-13 \n \n Kappa : 0.5887 \n \n Mcnemar's Test P-Value : 0.09327 \n\nStatistics by Class:\n\n Class: H Class: L Class: M\nSensitivity 0.7500 0.6765 0.7576\nSpecificity 0.8955 0.9672 0.7258\nPos Pred Value 0.7500 0.9200 0.5952\nNeg Pred Value 0.8955 0.8429 0.8491\nPrevalence 0.2947 0.3579 0.3474\nDetection Rate 0.2211 0.2421 0.2632\nDetection Prevalence 0.2947 0.2632 0.4421\nBalanced Accuracy 0.8228 0.8218 0.7417\n\nhead(result)\n\n rpart randomForest svm actual_class final_pred\n1 M M M M M\n2 L L L L L\n3 L L L L L\n4 M M L M M\n5 L L L L L\n6 M M M M M"
},
{
"objectID": "7-hotel-demond.html#不同国家预定数量分布",
"href": "7-hotel-demond.html#不同国家预定数量分布",
"title": "7 酒店房间预定预测",
"section": "7.4 不同国家预定数量分布",
"text": "7.4 不同国家预定数量分布\n\n#1. Summarizing data by city hotels\ndfcCity <- dfc %>%\n filter(hotel == \"City Hotel\") %>%\n group_by(country) %>%\n summarize(\"total_rate\" = sum(total_rates), \"bookings_count\" = round(log(n()), 1)) %>%\n mutate(\"average_rate\" = total_rate / n())\n\n#2. Summarizing data by resort hotels\ndfcResort <- dfc %>%\n filter(hotel == \"Resort Hotel\") %>%\n group_by(country) %>%\n summarize(\"total_rate\" = sum(total_rates), \"bookings_count\" = round(log(n()), 1)) %>%\n mutate(\"average_rate\" = total_rate / n())\n\n#3. Plots\nMapCity <- highchart() %>%\n hc_add_series_map(worldgeojson, dfcCity, \n value = \"bookings_count\", \n joinBy = c(\"iso3\", \"country\")) %>%\n hc_colorAxis(minColor = \"#fde725\", \n maxColor = \"#0d0887\") %>%\n hc_legend(align = \"top\", verticalAlign = \"top\", \n layout = \"horizontal\", x = -8, y = 41) %>%\n hc_title(text = \"City Hotel Bookings Volume by Country\", align = \"left\") %>%\n hc_subtitle(text = \"Scale in Natural Log\", align = \"left\")\n\nMapResort <- highchart() %>%\n hc_add_series_map(worldgeojson, dfcResort, \n value = \"bookings_count\", \n joinBy = c(\"iso3\", \"country\")) %>%\n hc_legend(align = \"top\", verticalAlign = \"top\", \n layout = \"horizontal\", x = -8, y = 41) %>%\n hc_title(text = \"Resort Hotel Bookings Volume by Country\", align = \"left\") %>%\n hc_subtitle(text = \"Scale in Natural Log\", align = \"left\")\n\n#4. Arraging and aligning each plot\nhw_grid(MapCity, MapResort)"
}
]