-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathglms.R
117 lines (86 loc) · 4.29 KB
/
glms.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#The dataset contained in the file timss.csv is a subset of data from the 2015 Trends
#in International Mathematics and Science Study (Mullis et al., 2016).
#It contains results of a study conducted on year 5 (9-10 year old) schoolchildren in England. As
#part of this study, children were asked to complete a mathematics test and answer
#survey questions about their background and their perceptions of mathematics. Each
#row of the dataset corresponds to an individual student, and the columns are:
#• sex: The sex of the child.
#• score: The child’s percentage score in the mathematics test.
#• books: The number of books in the household where the child lives.
#• place of birth: Whether or not the child was born in the UK.
#• good at maths: Did the child agree with the statement “I am good at working
# out difficult mathematics problems”?
#Goal: How 'good_at_maths' depends on 'score' and other variables.
#importing libraries and data
install.packages('ggplot2')
library(ggplot2)
df = read.csv('timss.csv')
#analysis
head(df)
str(df)
dim(df)
summary(df)
df$sex = as.factor(as.numeric(df$sex)-1) #male = 1
df$good_at_maths = as.factor(as.numeric(df$good_at_maths)-1) #disagree = 1
df$place_of_birth = as.factor(as.numeric(df$place_of_birth)-1) # uk = 1
#plotting
#score+sex agains the class
#score agains the class
# it probably does influence general iq etc.
ggplot(df, aes(x=score, fill=books)) +
geom_histogram(binwidth=5, position="dodge")
ggplot(df, aes(books, score))+
geom_violin(aes(fill = books), trim=FALSE)+
xlim("0-10", "11-25", "26-100", "101-200", "More than 200")+
xlab('Number of books in home')+
ylab('Percentage scored in maths test')+
ggtitle('Perfomance in mathematics across differet number of book in the household')
#we see nicely that they correlate highly, more books, smarter kid
ggplot(df, aes(sex, score))+
geom_violin(aes(fill = sex), trim=FALSE)+
xlab('Gender')+
ylab('Percentage scored in maths test')+
ggtitle('Perfomance in mathematics across sex')
summary(df[df$sex == 0,]$score)
summary(df[df$sex == 1,]$score)
#we see clearly that boys tend to score negligibly better, the distribution seem to be shifted by 2%
ggplot(df, aes(good_at_maths, score))+
geom_violin(aes(fill = good_at_maths), trim=FALSE)+
ylab('Percentage scored in maths test')+
ggtitle('Perfomance in mathematics across the class variable')+
scale_x_discrete("Answer to the question (class variable)",
labels = c("0" = "Agree","1" = "Disagree"))+
theme(legend.position = 'none')
summary(df[df$good_at_maths == 0,]$score)
summary(df[df$good_at_maths == 1,]$score)
#'generally, we see that kids who Agree are generally better by around 9% on the mean, so we can say that
#'these answers are appropiate for most of the cases
#'distribution of 'disagree' is less skewed i.e students who
ggplot(df, aes(score, fill = good_at_maths))+
geom_density(position = "stack")+
xlab('Test score')+
ylab('')+
ggtitle('Distribution of test score across class variable')+
theme(legend.position = 'none')+
ylim(0,0.04)
#'the two distributions are different, distribution of those that disagreed is less skewed,
#'better students tend to agree with the statement
ggplot(df, aes(score))+
geom_density(fill = "#ff4d5d", alpha = 0.5)+
xlab('Test score')+
ylab('')+
ggtitle('Distribution of test score')+
theme(legend.position = 'none')+
ylim(0,0.02)
#people generally score high (not normal
#modelling
logr_1 <- glm(good_at_maths ~ score, family = 'binomial', data = df)
summary(logr_1) #conclusion, 1% on the test gives 0.018 to Agree, statistically significant
logr_2 <- glm(good_at_maths ~ ., family = 'binomial', data = df)
summary(logr_2) #most of parameters apart from score and sex are not significant as have large p-values
logr_3 <- glm(good_at_maths ~ sex + score, family = 'binomial', data = df)
summary(logr_3) #looks like males have higher probability of saying Agree
logr_4 <- glm(good_at_maths ~ books, family = 'binomial', data = df)
summary(logr_4) #books variables are not statistically significant
lm_1 <- lm(score ~ sex, df)
summary(lm_1) #significantly statistic: being a male increases score by about 1.8%, s.e quite large