Introduction to Basic Statistics in the Psychology module
This R script aims to generate a dataset for the purpose of 100SoM-IMEX Science Seeker programme
Description of example dataset:
- SPM Maths trials results of 3 schools (long format).
- Among them, SPM results of 2 schools are normally distributed with the same mean and sd,the 3rd school has much higher mean.
Steps taken
1. Housekeeping
Firstly we clear the workspace and load the libraries needed.
rm(list=ls()) # clear the workspace
#set.seed for reproducible "random" numbers
set.seed(1234)
#load neccesary packages
require(ggplot2)
## Loading required package: ggplot2
require(psych)
## Loading required package: psych
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
require(truncnorm)
## Loading required package: truncnorm
require(patchwork)
## Loading required package: patchwork
2. Data used on the slides
The data below were “generated” as illustrative purpose for the slides for this module
<-c(10, 10, 10, 20, 30, 20, 10, 30, 20, 30)
central_tendency_eg
mean(central_tendency_eg)
## [1] 19
median(central_tendency_eg)
## [1] 20
#function to get mode
<- function(v) {
getmode <- unique(v)
uniqv which.max(tabulate(match(v, uniqv)))]
uniqv[
}
getmode(central_tendency_eg)
## [1] 10
(central_tendency_eg)## [1] 10 10 10 20 30 20 10 30 20 30
3. Generate data for the three schools
#Generate random ID numbers
<-c(100:999)
IdenNum#generate a column for School A and School B
=c(rep("SMK Ravenclaw",300),rep("SMK Slytherin",300), rep("SMK Hufflepuff", 300)) # schools as caegorical data
School=c(rep(c("Male","Female"),450)) # sex as binary/categorical data
Sex<-as.data.frame(cbind(IdenNum,School,Sex))
datahead(data)
## IdenNum School Sex
## 1 100 SMK Ravenclaw Male
## 2 101 SMK Ravenclaw Female
## 3 102 SMK Ravenclaw Male
## 4 103 SMK Ravenclaw Female
## 5 104 SMK Ravenclaw Male
## 6 105 SMK Ravenclaw Female
#generate normally distributed scores for students from SMK Slytherin and SMK Ravenclaw separately.
<-round(rnorm(100, mean = 30, sd = 10),0)
Ravenclaw_score<-round(rnorm(100,mean = 30, sd = 10),0)
Slytherin_score<-Slytherin_score+40
Hufflepuff_score
describe(Ravenclaw_score) #min=7, max=55, mean=28
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 100 28.42 10.04 26 27.69 9.64 7 55 48 0.59 -0.07 1
describe(Slytherin_score) #min=1, max=60, mean-30
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 100 30.44 10.33 30 30.4 8.9 1 60 59 -0.02 0.53 1.03
describe(Hufflepuff_score) # min=41, max=100, mean=70
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 100 70.44 10.33 70 70.4 8.9 41 100 59 -0.02 0.53 1.03
t.test(Ravenclaw_score,Slytherin_score) #Slytherin has higher mean score with p-value >0.05
##
## Welch Two Sample t-test
##
## data: Ravenclaw_score and Slytherin_score
## t = -1.4029, df = 197.84, p-value = 0.1622
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.8594675 0.8194675
## sample estimates:
## mean of x mean of y
## 28.42 30.44
t.test(Slytherin_score,Hufflepuff_score) # Hufflepuff has higher mean score with p-value <0.05
##
## Welch Two Sample t-test
##
## data: Slytherin_score and Hufflepuff_score
## t = -27.393, df = 198, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -42.87958 -37.12042
## sample estimates:
## mean of x mean of y
## 30.44 70.44
$Score<-ifelse(data$School=="SMK Ravenclaw", Ravenclaw_score,
dataifelse(data$School=="SMK Slytherin",Slytherin_score,
ifelse(data$School=="SMK Hufflepuff", Hufflepuff_score,NA)))
$Grade<-as.factor(ifelse(data$Score<40, "G",
dataifelse(data$Score>=40&data$Score<45, "E",
ifelse(data$Score>=45&data$Score<50, "D",
ifelse(data$Score>=50&data$Score<55, "C",
ifelse(data$Score>=55&data$Score<60, "C+",
ifelse(data$Score>=60&data$Score<65, "B",
ifelse(data$Score>=65&data$Score<70, "B+",
ifelse(data$Score>=70&data$Score<80,"A-",
ifelse(data$Score>=80&data$Score<90,"A",
ifelse(data$Score>=90&data$Score<=100,"A+",NA)))))))))))
4. Plotting the figures
Histogram for SMK Ravenclaw
ggplot(data,aes(x=Score))+
geom_histogram(data=subset(data,School=="SMK Ravenclaw"), fill= "red", breaks=seq(0, 100, by=10),
col="black",
alpha = .2)+
labs(title="Distribution of scores of SMK Ravenclaw students")+
theme(plot.title = element_text(hjust = 0.5))
Histogram for SMK Slytherin
ggplot(data,aes(x=Score))+
geom_histogram(data=subset(data,School=="SMK Slytherin"), fill= "blue", breaks=seq(0, 100, by=10),
col="black",
alpha = .2)+
labs(title="Distribution of scores of SMK Slytherin students")+
theme(plot.title = element_text(hjust = 0.5))
Histogram for SMK Hufflepuff
ggplot(data,aes(x=Score))+
geom_histogram(data=subset(data,School=="SMK Hufflepuff"), fill= "green", breaks=seq(0, 100, by=10),
col="black",
alpha = .2)+
labs(title="Distribution of scores of SMK Hufflepuff students")+
theme(plot.title = element_text(hjust = 0.5))
Plotting histograms of both Slytherin and Ravenclaw together:
ggplot(data,aes(x=Score, fill=School)) +
geom_histogram(data=subset(data,School == 'SMK Ravenclaw'),col="black", breaks=seq(0, 100, by=10),alpha = 0.5) +
geom_histogram(data=subset(data,School == 'SMK Slytherin'),col="black",breaks=seq(0, 100, by=10), alpha = 0.5)+
labs(title="Distribution of scores of SMK Ravenclaw and SMK Slytherin")+
theme(plot.title = element_text(hjust = 0.5))
Plotting Hufflepuff against Ravenclaw
ggplot(data,aes(x=Score,fill=School)) +
geom_histogram(data=subset(data,School == 'SMK Ravenclaw'),col="black", breaks=seq(0, 100, by=10),alpha = 0.5) +
geom_histogram(data=subset(data,School == 'SMK Hufflepuff'),col="black",breaks=seq(0, 100, by=10), alpha = 0.5)+
labs(title="Distribution of scores of SMK Ravenclaw and SMK Hufflepuff")+
theme(plot.title = element_text(hjust = 0.5))+
scale_fill_manual(values=c("SMK Ravenclaw"="Red","SMK Hufflepuff"="Green" ))
Boxplot for SMK Hufflepuff
boxplot(subset(data, School=="SMK Hufflepuff")$Score,main="Boxplot: SMK Hufflepuff Scores", col="green")
Boxplot for all schools
ggplot(data,aes(x=Score,y=School, fill=School))+
geom_boxplot()+ylab("")+labs(title="Boxplot for all schools")
5. Identifying data type
Recode data such that grade is ordinal variable, score is continuous variable, sex is categorical variable
Reorder grade and plot bar chart
$Grade <- ordered(data$Grade, levels = c("G", "E", "D",
data"C", "C+", "B",
"B+", "A-", "A",
"A+"))
plot(data$Grade, col=c("black", "red","orange","pink", "yellow","green",
"blue", "violet", "purple", "brown"),
main="Distribution of grades of all three schools")
6. Mean, mode, median
Showing mean score for each school
<-aggregate(data$Score, list(data$School), mean)
tablecolnames(table)<-c("School", "Mean")
table## School Mean
## 1 SMK Hufflepuff 70.44
## 2 SMK Ravenclaw 28.42
## 3 SMK Slytherin 30.44
<-aggregate(data$Score, list(data$School), median)
table2colnames(table2)<-c("School", "Median")
table2## School Median
## 1 SMK Hufflepuff 70
## 2 SMK Ravenclaw 26
## 3 SMK Slytherin 30
merge(table,table2, by="School")
## School Mean Median
## 1 SMK Hufflepuff 70.44 70
## 2 SMK Ravenclaw 28.42 26
## 3 SMK Slytherin 30.44 30
Get mode
<- function(v) {
getmode <- unique(v)
uniqv which.max(tabulate(match(v, uniqv)))]
uniqv[
}
<-aggregate(as.factor(data$Grade), list(data$School), getmode)
table3colnames(table3)<-c("School", "Mode")
table3## School Mode
## 1 SMK Hufflepuff A-
## 2 SMK Ravenclaw G
## 3 SMK Slytherin G
6. Save the data
setwd("/Users/kai/OneDrive - King's College London/PhD/outside_PhD/iMEX_SoM")
::write.xlsx(data, "Maths_trial_data.xlsx") xlsx