Introduction to Basic Statistics in the Psychology module

This R script aims to generate a dataset for the purpose of 100SoM-IMEX Science Seeker programme

Description of example dataset:

  • SPM Maths trials results of 3 schools (long format).
  • Among them, SPM results of 2 schools are normally distributed with the same mean and sd,the 3rd school has much higher mean.

Steps taken

1. Housekeeping

Firstly we clear the workspace and load the libraries needed.

rm(list=ls()) # clear the workspace

#set.seed for reproducible "random" numbers 
set.seed(1234)

#load neccesary packages
require(ggplot2)
## Loading required package: ggplot2
require(psych)
## Loading required package: psych
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
require(truncnorm)
## Loading required package: truncnorm
require(patchwork)
## Loading required package: patchwork

2. Data used on the slides

The data below were “generated” as illustrative purpose for the slides for this module

central_tendency_eg<-c(10, 10, 10, 20, 30, 20, 10, 30, 20, 30)

mean(central_tendency_eg)
## [1] 19

median(central_tendency_eg)
## [1] 20

#function to get mode
getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

getmode(central_tendency_eg)
## [1] 10

(central_tendency_eg)
##  [1] 10 10 10 20 30 20 10 30 20 30

3. Generate data for the three schools

#Generate random ID numbers
IdenNum<-c(100:999)
#generate a column for School A and School B
School=c(rep("SMK Ravenclaw",300),rep("SMK Slytherin",300), rep("SMK Hufflepuff", 300)) # schools as caegorical data
Sex=c(rep(c("Male","Female"),450)) # sex as binary/categorical data
data<-as.data.frame(cbind(IdenNum,School,Sex))
head(data)
##   IdenNum        School    Sex
## 1     100 SMK Ravenclaw   Male
## 2     101 SMK Ravenclaw Female
## 3     102 SMK Ravenclaw   Male
## 4     103 SMK Ravenclaw Female
## 5     104 SMK Ravenclaw   Male
## 6     105 SMK Ravenclaw Female

#generate normally distributed scores for students from SMK Slytherin and SMK Ravenclaw separately. 

Ravenclaw_score<-round(rnorm(100, mean = 30, sd = 10),0)
Slytherin_score<-round(rnorm(100,mean = 30, sd = 10),0)
Hufflepuff_score<-Slytherin_score+40


describe(Ravenclaw_score) #min=7, max=55, mean=28
##    vars   n  mean    sd median trimmed  mad min max range skew kurtosis se
## X1    1 100 28.42 10.04     26   27.69 9.64   7  55    48 0.59    -0.07  1
describe(Slytherin_score) #min=1, max=60, mean-30
##    vars   n  mean    sd median trimmed mad min max range  skew kurtosis   se
## X1    1 100 30.44 10.33     30    30.4 8.9   1  60    59 -0.02     0.53 1.03
describe(Hufflepuff_score) # min=41, max=100, mean=70
##    vars   n  mean    sd median trimmed mad min max range  skew kurtosis   se
## X1    1 100 70.44 10.33     70    70.4 8.9  41 100    59 -0.02     0.53 1.03
t.test(Ravenclaw_score,Slytherin_score) #Slytherin has higher mean score with p-value >0.05
## 
##  Welch Two Sample t-test
## 
## data:  Ravenclaw_score and Slytherin_score
## t = -1.4029, df = 197.84, p-value = 0.1622
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.8594675  0.8194675
## sample estimates:
## mean of x mean of y 
##     28.42     30.44
t.test(Slytherin_score,Hufflepuff_score) # Hufflepuff has higher mean score with p-value <0.05
## 
##  Welch Two Sample t-test
## 
## data:  Slytherin_score and Hufflepuff_score
## t = -27.393, df = 198, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -42.87958 -37.12042
## sample estimates:
## mean of x mean of y 
##     30.44     70.44
data$Score<-ifelse(data$School=="SMK Ravenclaw", Ravenclaw_score,
                   ifelse(data$School=="SMK Slytherin",Slytherin_score,
                          ifelse(data$School=="SMK Hufflepuff", Hufflepuff_score,NA)))

data$Grade<-as.factor(ifelse(data$Score<40, "G",
                             ifelse(data$Score>=40&data$Score<45, "E",
                                    ifelse(data$Score>=45&data$Score<50, "D",
                                           ifelse(data$Score>=50&data$Score<55, "C",
                                                  ifelse(data$Score>=55&data$Score<60, "C+",
                                                         ifelse(data$Score>=60&data$Score<65, "B",
                                                                ifelse(data$Score>=65&data$Score<70, "B+",
                                                                       ifelse(data$Score>=70&data$Score<80,"A-",
                                                                              ifelse(data$Score>=80&data$Score<90,"A",
                                                                                     ifelse(data$Score>=90&data$Score<=100,"A+",NA)))))))))))

4. Plotting the figures

Histogram for SMK Ravenclaw
ggplot(data,aes(x=Score))+
  geom_histogram(data=subset(data,School=="SMK Ravenclaw"), fill= "red", breaks=seq(0, 100, by=10), 
                 col="black", 
                 alpha = .2)+
  labs(title="Distribution of scores of SMK Ravenclaw students")+
  theme(plot.title = element_text(hjust = 0.5))

Histogram for SMK Slytherin

ggplot(data,aes(x=Score))+
  geom_histogram(data=subset(data,School=="SMK Slytherin"), fill= "blue", breaks=seq(0, 100, by=10), 
                 col="black", 
                 alpha = .2)+
  labs(title="Distribution of scores of SMK Slytherin students")+
  theme(plot.title = element_text(hjust = 0.5))

Histogram for SMK Hufflepuff

ggplot(data,aes(x=Score))+
  geom_histogram(data=subset(data,School=="SMK Hufflepuff"), fill= "green", breaks=seq(0, 100, by=10), 
                 col="black", 
                 alpha = .2)+
  labs(title="Distribution of scores of SMK Hufflepuff students")+
  theme(plot.title = element_text(hjust = 0.5))

Plotting histograms of both Slytherin and Ravenclaw together:

ggplot(data,aes(x=Score, fill=School)) + 
  geom_histogram(data=subset(data,School == 'SMK Ravenclaw'),col="black", breaks=seq(0, 100, by=10),alpha = 0.5) +
  geom_histogram(data=subset(data,School == 'SMK Slytherin'),col="black",breaks=seq(0, 100, by=10), alpha = 0.5)+
  labs(title="Distribution of scores of SMK Ravenclaw and SMK Slytherin")+
  theme(plot.title = element_text(hjust = 0.5))

Plotting Hufflepuff against Ravenclaw

ggplot(data,aes(x=Score,fill=School)) +   
  geom_histogram(data=subset(data,School == 'SMK Ravenclaw'),col="black", breaks=seq(0, 100, by=10),alpha = 0.5) +
  geom_histogram(data=subset(data,School == 'SMK Hufflepuff'),col="black",breaks=seq(0, 100, by=10), alpha = 0.5)+
  labs(title="Distribution of scores of SMK Ravenclaw and SMK Hufflepuff")+
  theme(plot.title = element_text(hjust = 0.5))+
  scale_fill_manual(values=c("SMK Ravenclaw"="Red","SMK Hufflepuff"="Green" ))

Boxplot for SMK Hufflepuff


boxplot(subset(data, School=="SMK Hufflepuff")$Score,main="Boxplot: SMK Hufflepuff Scores", col="green")

Boxplot for all schools

ggplot(data,aes(x=Score,y=School, fill=School))+
  geom_boxplot()+ylab("")+labs(title="Boxplot for all schools")

5. Identifying data type

Recode data such that grade is ordinal variable, score is continuous variable, sex is categorical variable

Reorder grade and plot bar chart

data$Grade <- ordered(data$Grade, levels = c("G", "E", "D", 
                                             "C", "C+", "B",
                                             "B+", "A-", "A",
                                             "A+"))
plot(data$Grade, col=c("black", "red","orange","pink", "yellow","green",
                       "blue", "violet", "purple", "brown"), 
     main="Distribution of grades of all three schools")

6. Mean, mode, median

Showing mean score for each school

table<-aggregate(data$Score, list(data$School), mean)
colnames(table)<-c("School", "Mean")
table
##           School  Mean
## 1 SMK Hufflepuff 70.44
## 2  SMK Ravenclaw 28.42
## 3  SMK Slytherin 30.44

table2<-aggregate(data$Score, list(data$School), median)
colnames(table2)<-c("School", "Median")
table2
##           School Median
## 1 SMK Hufflepuff     70
## 2  SMK Ravenclaw     26
## 3  SMK Slytherin     30

merge(table,table2, by="School")
##           School  Mean Median
## 1 SMK Hufflepuff 70.44     70
## 2  SMK Ravenclaw 28.42     26
## 3  SMK Slytherin 30.44     30

Get mode

getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

table3<-aggregate(as.factor(data$Grade), list(data$School), getmode)
colnames(table3)<-c("School", "Mode")
table3
##           School Mode
## 1 SMK Hufflepuff   A-
## 2  SMK Ravenclaw    G
## 3  SMK Slytherin    G

6. Save the data

setwd("/Users/kai/OneDrive - King's College London/PhD/outside_PhD/iMEX_SoM")
xlsx::write.xlsx(data, "Maths_trial_data.xlsx")