Studying individual risk factors of self-harm in the UK Biobank
Introduction
This is one of my PhD projects, where I study the aetiology of self-harm by strengthening the causal inference using two genomic data science methods: Polygenic scoring and Mendelian randomisation.
The project is currently published on PloS Medicine, a high-impact medical journal. You can access the article here.
The scripts below are a combination of shell and R scripts. We will go through the steps taken and scripts used in completing the project. On hindsight, lots of the steps could be taken using Tidyverse/dplyr functions, but I was new to the Tidyverse at that time.
1: Prepare the dataset
Prepare dataset for PRS analyses
These are shell scripts. Output of this chunk is “selfharm_small_15_Feb.txt” and “selfharm_big_15_Feb.txt”. The “small” dataset contains PRS at p value < 0.3 threshold only (which were used for main analyses), whereas the “big” dataset contains PRS at all thresholds.
The “small” dataset is used heavily in main analyses, whereas the “big” dataset was used to generate the R2 plot in supplementary materials to justify the usage of p < 0.3 threshold.
Note that I have “anonymised” the file paths since the script will be shared publicly.
Extract phenotypes
#import all dataset to dataset directory
#awk sex, age, suicidal and non-suicidal self-harm data.
awk '{print $1, $1, $3, $6, $1655, $1656}' original_phenotype_file.txt > ./data/phenotypes1108.txt
Merge phenotypes and covariates
########### 1. Merging the phenotypes and covariates ##########
# Open R on shell terminal
module add bioinformatics/R/3.5.0 #ex with R then just open R by typing R
R # Open R
library(data.table)
pheno=fread("phenotypes1108.txt", verbose=T)
names(pheno)[1]<-"FID" #change column name
names(pheno)[2]<-"IID"
covariates=fread("covariates.txt", verbose=T)
pheno=as.data.frame(pheno, col.names=names(pheno))
covariates=as.data.frame(covariates, col.names=names(covariates))
# check numbers of non-suicidal self-harm and suicidal self-harm.
table(pheno$f.20480.0.0, useNA="always") #ever self-harmed
table(pheno$f.20483.0.0, useNA="always") #ever attempted suicide
#all phenotypes data count is the same as presented in UK Biobank wesbsite!!
#merge pheno and covariates into a new dataset
selfharmc <- merge(pheno,covariates,by=c("IID","FID"), all.x=T, all.y=T) #merging all NA cells too.
# change the class of some data to what we wanted e.g. factor.
selfharmc$sex=NA; selfharmc$sex=as.factor(selfharmc$f.31.0.0)
selfharmc$age=NA; selfharmc$age=2018-selfharmc$f.34.0.0
###### 2. Exporting and re-reading the data - phenotypes + covariates #######
fwrite(selfharmc,"selfharm.pheno.covariates.1108.txt",verbose=T)
selfharm2=fread("selfharm.pheno.covariates.1108.txt",verbose=T)
head(selfharm2$batch)
#problem: empty strings are regarded as a new level.....
#resolve: some tweaks here and there: 106,22,2 levels each.
selfharm2[selfharm2$batch==""] <- NA
selfharm2$AC<-as.factor(selfharm2$assessment_centre)
selfharm2$sexL<-as.factor(selfharm2$sex)
selfharm2$batchL<-as.factor(selfharm2$batch)
str(selfharm2$AC);str(selfharm2$sexL);str(selfharm2$batchL)
# file is renamed as "selfharm.pheno.covariates.1209.txt" in "publication_analyses/data" folder
#the codes above are needed to make sure the factoring is proper - NAs are not part of the levels
Merge PRS with the dataset
########### 3. Merging the PRS data with other data - covariates + pheno + PRS
# awk out the PRS into personal folders
#tips for editing multiple lines https://twitter.com/i_steves/status/995394452821721088
cd /file/path/to/PRS_folder
cp {"ADHD02_header.all.score","ADHD05_header.all.score","AGAB01_header.all.score","AGGR01_header.all.score","ALCO01_header.all.score","ANXI01_header.all.score","ANXI02_header.all.score","BIPO01_header.all.score","BODY02_header.all.score","BODY04_header.all.score","CANU01_header.all.score","CHOL20_header.all.score","COGN01_header.all.score","CONS01_header.all.score","CROS01_header.all.score","DEPR07_header.all.score","EXTR01_header.all.score","HEIG01_header.all.score","HEIG03_header.all.score","OVER01_header.all.score","SCHI02_header.all.score","SCLE02_header.all.score","SMOK03_header.all.score","WEIG01_header.all.score"} /destination/directory
## Use R for further data wrangling and further edit headers as we added new PRS at later stage of analyses
module add bioinformatics/R/3.5.0 #ex with R then just open R by typing R
R
library(data.table)
setwd("/PRS_folder")
antisoc=fread("antisocbeh.all.score", verbose=T)
alcdep=fread("alcdep2.all.score", verbose=T)
neur=fread("neur_irt.all.score",verbose=T)
ADHD02=fread("ADHD02_header.all.score",verbose=T)
#change column names
colnames(antisoc)=c("FID","IID","ANTI01_1e-05","ANTI01_0.001", "ANTI01_0.01","ANTI01_0.05","ANTI01_0.1","ANTI01_0.2","ANTI01_0.3","ANTI01_0.4","ANTI01_0.5","ANTI01_1")
colnames(alcdep)=c("FID","IID","ALCD03_1e-05","ALCD03_0.001", "ALCD03_0.01","ALCD03_0.05","ALCD03_0.1","ALCD03_0.2","ALCD03_0.3","ALCD03_0.4","ALCD03_0.5","ALCD03_1")
colnames(neur)=c("FID","IID","NEUR04_1e-05","NEUR04_0.001", "NEUR04_0.01","NEUR04_0.05","NEUR04_0.1","NEUR04_0.2","NEUR04_0.3","NEUR04_0.4","NEUR04_0.5","NEUR04_1")
fwrite(antisoc,"ANTI01_header.all.score",verbose=T)
fwrite(alcdep,"ALCD03_header.all.score2",verbose=T)
fwrite(neur,"NEUR04_header.all.score",verbose=T)
# merge all the PRS as one txt file.
#all 24 PRS
#merge the PRS files using the function multmerge
multmerge = function(mypath){
filenames=list.files(path=mypath, full.names=TRUE)
datalist = lapply(filenames, function(x){fread(file=x,header=T)})
Reduce(function(x,y) {merge(x,y, by=c("IID","FID"), all.x=T, all.y=T)}, datalist)
}
PRSdata=multmerge("/PRS_folder/")
head(PRSdata)
dim(PRSdata)
str(PRSdata,list.len=ncol(PRSdata))
fwrite(PRSdata, file="PRSdata.txt")
### check if the data is there
PRSdata2=fread("PRSdata.txt")
dim(PRSdata2)
str(PRSdata2)
table(is.na(PRSdata2[,4]))
#compare length with unmerged data
PRSdata3=fread("ADHD02_header.all.score")
dim(PRSdata3)
#comparison is OK
#extract the PRS with p-value threshold of 0.3 - best threshold for current dataset.
myvar=c("IID","FID","ADHD02_0.3","ADHD05_0.3","AGAB01_0.3","AGGR01_0.3","ALCO01_0.3","ALCD03_0.3","ANTI01_0.3","ANXI01_0.3","ANXI02_0.3","BIPO01_0.3","BODY02_0.3","BODY04_0.3","CANU01_0.3","CHOL20_0.3","COGN01_0.3","CONS01_0.3","CROS01_0.3","DEPR07_0.3","EXTR01_0.3","HEIG01_0.3","HEIG03_0.3","NEUR04_0.3","OVER01_0.3","SCHI02_0.3","SCLE02_0.3","SMOK03_0.3","WEIG01_0.3")
length(myvar)
newPRSdata=PRSdata2[,myvar,with=F]
str(newPRSdata)
# merge PRSdata2 (bigger dataset) and newPRSdata (smaller dataset)
# with "selfharm.pheno.covariates.1108.txt" separately......
setwd("path/to/data/directory")
selfharm=fread("selfharm.pheno.covariates.1209.txt",verbose=T)
selfharm.small <- merge(selfharm,newPRSdata,by=c("IID","FID"), all.x=T, all.y=T)
dim(selfharm.small);str(selfharm.small)
head(selfharm.small)
fwrite(selfharm.small, file="selfharm_small_15_Feb.txt")
selfharm.big<- merge(selfharm,PRSdata2,by=c("IID","FID"), all.x=T, all.y=T)
fwrite(selfharm.big,file="selfharm_big_15_Feb.txt")
dim(selfharm.small);dim(selfharm.big)
str(selfharm.small);str(selfharm.big)
#testing the files - identical or not?
selfharm.small.test=fread("selfharm_small_15_Feb.txt",verbose=T)
str(selfharm.small.test)
dim(selfharm.small.test);dim(selfharm.small)
Prepare dataset for MR analyses
Download GWAS sumstats
# Create a new directory, "GWAS_sumstats"
cd /data/GWAS_sumstats
# Only an example is given here:
# download sumstats for self-harm
wget https://www.dropbox.com/s/28my5g965hrwbsf/20480.gwas.imputed_v3.both_sexes.tsv.bgz?dl=0 -O 20480.gwas.imputed_v3.both_sexes.tsv.bgz
# unzip the gwas sumstats
gunzip 20480.gwas.imputed_v3.both_sexes.tsv.gz
# check the sumstats
head -10 20480.gwas.imputed_v3.both_sexes.tsv
head -10 variants.tsv
Preprocess sumstats
## Open R in bash
cd /data/GWAS_sumstats
module add bioinformatics/R/3.5.0
R
library(data.table)
library(TwoSampleMR)
library(MendelianRandomization)
library(gmodels)
# columns needed for MR are: SNP, beta (log OR), SE, effect_allele (A1/tested allele/reference allele) and
# non-effective allele, p-value
adhd=fread("ADHD05",verbose=T)
rm(adhd)
alcdraw=fread("pgc_alcdep.eur_unrel_genotyped.aug2018_release.txt",verbose=T)
bipo=fread("BIPO01",verbose=T)
rm(bipo)
canu=fread("CANU01",verbose=T)
rm(canu)
cros=fread("CROS01",verbose=T)
rm(cros)
depr=fread("DEPR07",verbose=T)
rm(depr)
schi=fread("SCHI02",verbose=T)
head(adhd);dim(adhd)
head(bipo);dim(bipo)
head(canu);dim(canu) #canu already in beta, not OR
head(cros);dim(cros)
head(depr);dim(depr)
head(schi);dim(schi)
adhd$beta=log(adhd$OR) # To get the beta estimates, we take the logarithm of the odds ratio [log(OR)]
adhd$beta_se=adhd$SE/adhd$OR # To get the standard errors of the beta estimates, we have to divide the original standard errors of the odds ratios [SE(OR)] by the odds ratios [SE(OR)/OR]
alcdraw$beta=log(alcdraw$OR)
alcdraw$beta_se=alcdraw$SE/alcdraw$OR
bipo$beta=log(bipo$OR)
bipo$beta_se=bipo$SE/bipo$OR
depr$beta=log(depr$OR)
depr$beta_se=depr$SE/depr$OR
schi$beta=log(schi$OR)
schi$beta_se=schi$SE/schi$OR
# The ADHD summary statistics containing the converted effect size can now be exported and saved as a new file in our "GWAS_data" folder
#for adhd data,
# SNP -> column 1
# beta -> column 11
# SE -> column 12
# effect_allele -> column 4
# non-effective allele -> column 5
# p-value -> column 6
# effect_allele freq -> column 7
#for bipo data,
# SNP -> column 1
# beta -> column 11
# SE -> column 12
# effect_allele -> column 4
# non-effective allele -> column 5
# p-value -> column 6
# effect_allele freq -> column 7
#for canu data,
# SNP -> column 1
# beta -> column 8
# SE -> column 9
# effect_allele -> column 4
# non-effective allele -> column 5
# p-value -> column 7
# effect_allele freq -> column 6
#for cros data,
# SNP -> column 1
# beta -> column 11
# SE -> column 12
# effect_allele -> column 4
# non-effective allele -> column 5
# p-value -> column 6
# effect_allele freq -> column 7
#for depr data,
# SNP -> column 1
# beta -> column 15
# SE -> column 16
# effect_allele -> column 4
# non-effective allele -> column 5
# p-value -> column 8
# effect_allele freq -> column 6, but also extract column 7
#for schi data,
# SNP -> column 1
# beta -> column 15
# SE -> column 16
# effect_allele -> column 4
# non-effective allele -> column 5
# p-value -> column 8
# effect_allele freq -> column 6, but also extract column 7
write.table(adhd[,c(1,4,5,6,7,11,12)], "ADHD", col.names=T, row.names=F, quote=F,sep = "\t")
write.table(alcdraw[,c(1,2,4,5,9,11,12)], "ALCD", col.names=T, row.names=F, quote=F,sep = "\t")
write.table(bipo[,c(1,4,5,6,7,11,12)], "BIPO", col.names=T, row.names=F, quote=F,sep = "\t")
write.table(canu[,c(1,4,5,6,7,8,9)], "CANU", col.names=T, row.names=F, quote=F,sep = "\t")
write.table(depr[,c(1,4,5,6,7,8,15,16)], "DEPR", col.names=T, row.names=F, quote=F,sep = "\t")
write.table(schi[,c(1,4,5,6,7,8,15,16)], "SCHI", col.names=T, row.names=F, quote=F,sep = "\t")
# For self-harm data,
# SNP -> column 6 of variants file
# beta -> column 9 of gwas file
# SE -> column 10 of gwas file
# effect_allele -> column 5 of variants file (alt allele)
# non-effective allele -> column 4 of variants file (ref allele)
# p-value -> column 12 of gwas file
# effect_allele freq -> column 13 in variants file
# gwas file: c(1,9,10,12)
# variants file: c(1,4,5,6,13)
sh_gwas=fread("20480.gwas.imputed_v3.both_sexes.tsv",verbose=T, select=c(1,9,10,12))
head(sh_gwas)
str(sh_gwas)
## added on 14 Feb 2020 for SSH
ssh_gwas=fread("20483.gwas.imputed_v3.both_sexes.tsv",verbose=T, select=c(1,9,10,12))
head(ssh_gwas)
#problem: no SNP id (rsid) for this dataset
#download variants.tsv file from Neal's lab which contains SNP id
variants=fread("variants.tsv",verbose=T, select=c(1,4,5,6,13))
tally=identical(variants$variant,sh_gwas$variant) #see if they are identical
tally2=identical(variants$variant,ssh_gwas$variant) #see if they are identical
#merge sh_gwas and variants
sh_gwas_variants=merge(variants,sh_gwas,by=c("variant"), all.x=T, all.y=T)
write.table(sh_gwas_variants, "selfharm_sumstats_v2", col.names=T, row.names=F, quote=F,sep = "\t")
#do the same thing for ssh_gwas
ssh_gwas_variants=merge(variants,ssh_gwas,by=c("variant"), all.x=T, all.y=T)
write.table(ssh_gwas_variants, "ssh_sumstats_v2", col.names=T, row.names=F, quote=F,sep = "\t")
system("head ssh_sumstats_v2") # see if the merged file is okay
Prepare dataset for complementary analyses
Pulling in data for cases. The code below shows for multiple psychiatric condition but for the project we only did analyses for SCHI and DEPR because other conditions have too little number of cases.
#for SCHI
cd data/phenotypes
grep Non.cancer.illness.code.self.reported field_finder.txt #5793-5879 non cancer illness code self-reported
grep 20544 field_finder.txt #13039-13054 for mental health problems ever diagnosed
#self-report
awk '{print $1}' <(grep -w "1289" <(cut -d$'\t' -f 1,5793-5879 ukb18177_glanville_phenotypes.txt)) > /data/SCZ_ids_self_reported_06_march.txt
#self-reported - 620
#HES/ICD
cat <(awk '{print $1}' <(grep "F20" <(cut -d$'\t' -f 1,271-650,679-1113 /phenotypes.txt))) <(awk '{print $1}' <(grep -w "2953 | 2959" <(cut -d$'\t' -f 1,651-678,1114-1143 /phenotypes.txt))) > /data/SCZ_ids_HES_06_march.txt
#ICD10 and ICD9 - 1350
#MHQ self-report
awk '{print $1}' <(grep -w "2" <(cut -d$'\t' -f 1,13039-13054 /phenotypes.txt)) > /data/SCZ_MHQ_11_march.txt
wc -l /data/SCZ_MHQ_11_march.txt
#158 cases
# for depression: F32 and F33 for ICD 10, 3119 for ICD 9, and for self-report 1286 (not including post-natal depression)
#awk out self-report
awk '{print $1}' <(grep -w "1286" <(cut -d$'\t' -f 1,5793-5879 /phenotypes.txt)) > /data/DEPR_ids_self_reported_06_march.txt
wc -l DEPR_ids_self_reported_06_march.txt
#self-reported - 30116 cases
#for ICD/HES
cat <(awk '{print $1}' <(grep -E "F32|F33" <(cut -d$'\t' -f 1,271-650,679-1113 /phenotypes.txt))) <(awk '{print $1}' <(grep -w "3119" <(cut -d$'\t' -f 1,651-678,1114-1143 /phenotypes.txt))) > /data/DEPR_ids_HES_06_march.txt
wc -l DEPR_ids_HES_06_march.txt
#ICD10 and ICD9 - 14744 cases
# now for MHQ
awk '{print $1}' <(grep -w "11" <(cut -d$'\t' -f 1,13039-13054 /phenotypes.txt)) > /data/DEPR_MHQ_11_march.txt
wc -l /data/DEPR_MHQ_11_march.txt
#for ADHD
#code is F90 in ICD10
cat <(awk '{print $1}' <(grep -E "F90" <(cut -d$'\t' -f 1,271-650,679-1113 /phenotypes.txt)))> /data/ADHD_ids_HES_25_march.txt
wc -l ADHD_ids_HES_25_march.txt
#7 lines
#code is 18 in MHQ
awk '{print $1}' <(grep -w "18" <(cut -d$'\t' -f 1,13039-13054 /phenotypes.txt)) > /data/ADHD_MHQ_25_march.txt
wc -l ADHD_MHQ_25_march.txt
#133 lines
#for bipolar disorder
#code is F31 in ICD10,
cat <(awk '{print $1}' <(grep -E "F31" <(cut -d$'\t' -f 1,271-650,679-1113 /phenotypes.txt)))> /data/BIPO_ids_HES_25_march.txt
wc -l BIPO_ids_HES_25_march.txt
#1246 lines
#code is 1291 in self-report,
awk '{print $1}' <(grep -w "1291" <(cut -d$'\t' -f 1,5793-5879 /phenotypes.txt)) > /data/BIPO_ids_self_reported_06_march.txt
wc -l BIPO_ids_self_reported_06_march.txt
#1434 lines
#code is 10 in MHQ (mania, hypomania, bipolar or manic depression)
awk '{print $1}' <(grep -w "10" <(cut -d$'\t' -f 1,13039-13054 /phenotypes.txt)) > /data/BIPO_MHQ_25_march.txt
wc -l BIPO_MHQ_25_march.txt
#838 lines
#for cannabis use
grep 20453 field_finder.txt #column 12968
awk '{print $1, $1, $12968}' /phenotypes.txt > /data/CANU_MHQ_IDS.txt
wc CANU_MHQ_IDS.txt
#for alcohol dependence
# 3039 in ICD 9, F10.2 in ICD10, 1408 in self-report, no MHQ
cat <(awk '{print $1}' <(grep -w "F102" <(cut -d$'\t' -f 1,271-650,679-1113 /phenotypes.txt))) <(awk '{print $1}' <(grep -w "3039" <(cut -d$'\t' -f 1,651-678,1114-1143 /phenotypes.txt))) > /data/ALCD_ids_HES_26_march.txt
wc -l ALCD_ids_HES_26_march.txt
#2328 lines
awk '{print $1}' <(grep -w "1408" <(cut -d$'\t' -f 1,5793-5879 /phenotypes.txt)) > /ALCD_ids_self_reported_26_march.txt
wc -l ALCD_ids_self_reported_26_march.txt
#759 lines
Pulling in data for antipsychotic and antidepressant medications.
cut -d$'\t' -f 1,5880-6023 phenotypes.txt > /data/antipsychotics_05_march_2.txt
#antipsychotics - the names were selected based on Mind's website and BNF directory
grep -w -E "1140867944|1140909800|1140867078|1140867134|1140867168|1140867218|1140867304|1140867420|1140867444|1140868120|1140868170|1140879658|1140879746|1140882100|1140909802|1140910358|1140928916|1141152848|1141153490|1141195974|1140879750|1140882320|1141152860|1141167976|1141177762|1141202024|1140863416|1140867080|1140867092|1140867122|1140867136|1140867150|1140867152|1140867180|1140867184|1140867208|1140867210|1140867244|1140867272|1140867306|1140867342|1140867398|1140867456|1140867572|1140867940|1140867952|1140868172|1140909804|1141184742|1141185130|1141200458" antipsychotics_05_march_2.txt | awk '{print $1}'> antipsychotics_ID_06_march_updated.txt
wc -l antipsychotics_ID_06_march_updated.txt
#3339 lines
#antidepressants - the names were selected based on Mind's website and BNF directory
grep -w -E "1140867944|1140867624|1140867640|1140867690|1140867712|1140867726|1140867756|1140867758|1140867818|1140867820|1140867850|1140867852|1140867856|1140867860|1140867876|1140867878|1140867884|1140867888|1140867914|1140867916|1140867920|1140867922|1140867934|1140867938|1140867948|1140867960|1140879540|1140879556|1140879616|1140879620|1140879630|1140879634|1140882236|1140882244|1140909806|1140910504|1140910704|1140910820|1140916282|1140916288|1140921600|1141146062|1141151946|1141151978|1141151982|1141152732|1141152736|1141168396|1141174756|1141180212|1141190158|1141200564|1141201834" antipsychotics_05_march_2.txt | awk '{print $1}'> antidepressants_ID_06_march_updated.txt
wc -l antidepressants_ID_06_march_updated.txt
#37998 lines
#update: added a few new lithium drugs for antidepressants
grep -w -E "1199|1140867600|1140867618|1140867632|1140867944|1140867820|1140867850|1140867852|1140867856|1140867860|1140867876|1140867878|1140867884|1140867888|1140867914|1140867916|1140867920|1140867922|1140867934|1140867938|1140867948|1140867960|1140879540|1140879556|1140879616|1140879620|1140879630|1140879634|1140882236|1140882244|1140909806|1140910504|1140910704|1140910820|1140916282|1140916288|1140921600|1141146062|1141151946|1141151978|1141151982|1141152732|1141152736|1141168396|1141174756|1141180212|1141190158|1141200564|1141201834|1140909800|1140867078|1140867134|1140867168|1140867218|1140867304|1140867420|1140867444" antipsychotics_05_march_2.txt | awk '{print $1}'> antidepressants_ID_25_march_updated.txt
wc -l antidepressants_ID_25_march_updated.txt
#39097 lines
#ADHD_medications
grep -w -E "1140867944|1140867624|1140867640|1140867690|1140867712|1140867726|1140867756|1140867758|1140867818|1140867820" antipsychotics_05_march_2.txt | awk '{print $1}'> ADHD_medication_ID_25_march.txt
wc -l ADHD_medication_ID_25_march.txt
#1006 lines
#bipolar disorder
grep -w -E "1140879746|1140928916|1141177762|1141202024|1140867136|1140867150|1140867152|1140867210|1140867398|1140867456|1199|1140867600|1140867618|1140867632|1140872160|1140872284|1140872290|1140872348|1140872360|1141172918|2038459704|1140867944|1140867820|1140867850|1140867852|1140867856|1140867860|1140867876|1140867878|1140867884|1140867888|1140867914|1140867916|1140867920|1140867922|1140867934|1140867938|1140867948|1140867960|1140879540|1140879556|1140879616|1140879620|1140879630|1140879634|1140882236|1140882244|1140909806|1140910504|1140910704|1140910820|1140916282|1140916288|1140921600|1141146062|1141151946|1141151978|1141151982|1141152732|1141152736|1141168396|1141174756|1141180212|1141190158|1141200564|1141201834|1140909800|1140867078|1140867134|1140867168|1140867218|1140867304|1140867420|1140867444" antipsychotics_05_march_2.txt | awk '{print $1}'> BIPO_medication_ID_25_march.txt
wc -l BIPO_medication_ID_25_march.txt
#40932 lines
2: Run PRS analyses
Single PRS binomial logistic regression
library(data.table)
library(fmsb)
library(MASS)
library(plyr)
=c("ADHD02_0.3","ADHD05_0.3","ALCD03_0.3",
PRSnames"ANXI01_0.3","ANXI02_0.3","BIPO01_0.3",
"DEPR07_0.3","SCHI02_0.3","CANU01_0.3",
"SMOK03_0.3","ALCO01_0.3","COGN01_0.3",
"CONS01_0.3","EXTR01_0.3","NEUR04_0.3",
"AGAB01_0.3","AGGR01_0.3","ANTI01_0.3",
"HEIG01_0.3","WEIG01_0.3","HEIG03_0.3",
"OVER01_0.3","BODY02_0.3","BODY04_0.3")
=fread("selfharm_small_15_Feb.txt",verbose=T)
selfharmdata$batch[selfharmdata$batch==""]<-NA
selfharmdata$AC<-as.factor(selfharmdata$assessment_centre)
selfharmdata$sexL<-as.factor(selfharmdata$sex)
selfharmdata$batchL<-as.factor(selfharmdata$batch)
selfharmdata$f.20480.0.0[selfharmdata$f.20480.0.0==-818]<-NA
selfharmdata$f.20483.0.0[selfharmdata$f.20483.0.0==-818]<-NA
selfharmdata
#try using glm!
=matrix(nrow = length(PRSnames),ncol=8)
ResultsLoopfor (i in seq(from=1,to=length(PRSnames),by=1))
$XD=NA;selfharmdata$XD=selfharmdata[,PRSnames[i],with=F] #with=F, refer to data.table FAQ 1.1
{ selfharmdata=glm(factor(f.20480.0.0)~scale(XD)+PC1+PC2+PC3+PC4+PC5+PC6+age+sexL+AC+batchL,family=binomial(link='logit'),data=selfharmdata)
fit1=glm(factor(f.20480.0.0)~PC1+PC2+PC3+PC4+PC5+PC6+age+sexL+AC+batchL,family=binomial(link='logit'),data=selfharmdata)
fit2=coef(summary(fit1))[2,4]
p1=coef(summary(fit1))[2,1]
b1=exp(b1)
OR=confint(fit1)
CI=CI[2,1] #get confidence intervals
LowCI=CI[2,2]
UpCI=exp(LowCI)
LowCI_OR=exp(UpCI)
UpCI_OR=NagelkerkeR2(fit1)$R2
R1=NagelkerkeR2(fit2)$R2
R2=R1-R2
R2change=c(b1,LowCI,UpCI,OR, LowCI, UpCI, p1,R2change)
Resultsglm1:8]=Resultsglm
ResultsLoop[i,
}save.image(file="glm_models.RData")
# function to rename the columns for PRS
= function(var_names) {
pheno_names = revalue(var_names,
pheno_renamed c("ADHD02_0.3"="ADHD symptoms",
"ADHD05_0.3"="ADHD",
"AGAB01_0.3"="Agreeableness",
"AGGR01_0.3"="Aggression",
"ALCO01_0.3"="Daily alcohol use",
"ALCD03_0.3"="Alcohol dependence disorder",
"ANTI01_0.3"="Antisocial behaviour",
"ANXI01_0.3"=
"Anxiety disorders meta-analysis: factor scores",
"ANXI02_0.3"=
"Anxiety disorders meta-analysis: case-control",
"BIPO01_0.3"="Bipolar disorder",
"BODY02_0.3"="Extreme BMI",
"BODY04_0.3"="BMI",
"CANU01_0.3"="Lifetime cannabis use",
"COGN01_0.3"="Education attainment",
"CONS01_0.3"="Conscientiousness",
"DEPR07_0.3"="MDD",
"EXTR01_0.3"="Extraversion",
"HEIG01_0.3"="Birth length",
"HEIG03_0.3"="Adult height",
"NEUR04_0.3"="Neuroticism IRT",
"OVER01_0.3"="Overweight",
"SCHI02_0.3"="Schizophrenia",
"SMOK03_0.3"="Cigarettes per day",
"WEIG01_0.3"="Birth weight"))
return(pheno_renamed)}
colnames(ResultsLoop)=c("beta coefficient","Lower 95% CI", "Upper 95% CI", "Odds Ratio", "OR low 95% CI","OR up 95% CI","p-value","R2")
=pheno_names(PRSnames)
PRSnames_renamedrownames(ResultsLoop)=PRSnames_renamed
#compare with lrm results, especially the zero values!
ResultsLoop =as.data.frame(ResultsLoop)
ResultsLoop= length(ResultsLoop$`p-value`)
number_models_all $pw_adjusted <- p.adjust(ResultsLoop$`p-value`, method = "fdr", n = number_models_all)
ResultsLoop
ResultsLoop
#output of results
::write.xlsx(ResultsLoop, "glm_single_all_PRS_95CI_1206.xlsx") xlsx
Mutiple PRS logistic regressions
=glm(factor(f.20480.0.0) ~
multi2scale(ADHD05_0.3)+scale(ALCD03_0.3)+
scale(BIPO01_0.3)+scale(CANU01_0.3)+
scale(DEPR07_0.3)+scale(SCHI02_0.3)+
+PC2+PC3+PC4+PC5+PC6+age+sexL+AC+batch,
PC1family=binomial(link='logit'),
data=selfharmdata)
=glm(factor(f.20480.0.0) ~
multi22+PC2+PC3+PC4+PC5+PC6+age+sexL+AC+batch,
PC1family=binomial(link='logit'),
data=selfharmdata)
# calculate Nagelkerke's R2
=NagelkerkeR2(multi2)$R2
R11=NagelkerkeR2(multi22)$R2
R22=R11-R22
R2change2=confint(multi2)
confint2# save as image for future reference
save.image(file="multiple_PRS.RData")
=cbind(coef(summary(multi2)),R2change2,confint2)
results2=results2[1:7,c(1,6,7)]
results3=exp(results3)
results2_ORcolnames(results2_OR)=c("OR","low_OR","up_OR")
=cbind(results3,results2_OR)
overall_results::write.xlsx(overall_results,"glm_multiple_all_PRS_95CI_1206.xlsx") xlsx
Plot Figure 2
R scripts to plot Figure 2 based on the results from the PRS analyses.
library(readxl)
library(plyr)
library(ggplot2)
=read_excel("glm_single_all_PRS_95CI_1206.xlsx")
simple=read_excel("glm_multiple_all_PRS_95CI_1206.xlsx")
multi=as.data.frame(simple)
simple=as.data.frame(multi)
multi= function(var_names) {
pheno_names = revalue(var_names,
pheno_renamed c("ADHD02_0.3"="ADHD symptoms",
"ADHD05_0.3"="ADHD",
"AGAB01_0.3"="Agreeableness",
"AGGR01_0.3"="Aggression",
"ALCO01_0.3"="Daily alcohol use",
"ALCD03_0.3"="Alcohol dependence disorder",
"ANTI01_0.3"="Antisocial behaviour",
"ANXI01_0.3"=
"Anxiety disorders meta-analysis: factor scores",
"ANXI02_0.3"=
"Anxiety disorders meta-analysis: case-control",
"BIPO01_0.3"="Bipolar disorder",
"BODY02_0.3"="Extreme BMI",
"BODY04_0.3"="BMI",
"CANU01_0.3"="Lifetime cannabis use",
"COGN01_0.3"="Education attainment",
"CONS01_0.3"="Conscientiousness",
"DEPR07_0.3"="MDD",
"EXTR01_0.3"="Extraversion",
"HEIG01_0.3"="Birth length",
"HEIG03_0.3"="Adult height",
"NEUR04_0.3"="Neuroticism IRT",
"OVER01_0.3"="Overweight",
"SCHI02_0.3"="Schizophrenia",
"SMOK03_0.3"="Cigarettes per day",
"WEIG01_0.3"="Birth weight"))
return(pheno_renamed)}
##### PLOT ADJUSTED AND UNADJUSTED IN ONE MODEL
$Group_multi="Multi"
multi$Group="Simple"
simple
= simple[,c("...1","Odds Ratio", "OR low 95% CI", "OR up 95% CI", "Group")]
Simple_PRS = multi[2:7,c("...1","OR", "low_OR", "up_OR", "Group_multi")]
Multi_PRS colnames(Simple_PRS)=c("Phenotype_names","OR", "lower_CI", "upper_CI", "Group")
colnames(Multi_PRS)=c("Phenotype_names","OR", "lower_CI", "upper_CI", "Group")
<- data.frame(rbind(Simple_PRS, Multi_PRS))
allModelFrame
25:30,1]=c("ADHD","Alcohol dependence disorder", "Bipolar disorder", "Lifetime cannabis use", "MDD","Schizophrenia")
allModelFrame[
## Rename variables
$Phenotype_names_renamed=allModelFrame$Phenotype_names
allModelFrame$Phenotype_names_renamed=Simple_PRS$Phenotype_names
Simple_PRS
## order variables corresponding to the size of beta // convert to factor or numerical
$OR_num=as.numeric(as.character(allModelFrame$OR))
allModelFrame$Group_fac=as.factor(allModelFrame$Group)
allModelFrame
$Phenotype_names_ordered=
allModelFramefactor(allModelFrame$Phenotype_names,
levels = Simple_PRS$Phenotype_names[order(allModelFrame$OR_num)])
$Phenotype_names_renamed_ordered=
allModelFramefactor(allModelFrame$Phenotype_names_renamed,
levels = Simple_PRS$Phenotype_names_renamed[order(allModelFrame$OR_num)])
$upper_CI_num=as.numeric(allModelFrame$upper_CI)
allModelFrame$lower_CI_num=as.numeric(allModelFrame$lower_CI)
allModelFrame# ## rename variables that are non-significant after FDR correction
$Phenotype_names_renamed_ordered= revalue(allModelFrame$Phenotype_names_renamed_ordered,
allModelFramec("Alcohol dependence disorder" = "Alcohol dependence disorder (1)",
"ADHD symptoms"="ADHD symptoms (2)",
"Neuroticism IRT"="Neuroticism IRT (2)",
"Antisocial behaviour"= "Antisocial behaviour (2)",
"Birth weight" = "Birth weight (2)",
"Extreme BMI" = "Extreme BMI (2)"))
# ================= Plot results: FIGURE 2 ====================================================
$Group_name[allModelFrame$Group=="Multi"]="Multiple PS regression"
allModelFrame$Group_name[allModelFrame$Group=="Simple"]="Single PS regression"
allModelFrame
<- ggplot(allModelFrame, aes(shape= Group_name,col=Group_name)) +
new_plot scale_color_manual(values = c("seagreen", "deepskyblue2"))
<- new_plot +
new_plot geom_hline(yintercept = 1,
colour = gray(1/2),
lty = 2)
<- new_plot +
new_plot geom_linerange(aes(
x = Phenotype_names_renamed_ordered,
ymin = lower_CI_num,
ymax = upper_CI_num),
lwd = 0.7,
position = position_dodge(width = 1/2))
<- new_plot +
new_plot geom_pointrange(aes(
x = Phenotype_names_renamed_ordered,
y = OR_num,
ymin = lower_CI_num,
ymax = upper_CI_num),
lwd = 0.45,
position = position_dodge(width = 1/2))
<- new_plot +
new_plot coord_flip() +
theme(legend.position = c(-0.75, 1),
legend.justification = c(0, 0),
legend.direction = "horizontal") +
labs(title = "",
x = "",
y = "OR",
color = c("")) +
scale_shape_discrete(name ="")
<- new_plot + labs(y="OR")
new_plot
<- new_plot +
new_plot theme(axis.text.x =
element_text(size=9,
family ="Times",
face="bold",
colour="black"),
axis.text.y =
element_text(size=9.5,
family="Times",
face="bold",
colour="black"),
axis.title =
element_text(size=9,
family="Times",
face="bold",
colour="black"),
legend.text =
element_text(size=9,
family="Times",
face="bold",
colour="black"))
# print the new plot
new_plot#ggsave(file="Figure_2_20_Feb_2020.tiff", width=13, height=13, units="cm", dpi=550)
Figure2
Scripts to produce data for predict curve
The results are shown in Table S3 and Table S4 in Supporting Information.
<-fread("selfharm_small_15_Feb.txt",verbose=T)
selfharmdata$batch[selfharmdata$batch==""]<-NA
selfharmdata$AC<-as.factor(selfharmdata$assessment_centre)
selfharmdata$sexL<-as.factor(selfharmdata$sex)
selfharmdata$batchL<-as.factor(selfharmdata$batch)
selfharmdata$f.20480.0.0[selfharmdata$f.20480.0.0==-818]<-NA
selfharmdata$f.20483.0.0[selfharmdata$f.20483.0.0==-818]<-NA
selfharmdata
table(selfharmdata$f.20480.0.0,useNA="always")
table(selfharmdata$f.20483.0.0,useNA="always")
<-selfharmdata[,c("f.20480.0.0","ADHD05_0.3","BIPO01_0.3",
PRS_merged"CANU01_0.3","DEPR07_0.3","SCHI02_0.3",
"PC1", "PC2", "PC3", "PC4", "PC5", "PC6",
"age", "sexL"),with=F]
<-as.data.frame(PRS_merged)
PRS_merged2:13]<-scale(PRS_merged[,2:13])
PRS_merged[,<-PRS_merged
PRS_merged_scaled$self_harm_dichotomized<-as.factor(PRS_merged$f.20480.0.0)
PRS_merged_scaled
<-glm(self_harm_dichotomized ~
multi.3+ADHD05_0.3+BIPO01_0.3+CANU01_0.3+
DEPR07_0.3+PC1+PC2+PC3+PC4+PC5+PC6+age,
SCHI02_0family=binomial(),
data=PRS_merged_scaled)
=glm(self_harm_dichotomized ~
multi_sex.3+ADHD05_0.3+BIPO01_0.3+CANU01_0.3+
DEPR07_0.3+PC1+PC2+PC3+PC4+PC5+PC6+age+sexL,
SCHI02_0family=binomial(),
data=PRS_merged_scaled)
# exclude sex, when do later, run the same model and include sex, when putting values into parameter, use sex =0 and sex = 1. #assessment centre, batch removed as well
summary(multi)
summary(multi_sex)
## CREATE QUINTILE
<-c("ADHD05_0.3","BIPO01_0.3","CANU01_0.3","DEPR07_0.3","SCHI02_0.3")
names_variables#names_variables="DEPR07_0.3"
<-length(names_variables)
n_loop=5
number_quantiles<-PRS_merged_scaled[,2:6] #pick all variables that you have in the mode , inc PC1, just predictor
decile_matrix1
<-as.data.frame(apply(decile_matrix1, 2, dplyr::ntile, n=number_quantiles))
quantile<-matrix(ncol=n_loop, nrow=number_quantiles) #create empty matrix
matrix_quantilecolnames(matrix_quantile)<-names_variables
######
<-as.data.frame(decile_matrix1)
decile_matrix1<-as.data.frame(quantile)
quantile
for (i in 1:n_loop)
1:5,i]<- tapply(decile_matrix1[,i],quantile[,i],mean)}
{matrix_quantile[
colnames(matrix_quantile)<-names_variables
<-as.data.frame(matrix_quantile)
dat_matrix_quantile$PC1=0
dat_matrix_quantile$PC2=0
dat_matrix_quantile$PC3=0
dat_matrix_quantile$PC4=0
dat_matrix_quantile$PC5=0
dat_matrix_quantile$PC6=0
dat_matrix_quantile$age=0
dat_matrix_quantile$quantile=c("D1","D2","D3","D4","D5")
dat_matrix_quantile
# write prediction_estimate function
= function(logistic_model, name_PGS, newdataD1, newdataD2, newdataD3, newdataD4, newdataD5) {
prediction_estimate =predict(logistic_model,newdataD1, type="response",se.fit=T)
d1_predict=predict(logistic_model,newdataD2, type="response",se.fit=T)
d2_predict=predict(logistic_model,newdataD3, type="response",se.fit=T)
d3_predict=predict(logistic_model,newdataD4, type="response",se.fit=T)
d4_predict=predict(logistic_model,newdataD5, type="response",se.fit=T)
d5_predict
<- 1.96 ## approx 95% CI
critval
<- d1_predict$fit + (critval * d1_predict$se.fit)
d1_upr <- d1_predict$fit - (critval * d1_predict$se.fit)
d1_lwr <- d1_predict$fit
d1_fit
<- d2_predict$fit + (critval * d2_predict$se.fit)
d2_upr <- d2_predict$fit - (critval * d2_predict$se.fit)
d2_lwr <- d2_predict$fit
d2_fit
<- d3_predict$fit + (critval * d3_predict$se.fit)
d3_upr <- d3_predict$fit - (critval * d3_predict$se.fit)
d3_lwr <- d3_predict$fit
d3_fit
<- d4_predict$fit + (critval * d4_predict$se.fit)
d4_upr <- d4_predict$fit - (critval * d4_predict$se.fit)
d4_lwr <- d4_predict$fit
d4_fit
<- d5_predict$fit + (critval * d5_predict$se.fit)
d5_upr <- d5_predict$fit - (critval * d5_predict$se.fit)
d5_lwr <- d5_predict$fit
d5_fit
=c(d1_upr,d2_upr,d3_upr,d4_upr,d5_upr)
upr=c(d1_lwr,d2_lwr,d3_lwr,d4_lwr,d5_lwr)
lwr=c(d1_fit,d2_fit,d3_fit,d4_fit,d5_fit)
pred=as.data.frame(rbind(d1_predict,
pred_data_outcome
d2_predict,
d3_predict,
d4_predict,
d5_predict))$upr=upr*100
pred_data_outcome$lwr=lwr*100
pred_data_outcome$pred=pred*100
pred_data_outcome
$quantile=c("D1", "D2", "D3", "D4", "D5")
pred_data_outcome$quantile_num=seq(1:5)
pred_data_outcome$Model=name_PGS
pred_data_outcomereturn(pred_data_outcome)
}
#data for combined PRS
=subset(dat_matrix_quantile, quantile=="D1")
newdataD1=subset(dat_matrix_quantile, quantile=="D2")
newdataD2=subset(dat_matrix_quantile, quantile=="D3")
newdataD3=subset(dat_matrix_quantile, quantile=="D4")
newdataD4=subset(dat_matrix_quantile, quantile=="D5")
newdataD5
#data for ADHD
=newdataD1
D1_ADHD=newdataD2
D2_ADHD=newdataD3
D3_ADHD=newdataD4
D4_ADHD=newdataD5
D5_ADHD
=0
D1_ADHD[,]=0
D2_ADHD[,]=0
D3_ADHD[,]=0
D4_ADHD[,]=0
D5_ADHD[,]
$ADHD05_0.3=newdataD1$ADHD05_0.3
D1_ADHD$ADHD05_0.3=newdataD2$ADHD05_0.3
D2_ADHD$ADHD05_0.3=newdataD3$ADHD05_0.3
D3_ADHD$ADHD05_0.3=newdataD4$ADHD05_0.3
D4_ADHD$ADHD05_0.3=newdataD5$ADHD05_0.3
D5_ADHD
#data for BIPO
=newdataD1
D1_BIPO=newdataD2
D2_BIPO=newdataD3
D3_BIPO=newdataD4
D4_BIPO=newdataD5
D5_BIPO
=0
D1_BIPO[,]=0
D2_BIPO[,]=0
D3_BIPO[,]=0
D4_BIPO[,]=0
D5_BIPO[,]
$BIPO01_0.3=newdataD1$BIPO01_0.3
D1_BIPO$BIPO01_0.3=newdataD2$BIPO01_0.3
D2_BIPO$BIPO01_0.3=newdataD3$BIPO01_0.3
D3_BIPO$BIPO01_0.3=newdataD4$BIPO01_0.3
D4_BIPO$BIPO01_0.3=newdataD5$BIPO01_0.3
D5_BIPO
#data for CANU
=newdataD1
D1_CANU=newdataD2
D2_CANU=newdataD3
D3_CANU=newdataD4
D4_CANU=newdataD5
D5_CANU
=0
D1_CANU[,]=0
D2_CANU[,]=0
D3_CANU[,]=0
D4_CANU[,]=0
D5_CANU[,]
$CANU01_0.3=newdataD1$CANU01_0.3
D1_CANU$CANU01_0.3=newdataD2$CANU01_0.3
D2_CANU$CANU01_0.3=newdataD3$CANU01_0.3
D3_CANU$CANU01_0.3=newdataD4$CANU01_0.3
D4_CANU$CANU01_0.3=newdataD5$CANU01_0.3
D5_CANU
#data for DEPR
=newdataD1
D1_DEPR=newdataD2
D2_DEPR=newdataD3
D3_DEPR=newdataD4
D4_DEPR=newdataD5
D5_DEPR
=0
D1_DEPR[,]=0
D2_DEPR[,]=0
D3_DEPR[,]=0
D4_DEPR[,]=0
D5_DEPR[,]
$DEPR07_0.3=newdataD1$DEPR07_0.3
D1_DEPR$DEPR07_0.3=newdataD2$DEPR07_0.3
D2_DEPR$DEPR07_0.3=newdataD3$DEPR07_0.3
D3_DEPR$DEPR07_0.3=newdataD4$DEPR07_0.3
D4_DEPR$DEPR07_0.3=newdataD5$DEPR07_0.3
D5_DEPR
#data for SCHI
=newdataD1
D1_SCHI=newdataD2
D2_SCHI=newdataD3
D3_SCHI=newdataD4
D4_SCHI=newdataD5
D5_SCHI
=0
D1_SCHI[,]=0
D2_SCHI[,]=0
D3_SCHI[,]=0
D4_SCHI[,]=0
D5_SCHI[,]
$SCHI02_0.3=newdataD1$SCHI02_0.3
D1_SCHI$SCHI02_0.3=newdataD2$SCHI02_0.3
D2_SCHI$SCHI02_0.3=newdataD3$SCHI02_0.3
D3_SCHI$SCHI02_0.3=newdataD4$SCHI02_0.3
D4_SCHI$SCHI02_0.3=newdataD5$SCHI02_0.3
D5_SCHI
#run the prediction model function
=prediction_estimate(multi,"combined PGS",
predicted_all
newdataD1,newdataD2,newdataD3,newdataD4,newdataD5)
=prediction_estimate(multi,"ADHD",D1_ADHD,D2_ADHD,D3_ADHD,D4_ADHD,D5_ADHD)
predicted_ADHD=prediction_estimate(multi,"BIPO",D1_BIPO,D2_BIPO,D3_BIPO,D4_BIPO,D5_BIPO)
predicted_BIPO=prediction_estimate(multi,"CANU",D1_CANU,D2_CANU,D3_CANU,D4_CANU,D5_CANU)
predicted_CANU=prediction_estimate(multi,"DEPR",D1_DEPR,D2_DEPR,D3_DEPR,D4_DEPR,D5_DEPR)
predicted_DEPR=prediction_estimate(multi,"SCHI",D1_SCHI,D2_SCHI,D3_SCHI,D4_SCHI,D5_SCHI)
predicted_SCHI
#this is what you got today (31st July, 2019). Wohoo well done Kai Xiang, you are so amazing!!!
=rbind(predicted_all,
data_for_graph
predicted_ADHD,
predicted_BIPO,
predicted_CANU,
predicted_DEPR,
predicted_SCHI)=as.data.frame(data_for_graph)
data_for_graphdata_for_graph2=unlist(data_for_graph))
(fwrite(data_for_graph, file="data_for_predict_curve_17_Feb_2020.csv")
Plot Figure 3
library(ggplot2)
library(data.table)
=fread("data_for_predict_curve_17_Feb_2020.csv")
data_for_graph
<- c("#5BC0EB","#FDE74C","#9BC53D","#E55934","#FA7921","#295D2E")
cbPalette
= ggplot(data_for_graph, aes(x = quantile_num,
bp y = pred,
shape=factor(Model,
labels=c("ADHD",
"Bipolar disorder",
"Lifetime cannabis use",
"Combined PS",
"MDD",
"Schizophrenia")),
color=factor(Model,
labels=c("ADHD",
"Bipolar disorder",
"Lifetime cannabis use",
"Combined PS",
"MDD",
"Schizophrenia"))))+
scale_colour_manual(values=cbPalette)+
geom_line(linetype = "solid",
size=1) +
geom_point(size=3.0)+
scale_x_discrete(name ="Mean polygenic score per quintile",
limits=c("Mean PS (Q1)","Mean PS (Q2)",
"Mean PS (Q3)", "Mean PS (Q4)",
"Mean PS (Q5)")) +
theme(legend.position="top",
legend.title = element_blank(),
legend.text=element_text(size=15))+
scale_y_continuous(name="Predicted risk (%) of self-harm",
limits=c(0, 7))
=bp + theme(axis.title.x = element_text(face="bold",
bpsize=11,
family ="Times",
colour="black"),
axis.text.x = element_text(vjust=0.5,
size=9,
family ="Times",
colour="black"),
axis.text.y = element_text(vjust=0.5,
size=10,
family ="Times",
colour="black"),
axis.title.y = element_text(face="bold",
size=11,
family ="Times",
colour="black"),
legend.text = element_text(size=10,
face="bold",
family="Times",
colour="black"))
bp
Figure3
ggsave("Figure_3_20_Feb_2020.tiff",width=13, height=13, units="cm", dpi=550)
PRS analyses excluding SCHI cases
# 11 March 2019
#as suggested by JB, run another PRS which excludes schiophrenia cases and see if the results are significant
=fread("scz_diagnosis_0706.txt",verbose=T)
scz=as.data.frame(scz)
scz_data
=selfharmdata[!selfharmdata$IID %in% scz_data$IID,]
selfharm_exclude_schidim(selfharmdata);dim(scz_data);dim(selfharm_exclude_schi)
#502676-1121=501555
#run PRS analysis without SCHI cases:
=glm(factor(f.20480.0.0)~
fit1scale(SCHI02_0.3)+PC1+PC2+PC3+PC4+PC5+PC6+
+sexL+AC+batchL,
agefamily=binomial(link='logit'),
data=selfharm_exclude_schi)
=glm(factor(f.20480.0.0)~
fit2+PC2+PC3+PC4+PC5+PC6+age+sexL+AC+batchL,
PC1family=binomial(link='logit'),
data=selfharm_exclude_schi)
=coef(summary(fit1))[2,4]
p1=coef(summary(fit1))[2,1]
b1=confint(fit1)
CI=CI[2,1] #get upper confidence intervals
LowCI=CI[2,2]
UpCI=NagelkerkeR2(fit1)$R2
R1=NagelkerkeR2(fit2)$R2
R2=R1-R2
R2change=exp(b1)
OR=exp(LowCI)
ORlowCI=exp(UpCI)
ORupCI=c(b1,LowCI,UpCI,OR, ORlowCI, ORupCI, p1,R2change)
Resultsglm
=matrix(nrow = 1,ncol=8)
ResultsLoop1,1:7]<-Resultsglm
ResultsLoop[colnames(ResultsLoop)=c("beta coefficient","lowCI" "upCI","OR","ORlowCI","ORupCI","p-value","R2")
ResultsLoopwrite.table(ResultsLoop,"SCZ_excluded_GLM_0107_OR.csv")
PRS analyses excluding MDD cases
# exclude DEPR cases
=fread("dep_diagnosis_0706.txt",verbose=T)
dephead(dep);dim(dep)
=selfharmdata[!selfharmdata$IID %in% dep$IID,]
selfharm_exclude_deprdim(selfharmdata);dim(dep);dim(selfharm_exclude_depr)
#502676-62645=440031
str(selfharm_exclude_depr)
#run PRS analysis without SCHI cases:
=glm(factor(f.20480.0.0)~
fit1scale(DEPR07_0.3)+PC1+PC2+PC3+PC4+PC5+PC6+
+sexL+AC+batchL,
agefamily=binomial(link='logit'),
data=selfharm_exclude_depr)
=glm(factor(f.20480.0.0)~
fit2+PC2+PC3+PC4+PC5+PC6+age+sexL+AC+batchL,
PC1family=binomial(link='logit'),
data=selfharm_exclude_depr)
=coef(summary(fit1))[2,4]
p1=coef(summary(fit1))[2,1]
b1=confint(fit1)
CI=CI[2,1] #get upper confidence intervals
LowCI=CI[2,2]
UpCI=NagelkerkeR2(fit1)$R2
R1=NagelkerkeR2(fit2)$R2
R2=R1-R2
R2change=exp(b1)
OR=exp(LowCI)
ORlowCI=exp(UpCI)
ORupCI=c(b1,LowCI,UpCI,OR, ORlowCI, ORupCI, p1,R2change)
Resultsglm
=matrix(nrow = 1,ncol=8)
ResultsLoop1,1:7]<-Resultsglm
ResultsLoop[colnames(ResultsLoop)=c("beta coefficient","lowCI" "upCI","OR","ORlowCI","ORupCI","p-value","R2")
ResultsLoopwrite.table(ResultsLoop,"DEP_excluded_GLM_0107_OR.csv")
Multinomial PRS regression
#read self-harm data
#install.packages("data.table")
library(data.table)
library(AER)
library(nnet)
library(rms)
library(DescTools) #to calculate NagelkerkeR2 for multinom (nnet)
library(fmsb)
library(MASS) #to calculate NagelkerkeR2 for glm
=fread("selfharm_small_15_Feb.txt", verbose=T)
selfharmdata$batch[selfharmdata$batch==""]<-NA
selfharmdata$AC<-as.factor(selfharmdata$assessment_centre)
selfharmdata$sexL<-as.factor(selfharmdata$sex)
selfharmdata$batchL<-as.factor(selfharmdata$batch)
selfharmdata$f.20480.0.0[selfharmdata$f.20480.0.0==-818]<-NA
selfharmdata$f.20483.0.0[selfharmdata$f.20483.0.0==-818]<-NA
selfharmdatastr(selfharmdata$AC);str(selfharmdata$sexL);str(selfharmdata$batchL)
table(selfharmdata$batch,useNA="always")
table(selfharmdata$AC,useNA="always")
#run the analysis only for PRS significant at p=0.05 threshold after FDR
=c("ADHD02_0.3","ADHD05_0.3","ALCD03_0.3",
PRS"ANXI01_0.3","ANXI02_0.3","BIPO01_0.3",
"DEPR07_0.3","SCHI02_0.3","CANU01_0.3",
"SMOK03_0.3","ALCO01_0.3","COGN01_0.3",
"CONS01_0.3","EXTR01_0.3","NEUR04_0.3",
"AGAB01_0.3","AGGR01_0.3","ANTI01_0.3",
"HEIG01_0.3","WEIG01_0.3","HEIG03_0.3",
"OVER01_0.3","BODY02_0.3","BODY04_0.3")
#create a new column with 0,1 and 2, 0 for never self-harmed, 1 for ever self-harm without suicide intention
#and 2 for self harm with suicide intention.
$SHI=NA;selfharmdata$SHI=selfharmdata$f.20483.0.0
selfharmdatahead(selfharmdata$SHI, 100)
table(selfharmdata$f.20483.0.0, useNA="always")
table(selfharmdata$f.20480.0.0, useNA="always")
#recode to delineate never self-harmed (0), NSSH (1) and SSH (2).
which(selfharmdata$f.20480.0.0==0),"SHI"] <- 0
selfharmdata[which(selfharmdata$f.20483.0.0==0),"SHI"] <- 1
selfharmdata[which(selfharmdata$f.20483.0.0==1),"SHI"] <- 2
selfharmdata[table(selfharmdata$SHI,useNA="always")
#run multinomial logistic regression
$SHI<-factor(selfharmdata$SHI)
selfharmdata$SHI<- relevel(selfharmdata$SHI, ref = "0")
selfharmdata
=matrix(nrow = length(PRS),ncol=16)
ResultsLoopfor (i in seq(from=1,to=length(PRS),by=1))
$XA=NA;selfharmdata$XA=selfharmdata[,PRS[i],with=F]
{ selfharmdata$XD=NA; selfharmdata$XD=scale(selfharmdata$XA)
selfharmdata=multinom(SHI ~ XD+PC1+PC2+PC3+PC4+PC5+PC6+age+sexL+AC+batchL,data=selfharmdata)
model1=multinom(SHI ~ PC1+PC2+PC3+PC4+PC5+PC6+age+sexL+AC+batchL,data=selfharmdata)
model2=summary(model1)
sumtest1=summary(model2)
sumtest2=confint(model1, parm="XD")
CI=sumtest1$coefficients[3]
b1=sumtest1$coefficients[4]
b2=CI[1]
lowCI1=CI[2]
upCI1=CI[3]
lowCI2=CI[4]
upCI2=exp(b1);OR2=exp(b2)
OR1=exp(lowCI1);ORupCI1=exp(upCI1)
ORlowCI1=exp(lowCI2);ORupCI2=exp(upCI2)
ORlowCI2=anova(model1,model2)[7]
a=a$Pr[2]
p.anova<- sumtest1$coefficients/sumtest1$standard.errors
z # 2-tailed z test
<- (1 - pnorm(abs(z), 0, 1)) * 2
p =p[3]
p1=p[4]
p2=PseudoR2(model1,which="Nagelkerke")
R1=PseudoR2(model2,which="Nagelkerke")
R2=R1-R2
Rchange=c(b1,lowCI1,upCI1,OR1,ORlowCI1,ORupCI1,p1,
Results
b2,lowCI2,upCI2,OR2,ORlowCI2,ORupCI2,p2,#remember to add SEs
Rchange,p.anova)
Results
#
# #when ref=0
# #1.274361e-01 1.198635e-01 4.808891e-10 4.379186e-10 0.000000e+00
# #when ref=1
#
1:16]=Results
ResultsLoop[i,#
}#
colnames(ResultsLoop)=c("beta.1","lowCI.1","upCI.1","OR.1","lowOR.1","upOR.1","p.value1",
"beta.2","lowCI.2","upCI.2","OR.2","lowOR.2","upOR.2","p.value2","R2","p.anova")
rownames(ResultsLoop)=PRS
#
ResultsLoop
=as.data.frame(ResultsLoop)
ResultsLoop= length(cbind(ResultsLoop$p.value1,ResultsLoop$p.value2))
number_models_all $p1_adjusted <- p.adjust(ResultsLoop$`p.value1`, method = "fdr", n = number_models_all)
ResultsLoop$p2_adjusted <- p.adjust(ResultsLoop$`p.value2`, method = "fdr", n = number_models_all)
ResultsLoopsetwd("/mnt/lustre/groups/ukbiobank/Edinburgh_Data/usr/Kai/publication_analyses/results")
write.csv(ResultsLoop, "multinom_FDR_0307.csv")
###now run another multinomial model with different reference group - NSSH as reference
$SHI<-factor(selfharmdata$SHI)
selfharmdata$SHI<- relevel(selfharmdata$SHI, ref = "1")
selfharmdata
=matrix(nrow = length(PRS),ncol=16)
ResultsLoop2for (i in seq(from=1,to=length(PRS),by=1))
$XA=NA;selfharmdata$XA=selfharmdata[,PRS[i],with=F]
{ selfharmdata$XD=NA; selfharmdata$XD=scale(selfharmdata$XA)
selfharmdata=multinom(SHI ~ XD+PC1+PC2+PC3+PC4+PC5+PC6+age+sexL+AC+batchL,data=selfharmdata)
model1=multinom(SHI ~ PC1+PC2+PC3+PC4+PC5+PC6+age+sexL+AC+batchL,data=selfharmdata)
model2=summary(model1)
sumtest1=summary(model2)
sumtest2
=confint(model1, parm="XD")
CI2=sumtest1$coefficients[3]
b1=sumtest1$coefficients[4]
b2=CI2[1]
lowCI1=CI2[2]
upCI1=CI2[3]
lowCI2=CI2[4]
upCI2=exp(b1);OR2=exp(b2)
OR1=exp(lowCI1);ORupCI1=exp(upCI1)
ORlowCI1=exp(lowCI2);ORupCI2=exp(upCI2)
ORlowCI2=anova(model1,model2)[7]
a=a$Pr[2]
p.anova<- sumtest1$coefficients/sumtest1$standard.errors
z # 2-tailed z test
<- (1 - pnorm(abs(z), 0, 1)) * 2 #for depression, the p-value was 0 for SSH,not desirable.
p =p[3]
p1=p[4]
p2#try using another package to obtain p-value
<-coeftest(model1) #obtain p-value
pp#shows that p=0 means p< 2.2e-16
pp #
=PseudoR2(model1,which="Nagelkerke")
R1=PseudoR2(model2,which="Nagelkerke")
R2=R1-R2
Rchange=c(b1,lowCI1,upCI1,OR1,ORlowCI1,ORupCI1,p1,
Results
b2,lowCI2,upCI2,OR2,ORlowCI2,ORupCI2,p2,#remember to add SEs
Rchange,p.anova) 1:16]=Results
ResultsLoop[i,
}colnames(ResultsLoop)=c("beta.1","lowCI.1","upCI.1",
"OR.1","lowOR.1","upOR.1","p.value1",
"beta.2","lowCI.2","upCI.2",
"OR.2","lowOR.2","upOR.2","p.value2",
"R2","p.anova")
rownames(ResultsLoop)=PRS
=as.data.frame(ResultsLoop)
ResultsLoop= length(cbind(ResultsLoop$p.value1,ResultsLoop$p.value2))
number_models_all $p1_adjusted <- p.adjust(ResultsLoop$`p.value1`, method = "fdr", n = number_models_all)
ResultsLoop$p2_adjusted <- p.adjust(ResultsLoop$`p.value2`, method = "fdr", n = number_models_all)
ResultsLoopwrite.csv(ResultsLoop, "NSSH_multinom_FDR_0307.csv")
3: Run MR
Univariate two-sample MR
Outcome: SH
library(TwoSampleMR)
library(MendelianRandomization)
library(mr.raps)
library(data.table)
library(xlsx)
# 1. read all exposure data
# ADHD
<- read_exposure_data(
exposure_ADHD_dat filename = "ADHD", # This is the filename of the summary statistics file for your exposure
sep = "\t", # Specify whether the file is a tab delimited file ("\t") or a space delimited file ("")
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "beta", # Insert the name of column with effect sizes
se_col = "beta_se", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P") # Insert the name of column with p-value
$exposure="ADHD" # Provide a name for the exposure variable
exposure_ADHD_datlength(exposure_ADHD_dat$SNP) # We can check the number of rows the imported dataset
# ALCD
<- read_exposure_data(
exposure_ALCD_dat filename = "ALCD", # This is the filename of the summary statistics file for your exposure
sep = "\t", # Specify whether the file is a tab delimited file ("\t") or a space delimited file ("")
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "beta", # Insert the name of column with effect sizes
se_col = "beta_se", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P") # Insert the name of column with p-value
$exposure="ALCD" # Provide a name for the exposure variable
exposure_ALCD_datlength(exposure_ALCD_dat$SNP) # We can check the number of rows the imported dataset
<- read_exposure_data(
exposure_BIPO_dat filename = "BIPO", # This is the filename of the summary statistics file for your exposure
sep = "\t", # Specify whether the file is a tab delimited file ("\t") or a space delimited file ("")
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "beta", # Insert the name of column with effect sizes
se_col = "beta_se", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P") # Insert the name of column with p-value
$exposure="BIPO" # Provide a name for the exposure variable
exposure_BIPO_datlength(exposure_BIPO_dat$SNP) # We can check the number of rows the imported dataset
<- read_exposure_data(
exposure_CANU_dat filename = "CANU", # This is the filename of the summary statistics file for your exposure
sep = "\t", # Specify whether the file is a tab delimited file ("\t") or a space delimited file ("")
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "BETA", # Insert the name of column with effect sizes
se_col = "SE", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P", # Insert the name of column with p-value
eaf_col="FREQ") # Insert the name of column with effect allele frequency
$exposure="CANU" # Provide a name for the exposure variable
exposure_CANU_datlength(exposure_CANU_dat$SNP) # We can check the number of rows the imported dataset
<- read_exposure_data(
exposure_DEPR_dat filename = "DEPR", # This is the filename of the summary statistics file for your exposure
sep = "\t", # Specify whether the file is a tab delimited file ("\t") or a space delimited file ("")
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "beta", # Insert the name of column with effect sizes
se_col = "beta_se", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P", # Insert the name of column with p-value
eaf_col="FREQ") # Insert the name of column with effect allele frequency
$exposure="DEPR"
exposure_DEPR_datlength(exposure_DEPR_dat$SNP) # We can check the number of rows the imported dataset
=fread("SCHI") #SCHI file is too big, hence use fread to read it first, then
schi# remove snps which are not needed.
=subset(schi,P<0.05)
schi2
<-format_data(schi2,
exposure_SCHI_dattype="exposure",
snp_col = "SNP",
beta_col = "beta",
se_col = "beta_se",
effect_allele_col = "A1",
other_allele_col = "A2",
pval_col = "P",
eaf_col="FREQ")
$exposure="SCHI" # Provide a name for the exposure variable
exposure_SCHI_datlength(exposure_SCHI_dat$SNP) # We can check the number of rows the imported dataset
# now read outcome data - self-harm
<- read_outcome_data(
outcome_SH_dat snps = NULL, # Keep 'snps=NULL' so that all SNPs are uploaded
filename = "selfharm_sumstats_v2",
sep = "\t",
snp_col = "rsid",
beta_col = "beta",
se_col = "se",
effect_allele_col = "alt",
other_allele_col = "ref",
pval_col = "pval",
eaf_col="AF")
$outcome <- "SH" # Provide a name for the outcome
outcome_SH_datlength(outcome_SH_dat$SNP)
#quality control: select p-value for the snps associated with adhd at 0.3 - does not work - too many SNPs!
#use 5e5 instead - 5e8 was too stringent.
=subset(exposure_ADHD_dat, pval.exposure < 5e-5)
exposure_ADHD_dat_5e5=subset(exposure_ALCD_dat, pval.exposure < 5e-5)
exposure_ALCD_dat_5e5=subset(exposure_BIPO_dat, pval.exposure < 5e-5)
exposure_BIPO_dat_5e5=subset(exposure_CANU_dat, pval.exposure < 5e-5)
exposure_CANU_dat_5e5=subset(exposure_DEPR_dat, pval.exposure < 5e-5)
exposure_DEPR_dat_5e5=subset(exposure_SCHI_dat, pval.exposure < 5e-5)
exposure_SCHI_dat_5e5
#clumping, note that clump_kb is not the same as default in updated package!
=clump_data(exposure_ADHD_dat_5e5,
exposure_ADHD_dat_5e5_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_ALCD_dat_5e5,
exposure_ALCD_dat_5e5_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_BIPO_dat_5e5,
exposure_BIPO_dat_5e5_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_CANU_dat_5e5,
exposure_CANU_dat_5e5_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_DEPR_dat_5e5,
exposure_DEPR_dat_5e5_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_SCHI_dat_5e5,
exposure_SCHI_dat_5e5_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
#number of SNPs passing 5e-5 threshold
print("ADHD: ");length(exposure_ADHD_dat_5e5$SNP)
print("ALCD: ");length(exposure_ALCD_dat_5e5$SNP)
print("BIPO: ");length(exposure_BIPO_dat_5e5$SNP)
print("CANU: ");length(exposure_CANU_dat_5e5$SNP)
print("DEPR: ");length(exposure_DEPR_dat_5e5$SNP)
print("SCHI: ");length(exposure_SCHI_dat_5e5$SNP)
# 5e5 clumping results
print("ADHD: ");length(exposure_ADHD_dat_5e5_clumped$SNP)
print("ALCD: ");length(exposure_ALCD_dat_5e5_clumped$SNP)
print("BIPO: ");length(exposure_BIPO_dat_5e5_clumped$SNP)
print("CANU: ");length(exposure_CANU_dat_5e5_clumped$SNP)
print("DEPR: ");length(exposure_DEPR_dat_5e5_clumped$SNP)
print("SCHI: ");length(exposure_SCHI_dat_5e5_clumped$SNP)
#choose p=5e-5 threshold and harmonise data
<- harmonise_data(exposure_dat = exposure_ADHD_dat_5e5_clumped,
DataMR_ADHDtoSH outcome_dat = outcome_SH_dat,
action=2)
<- harmonise_data(exposure_dat = exposure_ALCD_dat_5e5_clumped,
DataMR_ALCDtoSH outcome_dat = outcome_SH_dat,
action=2)
<- harmonise_data(exposure_dat = exposure_BIPO_dat_5e5_clumped,
DataMR_BIPOtoSH outcome_dat = outcome_SH_dat,
action=2)
<- harmonise_data(exposure_dat = exposure_CANU_dat_5e5_clumped,
DataMR_CANUtoSH outcome_dat = outcome_SH_dat,
action=2)
<- harmonise_data(exposure_dat = exposure_DEPR_dat_5e5_clumped,
DataMR_DEPRtoSH outcome_dat = outcome_SH_dat,
action=2)
<- harmonise_data(exposure_dat = exposure_SCHI_dat_5e5_clumped,
DataMR_SCHItoSH outcome_dat = outcome_SH_dat,
action=2)
=subset(DataMR_ADHDtoSH, mr_keep==TRUE)
DataMR_ADHDtoSH_keep=subset(DataMR_ALCDtoSH, mr_keep==TRUE)
DataMR_ALCDtoSH_keep=subset(DataMR_BIPOtoSH, mr_keep==TRUE)
DataMR_BIPOtoSH_keep=subset(DataMR_CANUtoSH, mr_keep==TRUE)
DataMR_CANUtoSH_keep=subset(DataMR_DEPRtoSH, mr_keep==TRUE)
DataMR_DEPRtoSH_keep=subset(DataMR_SCHItoSH, mr_keep==TRUE)
DataMR_SCHItoSH_keep
#use MendelianRandomization to calculate the MR estimates
detach(package:TwoSampleMR)
library(MendelianRandomization)
= DataMR_ADHDtoSH_keep$beta.exposure
bx1 = DataMR_ADHDtoSH_keep$se.exposure
bxse1 = DataMR_ADHDtoSH_keep$beta.outcome
by1 = DataMR_ADHDtoSH_keep$se.outcome
byse1
= DataMR_ALCDtoSH_keep$beta.exposure
bx22 = DataMR_ALCDtoSH_keep$se.exposure
bxse22 = DataMR_ALCDtoSH_keep$beta.outcome
by22 = DataMR_ALCDtoSH_keep$se.outcome
byse22
= DataMR_BIPOtoSH_keep$beta.exposure
bx2 = DataMR_BIPOtoSH_keep$se.exposure
bxse2 = DataMR_BIPOtoSH_keep$beta.outcome
by2 = DataMR_BIPOtoSH_keep$se.outcome
byse2
= DataMR_CANUtoSH_keep$beta.exposure
bx3 = DataMR_CANUtoSH_keep$se.exposure
bxse3 = DataMR_CANUtoSH_keep$beta.outcome
by3 = DataMR_CANUtoSH_keep$se.outcome
byse3
= DataMR_DEPRtoSH_keep$beta.exposure
bx5 = DataMR_DEPRtoSH_keep$se.exposure
bxse5 = DataMR_DEPRtoSH_keep$beta.outcome
by5 = DataMR_DEPRtoSH_keep$se.outcome
byse5
= DataMR_SCHItoSH_keep$beta.exposure
bx6 = DataMR_SCHItoSH_keep$se.exposure
bxse6 = DataMR_SCHItoSH_keep$beta.outcome
by6 = DataMR_SCHItoSH_keep$se.outcome
byse6
# Read in the data
<- mr_input(bx1, bxse1, by1, byse1, exposure="ADHD", outcome="SH")
MR_ADHDtoSH_data <- mr_input(bx22, bxse22, by22, byse22, exposure="ALCD", outcome="SH")
MR_ALCDtoSH_data <- mr_input(bx2, bxse2, by2, byse2, exposure="BIPO", outcome="SH")
MR_BIPOtoSH_data <- mr_input(bx3, bxse3, by3, byse3, exposure="CANU", outcome="SH")
MR_CANUtoSH_data <- mr_input(bx5, bxse5, by5, byse5, exposure="DEPR", outcome="SH")
MR_DEPRtoSH_data <- mr_input(bx6, bxse6, by6, byse6, exposure="SCHI", outcome="SH")
MR_SCHItoSH_data
# Run MR
<- mr_allmethods(MR_ADHDtoSH_data, method = "all")
MR_ADHDtoSH <- mr_allmethods(MR_ALCDtoSH_data, method = "all")
MR_ALCDtoSH <- mr_allmethods(MR_BIPOtoSH_data, method = "all")
MR_BIPOtoSH <- mr_allmethods(MR_CANUtoSH_data, method = "all")
MR_CANUtoSH <- mr_allmethods(MR_DEPRtoSH_data, method = "all")
MR_DEPRtoSH <- mr_allmethods(MR_SCHItoSH_data, method = "all")
MR_SCHItoSH
=MR_ADHDtoSH@Values # Saves the results in dataframe
Results_ADHDtoSH=MR_ALCDtoSH@Values
Results_ALCDtoSH=MR_BIPOtoSH@Values
Results_BIPOtoSH=MR_CANUtoSH@Values
Results_CANUtoSH=MR_DEPRtoSH@Values
Results_DEPRtoSH=MR_SCHItoSH@Values
Results_SCHItoSH
<-"MendelianRandomization_results_1005.xlsx"
mrfilenamewrite.xlsx(Results_ADHDtoSH, file=mrfilename, sheetName="ADHD")
write.xlsx(Results_ALCDtoSH, file=mrfilename, sheetName="ALCD", append=T)
write.xlsx(Results_BIPOtoSH, file=mrfilename, sheetName="BIPO", append=T)
write.xlsx(Results_CANUtoSH, file=mrfilename, sheetName="CANU", append=T)
write.xlsx(Results_DEPRtoSH, file=mrfilename, sheetName="DEPR", append=T)
write.xlsx(Results_SCHItoSH, file=mrfilename, sheetName="SCHI", append=T)
#run MR egger using MendelianRandomisation package
mr_egger(MR_ADHDtoSH_data)
mr_egger(MR_ALCDtoSH_data)
mr_egger(MR_BIPOtoSH_data)
mr_egger(MR_CANUtoSH_data)
mr_egger(MR_DEPRtoSH_data)
mr_egger(MR_SCHItoSH_data)
#use TwoSampleMR to calculate the MR estimates
library(TwoSampleMR)
<- mr(DataMR_ADHDtoSH,method_list=c("mr_raps",
MR_ADHDtoSH2"mr_weighted_median",
"mr_ivw",
"mr_egger_regression"))
<- mr(DataMR_ALCDtoSH,method_list=c("mr_raps",
MR_ALCDtoSH2"mr_weighted_median",
"mr_ivw",
"mr_egger_regression"))
<- mr(DataMR_BIPOtoSH,method_list=c("mr_raps",
MR_BIPOtoSH2"mr_weighted_median",
"mr_ivw",
"mr_egger_regression"))
<- mr(DataMR_CANUtoSH,method_list=c("mr_raps",
MR_CANUtoSH2"mr_weighted_median",
"mr_ivw",
"mr_egger_regression"))
<- mr(DataMR_DEPRtoSH,method_list=c("mr_raps",
MR_DEPRtoSH2"mr_weighted_median",
"mr_ivw",
"mr_egger_regression"))
<- mr(DataMR_SCHItoSH,method_list=c("mr_raps",
MR_SCHItoSH2"mr_weighted_median",
"mr_ivw",
"mr_egger_regression"))
=MR_ADHDtoSH2 # Saves the results in dataframe
Results_ADHDtoSH2=MR_ALCDtoSH2 # Saves the results in dataframe
Results_ALCDtoSH2=MR_BIPOtoSH2 # Saves the results in dataframe
Results_BIPOtoSH2=MR_CANUtoSH2 # Saves the results in dataframe
Results_CANUtoSH2=MR_DEPRtoSH2 # Saves the results in dataframe
Results_DEPRtoSH2=MR_SCHItoSH2 # Saves the results in dataframe
Results_SCHItoSH2
<-"twosampleMR_results_1005.xlsx"
twosample_filenamewrite.xlsx(Results_ADHDtoSH2, file=twosample_filename, sheetName="ADHD")
write.xlsx(Results_ALCDtoSH2, file=twosample_filename, sheetName="ALCD", append=T)
write.xlsx(Results_BIPOtoSH2, file=twosample_filename, sheetName="BIPO", append=T)
write.xlsx(Results_CANUtoSH2, file=twosample_filename, sheetName="CANU", append=T)
write.xlsx(Results_DEPRtoSH2, file=twosample_filename, sheetName="DEPR", append=T)
write.xlsx(Results_SCHItoSH2, file=twosample_filename, sheetName="SCHI", append=T)
=rbind(
MR_Egger_intmr_pleiotropy_test(DataMR_ADHDtoSH),
mr_pleiotropy_test(DataMR_ALCDtoSH),
mr_pleiotropy_test(DataMR_BIPOtoSH),
mr_pleiotropy_test(DataMR_CANUtoSH),
mr_pleiotropy_test(DataMR_DEPRtoSH),
mr_pleiotropy_test(DataMR_SCHItoSH))
write.xlsx(MR_Egger_int, file="MR_Egger.xlsx")
I2GX stats for SH
#CITE: Bowden et al. Assessing the suitability of summary data for 2-sample MR using MR-Egger regression,
# role of I2 stats
# MR Egger assumes NOME,i.e. No Measurement Error
# When NOME is not satifised, there is 'regression dilution',
# which means that when there is error in the SNP-exposure association,
# the MR Egger estimate will underestimate the true causal effect,
# and in some cases have low power to estimate pleiotropy
# This can be mitigated by first
# 1/Calculating the I2 , which is based on the Cochran statistics and give an indication of measurement error
# (1 no error, 0 only error)
# 2a/ Correcting crudly the MR Beta by: BMR/I2 but can be unstable
# 2b/ Simulation analyses to correct for measurement error
#Guidelines are:
#-if I2 >90% nothing to do (the causal estimate will be affected by only 10%)
#-if 60% < I2 < 90%, apply correction
#-if I2 < 60% don't use MR Egger at all because estimate very diluted and low power.
#Preliminary step: take only variants with smallest absolute effect sizes, if not (we have I2 >99%) and no correction needed
<-DataMR_ADHDtoSH_keep[abs(
DataMR_ADHDtoSH_keep2$beta.exposure)<
DataMR_ADHDtoSH_keepquantile(abs(DataMR_ADHDtoSH_keep$beta.exposure),0.20),] #select variants with the weaker betas
dim(DataMR_ADHDtoSH_keep2)# 48 variants selected. Note that a very small number for MR Egger (e.g. 8) yield completely unstable estimates.
#calculate baselines estimates on this new dataset with weaker instruments
<- mr(DataMR_ADHDtoSH_keep, method_list=c("mr_ivw","mr_egger_regression")))
(MR_ADHDtoSH_I2 #IVW much bigger
#function adapted from MendelianRandomisation:
= function(Bx,Bxse,Byse)
QQ = sum((Bxse/Byse)^-2*(Bx/Byse-weighted.mean(Bx/Byse, w=(Bxse/Byse)^-2))^2)
{Q = max(0, (Q-(length(Bx)-1))/Q)
I cat("I^2_GX statistic: ", decimals(I*100, 1),"%\n",sep="")
}
#Function to calculate I2
= function(y,s){
Isq = length(y)
k = 1/s^2; sum.w = sum(w)
w = sum(y*w)/sum.w
mu.hat = sum(w*(y-mu.hat)^2)
Q = (Q - (k-1))/Q
Isq= max(0,Isq)
Isq return(Isq)
}
#if BetaXG and seBetaXG represent the vector of SNP-exposure estimates and standar errors,
#and BetaYG and seBetaYG represent the vector of SNP-exposure estimates and standard errors. We calculate the I 2
#then do: Isq(BetaXG,seBetaXG)
#for MR regression weighted: Isq(BetaXG/seBetaYG,seBetaXG/seBetaYG)
length(DataMR_ADHDtoSH_keep$beta.exposure)
#1. Calculate I2 (I squared)
#ADHD
isqADHD=Isq(y=DataMR_ADHDtoSH_keep$beta.exposure,
(s=DataMR_ADHDtoSH_keep$se.exposure))
isqADHD_w=Isq(y=DataMR_ADHDtoSH_keep$beta.exposure/DataMR_ADHDtoSH_keep$se.outcome,
(s=DataMR_ADHDtoSH_keep$se.exposure/DataMR_ADHDtoSH_keep$se.outcome))
QQ(Bx=DataMR_ADHDtoSH_keep$beta.exposure,
Bxse=DataMR_ADHDtoSH_keep$se.exposure,
Byse=DataMR_ADHDtoSH_keep$se.outcome)
#94.9%
#ALCD
isqALCD=Isq(y=DataMR_ALCDtoSH_keep$beta.exposure,
(s=DataMR_ALCDtoSH_keep$se.exposure))
isqALCD_w=Isq(y=DataMR_ALCDtoSH_keep$beta.exposure/DataMR_ALCDtoSH_keep$se.outcome,
(s=DataMR_ALCDtoSH_keep$se.exposure/DataMR_ALCDtoSH_keep$se.outcome))
QQ(Bx=DataMR_ALCDtoSH_keep$beta.exposure,
Bxse=DataMR_ALCDtoSH_keep$se.exposure,
Byse=DataMR_ALCDtoSH_keep$se.outcome)
#93.0%
#BIPO
isqBIPO=Isq(y=DataMR_BIPOtoSH_keep$beta.exposure,
(s=DataMR_BIPOtoSH_keep$se.exposure))
isqBIPO_w=Isq(y=DataMR_BIPOtoSH_keep$beta.exposure/DataMR_BIPOtoSH_keep$se.outcome,
(s=DataMR_BIPOtoSH_keep$se.exposure/DataMR_BIPOtoSH_keep$se.outcome))
QQ(Bx=DataMR_BIPOtoSH_keep$beta.exposure,
Bxse=DataMR_BIPOtoSH_keep$se.exposure,
Byse=DataMR_BIPOtoSH_keep$se.outcome)
#94.4%
#CANU
isqCANU=Isq(y=DataMR_CANUtoSH_keep$beta.exposure,
(s=DataMR_CANUtoSH_keep$se.exposure))
isqCANU_w=Isq(y=DataMR_CANUtoSH_keep$beta.exposure/DataMR_CANUtoSH_keep$se.outcome,
(s=DataMR_CANUtoSH_keep$se.exposure/DataMR_CANUtoSH_keep$se.outcome))
QQ(Bx=DataMR_CANUtoSH_keep$beta.exposure,
Bxse=DataMR_CANUtoSH_keep$se.exposure,
Byse=DataMR_CANUtoSH_keep$se.outcome)
#94.7%
#DEPR
isqDEPR=Isq(y=DataMR_DEPRtoSH_keep$beta.exposure,
(s=DataMR_DEPRtoSH_keep$se.exposure))
isqDEPR_w=Isq(y=DataMR_DEPRtoSH_keep$beta.exposure/DataMR_DEPRtoSH_keep$se.outcome,
(s=DataMR_DEPRtoSH_keep$se.exposure/DataMR_DEPRtoSH_keep$se.outcome))
QQ(Bx=DataMR_DEPRtoSH_keep$beta.exposure,
Bxse=DataMR_DEPRtoSH_keep$se.exposure,
Byse=DataMR_DEPRtoSH_keep$se.outcome)
#94.7%
#SCHI
isqSCHI=Isq(y=DataMR_SCHItoSH_keep$beta.exposure,
(s=DataMR_SCHItoSH_keep$se.exposure))
isqSCHI_w=Isq(y=DataMR_SCHItoSH_keep$beta.exposure/DataMR_SCHItoSH_keep$se.outcome,
(s=DataMR_SCHItoSH_keep$se.exposure/DataMR_SCHItoSH_keep$se.outcome))
QQ(Bx=DataMR_SCHItoSH_keep$beta.exposure,
Bxse=DataMR_SCHItoSH_keep$se.exposure,
Byse=DataMR_SCHItoSH_keep$se.outcome)
#95.7%
#despite weaker variant, I2 is high so really no problem of weak instrument with LDL
#### now try doing this when the p-value threshold is 5e-8.
## On hindsight, these are quite similar to the codes run above, could have saved them as rds rather than rerunning them in a different script
#resubset the SNPs, starting from the exposure data.
=subset(exposure_ADHD_dat, pval.exposure < 5e-8)
exposure_ADHD_dat_5e8=subset(exposure_ALCD_dat, pval.exposure < 5e-8)
exposure_ALCD_dat_5e8=subset(exposure_BIPO_dat, pval.exposure < 5e-8)
exposure_BIPO_dat_5e8=subset(exposure_CANU_dat, pval.exposure < 5e-8)
exposure_CANU_dat_5e8=subset(exposure_DEPR_dat, pval.exposure < 5e-8)
exposure_DEPR_dat_5e8=subset(exposure_SCHI_dat, pval.exposure < 5e-8)
exposure_SCHI_dat_5e8
# Lets have a look at the number of SNPs that we would include:
#clumping
=clump_data(exposure_ADHD_dat_5e8,
exposure_ADHD_dat_5e8_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_ALCD_dat_5e8,
exposure_ALCD_dat_5e8_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_BIPO_dat_5e8,
exposure_BIPO_dat_5e8_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_CANU_dat_5e8,
exposure_CANU_dat_5e8_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_DEPR_dat_5e8,
exposure_DEPR_dat_5e8_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_SCHI_dat_5e8,
exposure_SCHI_dat_5e8_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
#choose p=5e-8 threshold.....harmonise data
<- harmonise_data(exposure_dat = exposure_ADHD_dat_5e8_clumped,
DataMR_ADHDtoSH_5e8 outcome_dat = outcome_SH_dat, action=2)
<- harmonise_data(exposure_dat = exposure_ALCD_dat_5e8_clumped,
DataMR_ALCDtoSH_5e8 outcome_dat = outcome_SH_dat, action=2)
<- harmonise_data(exposure_dat = exposure_BIPO_dat_5e8_clumped,
DataMR_BIPOtoSH_5e8 outcome_dat = outcome_SH_dat, action=2)
<- harmonise_data(exposure_dat = exposure_CANU_dat_5e8_clumped,
DataMR_CANUtoSH_5e8 outcome_dat = outcome_SH_dat, action=2)
<- harmonise_data(exposure_dat = exposure_DEPR_dat_5e8_clumped,
DataMR_DEPRtoSH_5e8 outcome_dat = outcome_SH_dat, action=2)
<- harmonise_data(exposure_dat = exposure_SCHI_dat_5e8_clumped,
DataMR_SCHItoSH_5e8 outcome_dat = outcome_SH_dat, action=2)
=subset(DataMR_ADHDtoSH_5e8, mr_keep==TRUE)
DataMR_ADHDtoSH_keep_5e8=subset(DataMR_ALCDtoSH_5e8, mr_keep==TRUE)
DataMR_ALCDtoSH_keep_5e8=subset(DataMR_BIPOtoSH_5e8, mr_keep==TRUE)
DataMR_BIPOtoSH_keep_5e8=subset(DataMR_CANUtoSH_5e8, mr_keep==TRUE)
DataMR_CANUtoSH_keep_5e8=subset(DataMR_DEPRtoSH_5e8, mr_keep==TRUE)
DataMR_DEPRtoSH_keep_5e8=subset(DataMR_SCHItoSH_5e8, mr_keep==TRUE)
DataMR_SCHItoSH_keep_5e8
###### I2GX for 5e-8 SNPs
isqADHD_5e8=Isq(y=DataMR_ADHDtoSH_keep_5e8$beta.exposure,
(s=DataMR_ADHDtoSH_keep_5e8$se.exposure))
isqADHD_w_5e8=Isq(y=DataMR_ADHDtoSH_keep_5e8$beta.exposure/DataMR_ADHDtoSH_keep_5e8$se.outcome,
(s=DataMR_ADHDtoSH_keep_5e8$se.exposure/DataMR_ADHDtoSH_keep_5e8$se.outcome))
QQ(Bx=DataMR_ADHDtoSH_keep_5e8$beta.exposure,
Bxse=DataMR_ADHDtoSH_keep_5e8$se.exposure,
Byse=DataMR_ADHDtoSH_keep_5e8$se.outcome)
#ADHD
#97.3%
#ALCD
isqALCD_5e8=Isq(y=DataMR_ALCDtoSH_keep_5e8$beta.exposure,
(s=DataMR_ALCDtoSH_keep_5e8$se.exposure))
isqALCD_w_5e8=Isq(y=DataMR_ALCDtoSH_keep_5e8$beta.exposure/DataMR_ALCDtoSH_keep_5e8$se.outcome,
(s=DataMR_ALCDtoSH_keep_5e8$se.exposure/DataMR_ALCDtoSH_keep_5e8$se.outcome))
QQ(Bx=DataMR_ALCDtoSH_keep_5e8$beta.exposure,
Bxse=DataMR_ALCDtoSH_keep_5e8$se.exposure,
Byse=DataMR_ALCDtoSH_keep_5e8$se.outcome)
#only 1 SNP, cannot do the calculation
#BIPO
isqBIPO_5e8=Isq(y=DataMR_BIPOtoSH_keep_5e8$beta.exposure,
(s=DataMR_BIPOtoSH_keep_5e8$se.exposure))
isqBIPO_w_5e8=Isq(y=DataMR_BIPOtoSH_keep_5e8$beta.exposure/DataMR_BIPOtoSH_keep_5e8$se.outcome,
(s=DataMR_BIPOtoSH_keep_5e8$se.exposure/DataMR_BIPOtoSH_keep_5e8$se.outcome))
QQ(Bx=DataMR_BIPOtoSH_keep_5e8$beta.exposure,
Bxse=DataMR_BIPOtoSH_keep_5e8$se.exposure,
Byse=DataMR_BIPOtoSH_keep_5e8$se.outcome)
#only 4 SNPs
#46.5% if not weighted, 0 if weighted.
#CANU
isqCANU_5e8=Isq(y=DataMR_CANUtoSH_keep_5e8$beta.exposure,
(s=DataMR_CANUtoSH_keep_5e8$se.exposure))
isqCANU_w_5e8=Isq(y=DataMR_CANUtoSH_keep_5e8$beta.exposure/DataMR_CANUtoSH_keep_5e8$se.outcome,
(s=DataMR_CANUtoSH_keep_5e8$se.exposure/DataMR_CANUtoSH_keep_5e8$se.outcome))
QQ(Bx=DataMR_CANUtoSH_keep_5e8$beta.exposure,
Bxse=DataMR_CANUtoSH_keep_5e8$se.exposure,
Byse=DataMR_CANUtoSH_keep_5e8$se.outcome)
#no SNP less than 5e-8
#DEPR
isqDEPR_5e8=Isq(y=DataMR_DEPRtoSH_keep_5e8$beta.exposure,
(s=DataMR_DEPRtoSH_keep_5e8$se.exposure))
isqDEPR_w_5e8=Isq(y=DataMR_DEPRtoSH_keep_5e8$beta.exposure/DataMR_DEPRtoSH_keep_5e8$se.outcome,
(s=DataMR_DEPRtoSH_keep_5e8$se.exposure/DataMR_DEPRtoSH_keep_5e8$se.outcome))
QQ(Bx=DataMR_DEPRtoSH_keep_5e8$beta.exposure,
Bxse=DataMR_DEPRtoSH_keep_5e8$se.exposure,
Byse=DataMR_DEPRtoSH_keep_5e8$se.outcome)
#only 2 SNPs
# 0% for both weighted and unweighted
#SCHI
isqSCHI_5e8=Isq(y=DataMR_SCHItoSH_keep_5e8$beta.exposure,
(s=DataMR_SCHItoSH_keep_5e8$se.exposure))
isqSCHI_w_5e8=Isq(y=DataMR_SCHItoSH_keep_5e8$beta.exposure/DataMR_SCHItoSH_keep_5e8$se.outcome,
(s=DataMR_SCHItoSH_keep_5e8$se.exposure/DataMR_SCHItoSH_keep_5e8$se.outcome))
QQ(Bx=DataMR_SCHItoSH_keep_5e8$beta.exposure,
Bxse=DataMR_SCHItoSH_keep_5e8$se.exposure,
Byse=DataMR_SCHItoSH_keep_5e8$se.outcome)
#135 SNPs
#97.8%
Outcome: SSH
######## rerun another MR for SSH ##########
# we keep the exposure data
#now load new outcome data - the SSH sumstats
<- read_outcome_data(
outcome_SSH_dat snps = NULL, # Keep 'snps=NULL' so that all SNPs are uploaded
filename = "ssh_sumstats_v2",
sep = "\t",
snp_col = "rsid",
beta_col = "beta",
se_col = "se",
effect_allele_col = "alt",
other_allele_col = "ref",
pval_col = "pval",
eaf_col="AF")
$outcome <- "SSH" # Provide a name for the outcome
outcome_SSH_datlength(outcome_SSH_dat$SNP)
<- harmonise_data(exposure_dat = exposure_ADHD_dat_5e5_clumped,
DataMR_ADHDtoSSH outcome_dat = outcome_SSH_dat, action=2)
<- harmonise_data(exposure_dat = exposure_ALCD_dat_5e5_clumped,
DataMR_ALCDtoSSH outcome_dat = outcome_SSH_dat, action=2)
<- harmonise_data(exposure_dat = exposure_BIPO_dat_5e5_clumped,
DataMR_BIPOtoSSH outcome_dat = outcome_SSH_dat, action=2)
<- harmonise_data(exposure_dat = exposure_CANU_dat_5e5_clumped,
DataMR_CANUtoSSH outcome_dat = outcome_SSH_dat, action=2)
<- harmonise_data(exposure_dat = exposure_DEPR_dat_5e5_clumped,
DataMR_DEPRtoSSH outcome_dat = outcome_SSH_dat, action=2)
<- harmonise_data(exposure_dat = exposure_SCHI_dat_5e5_clumped,
DataMR_SCHItoSSH outcome_dat = outcome_SSH_dat, action=2)
=subset(DataMR_ADHDtoSSH, mr_keep==TRUE)
DataMR_ADHDtoSSH_keep=subset(DataMR_ALCDtoSSH, mr_keep==TRUE)
DataMR_ALCDtoSSH_keep=subset(DataMR_BIPOtoSSH, mr_keep==TRUE)
DataMR_BIPOtoSSH_keep=subset(DataMR_CANUtoSSH, mr_keep==TRUE)
DataMR_CANUtoSSH_keep=subset(DataMR_DEPRtoSSH, mr_keep==TRUE)
DataMR_DEPRtoSSH_keep=subset(DataMR_SCHItoSSH, mr_keep==TRUE)
DataMR_SCHItoSSH_keep
<- mr(DataMR_ADHDtoSSH,method_list=c("mr_raps",
MR_ADHDtoSSH2"mr_weighted_median",
"mr_ivw",
"mr_egger_regression"))
<- mr(DataMR_ALCDtoSSH,method_list=c("mr_raps",
MR_ALCDtoSSH2"mr_weighted_median",
"mr_ivw",
"mr_egger_regression"))
<- mr(DataMR_BIPOtoSSH,method_list=c("mr_raps",
MR_BIPOtoSSH2"mr_weighted_median",
"mr_ivw",
"mr_egger_regression"))
<- mr(DataMR_CANUtoSSH,method_list=c("mr_raps",
MR_CANUtoSSH2"mr_weighted_median",
"mr_ivw",
"mr_egger_regression"))
<- mr(DataMR_DEPRtoSSH,method_list=c("mr_raps",
MR_DEPRtoSSH2"mr_weighted_median",
"mr_ivw",
"mr_egger_regression"))
<- mr(DataMR_SCHItoSSH,method_list=c("mr_raps",
MR_SCHItoSSH2"mr_weighted_median",
"mr_ivw",
"mr_egger_regression"))
write.xlsx(MR_ADHDtoSSH2, file="twosampleMR_SSH_14_Feb_2020.xlsx", sheetName="ADHD")
write.xlsx(MR_ALCDtoSSH2, file="twosampleMR_SSH_14_Feb_2020.xlsx", sheetName="ALCD", append=T)
write.xlsx(MR_BIPOtoSSH2, file="twosampleMR_SSH_14_Feb_2020.xlsx", sheetName="BIPO", append=T)
write.xlsx(MR_CANUtoSSH2, file="twosampleMR_SSH_14_Feb_2020.xlsx", sheetName="CANU", append=T)
write.xlsx(MR_DEPRtoSSH2, file="twosampleMR_SSH_14_Feb_2020.xlsx", sheetName="DEPR", append=T)
write.xlsx(MR_SCHItoSSH2, file="twosampleMR_SSH_14_Feb_2020.xlsx", sheetName="SCHI", append=T)
=rbind(mr_pleiotropy_test(DataMR_ADHDtoSSH),
MR_Egger_int_sshmr_pleiotropy_test(DataMR_ALCDtoSSH),
mr_pleiotropy_test(DataMR_BIPOtoSSH),
mr_pleiotropy_test(DataMR_CANUtoSSH),
mr_pleiotropy_test(DataMR_DEPRtoSSH),
mr_pleiotropy_test(DataMR_SCHItoSSH))
write.xlsx(MR_Egger_int_ssh, file="MR_Egger_14_Feb_2020.xlsx")
I2GX stats for SSH
#if BetaXG and seBetaXG represent the vector of SNP-exposure estimates and standar errors,
#and BetaYG and seBetaYG represent the vector of SNP-exposure estimates and standard errors. We calculate the I^2
#then do: Isq(BetaXG,seBetaXG)
#for MR regression weighted: Isq(BetaXG/seBetaYG,seBetaXG/seBetaYG)
#1. Calculate I2 (I squared)
#ADHD
isqADHD_SSH=Isq(y=DataMR_ADHDtoSSH_keep$beta.exposure,
(s=DataMR_ADHDtoSSH_keep$se.exposure))
isqADHD_w_SSH=Isq(y=DataMR_ADHDtoSSH_keep$beta.exposure/DataMR_ADHDtoSSH_keep$se.outcome,
(s=DataMR_ADHDtoSSH_keep$se.exposure/DataMR_ADHDtoSSH_keep$se.outcome))
QQ(Bx=DataMR_ADHDtoSSH_keep$beta.exposure,
Bxse=DataMR_ADHDtoSSH_keep$se.exposure,
Byse=DataMR_ADHDtoSSH_keep$se.outcome)
# I^2_GX statistic: 94.9%
#ALCD
isqALCD_SSH=Isq(y=DataMR_ALCDtoSSH_keep$beta.exposure,
(s=DataMR_ALCDtoSSH_keep$se.exposure))
isqALCD_w_SSH=Isq(y=DataMR_ALCDtoSSH_keep$beta.exposure/DataMR_ALCDtoSSH_keep$se.outcome,
(s=DataMR_ALCDtoSSH_keep$se.exposure/DataMR_ALCDtoSSH_keep$se.outcome))
QQ(Bx=DataMR_ALCDtoSSH_keep$beta.exposure,
Bxse=DataMR_ALCDtoSSH_keep$se.exposure,
Byse=DataMR_ALCDtoSSH_keep$se.outcome)
# I^2_GX statistic: 93.0%
#BIPO
isqBIPO_SSH=Isq(y=DataMR_BIPOtoSSH_keep$beta.exposure,
(s=DataMR_BIPOtoSSH_keep$se.exposure))
isqBIPO_w_SSH=Isq(y=DataMR_BIPOtoSSH_keep$beta.exposure/DataMR_BIPOtoSSH_keep$se.outcome,
(s=DataMR_BIPOtoSSH_keep$se.exposure/DataMR_BIPOtoSSH_keep$se.outcome))
QQ(Bx=DataMR_BIPOtoSSH_keep$beta.exposure,
Bxse=DataMR_BIPOtoSSH_keep$se.exposure,
Byse=DataMR_BIPOtoSSH_keep$se.outcome)
# I^2_GX statistic: 94.4%
#CANU
isqCANU_SSH=Isq(y=DataMR_CANUtoSSH_keep$beta.exposure,
(s=DataMR_CANUtoSSH_keep$se.exposure))
isqCANU_w_SSH=Isq(y=DataMR_CANUtoSSH_keep$beta.exposure/DataMR_CANUtoSSH_keep$se.outcome,
(s=DataMR_CANUtoSSH_keep$se.exposure/DataMR_CANUtoSSH_keep$se.outcome))
QQ(Bx=DataMR_CANUtoSSH_keep$beta.exposure,
Bxse=DataMR_CANUtoSSH_keep$se.exposure,
Byse=DataMR_CANUtoSSH_keep$se.outcome)
# I^2_GX statistic: 94.7%
#DEPR
isqDEPR_SSH=Isq(y=DataMR_DEPRtoSSH_keep$beta.exposure,
(s=DataMR_DEPRtoSSH_keep$se.exposure))
isqDEPR_w_SSH=Isq(y=DataMR_DEPRtoSSH_keep$beta.exposure/DataMR_DEPRtoSSH_keep$se.outcome,
(s=DataMR_DEPRtoSSH_keep$se.exposure/DataMR_DEPRtoSSH_keep$se.outcome))
QQ(Bx=DataMR_DEPRtoSSH_keep$beta.exposure,
Bxse=DataMR_DEPRtoSSH_keep$se.exposure,
Byse=DataMR_DEPRtoSSH_keep$se.outcome)
# I^2_GX statistic: 94.7%
#SCHI
isqSCHI_SSH=Isq(y=DataMR_SCHItoSSH_keep$beta.exposure,
(s=DataMR_SCHItoSSH_keep$se.exposure))
isqSCHI_w_SSH=Isq(y=DataMR_SCHItoSSH_keep$beta.exposure/DataMR_SCHItoSSH_keep$se.outcome,
(s=DataMR_SCHItoSSH_keep$se.exposure/DataMR_SCHItoSSH_keep$se.outcome))
QQ(Bx=DataMR_SCHItoSSH_keep$beta.exposure,
Bxse=DataMR_SCHItoSSH_keep$se.exposure,
Byse=DataMR_SCHItoSSH_keep$se.outcome)
# I^2_GX statistic: 95.7%
Multivariate MR
After conducting univariate MR, significant exposures are modelled together in multivariate MR model
Multivariate MR for SH using 5e-5 threshold
#multivariate MR for SH and other traits (ADHD, BIPO, CANU, DEPR, SCHI) - take out CROS.
# steps:
# 1. read exposure data
# 2. select 5e-5 threshold and clump the SNPs for exposure data
# 3. rbind exposure data
# 4. read outcome data
# 5. harmonise the data
# 6. conduct multivariate MR
library(data.table)
library(TwoSampleMR)
library(MendelianRandomization)
library(gmodels)
# step 1: Read exposure data for SCHI, DEPR and ADHD
# ADHD
<- read_exposure_data(
exposure_ADHD_dat filename = "ADHD",
sep = "\t",
snp_col = "SNP",
beta_col = "beta",
se_col = "beta_se",
effect_allele_col = "A1",
other_allele_col = "A2",
pval_col = "P")
$exposure="ADHD"
exposure_ADHD_datlength(exposure_ADHD_dat$SNP)
<- read_exposure_data(
exposure_DEPR_dat filename = "DEPR",
sep = "\t",
snp_col = "SNP",
beta_col = "beta",
se_col = "beta_se",
effect_allele_col = "A1",
other_allele_col = "A2",
pval_col = "P",
eaf_col="FREQ")
$exposure="DEPR"
exposure_DEPR_datlength(exposure_DEPR_dat$SNP)
<- read_exposure_data(
exposure_SCHI_dat filename = "SCHI",
sep = "\t",
snp_col = "SNP",
beta_col = "beta",
se_col = "beta_se",
effect_allele_col = "A1",
other_allele_col = "A2",
pval_col = "P",
eaf_col="FREQ")
$exposure="SCHI" # Provide a name for the exposure variable
exposure_SCHI_dat
length(exposure_SCHI_dat$SNP)
#p-value threshold
=subset(exposure_ADHD_dat, pval.exposure < 5e-5)
exposure_ADHD_dat_5e5=subset(exposure_DEPR_dat, pval.exposure < 5e-5)
exposure_DEPR_dat_5e5=subset(exposure_SCHI_dat, pval.exposure < 5e-5)
exposure_SCHI_dat_5e5
# #clumping
=clump_data(exposure_ADHD_dat_5e5,
exposure_ADHD_dat_5e5_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_DEPR_dat_5e5,
exposure_DEPR_dat_5e5_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_SCHI_dat_5e5,
exposure_SCHI_dat_5e5_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
# step 2: merge the data
=do.call("rbind", list(exposure_ADHD_dat_5e5_clumped,
exposure_dat
exposure_DEPR_dat_5e5_clumped,
exposure_SCHI_dat_5e5_clumped))
write.table(exposure_dat, file="initial_SNPS")
# read 'outcome' data based on the new list of SNPs using read_outcome data then convert into exposure
=read_outcome_data(
ADHDsnps=exposure_dat$SNP,
filename = "ADHD",
sep = "\t",
snp_col = "SNP",
beta_col = "beta",
se_col = "beta_se",
effect_allele_col = "A1",
other_allele_col = "A2",
pval_col = "P")
$outcome <- "ADHD"
ADHD<- convert_outcome_to_exposure(ADHD)
exposure_ADHD2
=read_outcome_data(
DEPRsnps=exposure_dat$SNP,
filename = "DEPR",
sep = "\t",
snp_col = "SNP",
beta_col = "beta",
se_col = "beta_se",
effect_allele_col = "A1",
other_allele_col = "A2",
pval_col = "P"
eaf_col="FREQ")
$outcome <- "DEPR"
DEPR<- convert_outcome_to_exposure(DEPR)
exposure_DEPR2
=read_outcome_data(
SCHIsnps=exposure_dat$SNP,
filename = "SCHI",
sep = "\t",
snp_col = "SNP",
beta_col = "beta",
se_col = "beta_se",
effect_allele_col = "A1",
other_allele_col = "A2",
pval_col = "P",
eaf_col="FREQ")
$outcome <- "SCHI"
SCHI<- convert_outcome_to_exposure(SCHI)
exposure_SCHI2
=do.call("rbind", list(exposure_ADHD2,
exposure_dat2
exposure_DEPR2,
exposure_SCHI2))
write.table(exposure_dat2,file="compiled_SNPs_rbind")
##### 4. Read outcome data ####
<- read_outcome_data(
outcome_SH_dat snps = exposure_dat$SNP,
filename = "selfharm_sumstats_v2",
sep = "\t",
snp_col = "rsid",
beta_col = "beta",
se_col = "se",
effect_allele_col = "alt",
other_allele_col = "ref",
pval_col = "pval",
eaf_col="AF")
$outcome <- "SH"
outcome_SH_dat
write.table(outcome_SH_dat$SNP, file="selfharm_SNPs")
# 5. Harmonise the data
<- mv_harmonise_data(exposure_dat2, outcome_SH_dat)
mvdat write.table(mvdat$exposure_beta, file="mv_mr_harmonised_snps")
# 6. Conduct multivariate MR
# #using TwoSampleMR package
<- mv_multiple(mvdat, instrument_specific=F, plots=T, pval_threshold=1)
MultiMr $result
MultiMr
#save the results
write.csv( MultiMr$result,file="TwoSampleMR_MVMR_results_28_Feb.csv")
pdf("plot_TwoSampleMR_MVMR_28_Feb.pdf")
$plot
MultiMrdev.off()
#
Multivariate MR for SH without DEPR
We have to repeat the same steps as above because this will give us a different number of SNPs.
# steps:
# 1. read exposure data
# 2. select 5e-5 threshold and clump the SNPs for exposure data
# 3. rbind exposure data
# 4. read outcome data
# 5. harmonise the data
# 6. conduct multivariate MR
library(data.table)
library(TwoSampleMR)
library(MendelianRandomization)
library(gmodels)
# step 1: Read exposure data for SCHI, DEPR and ADHD
# ADHD
<- read_exposure_data(
exposure_ADHD_dat filename = "ADHD",
sep = "\t", # Specify whether the file is a tab delimited file ("\t") or a space delimited file ("")
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "beta", # Insert the name of column with effect sizes
se_col = "beta_se", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P") # Insert the name of column with p-value
print("ADHD")
$exposure="ADHD" # Provide a name for the exposure variable
exposure_ADHD_dathead(exposure_ADHD_dat)
length(exposure_ADHD_dat$SNP)
<- read_exposure_data(
exposure_SCHI_dat filename = "SCHI", # This is the filename of the summary statistics file for your exposure
sep = "\t", # Specify whether the file is a tab delimited file ("\t") or a space delimited file ("")
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "beta", # Insert the name of column with effect sizes
se_col = "beta_se", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P", # Insert the name of column with p-value
eaf_col="FREQ") # Insert the name of column with effect allele frequency
print("SCHI")
$exposure="SCHI" # Provide a name for the exposure variable
exposure_SCHI_dathead(exposure_SCHI_dat)
length(exposure_SCHI_dat$SNP)
#p-value threshold
=subset(exposure_ADHD_dat, pval.exposure < 5e-5)
exposure_ADHD_dat_5e5#exposure_DEPR_dat_5e5=subset(exposure_DEPR_dat, pval.exposure < 5e-5)
=subset(exposure_SCHI_dat, pval.exposure < 5e-5)
exposure_SCHI_dat_5e5#
# #clumping
=clump_data(exposure_ADHD_dat_5e5,
exposure_ADHD_dat_5e5_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_SCHI_dat_5e5,
exposure_SCHI_dat_5e5_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
print("ADHD")
length(exposure_ADHD_dat_5e5_clumped$SNP)
head(exposure_ADHD_dat_5e5_clumped)
print("SCHI")
length(exposure_SCHI_dat_5e5_clumped$SNP)
head(exposure_SCHI_dat_5e5_clumped)
#just to check the order
# step 2: merge the data
#merge the data by rbind, do clumping and
=do.call("rbind", list(exposure_ADHD_dat_5e5_clumped,
exposure_dat
exposure_SCHI_dat_5e5_clumped))
print("new list of SNPs, number of SNPs:")
length(exposure_dat$SNP)
# read exposure data based on the new list of SNPs using read_outcome data then convert
=read_outcome_data(
ADHDsnps=exposure_dat$SNP,
filename = "ADHD",
sep = "\t", # Specify whether the file is a tab delimited file ("\t") or a space delimited file ("")
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "beta", # Insert the name of column with effect sizes
se_col = "beta_se", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P") # Insert the name of column with p-value
$outcome <- "ADHD"
ADHD<- convert_outcome_to_exposure(ADHD)
exposure_ADHD2 print("exposure_ADHD2 SNPs")
length(exposure_ADHD2$SNP)
=read_outcome_data(
SCHIsnps=exposure_dat$SNP,
filename = "SCHI",
sep = "\t", # Specify whether the file is a tab delimited file ("\t") or a space delimited file ("")
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "beta", # Insert the name of column with effect sizes
se_col = "beta_se", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P",
eaf_col="FREQ") # Insert the name of column with p-value
$outcome <- "SCHI"
SCHI<- convert_outcome_to_exposure(SCHI)
exposure_SCHI2 print("exposure_SCHI2 SNPs")
length(exposure_SCHI2$SNP)
=do.call("rbind", list(exposure_ADHD2,
exposure_dat2
exposure_SCHI2))
# # 4. Read outcome data
<- read_outcome_data(
outcome_SH_dat snps = exposure_dat$SNP,
filename = "selfharm_sumstats_v2",
sep = "\t",
snp_col = "rsid",
beta_col = "beta",
se_col = "se",
effect_allele_col = "alt",
other_allele_col = "ref",
pval_col = "pval",
eaf_col="AF")
print("Self Harm")
$outcome <- "SH" # Provide a name for the outcome
outcome_SH_datlength(outcome_SH_dat$SNP)
# 5. Harmonise the data
<- mv_harmonise_data(exposure_dat2, outcome_SH_dat)
mvdat
# 6. Conduct multivariate MR
# using TwoSampleMR package
<- mv_multiple(mvdat,
MultiMr instrument_specific=F,
plots=T,
pval_threshold=1)
$result
MultiMr
#save the results
write.csv( MultiMr$result,file="TwoSampleMR_MVMR_without_DEPR_results_28_Feb.csv")
pdf("plot_TwoSampleMR_MVMR__without_DEPR_28_Feb.pdf")
$plot
MultiMrdev.off()
Multivariate MR for SH using 5e-8 threshold for SCHI
This was requested by one of the reviewers
#multivariate MR for SH and other traits (ADHD, DEPR, SCHI) with a more stringent threshold.
# steps:
# 1. read exposure data
# 2. select 5e-5 threshold and clump the SNPs for exposure data
# 3. rbind exposure data
# 4. read outcome data
# 5. harmonise the data
# 6. conduct multivariate MR
library(data.table)
library(TwoSampleMR)
library(MendelianRandomization)
library(gmodels)
# step 1: Read exposure data for SCHI, DEPR and ADHD
# ADHD
<- read_exposure_data(
exposure_ADHD_dat filename = "ADHD",
sep = "\t",
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "beta", # Insert the name of column with effect sizes
se_col = "beta_se", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P") # Insert the name of column with p-value
print("ADHD")
$exposure="ADHD" # Provide a name for the exposure variable
exposure_ADHD_dathead(exposure_ADHD_dat)
length(exposure_ADHD_dat$SNP)
system("wc -l ADHD")
<- read_exposure_data(
exposure_DEPR_dat filename = "DEPR",
sep = "\t",
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "beta", # Insert the name of column with effect sizes
se_col = "beta_se", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P", # Insert the name of column with p-value
eaf_col="FREQ") # Insert the name of column with effect allele frequency
print("DEPR")
$exposure="DEPR"
exposure_DEPR_dat# Provide a name for the exposure variable
head(exposure_DEPR_dat)
length(exposure_DEPR_dat$SNP)
=fread("SCHI")
schi=subset(schi,P<5e-4)
schi2dim(schi);dim(schi2)
<-format_data(schi2,
exposure_SCHI_dattype="exposure",
snp_col = "SNP",
beta_col = "beta",
se_col = "beta_se",
effect_allele_col = "A1",
other_allele_col = "A2",
pval_col = "P",
eaf_col="FREQ")
print("SCHI")
$exposure="SCHI" # Provide a name for the exposure variable
exposure_SCHI_dathead(exposure_SCHI_dat)
length(exposure_SCHI_dat$SNP)
#p-value threshold
=subset(exposure_ADHD_dat, pval.exposure < 5e-5)
exposure_ADHD_dat_5e8=subset(exposure_DEPR_dat, pval.exposure < 5e-5)
exposure_DEPR_dat_5e8=subset(exposure_SCHI_dat, pval.exposure < 5e-8)
exposure_SCHI_dat_5e8
# #clumping
=clump_data(exposure_ADHD_dat_5e8,
exposure_ADHD_dat_5e8_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_DEPR_dat_5e8,
exposure_DEPR_dat_5e8_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
=clump_data(exposure_SCHI_dat_5e8,
exposure_SCHI_dat_5e8_clumpedclump_r2 = 0.001,
clump_p1 = 1,
clump_p2 = 1,
clump_kb = 250)
#check number of SNPs in harmonisation
<- harmonise_data(exposure_dat = exposure_ADHD_dat_5e8_clumped,
DataMR_ADHDtoSH outcome_dat = outcome_SH_dat, action=2)
<- harmonise_data(exposure_dat = exposure_DEPR_dat_5e8_clumped,
DataMR_DEPRtoSH outcome_dat = outcome_SH_dat, action=2)
table(DataMR_DEPRtoSH$mr_keep)
table(DataMR_ADHDtoSH$mr_keep)
#
print("ADHD")
length(exposure_ADHD_dat_5e8_clumped$SNP)
head(exposure_ADHD_dat_5e8_clumped)
table(exposure_ADHD_dat_5e8_clumped$mr_keep.exposure) #296 SNPs
print("DEPR")
length(exposure_DEPR_dat_5e8_clumped$SNP)
head(exposure_DEPR_dat_5e8_clumped)
table(exposure_DEPR_dat_5e8_clumped$mr_keep.exposure) # 252 SNPs
print("SCHI")
length(exposure_SCHI_dat_5e8_clumped$SNP)
head(exposure_SCHI_dat_5e8_clumped)
table(exposure_SCHI_dat_5e8_clumped$mr_keep.exposure) #145 SNPs
# step 2: merge the data
#merge the data by rbind, do clumping and bind all rows together.
=do.call("rbind", list(exposure_ADHD_dat_5e8_clumped[,1:11],
exposure_dat1:11],
exposure_DEPR_dat_5e8_clumped[,
exposure_SCHI_dat_5e8_clumped))
print("new list of SNPs, number of SNPs:")
length(exposure_dat$SNP) #693 SNPs together
# read exposure data based on the new list of SNPs using read_outcome data then convert
=read_outcome_data(
ADHDsnps=exposure_dat$SNP,
filename = "ADHD",
sep = "\t",
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "beta", # Insert the name of column with effect sizes
se_col = "beta_se", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P") # Insert the name of column with p-value
$outcome <- "ADHD"
ADHD<- convert_outcome_to_exposure(ADHD)
exposure_ADHD2 print("exposure_ADHD2 SNPs")
length(exposure_ADHD2$SNP) #read 159 SNPs out of 161 SNPs
=read_outcome_data(
DEPRsnps=exposure_dat$SNP,
filename = "DEPR",
sep = "\t",
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "beta", # Insert the name of column with effect sizes
se_col = "beta_se", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P",
eaf_col="FREQ") # Insert the name of column with p-value
$outcome <- "DEPR"
DEPR<- convert_outcome_to_exposure(DEPR)
exposure_DEPR2 print("exposure_DEPR2 SNPs")
length(exposure_DEPR2$SNP) #157 SNPs out of 161 SNPs
=read_outcome_data(
SCHIsnps=exposure_dat$SNP,
filename = "SCHI",
sep = "\t",
snp_col = "SNP", # Insert the name of column that includes the SNP ID's
beta_col = "beta", # Insert the name of column with effect sizes
se_col = "beta_se", # Insert the ame of column with standard errors
effect_allele_col = "A1", # Insert the name of column with the effect allele
other_allele_col = "A2", # Insert the name of column with the non-effect allele
pval_col = "P",
eaf_col="FREQ") # Insert the name of column with p-value
$outcome <- "SCHI"
SCHI<- convert_outcome_to_exposure(SCHI)
exposure_SCHI2 print("exposure_SCHI2 SNPs")
length(exposure_SCHI2$SNP) #158 SNPs out of 161 SNPs
=do.call("rbind", list(exposure_ADHD2,
exposure_dat2
exposure_DEPR2,
exposure_SCHI2))head(exposure_dat2);dim(exposure_dat2) #474 rows, total of 157+158+159 SNPs
# 4. Read outcome data
<- read_outcome_data(
outcome_SH_dat snps = exposure_dat$SNP,
filename = "selfharm_sumstats_v2",
sep = "\t",
snp_col = "rsid",
beta_col = "beta",
se_col = "se",
effect_allele_col = "alt",
other_allele_col = "ref",
pval_col = "pval",
eaf_col="AF")
print("Self Harm")
$outcome <- "SH" # Provide a name for the outcome
outcome_SH_datlength(outcome_SH_dat$SNP) # 154 SNPs here
system("wc -l selfharm_sumstats_v2")
# 5. Harmonise the data
<- mv_harmonise_data(exposure_dat2, outcome_SH_dat)
mvdat =as.data.frame(mvdat$exposure_beta)
mv_mr_harmonised_snpsdim(mv_mr_harmonised_snps) #128 rows
# 6. Conduct multivariate MR
# #using TwoSampleMR package
<- mv_multiple(mvdat, instrument_specific=F, plots=T, pval_threshold=1)
MultiMr $result
MultiMrwrite.csv( MultiMr$result,file="TwoSampleMR_MVMR_results_16_Feb_2020.csv")
#using MendelianRandomisation package
str(mvdat)
<- data.frame(bx=mvdat$exposure_beta,
MulitMRdata bxse=mvdat$exposure_se,
by.SH=mvdat$outcome_beta,
byse.SH=mvdat$outcome_se)
<- mr_mvinput(bx = cbind(MulitMRdata$bx.ADHD,
MRMVInputObject $bx.DEPR,
MulitMRdata$bx.SCHI),
MulitMRdatabxse = cbind(MulitMRdata$bxse.ADHD,
$bxse.DEPR,
MulitMRdata$bxse.SCHI),
MulitMRdataby = MulitMRdata$by.SH,
byse = MulitMRdata$byse.SH,
exposure=c("ADHD","DEPR","SCHI"))
mr_mvivw(MRMVInputObject)
Calculate the number of SNPs
In this code chunk I calculated the number of SNPs involved in MVMR.
##figure out the number of SNPs
=exposure_dat #"initial_SNPS"
list_A=exposure_dat2 #fread("compiled_SNPs_rbind",verbose=T)
list_B=mvdat$exposure_beta #fread("mv_mr_harmonised_snps",verbose=T)
list_C=as.data.frame(outcome_SH_dat$SNP) #fread("selfharm_SNPs",verbose=T)
list_Ddim(list_A);dim(list_B);dim(list_C);dim(list_D) #list A=161, B=474, C=128, D=154
table(list_A$exposure) #14 for ADHD, 2 for DEPR, 145 for SCHI - total= 161
table(list_B$exposure) #159 for ADHD, 157 for DEPR, 158 for SCHI, total=474
=subset(list_B[,1:2],exposure=="ADHD")
ADHD=subset(list_B[,1:2],exposure=="DEPR")
DEPR=subset(list_B[,1:2],exposure=="SCHI")
SCHI$outcome="selfharm"
list_Dcolnames(list_D)[1]<-"SNP"
head(list_D)
### draw Venn Diagram for number of snps/overlapping of snps
library(VennDiagram)
venn.diagram(x = list(ADHD$SNP , DEPR$SNP , SCHI$SNP, list_D$SNP),
category.names = c("ADHD" , "DEPR " , "SCHI", "self-harm"),
filename = 'SH_venn_diagramm_mvmr.png',
imagetype="png",
fill = c('yellow', 'purple', 'green','blue'),
cex=2, cat.cex=2)
For MVMR with different threshold across the exposures (10E-5 for DEPR and ADHD, 10E-8 for SCHI), the Venn diagram for the number of SNPs involved is shown below:
4: Supporting information
SCZ cases for quantiles
- output is a file containing the RR of self-harm in each quantile of SCZ PRS and also for the SCZ cases
library(data.table)
library(VennDiagram)
<- fread("selfharm_small_15_Feb.txt", verbose=T)
selfharmdata $batch[selfharmdata$batch==""]<-NA
selfharmdata$AC<-as.factor(selfharmdata$assessment_centre)
selfharmdata$sexL<-as.factor(selfharmdata$sex)
selfharmdata$batchL<-as.factor(selfharmdata$batch)
selfharmdata$f.20480.0.0[selfharmdata$f.20480.0.0==-818]<-NA
selfharmdata$f.20483.0.0[selfharmdata$f.20483.0.0==-818]<-NA
selfharmdata
=c("PC1","PC2","PC3","PC4","PC5","PC6","batchL","AC","sexL","age")
covariates=c("IID","PC1","PC2","PC3","PC4","PC5","PC6","batchL","AC","sexL","age")
covariates_IID=selfharmdata[,covariates_IID,with=F]
pcs
=fread("antipsychotics_ID_06_march_updated.txt",verbose=T)
antidim(anti) #3339 people take antipsychotics
<- 20
numquant <- numquant + 1
quant1 <- quant1 + 1
quantscz <- ceiling(quantscz/2)
ref
<- cbind(selfharmdata$IID,selfharmdata$SCHI02_0.3)
scores <- cbind(selfharmdata$IID,selfharmdata$f.20480.0.0)
phenofile
#creating the dataframe
colnames(phenofile)[c(1, 2)] <- c("ID", "pheno")
$anti<-1
anticolnames(anti)[1]<-"IID"
=as.data.frame(phenofile)
phenofilecolnames(scores)=c("IID","scores")
<- merge(scores, anti, by = 1, all = T)
x
<- merge(x = x,
forquant by.x = "IID",
y = phenofile,
by.y = "ID",
all.x=T,
all.y=T)
<- merge(x = forquant, y = pcs, by = "IID",all.x=T,all.y=T )
forquant
#merge with covariates
=fread("SCZ_ids_HES_06_march.txt",verbose=T)
scz=fread("SCZ_ids_self_reported_06_march.txt",verbose=T)
scz2=fread("SCZ_MHQ_11_march.txt",verbose=T)
scz3$HES_SCZ=1;scz2$Self_SCZ=1;scz3$MHQ_SCZ=1
scz=as.data.frame(scz);scz2=as.data.frame(scz2);scz3=as.data.frame(scz3) scz
Venn Diagram (Fig A in Appendix S1)
venn.diagram(x = list(scz$V1 , scz2$V1 , scz3$f.eid),
category.names = c("ICD 9/10",
"Non-cancer\nillness\nself-report ",
"MHQ self-report"),
filename = 'SCZ_cases_venn_diagramm.png',
imagetype="png",
fill = c('yellow', 'purple', 'green'),
cex=2.5,
cat.cex=2.5,
cat.dist=c(0.04,0.04,0.02),
cat.fontface="bold",
height=5500,width=5500) #75 overlapping, 1121 unique individuals in total
Derive RR for SCZ
#scz - ICD, scz2 - self-report.
=merge(scz,scz2,by=1, all.x=T,all.y=T)
scz_all#781+620>1046
=merge(scz_all, scz3, by=1,all.y=T,all.x=T)
scz_allnames(scz_all)[1]<-"IID"
=as.data.frame(scz_all$IID)
scz_data_in$SCZ=NA;scz_data_in$SCZ=1
scz_data_in
colnames(scz_data_in)=c("IID","SCZ")
=merge(scz_data_in, scz_all, by="IID",all.y=T,all.x=T)
scz_data_out
=merge(forquant,scz_data_in, by="IID",all.x=T,all.y=T)
forquant
$quantiles <- as.numeric(cut(forquant$scores,
forquantbreaks = quantile(forquant$scores,
probs = seq(0, 1, 1/numquant),
na.rm = T),
include.lowest=T))
=forquant$scores[forquant$SCZ==1]
schizophrenia_scores
=schizophrenia_scores[!is.na(schizophrenia_scores)]
schizophrenia_scores#mean score for schizophrenia patients is -0.002210675
#compare with thresholds
=as.numeric(cut(-0.002210675,
mean_positionbreaks = quantile(forquant$scores,
probs = seq(0, 1, 1/numquant),
na.rm = T),
include.lowest=T))
$anti[is.na(forquant$anti)]<-0
forquant
#recode scz_anti and scz_no_anti
$scz_anti=NA;forquant$scz_no_anti=NA
forquantwhich(forquant$SCZ==1 & forquant$anti==1),"scz_anti"] <- 1
forquant[which(forquant$SCZ==1 & forquant$anti==0),"scz_no_anti"] <- 1
forquant[
$quantiles <- with(forquant,
forquantifelse(!is.na(forquant$scz_no_anti),
quant1,
quantiles))
#if SCZ patient (unmedicated), assign a different quantile aka 21
$quantiles <- with(forquant,
forquantifelse(!is.na(forquant$scz_anti),
quantscz,
quantiles))
#if SCZ patient (medicated), assign a different quantile aka 22
$quantiles <- factor(forquant$quantiles,
forquantlevels = c(ref,
seq(1, quantscz, 1)[seq(1, quantscz, 1) != ref]))
=forquant[forquant$quantiles==quant1,] #unmedicated
atable(a$pheno,useNA="always")
table(a$SCZ,useNA="always")
#unmedicated with self-harm data: 56 never self-harmed, 32 did
=forquant[forquant$quantiles == quantscz,] #medicated
btable(b$pheno,useNA="always")
table(b$SCZ,useNA="always")
#medicated with self-harm data: 57 never self-harmed, 32 did.
=forquant[forquant$quantiles != quant1,]
c
$scz_med=NA
forquantwhich(forquant$SCZ==1 & forquant$anti==1),"scz_med"] <- 1
forquant[which(forquant$SCZ==1 & forquant$anti==0),"scz_med"] <- 0
forquant[
<- chisq.test(table(forquant$pheno, forquant$scz_med))
chisq print(chisq) #not significantly different.
<- exp(summary(glm(pheno ~ .,
RR family="poisson",
data=forquant[,c("pheno","quantiles",covariates)]
$coefficients[1:quantscz,1])
))
<- RR + (1.96*summary(glm(pheno ~ quantiles,
ConfIu family="poisson",
data =forquant))$coefficients[1:quantscz,2])
<- RR - (1.96*summary(glm(pheno ~ quantiles,
ConfIl family="poisson",
data = forquant))$coefficients[1:quantscz,2])
1] <- 1
RR[1] <- 1
ConfIu[1] <- 1
ConfIl[<- seq(1, quantscz, 1)
list <- list[list != ref]
list <- c(ref, list)
quantable <- data.frame(RR, ConfIu, ConfIl, quantable)
quantdf colnames(quantdf) <- c("RR", "CIU", "CIL", "QUANT")
<- quantdf[order(quantdf$QUANT),]
quantdf $label <- c(rep("GenPop", numquant), "SCZnomeds", "SCZmeds")
quantdf$label <- factor(quantdf$label, levels = c("GenPop", "SCZnomeds", "SCZmeds"))
quantdf
write.table(quantdf, file="RR_medications_split_quants_SCHI_self_report_or_ICD_or_MHQ.txt")
DEPR cases for quantile
#step 1: find out the DEPR cases
# find out the column numbers for the following:
# Main ICD9:
# Secondary ICD 9:
# Main ICD 10:
# Secondary ICD 10:
library(data.table)
library(VennDiagram)
library(tidyverse)
<- fread("selfharm_small_15_Feb.txt", verbose=T)
selfharmdata $batch[selfharmdata$batch==""]<-NA
selfharmdata$AC<-as.factor(selfharmdata$assessment_centre)
selfharmdata$sexL<-as.factor(selfharmdata$sex)
selfharmdata$batchL<-as.factor(selfharmdata$batch)
selfharmdata$f.20480.0.0[selfharmdata$f.20480.0.0==-818]<-NA
selfharmdata$f.20483.0.0[selfharmdata$f.20483.0.0==-818]<-NA
selfharmdata
=c("PC1","PC2","PC3","PC4","PC5","PC6","batchL","AC","sexL","age")
covariates=c("IID","PC1","PC2","PC3","PC4","PC5","PC6","batchL","AC","sexL","age")
covariates_IID=selfharmdata[,covariates_IID,with=F]
pcs=fread("antidepressants_ID_06_march_updated.txt",verbose=T)
anti
<- 20
numquant <- numquant + 1
quant1 <- quant1 + 1
quantscz <- ceiling(quantscz/2)
ref <- cbind(selfharmdata$IID,selfharmdata$DEPR07_0.3)
scores <- cbind(selfharmdata$IID,selfharmdata$f.20480.0.0)
phenofile
#creating the datframe
colnames(phenofile)[c(1, 2)] <- c("ID", "pheno")
$anti<-1
anticolnames(anti)[1]<-"IID"
=as.data.frame(phenofile)
phenofilecolnames(scores)=c("IID","scores")
<- merge(scores, anti, by = 1, all = T)
x
<- merge(x = x, by.x = "IID", y = phenofile, by.y = "ID", all.x=T, all.y=T)
forquant <- merge(x = forquant, y = pcs, by = "IID",all.x=T,all.y=T )
forquant
#merge with covariates
=fread("DEPR_ids_HES_06_march.txt",verbose=T)
dep=dep[!duplicated(dep)] #remove duplicated IDs
dep=fread("DEPR_ids_self_reported_06_march.txt",verbose=T)
dep2=fread("DEPR_MHQ_11_march.txt",verbose=T)
dep3
$HES_MDD=1;dep2$Self_MDD=1;dep3$MHQ_MDD=1
dep
=as.data.frame(dep);dep2=as.data.frame(dep2);dep3=as.data.frame(dep3) dep
Venn Diagram (Fig B in Appendix S1)
venn.diagram(x = list(dep$V1 , dep2$V1 , dep3$f.eid),
category.names = c("ICD 9/10","Non-cancer\nillness\nself-report ",
"MHQ self-report"),
filename = 'DEPR_cases_venn_diagramm_10_june.png',
imagetype="png",
fill = c('yellow', 'purple', 'green'),
cex=2.5,
cat.cex=2.5,
cat.dist=c(0.04,0.04,0.02),
cat.fontface="bold",
height=5500,
width=5500
)
Derive RR for DEPR
#dep - ICD, dep2 - self-report.
=merge(dep,dep2,by="V1",all.x=T,all.y=T)
dep_datacolnames(dep3)[1]<-"V1"
=merge(dep_data,dep3,by="V1",all.x=T,all.y=T)
dep_data
colnames(dep_data)[1]<-"IID"
#self-report or ICD = 23656 - see below
=as.data.frame(dep_data$IID)
dep_data_in$V2=1
dep_data_incolnames(dep_data_in)=c("IID","dep")
=merge(dep_data, dep_data_in, by="IID")
dep_data_out
=merge(forquant,dep_data_in, by="IID",all.x=T,all.y=T)
forquant
$quantiles <- as.numeric(cut(forquant$scores,
forquantbreaks = quantile(forquant$scores,
probs = seq(0, 1, 1/numquant),
na.rm = T),
include.lowest=T))
=forquant$scores[forquant$dep==1]
depression_scores
=depression_scores[!is.na(depression_scores)]
depression_scores
#compare with thresholds
=as.numeric(cut(-0.002278132,
mean_positionbreaks = quantile(forquant$scores,
probs = seq(0, 1, 1/numquant),
na.rm = T),
include.lowest=T))
$anti[is.na(forquant$anti)]<-0
forquant
$dep_anti=NA;forquant$dep_no_anti=NA
forquantwhich(forquant$dep==1 & forquant$anti==1),"dep_anti"] <- 1
forquant[which(forquant$dep==1 & forquant$anti==0),"dep_no_anti"] <- 1
forquant[
=forquant$scores[dep==1]
depr_prs
$quantiles <- with(forquant,
forquantifelse(!is.na(forquant$dep_no_anti),
quant1,
quantiles))
$quantiles <- with(forquant,
forquantifelse(!is.na(forquant$dep_anti),
quantscz,
quantiles))
#if dep patient (medicated), assign a different quantile aka 22
$quantiles <- factor(forquant$quantiles,
forquantlevels = c(ref,
seq(1, quantscz, 1)[seq(1, quantscz, 1) != ref]))
=forquant$pheno[forquant$quantiles == quant1] #unmedicated
a=forquant$pheno[forquant$quantiles == quantscz] #medicated
b$dep_med=NA
forquantwhich(forquant$dep==1 & forquant$anti==1),"dep_med"] <- 1
forquant[which(forquant$dep==1 & forquant$anti==0),"dep_med"] <- 0
forquant[
<- chisq.test(table(forquant$pheno, forquant$dep_med))
chisq print(chisq)
# calculate the risk ratio
<- exp(summary(glm(pheno ~ .,
RR family="poisson",
data=forquant[,c("pheno","quantiles",covariates)]
$coefficients[1:quantscz,1])
))
<- RR + (1.96*summary(glm(pheno ~ quantiles,
ConfIu family="poisson",
data = forquant))$coefficients[1:quantscz,2])
<- RR - (1.96*summary(glm(pheno ~ quantiles,
ConfIl family="poisson",
data = forquant))$coefficients[1:quantscz,2])
1] <- 1
RR[1] <- 1
ConfIu[1] <- 1
ConfIl[<- seq(1, quantscz, 1)
list <- list[list != ref]
list <- c(ref, list)
quantable <- data.frame(RR, ConfIu, ConfIl, quantable)
quantdf colnames(quantdf) <- c("RR", "CIU", "CIL", "QUANT")
<- quantdf[order(quantdf$QUANT),]
quantdf $label <- c(rep("GenPop", numquant), "DEPRnomeds", "DEPRmeds")
quantdf$label <- factor(quantdf$label, levels = c("GenPop", "DEPRnomeds", "DEPRmeds"))
quantdf
quantdf
write.table(quantdf, file="RR_medications_split_quants_DEPR_self_report_or_ICD_or_MHQ_10_june.txt")
Plot Figure 4
library(ggplot2)
library(cowplot)
<-function(myggplot){
get_legend<- ggplot_gtable(ggplot_build(myggplot))
tmp <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
leg <- tmp$grobs[[leg]]
legend return(legend)
}
=read.table("RR_medications_split_quants_DEPR_self_report_or_ICD_or_MHQ_10_june.txt")
y=as.data.frame(y)
y
<- c("GenPop"="seagreen","DEPRnomeds"="deepskyblue3","DEPRmeds"="firebrick1")
cols
<- ggplot(y) +
p geom_point(aes(x = QUANT,
y = RR,
colour = label),
size=1) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black",size=0.25)) +
geom_errorbar(aes(ymin = CIL,
ymax = CIU,
x = QUANT,
colour = label),
size = 0.25,
width=1) +
ylab("RR of self-harm for MDD")+
theme(legend.position = "none") +
xlab("Quantiles for Polygenic Score") +
scale_colour_manual(name="",
labels=c("GenPop"="General Population",
"DEPRnomeds"="Non-medicated cases",
"DEPRmeds"="Medicated cases"),
values =cols) +
theme(axis.title = element_text(size = 10,
colour="black",
face="bold"),
axis.text=element_text(size=10,
colour="black",
face="bold"))+
ylim(0,10)
=read.table("RR_medications_split_quants_SCHI_self_report_or_ICD_or_MHQ.txt")
z
=as.data.frame(z)
z
<- c("GenPop"="seagreen","SCZnomeds"="deepskyblue3","SCZmeds"="firebrick1")
cols
<- ggplot(z) +
q geom_point(aes(x = QUANT,
y = RR,
colour = label),
size=1) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black",size=0.25)) +
geom_errorbar(aes(ymin = CIL,
ymax = CIU,
x = QUANT,
colour = label),
size = 0.25,
width=1) +
ylab("RR of self-harm for schizophrenia")+
theme(legend.position = "none") +
xlab("Quantiles for Polygenic Score") +
scale_colour_manual(values = cols) +
theme(axis.title = element_text(size = 10,
colour="black",
face="bold"),
axis.text=element_text(size=10,
colour="black",
face="bold"))+
ylim(0,10)
<-get_legend(p+theme(legend.position=c(1,0.6),
legendlegend.justification="center",
legend.box.just="top",
legend.direction="horizontal",
legend.text=element_text(size=10,
colour="black",
face="bold"))+
guides(colour=guide_legend(reverse=T)))
=plot_grid(q,p,
combinedlabels=c("A","B"),
label_size = 10,
rel_heights=c(1,.2))
legend,
combined
Figure_4
Descriptive statistics
Finding out number of participants in each process, proportion of males, females, etc.
#### more demgraphic analyses for flowchart in method draft
## date: 24th April 2019
#### wanted to know the age, sex, etc demographics for the final sample.
library(data.table)
library(prettyR)
library(tidyr)
=fread("selfharm_small_15_Feb.txt")
selfharmdata<-selfharmdata[!is.na(selfharmdata$f.20480.0.0) & !is.na(selfharmdata$ADHD02_0.3) & !is.na(selfharmdata$PC1)
selfharmdatac& !is.na(selfharmdata$PC2) & !is.na(selfharmdata$PC3)
& !is.na(selfharmdata$PC4) & !is.na(selfharmdata$PC5) & !is.na(selfharmdata$PC6)
& !is.na(selfharmdata$batch) & !is.na(selfharmdata$age) & !is.na(selfharmdata$sex)
& !is.na(selfharmdata$assessment_centre),]
=fread("scz_diagnosis_0706.txt")
scz_diag=scz_diag[,-1]
scz_diagstr(scz_diag)
=fread("dep_diagnosis_0706.txt")
dep_diag=dep_diag[,-1]
dep_diaghead(dep_diag)
=merge(scz_diag,dep_diag,by="IID",all=T)
diag
dim(selfharmdata);dim(selfharmdatac)
<-selfharmdatac[selfharmdatac$f.20480.0.0!=-818,]
noNAselfharmdatadim(noNAselfharmdata)
###07 June - descriptive stats for scz and dep, age, gender of each strata
head(diag)
=merge(noNAselfharmdata, diag, by="IID",all.x=T,all.y=F)
selfharm_table$SCZ=replace_na(selfharm_table$SCZ,0)
selfharm_table$dep=replace_na(selfharm_table$dep,0)
selfharm_tabledim(noNAselfharmdata);dim(selfharm_table)
#create a loop:
=selfharm_table
whole=selfharm_table[selfharm_table$f.20480.0.0==1]
selfharmed=selfharm_table[selfharm_table$f.20483.0.0==1]
SSH=selfharm_table[selfharm_table$f.20483.0.0==0]
NSSH=selfharm_table[selfharm_table$f.20480.0.0==0]
controls
table(controls$f.20480.0.0,useNA="always")
table(NSSH$f.20483.0.0,useNA="always")
table(SSH$f.20483.0.0,useNA="always")
table(selfharmed$f.20480.0.0,useNA="always")
#write a function to analyze systematically the female%,
<- function(data) {
analyze =prop.table(table(data$sex))[1]*100
female=mean(data$age)
mean_age=sd(data$age)
sd_age=prop.table(table(data$SCZ))[2]*100
scz=prop.table(table(data$dep))[2]*100
dep=c(female,mean_age,sd_age,scz,dep)
rownames(row)=c("Female (%)","mean age","SD of age","Schizophrenia diagnosis (%)","MDD diagnosis (%)")
=round(row,2)
rowreturn(row)
}
=rbind(analyze(whole),analyze(selfharmed),analyze(SSH),analyze(NSSH),analyze(controls))
descriptivesrownames(descriptives)=c("Full analytical sample",
"Self-harmed",
"SSH",
"NSSH",
"Never self-harmed")
write.csv(descriptives, "descriptives_stats.csv")
#check age
table(selfharmdatac$f.20480.0.0)
sum(table(selfharmdata$age)) #502543
sum(table(selfharmdata$sex)) #502543
summary(selfharmdatac$age)
describe(selfharmdatac$age)
#Min. 1st Qu. Median Mean 3rd Qu. Max.
# 48.00 60.00 67.00 65.88 72.00 82.00
# mean median var sd valid.n
#x 65.88 67 59.15 7.69 126287
#sex
prop.table(table(selfharmdatac$sex))
# female 0 male 1
# 0.5622748 0.4377252
### -818 for selfharmed.
#merge with original self-harm data
setwd("/mnt/lustre/groups/ukbiobank/Edinburgh_Data/usr/Kai/phenotypes")
=fread("self_harmed_ori.txt")
oriselfharmcolnames(oriselfharm)<-c("IID","FID","selfharmori") #recode the selfharm ori column to avoid overlapping with f.20480.0.0
colnames(selfharmdata)
<-merge(selfharmdata,oriselfharm,
selfharm2by=c("IID","FID"),
all.x=T,
all.y=T)
dim(selfharm2);dim(selfharmdata)
#now see those who answered MHQ (including "prefer not to answer") and had "PRS"
table(!is.na(selfharmdata$f.20480.0.0)&!is.na(selfharmdata$ADHD02_0.3))
table(!is.na(selfharmdata$f.20480.0.0)) #number of ppl who answered MHQ
#157358 - had MHQ
#126287 - had MHQ and PRS
sum(table((selfharmdatac$f.20483.0.0)))
# found it : 126287 = number of people who answered MHQ + PRS data (including those answering "prefer not to answer")
#new data: 126287-362=125925
#number of ppl who self-harmed
table(selfharmdata$f.20480.0.0)
6872/(150011+6872) #=0.044
Figure S1
Figure S1 is a combination of 6 plots for variance accounted for by the PRS using different p-value thresholds in selecting the SNPs.
library(data.table)
library(ggplot2)
=fread("PRS_forR2_0412_names.csv")
PRSdatacolnames(PRSdata)[1]="names"
"ADHD"->PRSdata$group[1:10]
"Alcohol Dependence Disorder"->PRSdata$group[11:20]
"Bipolar Disorder"->PRSdata$group[21:30]
"Lifetime Cannabis Use"->PRSdata$group[31:40]
"Major Depressive Disorder"->PRSdata$group[41:50]
"Schizophrenia"->PRSdata$group[51:60]
$group=as.factor(PRSdata$group)
PRSdata
#change p-value threshold
$names=c("1x10^-5","0.001","0.01","0.05","0.1","0.2","0.3","0.4","0.5","1")
PRSdata
#change p-values
$print.p[round(PRSdata$`p-value`, digits = 3) != 0] <-
PRSdataround(PRSdata$`p-value`[round(PRSdata$`p-value`, digits = 3) != 0], digits = 3)
$print.p[round(PRSdata$`p-value`, digits = 3) == 0] <-
PRSdataformat(PRSdata$`p-value`[round(PRSdata$`p-value`, digits = 3) == 0], digits = 2)
$print.p <- sub("e", "*x*10^", PRSdata$print.p)
PRSdata
$print.p
PRSdata
#change R2
$print.R2[round(PRSdata$R2, digits = 3) != 0] <-
PRSdataround(PRSdata$R2[round(PRSdata$R2, digits = 3) != 0], digits = 3)
$print.R2[round(PRSdata$R2, digits = 3) == 0] <-
PRSdataformat(PRSdata$R2[round(PRSdata$R2, digits = 3) == 0], digits = 3)
$print.R2 <- sub("e", "*x*10^", PRSdata$R2)
PRSdata
$print.R2
PRSdata
=c("ADHD",
loopname"Alcohol Dependence Disorder",
"Bipolar Disorder",
"Lifetime Cannabis Use",
"Major Depressive Disorder",
"Schizophrenia")
<- theme_bw()+
theme_sam theme(axis.title=element_text(face="bold", size=18),
axis.text=element_text(size=14),
legend.title=element_text(face="bold", size=18),
legend.text=element_text(size=14),
axis.text.x=element_text(angle=45, hjust=1),
panel.grid = element_blank(),
panel.border = element_blank(),
axis.line = element_line(),
plot.title = element_text(size=18, face="bold",hjust = 0.6)
)
for (i in seq(from=1,to=length(loopname),by=1))
{=PRSdata[PRSdata$group==loopname[i],]
PRSdata2
$names <- factor(PRSdata2$names, as.character(PRSdata2$names))
PRSdata2
=ggplot(PRSdata2, aes(x=factor(names),y=R2))+
imagegeom_text(aes(label=paste(print.p)),vjust=-1.5,hjust=0,angle=45,cex=4,parse=T)+
+
theme_samscale_y_continuous(limits=c(0,0.0046))+
geom_bar(aes(fill = -log10(`p-value`)), stat = "identity")+
xlab(expression(italic(P) - value ~ threshold ~ (italic(P)[T]))) +
ylab(expression(paste("PRS model fit: ", R ^ 2)))+
scale_fill_gradient(
low = "#000099",
high = "#990010",
name = bquote(atop(-log[10] ~ model, italic(P) - value)),
limit = c(0,40))+
ggtitle(loopname[i])
ggsave(image,filename=paste("PRS_plot",loopname[i],".png",sep=""))
}