Simulation of power for local ancestry analysis
args <- commandArgs(TRUE)
scriptno <- args[1]
scriptno <- as.numeric(scriptno)
ncore=7 #Assume that node has this many cores
Ngroups=1
fineness=.005
efseq1 <- seq(1.05,1.3,by=fineness)
nprobes_job=ceiling(length(efseq1)/ncore) #Number of effect estimates per cpu
probestart=(scriptno-1) * nprobes_job + 1
probestop=(scriptno) * nprobes_job
print(probestart)
print(probestop)
#https://www.biorxiv.org/content/early/2018/09/18/014001.full.pdf
library(lmtest)
#simulate a snp with allele frequency 20%
#simulate that this SNP has an effect 1.2 in pop 1, 1.1 in pop 2.
#simualte admixture of this population
# For an individual:
# Assign ancestry based on admixture proportion (binomial distribution), with admixture proportion prior (0 ,.25,.5,.75,1) - perhaps just need .75?
# Assume equal allele frequency f in ancestral population 1,
# sample allele types based on ancestry
# generate phenotype value using a binomial distribution #I think this is the critical flaw assuming polygenicity?
#Simulated a locus for a population of admixed individuals
#Simualted overall admixture from a beta distribution with shape parameters shape1=7.76 ,shape2= 2.17, based on fitdist of our overall AAM population.
#Ancestry at the ordered haplotype of the locus was drawn from a binomial distribution, with p=the subject's overall admixture proportion.
#Genotype at each haplotype of the locus drawn from a binomial distribution with p = MAF. In this case, MAF = 20%.
#Calculated for each individual the number of copies of the risk allele coming from each ancestry background. I.e. this is the product of ancestral haplotype and indid
#Following the model design in asaMap, .
#Simulated a phenotype with 10% prevalence in the allele-less population, with a 1.3 odds ratio in African Americans log additive effect, no effect in Europeans, equal allele frequencies between groups,
#Disease status drawn from a binomial distribution with p=probability of disease
#sampled from population until found requisite N of cases and controls.
prevalence=.1 #i.e. risk of disease even without risk SNP
intercept=-log(1/prevalence - 1) #Translate this into a logistic model intercept
#split sequence into nproc parts
efseq=na.omit(efseq1[probestart:probestop])
powergivenn <- matrix(ncol=3,nrow=length(efseq)*Ngroups)
row.names(powergivenn) <- paste(c(12000),sort(rep(efseq,Ngroups)),sep="_")
j=1
for (effectsize in efseq)
{
k=1
for (N in c(12000)) # ,12000
{
nrep=1000
powersim <- matrix(ncol=3,nrow=nrep)
for (i in 1:nrep)
{
#Print for every 100 reps
if ((i %% 100) == 0)
{
print(i)
}
individuals <- as.data.frame(matrix(ncol=6,nrow=N*8))
names(individuals) <- c("Admixture","Ancestry1","Ancestry2","Allele1","Allele2","Phenotype")
#Draw admixture from a beta distribution following our sample admixture
individuals$Admixture <- rbeta(n=length(individuals$Admixture), shape1=7.75815060 ,shape2= 2.16809238) # runif(n=length(individuals$Admixture),.5,1) #Make a gradient following a uniform distribution. Can alter models later.
#system.time(for (kk in 1:100){ rbeta(n=length(individuals$Admixture), shape1=7.75815060 ,shape2= 2.16809238)})
#system.time(for (kk in 1:100){ rnorm(n=length(individuals$Admixture), mean=.78,sd=.12)})
#admixes <- rnorm(n=length(individuals$Admixture), mean=.78,sd=.12)
#But censor it so no values < .1 occur
try(individuals[which(individuals$Admixture < .1),]$Admixture <- .1,silent=TRUE)
#Assign ordered ancestry haplotypes based on probability of admixture
individuals$AncestryHap1 <- rbinom(n=length(individuals$Admixture),size=1,p=individuals$Admixture)
individuals$AncestryHap2 <- rbinom(n=length(individuals$Admixture),size=1,p=individuals$Admixture)
#Assign ordered alleles
#Code for giving them 10% prob of allele if african and 30% if european.
individuals$Allele1Freq <- as.numeric(!sign(individuals$AncestryHap1))*.2 + .1
individuals$Allele2Freq <- as.numeric(!sign(individuals$AncestryHap2))*.2 + .1
individuals$AlleleHap1 <- rbinom(n=length(individuals$Admixture),size=1,p=individuals$Allele1Freq)
individuals$AlleleHap2 <- rbinom(n=length(individuals$Admixture),size=1,p=individuals$Allele2Freq)
#Calculate number of copies of the risk allele on ancestry background A
individuals$N_copies_ancA_hap1 <- individuals$AncestryHap1 * individuals$AlleleHap1
individuals$N_copies_ancA_hap2 <- individuals$AncestryHap2 * individuals$AlleleHap2
individuals$N_copies_ancA <- individuals$N_copies_ancA_hap1 + individuals$N_copies_ancA_hap2
#Calculate number of copies of the risk allele on ancestry background B
#Since I'm using a negation (!) code, it is recoded TRUE/FALSE. With logic, FALSE*FALSE = TRUE, which I don't want, I want it to be FALSE, so I convert back to 0/1
individuals$N_copies_ancB_hap1 <- sign(!(individuals$AncestryHap1)) * individuals$AlleleHap1
individuals$N_copies_ancB_hap2 <- sign(!(individuals$AncestryHap2)) * individuals$AlleleHap2
individuals$N_copies_ancB <- individuals$N_copies_ancB_hap1 + individuals$N_copies_ancB_hap2
#Calculate total N copies of the risk allele
individuals$N_copies <- individuals$AlleleHap1 + individuals$AlleleHap2
#table(individuals$N_copies)
#table(individuals$N_copies_ancA,individuals$N_copies_ancB) #Upper triangle should be nonzeros, i.e. there should be people with 0 copies from both, etc.
#For every individual,
relative_risk_anca=log(effectsize)
relative_risk_ancb=0 #
admixture_risk=.5
eabx=intercept + admixture_risk*individuals$Admixture + individuals$N_copies_ancA*relative_risk_anca + individuals$N_copies_ancB*relative_risk_ancb # + rnorm(length(individuals$Admixture),sd=sqrt(5)) # + individuals$Admixture*admixture_risk
#Prevalence needs to be adjusted so that its the population prevalence, i.e. individuals without risk allele?
#in other words, must be set so that the even when trait are all 0, probabiltiy of disease is the prevalence, this is the intercept i calculated at -2.19
#Individual probabiltiy of disease
pdis <- 1/(1+exp(-eabx))
#Draw phenotypes
individuals$Phenotype <- rbinom(n=length(individuals$Admixture),size=1, p=pdis) #Should this even be drawn? or is risk of disease anyway here? following a liability threshold...
#resample individuals to have cases and 2.5 controls for eachcase
cases <- which(individuals$Phenotype == 1)
controls <- which(individuals$Phenotype == 0)
nmult <- N*2.5
casepick <- cases[1:N]
conpick <- controls[1:nmult]
#casepick <- sample(cases,N,replace=F)
#conpick <- sample(controls,nmult,replace=F)
#check system time of 100 rounds of resampling vs just shuffling
# system.time(for (k in 1:1000){ sample(controls,nmult,,replace=F)})
#system.time(for (k in 1:1000){ c(1:12000)}) #1/10 the time to just take the first 10000, which are arbitrary anyway...
individuals2 <- individuals[c(casepick,conpick),]
#Model risk
m1 <- glm(Phenotype ~ N_copies_ancA + N_copies_ancB + Admixture,data=individuals2,family='binomial')
m1a <- glm(Phenotype ~ N_copies + Admixture,data=individuals2,family='binomial')
m2 <- glm(Phenotype ~ Admixture,data=individuals2,family='binomial')
powersim[i,1] <- lrtest(m1,m2)[2,5]
#powersim[i,2] <- summary(m1)$coefficients[2,4]
powersim[i,3] <- lrtest(m1a,m2)[2,5]
}
powergivenn [(((k-1)*Ngroups) + j),1] <- length(which(powersim[,1] < 5e-8))
powergivenn [(((k-1)*Ngroups) + j),3] <- length(which(powersim[,3] < 5e-8))
j=j+1
}
k=k+1
}
write.table(powergivenn,file=paste('powergivennmaf13alt_',scriptno,'.txt',sep=''))
echo "asamap null standard" > header.txt
cat header.txt powergivenn_*.txt | grep -v V1 > power_givenn.txt
cat header.txt powergivennmaf13_*.txt | grep -v V1 > power_givennmaf13.txt
cat header.txt powergivennmaf24_*.txt | grep -v V1 > power_givennmaf24.txt
cat header.txt powergivennmaf13alt_*.txt | grep -v V1 > power_givennmaf13alt.txt
powergivenn <- read.table('power_givenn.txt',header=T)
powergivenn4000 <- powergivenn[seq(1,dim(powergivenn)[1],by=2),]
powergivenn12000 <- powergivenn[seq(2,dim(powergivenn)[1],by=2),]
powergivenn24 <- read.table('power_givennmaf24.txt',header=T)
powergivenn13 <- read.table('power_givennmaf13alt.txt',header=T)
#get rid of middle people, do 10% and 30% maf per group
#wesanderson::wes_palette
fineness=.005
unlist_split2 <- function(x, colnum ,...)
{
toret <- unlist(strsplit(x, ...) )[colnum]
return(toret)
}
efseq <- as.numeric(sapply(row.names(powergivenn4000),unlist_split2,colnum=2,split="_") )
# pdf('powersimg2_2020.pdf',8,4)
# plot(efseq,powergivenn4000[,1]/10,type='l',lwd=2,cex.axis=1.25,cex.lab=1.45,xlab="Odds Ratio", ylab="Power",lty=2,ylim=c(0,100))
# lines(efseq,powergivenn4000[,3]/10,type='l',lwd=2,col='darkgrey',lty=1)
# lines(efseq,powergivenn12000[,1]/10,type='l',lwd=2,col='blue',lty=2)
# lines(efseq,powergivenn12000[,3]/10,type='l',lwd=2,col='lightblue',lty=1)
# legend("bottomright",legend=c( "12000 Cases, asaMap", "12000 Cases, standard","4000 Cases, asaMap", "4000 Cases, standard"), col=rep(c("blue","lightblue",'black','darkgrey'),3),lty=c(2,1,2,1),bty='n')
# #Remove the border
# dev.off()
# pdf('powersimg2_2040.pdf',8,4)
# plot(efseq,powergivenn24[,1]/10,type='l',lwd=2,col='red',lty=2,xlab="Odds Ratio", ylab="Power",)
# lines(efseq,powergivenn24[,3]/10,type='l',lwd=2,col='pink',lty=1)
# legend("bottomright",legend=c("12000 Cases, asaMap","12000 Cases, standard" ), col=rep(c('red','pink'),3),lty=c(2,1),bty='n')
# dev.off()
#Graph w/o red line
pdf('powersimg2_scen1.pdf',5.5,4.5)
plot(efseq,powergivenn4000[,1]/10,type='l',lwd=2,cex.axis=1.15,cex.lab=1.45,xlab="Odds Ratio", ylab="Power",lty=2,ylim=c(0,110),yaxt='n',bty='l')
abline(a=80,b=0,lty=3)
axis(2,c(0,20,40,60,80,100),cex.axis=1.15) #jesus christ you fuck jkust tra
lines(efseq,powergivenn4000[,3]/10,type='l',lwd=2,col='black',lty=1)
lines(efseq,powergivenn12000[,1]/10,type='l',lwd=2,col='blue',lty=2)
lines(efseq,powergivenn12000[,3]/10,type='l',lwd=2,col='blue',lty=1)
lines(efseq,powergivenn13[,1]/10,type='l',lwd=2,col='green',lty=2)
lines(efseq,powergivenn13[,3]/10,type='l',lwd=2,col='green',lty=1)
legend("topleft",legend=c("12000 Cases, LANC, MAF20%","12000 Cases, GLOB" ,"12000 Cases, LANC, MAF10% AFR, 30% EUR" ,"12000 Cases, GLOB" ,"4000 Cases, LANC, MAF20%" ,"4000 Cases, GLOB" ), col=rep(c('blue','blue','green','green','black','black'),3),lty=rep(c(2,1),3),bty='n',cex=.5, pt.cex = .4)
dev.off()
#Graph with black and blue
pdf('powersimg2_scen2.pdf',5.5,4.5)
plot(efseq,powergivenn4000[,1]/10,type='l',lwd=2,cex.axis=1.15,cex.lab=1.45,xlab="Odds Ratio", ylab="Power",lty=2,ylim=c(0,110),yaxt='n',bty='l')
abline(a=80,b=0,lty=3)
axis(2,c(0,20,40,60,80,100),cex.axis=1.15) #jesus christ you fuck jkust tra
lines(efseq,powergivenn4000[,3]/10,type='l',lwd=2,col='black',lty=1)
lines(efseq,powergivenn12000[,1]/10,type='l',lwd=2,col='blue',lty=2)
lines(efseq,powergivenn12000[,3]/10,type='l',lwd=2,col='blue',lty=1)
legend("topleft",legend=c("12000 Cases, LANC, MAF20%","12000 Cases, GLOB" ,"4000 Cases, LANC, MAF20%" ,"4000 Cases, GLOB" ), col=rep(c('blue','blue','black','black'),3),lty=rep(c(2,1),2),bty='n',cex=.5, pt.cex = .4)
dev.off()
#Graph with red and green
pdf('powersimg2_scen3.pdf',5.5,4.5)
plot(efseq,powergivenn4000[,1]/10,type='l',lwd=2,cex.axis=1.15,cex.lab=1.45,xlab="Odds Ratio", ylab="Power",lty=2,ylim=c(0,110),yaxt='n',bty='l',col='white')
axis(2,c(0,20,40,60,80,100),cex.axis=1.15)
abline(a=80,b=0,lty=3)
lines(efseq,powergivenn24[,1]/10,type='l',lwd=2,col='red',lty=2)
lines(efseq,powergivenn24[,3]/10,type='l',lwd=2,col='red',lty=1)
lines(efseq,powergivenn13[,1]/10,type='l',lwd=2,col='green',lty=2)
lines(efseq,powergivenn13[,3]/10,type='l',lwd=2,col='green',lty=1)
legend("topleft",legend=c("12000 Cases, LANC, MAF20% AFR, 40% EUR","12000 Cases, GLOB" ,"12000 Cases, LANC, MAF10% AFR, 30% EUR" ,"12000 Cases, GLOB" ), col=rep(c('red','red','green','green'),3),lty=rep(c(2,1),2),bty='n',cex=.5, pt.cex = .4)
dev.off()
#graph all
pdf('powersimg2_202020401030.pdf',5.5,4.5)
plot(efseq,powergivenn4000[,1]/10,type='l',lwd=2,cex.axis=1.15,cex.lab=1.45,xlab="Odds Ratio", ylab="Power",lty=2,ylim=c(0,110),yaxt='n',bty='l')
axis(2,c(0,20,40,60,80,100),cex.axis=1.15) #jesus christ you fuck jkust tra
lines(efseq,powergivenn4000[,3]/10,type='l',lwd=2,col='black',lty=1)
abline(h=0)
lines(efseq,powergivenn12000[,1]/10,type='l',lwd=2,col='blue',lty=2)
lines(efseq,powergivenn12000[,3]/10,type='l',lwd=2,col='blue',lty=1)
lines(efseq,powergivenn24[,1]/10,type='l',lwd=2,col='red',lty=2)
lines(efseq,powergivenn24[,3]/10,type='l',lwd=2,col='red',lty=1)
lines(efseq,powergivenn13[,1]/10,type='l',lwd=2,col='green',lty=2)
lines(efseq,powergivenn13[,3]/10,type='l',lwd=2,col='green',lty=1)
legend("topleft",legend=c("12000 Cases, asaMap","12000 Cases, standard" ), col=rep(c('red','pink'),3),lty=c(2,1),bty='n')
dev.off()
#!/bin/bash
#wd=$(pwd)
#qsub simulate_power.sh -lwalltime=3:00:00 -d $wd -e errandout/ -o errandout/ -F "-n 7"
while getopts n: option
do
case "${option}"
in
n) nodeuse=${OPTARG};;
esac
done
module load R
#Write the start and stop points of the file
jstart=1 #changed jstart to 11 to finish last job
jstop=$nodeuse
for j in $(seq -w $jstart 1 $jstop)
do
Rscript /mnt/sdb/genetics/pgc_ptsd_grant_resub/lanc_simulation_v5_mafdif.txt $j &
done
wait