nievergeltlab
10/1/2018 - 6:11 PM

Local Ancestry Power (OLD VERSION)

Simulation of power for local ancestry analysis

args <- commandArgs(TRUE)
scriptno <- args[1]

scriptno <- as.numeric(scriptno)
ncore=7 #Assume that node has this many cores

Ngroups=1
fineness=.005
efseq1 <- seq(1.05,1.3,by=fineness)

nprobes_job=ceiling(length(efseq1)/ncore) #Number of effect estimates per cpu

probestart=(scriptno-1)  * nprobes_job + 1
probestop=(scriptno)  * nprobes_job 
print(probestart)
print(probestop)

#https://www.biorxiv.org/content/early/2018/09/18/014001.full.pdf
library(lmtest)

#simulate a snp with allele frequency 20%
#simulate that this SNP has an effect 1.2 in pop 1, 1.1 in pop 2.
#simualte admixture of this population

# For an individual:
# Assign ancestry based on admixture proportion (binomial distribution), with admixture proportion prior (0 ,.25,.5,.75,1) - perhaps just need .75?
# Assume equal allele frequency f in ancestral population 1, 
# sample allele types based on ancestry
# generate phenotype value using a binomial distribution #I think this is the critical flaw assuming polygenicity?

#Simulated a locus for a population of admixed individuals
#Simualted overall admixture  from a beta distribution with shape parameters shape1=7.76 ,shape2= 2.17, based on fitdist of our overall AAM population.
#Ancestry at the ordered haplotype of the locus was drawn from a binomial distribution, with p=the subject's overall admixture proportion.
#Genotype at each haplotype of the locus drawn from a binomial distribution with p = MAF. In this case, MAF = 20%.
#Calculated for each individual the number of copies of the risk allele coming from each ancestry background. I.e. this is the product of ancestral haplotype and indid

#Following the model design in asaMap, .
#Simulated a phenotype with 10% prevalence in the allele-less population, with a 1.3 odds ratio in African Americans log additive effect, no effect in Europeans, equal allele frequencies between groups,
#Disease status drawn from a binomial distribution with p=probability of disease
#sampled from population until found requisite N of cases and controls.

prevalence=.1 #i.e. risk of disease even without risk SNP
intercept=-log(1/prevalence - 1) #Translate this into a logistic model intercept



#split sequence into nproc parts
efseq=na.omit(efseq1[probestart:probestop])
powergivenn <- matrix(ncol=3,nrow=length(efseq)*Ngroups)

row.names(powergivenn) <- paste(c(12000),sort(rep(efseq,Ngroups)),sep="_")


j=1
for (effectsize in efseq)
{
 k=1
 for (N in c(12000)) # ,12000
 {
  nrep=1000
  powersim <- matrix(ncol=3,nrow=nrep)
  for (i in 1:nrep)
  {
   #Print for every 100 reps
   if ((i %% 100) == 0)
   {
    print(i)
   }

   individuals <- as.data.frame(matrix(ncol=6,nrow=N*8))
   names(individuals) <- c("Admixture","Ancestry1","Ancestry2","Allele1","Allele2","Phenotype")
   #Draw admixture from a beta distribution following our sample admixture
   individuals$Admixture <- rbeta(n=length(individuals$Admixture), shape1=7.75815060  ,shape2= 2.16809238) # runif(n=length(individuals$Admixture),.5,1) #Make a gradient following a uniform distribution. Can alter models later.
   #system.time(for (kk in 1:100){ rbeta(n=length(individuals$Admixture), shape1=7.75815060  ,shape2= 2.16809238)})
   #system.time(for (kk in 1:100){ rnorm(n=length(individuals$Admixture), mean=.78,sd=.12)})
   #admixes <- rnorm(n=length(individuals$Admixture), mean=.78,sd=.12)
   
   #But censor it so no values < .1 occur
   try(individuals[which(individuals$Admixture < .1),]$Admixture <- .1,silent=TRUE)
   
   #Assign ordered ancestry haplotypes based on probability of admixture
   individuals$AncestryHap1 <- rbinom(n=length(individuals$Admixture),size=1,p=individuals$Admixture) 
   individuals$AncestryHap2 <- rbinom(n=length(individuals$Admixture),size=1,p=individuals$Admixture) 

   #Assign ordered alleles 
   #Code for giving them 10% prob of allele if african and 30% if european. 
   individuals$Allele1Freq <- as.numeric(!sign(individuals$AncestryHap1))*.2 + .1
   individuals$Allele2Freq <- as.numeric(!sign(individuals$AncestryHap2))*.2 + .1
    
   individuals$AlleleHap1   <- rbinom(n=length(individuals$Admixture),size=1,p=individuals$Allele1Freq) 
   individuals$AlleleHap2   <- rbinom(n=length(individuals$Admixture),size=1,p=individuals$Allele2Freq)

   #Calculate number of copies of the risk allele on ancestry background A
   individuals$N_copies_ancA_hap1 <- individuals$AncestryHap1 * individuals$AlleleHap1 
   individuals$N_copies_ancA_hap2 <- individuals$AncestryHap2 * individuals$AlleleHap2 
   individuals$N_copies_ancA <- individuals$N_copies_ancA_hap1 + individuals$N_copies_ancA_hap2 

   #Calculate number of copies of the risk allele on ancestry background B
   #Since I'm using a negation (!) code, it is recoded TRUE/FALSE. With logic, FALSE*FALSE = TRUE, which I don't want, I want it to be FALSE, so I convert back to 0/1
   individuals$N_copies_ancB_hap1 <- sign(!(individuals$AncestryHap1)) * individuals$AlleleHap1 
   individuals$N_copies_ancB_hap2 <- sign(!(individuals$AncestryHap2)) * individuals$AlleleHap2 
   individuals$N_copies_ancB <- individuals$N_copies_ancB_hap1 + individuals$N_copies_ancB_hap2 

   #Calculate total N copies of the risk allele
   individuals$N_copies <- individuals$AlleleHap1  + individuals$AlleleHap2

   #table(individuals$N_copies)
   #table(individuals$N_copies_ancA,individuals$N_copies_ancB) #Upper triangle should be nonzeros, i.e. there should be people with 0 copies from both, etc. 

   #For every individual,

   relative_risk_anca=log(effectsize) 
   relative_risk_ancb=0 #
   admixture_risk=.5
   eabx=intercept + admixture_risk*individuals$Admixture  +  individuals$N_copies_ancA*relative_risk_anca + individuals$N_copies_ancB*relative_risk_ancb # + rnorm(length(individuals$Admixture),sd=sqrt(5)) # + individuals$Admixture*admixture_risk 
   #Prevalence needs to be adjusted so that its the population prevalence, i.e. individuals without risk allele?
   #in other words, must be set so that the even when trait are all 0, probabiltiy of disease is the prevalence, this is the intercept i calculated at -2.19
   
   #Individual probabiltiy of disease
   pdis <- 1/(1+exp(-eabx))
   
   #Draw phenotypes
   individuals$Phenotype  <- rbinom(n=length(individuals$Admixture),size=1, p=pdis) #Should this even be drawn? or is risk of disease anyway here? following a liability threshold...

   #resample individuals to have cases and 2.5 controls for eachcase
   cases <- which(individuals$Phenotype == 1)
   controls <- which(individuals$Phenotype == 0)
   nmult <- N*2.5
   casepick <- cases[1:N]
   conpick <- controls[1:nmult]
   
   #casepick <- sample(cases,N,replace=F)
   #conpick <- sample(controls,nmult,replace=F)
   #check system time of 100 rounds of resampling vs just shuffling
  # system.time(for (k in 1:1000){ sample(controls,nmult,,replace=F)}) 
   #system.time(for (k in 1:1000){ c(1:12000)}) #1/10 the time to just take the first 10000, which are arbitrary anyway...
   
   individuals2 <- individuals[c(casepick,conpick),]
   
   #Model risk
   m1  <- glm(Phenotype ~ N_copies_ancA + N_copies_ancB + Admixture,data=individuals2,family='binomial')
   m1a <- glm(Phenotype ~ N_copies + Admixture,data=individuals2,family='binomial')
   m2  <- glm(Phenotype ~ Admixture,data=individuals2,family='binomial')


   powersim[i,1] <- lrtest(m1,m2)[2,5]
   #powersim[i,2] <- summary(m1)$coefficients[2,4]
   powersim[i,3] <- lrtest(m1a,m2)[2,5]
  }

  powergivenn [(((k-1)*Ngroups) + j),1] <- length(which(powersim[,1] < 5e-8))
  powergivenn [(((k-1)*Ngroups) + j),3] <- length(which(powersim[,3] < 5e-8))

  j=j+1
 }
 k=k+1
}

write.table(powergivenn,file=paste('powergivennmaf13alt_',scriptno,'.txt',sep=''))
echo "asamap null standard" > header.txt
cat header.txt powergivenn_*.txt | grep -v V1 > power_givenn.txt
cat header.txt powergivennmaf13_*.txt | grep -v V1 > power_givennmaf13.txt
cat header.txt powergivennmaf24_*.txt | grep -v V1 > power_givennmaf24.txt

cat header.txt powergivennmaf13alt_*.txt | grep -v V1 > power_givennmaf13alt.txt


powergivenn <- read.table('power_givenn.txt',header=T)
powergivenn4000 <- powergivenn[seq(1,dim(powergivenn)[1],by=2),]
powergivenn12000 <- powergivenn[seq(2,dim(powergivenn)[1],by=2),]


powergivenn24 <- read.table('power_givennmaf24.txt',header=T)

powergivenn13 <- read.table('power_givennmaf13alt.txt',header=T)


#get rid of middle people, do 10% and 30% maf per group

#wesanderson::wes_palette
fineness=.005

 unlist_split2 <- function(x, colnum ,...)
 {
     toret <- unlist(strsplit(x, ...) )[colnum]
     return(toret)
 }

efseq <- as.numeric(sapply(row.names(powergivenn4000),unlist_split2,colnum=2,split="_") )


# pdf('powersimg2_2020.pdf',8,4)

# plot(efseq,powergivenn4000[,1]/10,type='l',lwd=2,cex.axis=1.25,cex.lab=1.45,xlab="Odds Ratio", ylab="Power",lty=2,ylim=c(0,100))
# lines(efseq,powergivenn4000[,3]/10,type='l',lwd=2,col='darkgrey',lty=1)



# lines(efseq,powergivenn12000[,1]/10,type='l',lwd=2,col='blue',lty=2)
# lines(efseq,powergivenn12000[,3]/10,type='l',lwd=2,col='lightblue',lty=1)

# legend("bottomright",legend=c( "12000 Cases, asaMap", "12000 Cases, standard","4000 Cases, asaMap", "4000 Cases, standard"), col=rep(c("blue","lightblue",'black','darkgrey'),3),lty=c(2,1,2,1),bty='n')
# #Remove the border
# dev.off()



# pdf('powersimg2_2040.pdf',8,4)

# plot(efseq,powergivenn24[,1]/10,type='l',lwd=2,col='red',lty=2,xlab="Odds Ratio", ylab="Power",)
# lines(efseq,powergivenn24[,3]/10,type='l',lwd=2,col='pink',lty=1)
# legend("bottomright",legend=c("12000 Cases, asaMap","12000 Cases, standard" ), col=rep(c('red','pink'),3),lty=c(2,1),bty='n')

# dev.off()



#Graph w/o red line
pdf('powersimg2_scen1.pdf',5.5,4.5)

plot(efseq,powergivenn4000[,1]/10,type='l',lwd=2,cex.axis=1.15,cex.lab=1.45,xlab="Odds Ratio", ylab="Power",lty=2,ylim=c(0,110),yaxt='n',bty='l')

abline(a=80,b=0,lty=3)
axis(2,c(0,20,40,60,80,100),cex.axis=1.15) #jesus christ you fuck jkust tra
lines(efseq,powergivenn4000[,3]/10,type='l',lwd=2,col='black',lty=1)

lines(efseq,powergivenn12000[,1]/10,type='l',lwd=2,col='blue',lty=2)
lines(efseq,powergivenn12000[,3]/10,type='l',lwd=2,col='blue',lty=1)

lines(efseq,powergivenn13[,1]/10,type='l',lwd=2,col='green',lty=2)
lines(efseq,powergivenn13[,3]/10,type='l',lwd=2,col='green',lty=1)

legend("topleft",legend=c("12000 Cases, LANC, MAF20%","12000 Cases, GLOB" ,"12000 Cases, LANC, MAF10% AFR, 30% EUR" ,"12000 Cases, GLOB" ,"4000 Cases, LANC, MAF20%" ,"4000 Cases, GLOB" ), col=rep(c('blue','blue','green','green','black','black'),3),lty=rep(c(2,1),3),bty='n',cex=.5, pt.cex = .4)

dev.off()



#Graph with black and blue
pdf('powersimg2_scen2.pdf',5.5,4.5)

plot(efseq,powergivenn4000[,1]/10,type='l',lwd=2,cex.axis=1.15,cex.lab=1.45,xlab="Odds Ratio", ylab="Power",lty=2,ylim=c(0,110),yaxt='n',bty='l')

abline(a=80,b=0,lty=3)
axis(2,c(0,20,40,60,80,100),cex.axis=1.15) #jesus christ you fuck jkust tra
lines(efseq,powergivenn4000[,3]/10,type='l',lwd=2,col='black',lty=1)

lines(efseq,powergivenn12000[,1]/10,type='l',lwd=2,col='blue',lty=2)
lines(efseq,powergivenn12000[,3]/10,type='l',lwd=2,col='blue',lty=1)


legend("topleft",legend=c("12000 Cases, LANC, MAF20%","12000 Cases, GLOB" ,"4000 Cases, LANC, MAF20%" ,"4000 Cases, GLOB" ), col=rep(c('blue','blue','black','black'),3),lty=rep(c(2,1),2),bty='n',cex=.5, pt.cex = .4)

dev.off()

#Graph with red and green

pdf('powersimg2_scen3.pdf',5.5,4.5)

plot(efseq,powergivenn4000[,1]/10,type='l',lwd=2,cex.axis=1.15,cex.lab=1.45,xlab="Odds Ratio", ylab="Power",lty=2,ylim=c(0,110),yaxt='n',bty='l',col='white')
axis(2,c(0,20,40,60,80,100),cex.axis=1.15) 

abline(a=80,b=0,lty=3)
lines(efseq,powergivenn24[,1]/10,type='l',lwd=2,col='red',lty=2)
lines(efseq,powergivenn24[,3]/10,type='l',lwd=2,col='red',lty=1)

lines(efseq,powergivenn13[,1]/10,type='l',lwd=2,col='green',lty=2)
lines(efseq,powergivenn13[,3]/10,type='l',lwd=2,col='green',lty=1)

legend("topleft",legend=c("12000 Cases, LANC, MAF20% AFR, 40% EUR","12000 Cases, GLOB" ,"12000 Cases, LANC, MAF10% AFR, 30% EUR" ,"12000 Cases, GLOB" ), col=rep(c('red','red','green','green'),3),lty=rep(c(2,1),2),bty='n',cex=.5, pt.cex = .4)

dev.off()




#graph all 
pdf('powersimg2_202020401030.pdf',5.5,4.5)

plot(efseq,powergivenn4000[,1]/10,type='l',lwd=2,cex.axis=1.15,cex.lab=1.45,xlab="Odds Ratio", ylab="Power",lty=2,ylim=c(0,110),yaxt='n',bty='l')
axis(2,c(0,20,40,60,80,100),cex.axis=1.15) #jesus christ you fuck jkust tra
lines(efseq,powergivenn4000[,3]/10,type='l',lwd=2,col='black',lty=1)
abline(h=0)

lines(efseq,powergivenn12000[,1]/10,type='l',lwd=2,col='blue',lty=2)
lines(efseq,powergivenn12000[,3]/10,type='l',lwd=2,col='blue',lty=1)

lines(efseq,powergivenn24[,1]/10,type='l',lwd=2,col='red',lty=2)
lines(efseq,powergivenn24[,3]/10,type='l',lwd=2,col='red',lty=1)

lines(efseq,powergivenn13[,1]/10,type='l',lwd=2,col='green',lty=2)
lines(efseq,powergivenn13[,3]/10,type='l',lwd=2,col='green',lty=1)
legend("topleft",legend=c("12000 Cases, asaMap","12000 Cases, standard" ), col=rep(c('red','pink'),3),lty=c(2,1),bty='n')


dev.off()

#!/bin/bash

#wd=$(pwd)
#qsub simulate_power.sh -lwalltime=3:00:00 -d $wd -e errandout/ -o errandout/ -F "-n 7"

while getopts n: option
do
  case "${option}"
    in
      n) nodeuse=${OPTARG};;
    esac
done

 module load R
 
 #Write the start and stop points of the file
 jstart=1 #changed jstart to 11 to finish last job
 jstop=$nodeuse

 for j in $(seq -w $jstart 1 $jstop) 
 do
  Rscript /mnt/sdb/genetics/pgc_ptsd_grant_resub/lanc_simulation_v5_mafdif.txt $j &
 done

wait