nievergeltlab of Simulations
7/11/2017 - 6:25 PM

Simulate best guess genotype cutoff thresholds to determine how well correlated they are to true genotypes, and if they will return comparab

Simulate best guess genotype cutoff thresholds to determine how well correlated they are to true genotypes, and if they will return comparably powered test statistics.

Results show that at a cutoff of .8, analytic results are nearly the same as with dosages

#info test
#From Marchini 2010


#Simulate an allele, then simulate a penalty factor. 
#E.g. someone with 2 copies has a penalized genotype (worst case scenario) of 
#(1-threshold)*0+ 0*1 + threshold*2 = 1.6
#Someone with 1 copies has a penalized genotype of 
# threshold*0 + 0.8*1 + (1-threshold)*2 #can be either 1.2 or .8, either is .2 off from the expectation of 1.
#Someone with 0 copies has a maximum penalized genotype of .4 
# threshold*0 + 0*1 + (1-threshold)*2

threshlist=seq(.5,1,.01)
threshout <- rep(NA,length(threshlist))
ts1 <-  rep(NA,length(threshlist))
ts2 <-  rep(NA,length(threshlist))

i=1
for (threshes in threshlist)
{
thresh=threshes
nsubs=10000
truemaf=.4
truegeno <- rbinom(nsubs, 2,p=truemaf) # True genotypes


#Penalize true genotypes (penalizing randomly)
truegeno2 <- truegeno
truegeno2[which(truegeno ==2)] <- sample( c((1-thresh)/2 + thresh*2,(1-thresh) + thresh*2, thresh*2),size=length(truegeno2[which(truegeno ==2)]),replace=TRUE)
truegeno2[which(truegeno ==1)] <- sample( c(thresh, thresh + (1-thresh)*2),size=length(truegeno2[which(truegeno ==1)]),replace=TRUE)
truegeno2[which(truegeno ==0)] <- sample( c((1-thresh), (1-thresh)/2 + (1-thresh),  (1-thresh)*2),size=length(truegeno2[which(truegeno ==0)]),replace=TRUE)

newpheno <- truegeno + rnorm(nsubs,sd=5)

 ts1[i] <- summary(lm(newpheno ~ truegeno))$coefficients[2,3]

ts2[i] <-  summary(lm(newpheno ~ truegeno2))$coefficients[2,3]

 
expectation <- truegeno2
mafe <- mean(expectation)/2

r2 <- var(expectation) / (2*mafe*(1-mafe))
r2

threshout[i] <- cor.test(truegeno,truegeno2)$estimate #info scores, if that calculation is right, dont matter -- the measure is 95% identical if the threshold is .8

i=i+1
}

plot(threshlist,threshout,type='l',lwd=2,xlab="Probability Threshold",ylab="Pearson Correlation of Imputed and Genotyped")

plot(threshlist,ts1,type='l',lwd=2,xlab="Probability Threshold",ylab="Test statistic",ylim=c(0,15))
lines(threshlist,ts2,type='l',lwd=2,col="blue")