Simulate best guess genotype cutoff thresholds to determine how well correlated they are to true genotypes, and if they will return comparably powered test statistics.
Results show that at a cutoff of .8, analytic results are nearly the same as with dosages
#info test
#From Marchini 2010
#Simulate an allele, then simulate a penalty factor.
#E.g. someone with 2 copies has a penalized genotype (worst case scenario) of
#(1-threshold)*0+ 0*1 + threshold*2 = 1.6
#Someone with 1 copies has a penalized genotype of
# threshold*0 + 0.8*1 + (1-threshold)*2 #can be either 1.2 or .8, either is .2 off from the expectation of 1.
#Someone with 0 copies has a maximum penalized genotype of .4
# threshold*0 + 0*1 + (1-threshold)*2
threshlist=seq(.5,1,.01)
threshout <- rep(NA,length(threshlist))
ts1 <- rep(NA,length(threshlist))
ts2 <- rep(NA,length(threshlist))
i=1
for (threshes in threshlist)
{
thresh=threshes
nsubs=10000
truemaf=.4
truegeno <- rbinom(nsubs, 2,p=truemaf) # True genotypes
#Penalize true genotypes (penalizing randomly)
truegeno2 <- truegeno
truegeno2[which(truegeno ==2)] <- sample( c((1-thresh)/2 + thresh*2,(1-thresh) + thresh*2, thresh*2),size=length(truegeno2[which(truegeno ==2)]),replace=TRUE)
truegeno2[which(truegeno ==1)] <- sample( c(thresh, thresh + (1-thresh)*2),size=length(truegeno2[which(truegeno ==1)]),replace=TRUE)
truegeno2[which(truegeno ==0)] <- sample( c((1-thresh), (1-thresh)/2 + (1-thresh), (1-thresh)*2),size=length(truegeno2[which(truegeno ==0)]),replace=TRUE)
newpheno <- truegeno + rnorm(nsubs,sd=5)
ts1[i] <- summary(lm(newpheno ~ truegeno))$coefficients[2,3]
ts2[i] <- summary(lm(newpheno ~ truegeno2))$coefficients[2,3]
expectation <- truegeno2
mafe <- mean(expectation)/2
r2 <- var(expectation) / (2*mafe*(1-mafe))
r2
threshout[i] <- cor.test(truegeno,truegeno2)$estimate #info scores, if that calculation is right, dont matter -- the measure is 95% identical if the threshold is .8
i=i+1
}
plot(threshlist,threshout,type='l',lwd=2,xlab="Probability Threshold",ylab="Pearson Correlation of Imputed and Genotyped")
plot(threshlist,ts1,type='l',lwd=2,xlab="Probability Threshold",ylab="Test statistic",ylim=c(0,15))
lines(threshlist,ts2,type='l',lwd=2,col="blue")