Evaluate the efficiency of meta analysis with small study sizes and rare variants
###Simulate whether or not we'll get valid test stats doing a meta analysis of small samples
nsim=100
ncases=5000
ncontrols=ncases
set.seed=18
nsubs=ncases+ncontrols
case_af <- .1
control_af <- .07
setsize=50
library(metafor)
#Sample same population st cases are enumerated in batches of 50
#till we get 10000 cases -- assuming 5% of the population is cases, we need a pop of 200k people. maybe i should just do this in plink?? # http://cnsgenomics.com/software/gcta/Simu.html
#Save the meta analysis test stat in the first column, the whole data regression analysis in the second
results <- matrix(ncol=2,nrow=nsim)
for (sim in 1:nsim)
{
genotype <- rbinom(nsubs,2,p=rep(c(case_af,control_af),nsubs))
dat <- data.frame(cbind(rep(c(1,0),ncases),genotype))
#Split into abritrary sets , do assoc analysis
pres <- ncases/setsize
stat <- matrix(nrow=pres*2,ncol=2)
for (rep in 1:(2*pres))
{
ds <- dat[((rep-1)*setsize + 1):(rep*setsize),]
stat[rep,] <- summary(glm(V1~ genotype,family="binomial",data=ds))$coefficients[2,1:2]
}
weight=1/stat[,2]^2
t_top=sum(weight*stat[,1])
T=t_top/sum(weight)
sem=sqrt(1/sum(weight))
zscore <- T/sem
#results[sim,1] <- rma(yi=stat[,1],sei=stat[,2],method="FE")$zval
results[sim,1] <- zscore
results[sim,2] <- summary(glm(V1~genotype,data=dat,family="binomial"))$coefficients[2,3]
}
#Test whether or not the analyses produce different test stats
median(results[,1]/results[,2])
#When the AF is common, having 50 samples per study results in a perhaps 5% loss in efficiency. This represents a worst case scenario, as most studies have > 50
#However, at low MAF, un-estimatable logistic estimates become common - e.g. with af 10% and 7% in cases and controls respectively, 3% of results had huge SEs
#Conclusion: For common variation, At lower mafs, efficiency reduces exponentially due to misestiamtion due to sparse cell counts
#The realistic loss of efficiency could be a an interpolated number between the losses at n=500 and n=50
#With AF 10%, approx 1/25 of results will have at least one study with a mis-estimated parameter