Implementation of formula converting beta from a linear regression onto the log odds scale
EFFICIENT COMPUTATION WITH A LINEAR MIXED MODEL ON LARGE-SCALE DATA SETSWITH APPLICATIONS TO GENETIC STUDIES Author(s): Matti Pirinen, Peter Donnelly and Chris C. A. Spencer
##quick version just based on prevalence, see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5237383/
awk -v phi=$phi '{if(NR == 1) print $0,"beta_logscale","se_logscale"; else print $0,$8/(phi*(1-phi)),$9/(phi*(1-phi))}' "$study"_gemma_"$outname_append" > "$study"_gemma_"$outname_append".logscale2
#Reformat GEMMA outputs to be on the log scale. Filter files down to meta-analysis ready files. Meta analyse, make QQ and Manhattan plots, do forest plots, LDSC
#Set working directoy=ry
workingdir=$(pwd)
awk '{if(NR==1) print "rs","SNP", "ngt", "Info"; else print $1,$18,$9,$37}' Metadata_allchr.txt | LC_ALL=C sort -k1,1b > Metadata_allchr_qimr.txt.sorted
LC_ALL=C sort -k2,2b <(zcat PGC_PTSD_allchr_lmm_with5PCs.assoc.txt.gz) > PGC_PTSD_allchr_lmm_with5PCs.assoc.txt.sorted
LC_ALL=C sort -k2,2b <(zcat PGC_PTSD_allchrfemale_lmm_with5PCs.assoc.txt.gz) > PGC_PTSD_allchrfemale_lmm_with5PCs.assoc.txt.sorted
LC_ALL=C sort -k2,2b <(zcat PGC_PTSD_allchrmale_lmm_with5PCs.assoc.txt.gz) > PGC_PTSD_allchrmale_lmm_with5PCs.assoc.txt.sorted
LC_ALL=C join -1 1 -2 2 Metadata_allchr_qimr.txt.sorted PGC_PTSD_allchr_lmm_with5PCs.assoc.txt.sorted | cut -d " " -f 2- | awk '{if ($1 != ".") print}' | sort -g -k 11 > qimr_all_rp_v1_oct6_2016.fixed
LC_ALL=C join -1 1 -2 2 Metadata_allchr_qimr.txt.sorted PGC_PTSD_allchrfemale_lmm_with5PCs.assoc.txt.sorted | cut -d " " -f 2- | awk '{if ($1 != ".") print}' | sort -g -k 11 > qimr_females_rp_v1_oct6_2016.fixed
LC_ALL=C join -1 1 -2 2 Metadata_allchr_qimr.txt.sorted PGC_PTSD_allchrmale_lmm_with5PCs.assoc.txt.sorted | cut -d " " -f 2- | awk '{if ($1 != ".") print}' | sort -g -k 11 > qimr_males_rp_v1_oct6_2016.fixed
#Calculate prevalence
# 325 cases (69% female), 1797 controls (48% female) with density and concordance within families as per the table.
# 325/(1797+325) = .153 overall prevalence
# 325*.69/(1797*.48+325*.69) = .206 fem prevalence
# 325*.31/(1797*.52+325*.31) = 0.097 male prevalence
# 224 cases 863 controls for women
# 101 cases 934 controls for men
ls | grep qimr_* > study_files.txt
#Convert all data into the log scale
file=qimr_all_rp_v1_oct6_2016.fixed
phi=.153
qsub -lwalltime=00:25:00 beta_to_logscale.pbs -e errandout/ -o errandout -d $workingdir -F "-s beta_to_logscale.rscript -i $file -v $phi -b beta -m af -p p_wald -o qimr_all.logscale"
file=qimr_females_rp_v1_oct6_2016.fixed
phi=.206
qsub -lwalltime=00:25:00 beta_to_logscale.pbs -e errandout/ -o errandout -d $workingdir -F "-s beta_to_logscale.rscript -i $file -v $phi -b beta -m af -p p_wald -o qimr_fem.logscale"
file=qimr_males_rp_v1_oct6_2016.fixed
phi=.098
qsub -lwalltime=00:25:00 beta_to_logscale.pbs -e errandout/ -o errandout -d $workingdir -F "-s beta_to_logscale.rscript -i $file -v $phi -b beta -m af -p p_wald -o qimr_mal.logscale"
columns are currently:
snp chr ps n_miss allele1 allele0 af beta se l_remle l_mle p_wald p_lrt p_score beta_logit se_logit
FRQ_A_0 FRQ_U_#samplesizehere#
#Set AF in cases and controls to same value -- not exactly right but to hell with it, it gives me what I want in the end - the overall maf
echo "CHR SNP BP A1 A2 FRQ_A_325 FRQ_U_1797 INFO OR SE P ngt" > header1.txt
cat header1.txt <( awk '{print $2,$1, $3 ,$5 ,$6 ,$6,$7, 0, 1,$15,$16,$12, 1}' qimr_all.logscale) > pts_qimr_mix_nm.logscale.results
#!/bin/bash
while getopts s:i:v:b:m:p:o: option
do
case "${option}"
in
s) scriptname=${OPTARG};;
i) infile=${OPTARG};;
v) phi=${OPTARG};;
b) beta_colname=${OPTARG};;
m) maf_colname=${OPTARG};;
p) p_colname=${OPTARG};;
o) outfile=${OPTARG};;
esac
done
echo scriptname : $scriptname
echo infile : $infile
echo prevalence: $phi
echo beta column name : $beta_colname
echo maf column name : $maf_colname
echo p column name : $p_colname
echo outfile : $outfile
module load R
Rscript $scriptname $infile $phi $beta_colname $maf_colname $p_colname $outfile
args <- commandArgs(trailingOnly = TRUE)
infile <- args[1]
phi <- args[2]
beta_colname <- args[3]
maf_colname <- args[4]
p_colname <- args[5]
outfile <- args[6]
print(c(infile,phi,beta_colname,maf_colname,p_colname,outfile))
phi <- as.numeric(phi)
new_beta_se <- function(x ,phi=0.153)
{
#Input should be a vector consisting of beta, p, maf. phi should be a value for the proportion of cases
B=x[1]
P=x[2]
theta=x[3]
B2=B/(phi*(1 - phi) + .5*(1-2*phi)*(1-2*theta)*B - (0.084 + 0.9*phi*(1 - 2*phi)*theta*(1-theta))/(phi*(1-phi))*B^2)
Zsq=qchisq(P,1,lower.tail=F)
se=sqrt(B2^2/Zsq)
return(c(B2,se))
}
library(data.table)
#dat <- fread(paste('zcat', infile), data.table=F)
dat <- fread(paste(infile), data.table=F)
newbetas <- t(apply(dat[,c(beta_colname,p_colname,maf_colname)],1,new_beta_se,phi=phi))
colnames(newbetas) <- c("BETA_LOGIT","SE_LOGIT")
dat_exp <- cbind(dat,newbetas)
#gz1 <- gzfile(paste(outfile,'.gz',sep=''),'w')
write.table(dat_exp,outfile, row.names=F,quote=F)
#close(gz1)
B=-0.01540708 #Beta from linear model
phi= .1 #Proportion of cases in data
theta=.4 #Minor allele Frequency
P= 0.4119744 #pvalue of data
#Calculate the beta value on the log odds scale
B2=B/(phi*(1 - phi) + .5*(1-2*phi)*(1-2*theta)*B - (0.084 + 0.9*phi*(1 - 2*phi)*theta*(1-theta))/(phi*(1-phi))*B^2)
#The associated odds ratio
exp(B2)
#For meta-analysis, still need to know SE of beta. Since p-value is known, SE can be derived e.g.
Zsq=qchisq(P,1,lower.tail=F)
se=sqrt(B2^2/Zsq)
R
new_beta_se <- function(x ,phi=0.153)
{
#Input should be a vector consisting of beta, p, maf. phi should be a value for the proportion of cases
B=x[1]
P=x[2]
theta=x[3]
B2=B/(phi*(1 - phi) + .5*(1-2*phi)*(1-2*theta)*B - (0.084 + 0.9*phi*(1 - 2*phi)*theta*(1-theta))/(phi*(1-phi))*B^2)
Zsq=qchisq(P,1,lower.tail=F)
se=sqrt(B2^2/Zsq)
print(B2)
print(se)
return(c(B2,se))
}
#has about 830000 markers
dat <- read.table('qimr_rp_v1_oct6_2016.fixed.gz', nr=8300000,stringsAsFactors=F)
names(dat) <- c("chr" , "rs" , "ps" , "n_miss" , "allele1" ,"allele0" ,"af" , "beta" , "se" , "l_remle" ,"l_mle" , "p_wald" , "p_lrt" , "p_score")
newbetas <- t(apply(dat[,c("beta","p_wald","af")],1,new_beta_se,phi=0.153))
names(newbetas) <- c("b_logit","se_logit")
dat_exp <- cbind(dat,newbetas)
gz1 <- gzfile('qimr_rp_v1_oct6_2016.fixed.logit.gz','w')
write.table(dat_exp,gz1, row.names=F,quote=F)
close(gz1)
q()
n