nievergeltlab
4/3/2017 - 4:37 PM

Implementation of formula converting beta from a linear regression onto the log odds scale EFFICIENT COMPUTATION WITH A LINEAR MIXED MODEL

Implementation of formula converting beta from a linear regression onto the log odds scale

EFFICIENT COMPUTATION WITH A LINEAR MIXED MODEL ON LARGE-SCALE DATA SETSWITH APPLICATIONS TO GENETIC STUDIES Author(s): Matti Pirinen, Peter Donnelly and Chris C. A. Spencer


##quick version just based on prevalence, see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5237383/
 awk -v phi=$phi '{if(NR == 1) print $0,"beta_logscale","se_logscale"; else print $0,$8/(phi*(1-phi)),$9/(phi*(1-phi))}' "$study"_gemma_"$outname_append" > "$study"_gemma_"$outname_append".logscale2
#Reformat GEMMA outputs to be on the log scale. Filter files down to meta-analysis ready files. Meta analyse, make QQ and Manhattan plots, do forest plots, LDSC

#Set working directoy=ry
workingdir=$(pwd)

awk '{if(NR==1) print "rs","SNP", "ngt", "Info"; else print $1,$18,$9,$37}' Metadata_allchr.txt | LC_ALL=C sort -k1,1b > Metadata_allchr_qimr.txt.sorted

LC_ALL=C sort -k2,2b <(zcat PGC_PTSD_allchr_lmm_with5PCs.assoc.txt.gz) > PGC_PTSD_allchr_lmm_with5PCs.assoc.txt.sorted
LC_ALL=C sort -k2,2b <(zcat PGC_PTSD_allchrfemale_lmm_with5PCs.assoc.txt.gz) > PGC_PTSD_allchrfemale_lmm_with5PCs.assoc.txt.sorted
LC_ALL=C sort -k2,2b <(zcat PGC_PTSD_allchrmale_lmm_with5PCs.assoc.txt.gz) > PGC_PTSD_allchrmale_lmm_with5PCs.assoc.txt.sorted


LC_ALL=C join -1 1 -2 2 Metadata_allchr_qimr.txt.sorted PGC_PTSD_allchr_lmm_with5PCs.assoc.txt.sorted | cut -d " " -f 2- | awk '{if ($1 != ".") print}' | sort -g -k 11 >  qimr_all_rp_v1_oct6_2016.fixed
LC_ALL=C join -1 1 -2 2 Metadata_allchr_qimr.txt.sorted  PGC_PTSD_allchrfemale_lmm_with5PCs.assoc.txt.sorted | cut -d " " -f 2- | awk '{if ($1 != ".") print}' | sort -g -k 11 > qimr_females_rp_v1_oct6_2016.fixed
LC_ALL=C join -1 1 -2 2 Metadata_allchr_qimr.txt.sorted  PGC_PTSD_allchrmale_lmm_with5PCs.assoc.txt.sorted | cut -d " " -f 2- | awk '{if ($1 != ".") print}' | sort -g -k 11  > qimr_males_rp_v1_oct6_2016.fixed


#Calculate prevalence
# 325 cases (69% female), 1797 controls (48% female) with density and concordance within families as per the table.

# 325/(1797+325) = .153 overall prevalence
# 325*.69/(1797*.48+325*.69) = .206 fem prevalence
# 325*.31/(1797*.52+325*.31) = 0.097 male prevalence

# 224 cases 863 controls for women
# 101 cases 934 controls for men

ls | grep qimr_* > study_files.txt

#Convert all data into the log scale

 file=qimr_all_rp_v1_oct6_2016.fixed
 phi=.153
 qsub -lwalltime=00:25:00 beta_to_logscale.pbs -e errandout/ -o errandout -d $workingdir -F "-s beta_to_logscale.rscript -i $file -v $phi -b beta -m af -p p_wald -o qimr_all.logscale"
 
 file=qimr_females_rp_v1_oct6_2016.fixed
 phi=.206
 qsub -lwalltime=00:25:00 beta_to_logscale.pbs -e errandout/ -o errandout -d $workingdir -F "-s beta_to_logscale.rscript -i $file -v $phi -b beta -m af -p p_wald -o qimr_fem.logscale"
 
  file=qimr_males_rp_v1_oct6_2016.fixed
 phi=.098
 qsub -lwalltime=00:25:00 beta_to_logscale.pbs -e errandout/ -o errandout -d $workingdir -F "-s beta_to_logscale.rscript -i $file -v $phi -b beta -m af -p p_wald -o qimr_mal.logscale"
 
columns are currently:
snp chr       ps      n_miss  allele1 allele0 af      beta    se      l_remle l_mle   p_wald  p_lrt   p_score beta_logit se_logit

FRQ_A_0 FRQ_U_#samplesizehere#

#Set AF in cases and controls to same value -- not exactly right but to hell with it, it gives me what I want in the end - the overall maf

 echo "CHR         SNP          BP  A1  A2   FRQ_A_325 FRQ_U_1797  INFO      OR      SE       P   ngt" > header1.txt
 
 cat header1.txt <( awk '{print $2,$1, $3 ,$5 ,$6 ,$6,$7, 0, 1,$15,$16,$12, 1}' qimr_all.logscale) > pts_qimr_mix_nm.logscale.results

 
 
#!/bin/bash

while getopts s:i:v:b:m:p:o: option
do
  case "${option}"
    in
      s) scriptname=${OPTARG};;
      i) infile=${OPTARG};;
      v) phi=${OPTARG};;
      b) beta_colname=${OPTARG};;
      m) maf_colname=${OPTARG};;
      p) p_colname=${OPTARG};;
      o) outfile=${OPTARG};;
    esac
done

echo scriptname : $scriptname 
echo infile : $infile 
echo prevalence: $phi 
echo beta column name : $beta_colname 
echo maf column name  : $maf_colname 
echo p column name : $p_colname 
echo outfile : $outfile
 
module load R

Rscript $scriptname $infile $phi $beta_colname $maf_colname $p_colname $outfile

args <- commandArgs(trailingOnly = TRUE)
 infile <- args[1]
 phi <- args[2]
 beta_colname <- args[3]
 maf_colname <- args[4]
 p_colname <- args[5]
 outfile <- args[6]

print(c(infile,phi,beta_colname,maf_colname,p_colname,outfile))
 
 phi <- as.numeric(phi)
 
new_beta_se <- function(x ,phi=0.153)
{
 #Input should be a vector consisting of beta, p, maf. phi should be a value for the proportion of cases
 B=x[1]
 P=x[2]
 theta=x[3]
 
 B2=B/(phi*(1 - phi) + .5*(1-2*phi)*(1-2*theta)*B - (0.084 + 0.9*phi*(1 - 2*phi)*theta*(1-theta))/(phi*(1-phi))*B^2)
 Zsq=qchisq(P,1,lower.tail=F)
 se=sqrt(B2^2/Zsq)
 return(c(B2,se))
}

library(data.table)

#dat <- fread(paste('zcat', infile), data.table=F)
dat <- fread(paste(infile), data.table=F)


newbetas <- t(apply(dat[,c(beta_colname,p_colname,maf_colname)],1,new_beta_se,phi=phi))

colnames(newbetas) <- c("BETA_LOGIT","SE_LOGIT")
dat_exp <- cbind(dat,newbetas)
#gz1 <- gzfile(paste(outfile,'.gz',sep=''),'w')
write.table(dat_exp,outfile, row.names=F,quote=F)
#close(gz1)
B=-0.01540708 #Beta from linear model
phi= .1 #Proportion of cases in data 
theta=.4 #Minor allele Frequency
P= 0.4119744 #pvalue of data 

#Calculate the beta value on the log odds scale
B2=B/(phi*(1 - phi) + .5*(1-2*phi)*(1-2*theta)*B - (0.084 + 0.9*phi*(1 - 2*phi)*theta*(1-theta))/(phi*(1-phi))*B^2)

#The associated odds ratio
exp(B2)

#For meta-analysis, still need to know SE of beta. Since p-value is known, SE can be derived e.g.
Zsq=qchisq(P,1,lower.tail=F)
se=sqrt(B2^2/Zsq)
R 

new_beta_se <- function(x ,phi=0.153)
{
 #Input should be a vector consisting of beta, p, maf. phi should be a value for the proportion of cases
 B=x[1]
 P=x[2]
 theta=x[3]
 
 B2=B/(phi*(1 - phi) + .5*(1-2*phi)*(1-2*theta)*B - (0.084 + 0.9*phi*(1 - 2*phi)*theta*(1-theta))/(phi*(1-phi))*B^2)
 Zsq=qchisq(P,1,lower.tail=F)
 se=sqrt(B2^2/Zsq)
 print(B2)
print(se)
 return(c(B2,se))
}

#has about 830000 markers
dat <- read.table('qimr_rp_v1_oct6_2016.fixed.gz', nr=8300000,stringsAsFactors=F)
names(dat) <- c("chr"    , "rs"   ,   "ps"   ,   "n_miss" , "allele1" ,"allele0" ,"af"  ,    "beta" ,   "se"  ,    "l_remle" ,"l_mle"  , "p_wald" , "p_lrt" ,  "p_score")
newbetas <- t(apply(dat[,c("beta","p_wald","af")],1,new_beta_se,phi=0.153))
names(newbetas) <- c("b_logit","se_logit")
dat_exp <- cbind(dat,newbetas)
gz1 <- gzfile('qimr_rp_v1_oct6_2016.fixed.logit.gz','w')
write.table(dat_exp,gz1, row.names=F,quote=F)
close(gz1)
q()
n