[LD Score Regression]

3/20/2018 - 8:33 PM




GWAS=/hp_shares/mgandal/datasets/GWAS/BP.PGC.2012/BD.PGC.2012.sumstats.gz
GWAS=/hp_shares/mgandal/datasets/GWAS/SCZ.Clozuk.2018/SCZ.Clozuk.2018.sumstats.gz
GWAS=/hp_shares/mgandal/datasets/GWAS/ASD.iPSYCH+PGC.2017/ASD.iPSYCHPGC.2018.sumstats.gz
GWAS=/hp_shares/mgandal/datasets/GWAS/ASD.PGC.2017/ASD.PGC.2017.sumstats.gz
GWAS=/hp_shares/mgandal/datasets/GWAS/IBD.2015/IBD.2015.sumstats.gz

GWAS=/hp_shares/mgandal/datasets/GWAS/Neuroticism.SSGAP.2016/Neuroticism.SSGAP.2016.sumstats.gz
GWAS=/hp_shares/mgandal/datasets/GWAS/EduYears.SSGAC.2016/EduYears.SSGAC.2016.sumstats.gz
GWAS=/hp_shares/mgandal/datasets/GWAS/SubjWellBeing.SSGAC.2016/SubjWellBeing.SSGAC.2016.sumstats.gz





screen
qrsh -l h_rt=04:00:00,h_vmem=64G
annotDir=/hp_shares/mgandal/datasets/LDscore/1000G_Phase3/makeCustomAnnotation/output/
outdir=/hp_shares/mgandal/datasets/LDscore/PsychEncode/results/
cd $annotDir

GWAS=/hp_shares/mgandal/datasets/GWAS/EduYears.SSGAC.2016/EduYears.SSGAC.2016.sumstats.gz
GWAS_name=`basename $GWAS .sumstats.gz`

for annot in ${annotDir}/*iso*.1.l2.ldscore.gz; do
	set=`basename $annot .1.l2.ldscore.gz`
    if [ ! -e ${outdir}/${GWAS_name}_${set}.results ] 
	then
python ~/bin/ldsc/ldsc.py \
--h2  $GWAS \
--out ${outdir}/${GWAS_name}_${set} \
--ref-ld-chr ${annotDir}/${set}.,${annotDir}/baseline. \
--w-ld-chr /hp_shares/mgandal/datasets/LDscore/1000G_Phase3/weights_hm3_no_hla/weights. \
--overlap-annot \
--frqfile-chr /hp_shares/mgandal/datasets/LDscore/1000G_Phase3/1000G_Phase3_frq/1000G.EUR.QC. \
--print-coefficients &

fi

done

CreateNewAnnotation_02.sh

setdir=/hp_shares/mgandal/datasets/LDscore/1000G_Phase3/makeCustomAnnotation/output/

for set in output/*.1.annot; do
	set=`basename $set .1.annot`

	for chr in {1..22}; do
		if [ ! -e ${setdir}/${set}.${chr}.l2.ldscore.gz ] 
		then
        	qsub -b y -cwd -e ./log -o ./log -V -N sLDSC -l h_vmem=8G,h_rt=4:00:00 /hp_shares/mgandal/datasets/LDscore/1000G_Phase3/makeCustomAnnotation/qsubWrapper.sh $set $chr
		fi
	done
done

#===========#===========#===========#===========#===========#===========#===========
#!/bin/bash
#qsubWrapper.sh

setdir=/hp_shares/mgandal/datasets/LDscore/1000G_Phase3/makeCustomAnnotation/output/

set=$1
chr=$2

echo $set, $chr
        
python ~/bin/ldsc/ldsc.py \
	--l2 \
	--bfile /hp_shares/mgandal/datasets/LDscore/1000G_Phase3/1000G_EUR_Phase3_plink/1000G.EUR.QC.${chr} \
        --ld-wind-cm 1 \
        --annot /hp_shares/mgandal/datasets/LDscore/1000G_Phase3/makeCustomAnnotation/output/${set}.${chr}.annot \
        --out ${setdir}/${set}.${chr} \
        --print-snps /hp_shares/mgandal/datasets/LDscore/1000G_Phase3/hapmap3_snps/hm.${chr}.snp

CreateNewAnnotation_02.R

#!/share/apps/R-3.3.1/bin/Rscript
#Usage: Rscript CreateNewAnnotation_02.R $CHR_NUM $GENE_ANNOTATION $SET_ANNOTATION
#SET_ANNOTATION FORMAT: TSV ensembl_gene_id set_name 

options(stringsAsFactors = F)
args <- commandArgs(trailingOnly = TRUE)
##args=list("1", "annotation.gene.gencodeV19.csv", "customAnnotation.txt")
print(args)

OFFSET = 10000

outdir="./output/"

chr = args[[1]]
geneAnno_file = args[[2]]
setAnno_file = args[[3]]

geneAnnotation = read.csv(geneAnno_file, head=T)
setAnnotation = read.table(setAnno_file, head=T)

geneAnnotation = geneAnnotation[match(setAnnotation[,1], geneAnnotation$gene_id),]
setAnnotation = cbind(setAnnotation, geneAnnotation)
setAnnotation$chr = as.numeric(gsub("chr", "", setAnnotation$chr))
template = read.table(gzfile(paste0("/hp_shares/mgandal/datasets/LDscore/1000G_Phase3/1000G_Phase3_cell_type_groups/cell_type_group.1.", chr, ".annot.gz")), head=T)[,1:4]
print(paste("Loading",chr,"there are", nrow(template), "SNPs"))

for(set in unique(setAnnotation[,2])) {
  print(paste("Working on chr", chr, ": ",set))
  setGenes = intersect(setAnnotation[,1][setAnnotation[,2] == set], geneAnnotation$gene_id[geneAnnotation$chr == paste0("chr",chr)])
  
  binaryVec = rep(0, times=nrow(template))
  
  for(g in setGenes) {
	  idx = match(g, geneAnnotation$gene_id)
	  snp_match = which(( template$BP > geneAnnotation$start[idx] - OFFSET) & (template$BP < geneAnnotation$stop[idx] + OFFSET))
      if(length(snp_match) > 0) {
        binaryVec[snp_match] = 1
      }
   }
  annotOut = cbind(template, binaryVec)
  colnames(annotOut)[5] = set
  print(paste0("Found ", sum(binaryVec), " SNPs in the set"))
  write.table(annotOut, file = paste0(outdir, set, ".", chr, ".annot"), quote = F, row.names = F)
}
    
print('done')

CreateNewAnnotation_01.sh

#!/bin/bash

# https://github.com/bulik/ldsc/wiki/LD-Score-Estimation-Tutorial

# Download the baseline model and weights
wget https://data.broadinstitute.org/alkesgroup/LDSCORE/1000G_Phase3_baseline_ldscores.tgz
wget https://data.broadinstitute.org/alkesgroup/LDSCORE/weights_hm3_no_hla.tgz
wget https://data.broadinstitute.org/alkesgroup/LDSCORE/1000G_Phase3_plinkfiles.tgz
wget https://data.broadinstitute.org/alkesgroup/LDSCORE/hapmap3_snps.tgz
wget https://data.broadinstitute.org/alkesgroup/LDSCORE/1000G_Phase3_frq.tgz
tar -xzvf 1000G_Phase3_plinkfiles.tgz
tar -xvzf 1000G_Phase3_baseline_ldscores.tgz
tar -xvzf weights_hm3_no_hla.tgz
tar -xzvf 1000G_Phase3_frq.tgz
tar -xzvf hapmap3_snps.tgz


geneAnnotation=annotation.gene.gencodeV19.csv
setAnnotation=customAnnotation.txt

screen

qrsh -l h_vmem=16G -pe shared 4

for i in {1..22}; do
	echo $i
    /share/apps/R-3.3.1/bin/Rscript CreateNewAnnotation_02.R $i $geneAnnotation $setAnnotation
done



for set in *_*.1.annot; do
	set=`basename $set .1.annot`
    
	for i in {1..22}; do
		echo $set, $i
        
		python ~/bin/ldsc/ldsc.py \
        	--l2 \
            --bfile /hp_shares/mgandal/datasets/LDscore/1000G_Phase3/1000G_EUR_Phase3_plink/1000G.EUR.QC.$i \
            --ld-wind-cm 1 \
            --annot /hp_shares/mgandal/datasets/LDscore/1000G_Phase3/makeCustomAnnotation/output/${set}.${i}.annot \
            --out ${set}.${i} \
            --print-snps /hp_shares/mgandal/datasets/LDscore/1000G_Phase3/hapmap3_snps/hm.${i}.snp
    done
 done

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

[LD Score Regression]