genomewalker
2/17/2016 - 9:07 AM

MED rhodo

MED rhodo

library(ape)
library(reshape)
library(ggplot2)
library(data.table)
library(grid)
library(scales)
library(spider)
#change the seed to have different results

dat<-read.dna(file="~/Downloads/ALL.MED.unique.cd-hit.fasta", format = "fasta", as.character=FALSE, skip=0)
dat1<-read.dna(file="~/Downloads/ALL.MED.unique.3st", format = "fasta", as.character=FALSE, skip=0)

dat<-read.dna(file="~/Downloads/MED/R_baltica_all_codons_node-representatives-tree.txt", format = "fasta", as.character=FALSE, skip=0)

dat1<-read.dna(file="~/Downloads/MED/R_baltica_all_codons_node-representatives-tree.only3rd.fasta", format = "fasta", as.character=FALSE, skip=0)

# 
# lm_eqn = function(m) {
#   
#   l <- list(a = format(coef(m)[1], digits = 2),
#             b = format(abs(coef(m)[2]), digits = 2),
#             r2 = format(summary(m)$r.squared, digits = 3));
#   
#   if (coef(m)[2] >= 0)  {
#     eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
#   } else {
#     eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)    
#   }
#   
#   as.character(as.expression(eq));                 
# }


###Input data: dat--an object of class 'DNAbin'###

set.seed(3399322)
Nseqs<-1000
rand.seqs.ids<-sample(dim(dat)[1], Nseqs)

rand.seqs<-dat[rand.seqs.ids,]
rand.seqs1<-dat1[rand.seqs.ids,]

#Remove 3rd codon position
#rand.seqs<-rand.seqs[, -seq(0, ncol(rand.seqs), by=3) ]

# ###Convert to genetic distances###
# dist.raw<-dist.dna(rand.seqs, model="raw", pairwise.deletion=TRUE)
# #dist.jc69<-dist.dna(rand.seqs, model="JC69", pairwise.deletion=TRUE)
# #dist.k80<-dist.dna(rand.seqs, model="K80", pairwise.deletion=TRUE)
# dist.tn93<-dist.dna(rand.seqs, model="TN93", pairwise.deletion=TRUE)
# dist<-dist.raw
# dist.corrected<-dist.tn93
# 
# l<-lm(dist~dist.tn93)
# lm_coef<-coef(l)
# 
# dist.raw<-as.matrix(dist.raw)
# dist.corrected<-as.matrix(dist.corrected)
# 
# # We 
# dist.raw[lower.tri(dist.raw)] <- NA
# diag(dist.raw)<-NA
# dist.raw<-melt(dist.raw)
# dist.raw<-dist.raw[!is.na(dist.raw$value),]
# #y<-subset(y,value != 'NA')
# 
# 
# 
# 
# # We 
# dist.corrected[lower.tri(dist.corrected)] <- NA
# diag(dist.corrected)<-NA
# dist.corrected<-melt(dist.corrected)
# dist.corrected<-dist.corrected[!is.na(dist.corrected$value),]
# #x<-subset(x,value != 'NA')
# 
# 
# 
# v<-cbind(dist.corrected$value,dist.raw$value)
# colnames(v)<-c("x","y")
# v<-as.data.table(v)
# 
# setkey(v,NULL)
# v.unique<-unique(v)
# 
# label <- lm_eqn(l)
# 
# png(file="~/Downloads/test-99991.png")
# ggplot(v.unique,aes(x=x, y=y, group = 1)) +
#   geom_abline(linetype="dotted") +
#   geom_point(color="red", size=1) + 
#  # stat_smooth(method = "lm", se=F, color="black", formula = y ~ x, size = 1) +
#   geom_abline(intercept = lm_coef[1], slope = lm_coef[2], size = 1) +
#   #geom_text(aes(x = 0.3, y = 0.1, label = label, size=5), parse = T) +
#   annotate("text", label = label, x = 0.4, y = 0.1, size = 4, colour = "red", parse =T) +
#   theme_bw() + 
#   xlab("TN93 model distance") +
#   ylab("Uncorrected genetic distance") +
#   theme(legend.position="none",
#         legend.title=element_blank(), 
#         legend.key=element_blank(),
#         legend.background = element_rect(fill=alpha('white', 0.4)),
#         axis.title=element_text(size=12),
#         legend.text=element_text(size=8),
#         legend.text.align=0,
#         legend.margin=unit(-1, "cm"),
#         legend.key.width=unit(0.5,"line"),
#         legend.key.size = unit(1, "cm"),
#         axis.text=element_text(size=10) 
#   )
# dev.off()
# 


subs <- titv(dat)
#Saturation plot
doloDist <- dist.dna(dat,"K80")
#Transversions
tv <- t(subs)
tv <- tv[lower.tri(tv)]

#Transitions
ti <- subs[lower.tri(subs)]

ti<-ti/483
tv<-tv/483
c<-data.frame(x=as.vector(doloDist),y=ti, m="s")
c1<-data.frame(x=as.vector(doloDist),y=tv, m="v")
c<-rbind(c,c1)
c<-as.data.table(c)
setkey(c,NULL)
c.unique<-unique(c)

pdf("~/Downloads/ti-tv-3rd-5.pdf", height=8.27, width=11.69)
ggplot(aes(x,y,color=m),data=c.unique ) +
  geom_hline(yintercept = mean(ti), size=0.5,color="#ff800e",) +
  geom_hline(yintercept = c(mean(ti)+(sd(ti)),mean(ti)-((sd(ti)))) , size=0.5, color="#ff800e",linetype=2) +
  geom_hline(yintercept = mean(tv) ,size=0.5, color="#0b86cf") +
  geom_hline(yintercept = c(mean(tv)+(sd(tv)),mean(tv)-((sd(tv)))) , size=0.5, color="#0b86cf",linetype=2)+
  geom_point(shape=15, alpha=0.5, size=3) + theme_bw() +
  scale_color_manual(values=c("#ff800e","#0b86cf"), labels=c("Transitions", "Transversions"), name="") +
  theme(legend.position="top",legend.key = element_blank(), 
        legend.margin = unit(-0.4, "cm"),legend.text = element_text( size = 11),
        axis.text=element_text(size=10),axis.title=element_text(size=12)) + 
  xlab("Genetic distance (K80)") + ylab("Transitions and Transversions") +
  guides(colour=guide_legend(override.aes=list(size=4))) 
dev.off()

mean(ti)
mean(tv)