MED rhodo
library(ape)
library(reshape)
library(ggplot2)
library(data.table)
library(grid)
library(scales)
library(spider)
#change the seed to have different results
dat<-read.dna(file="~/Downloads/ALL.MED.unique.cd-hit.fasta", format = "fasta", as.character=FALSE, skip=0)
dat1<-read.dna(file="~/Downloads/ALL.MED.unique.3st", format = "fasta", as.character=FALSE, skip=0)
dat<-read.dna(file="~/Downloads/MED/R_baltica_all_codons_node-representatives-tree.txt", format = "fasta", as.character=FALSE, skip=0)
dat1<-read.dna(file="~/Downloads/MED/R_baltica_all_codons_node-representatives-tree.only3rd.fasta", format = "fasta", as.character=FALSE, skip=0)
#
# lm_eqn = function(m) {
#
# l <- list(a = format(coef(m)[1], digits = 2),
# b = format(abs(coef(m)[2]), digits = 2),
# r2 = format(summary(m)$r.squared, digits = 3));
#
# if (coef(m)[2] >= 0) {
# eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
# } else {
# eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
# }
#
# as.character(as.expression(eq));
# }
###Input data: dat--an object of class 'DNAbin'###
set.seed(3399322)
Nseqs<-1000
rand.seqs.ids<-sample(dim(dat)[1], Nseqs)
rand.seqs<-dat[rand.seqs.ids,]
rand.seqs1<-dat1[rand.seqs.ids,]
#Remove 3rd codon position
#rand.seqs<-rand.seqs[, -seq(0, ncol(rand.seqs), by=3) ]
# ###Convert to genetic distances###
# dist.raw<-dist.dna(rand.seqs, model="raw", pairwise.deletion=TRUE)
# #dist.jc69<-dist.dna(rand.seqs, model="JC69", pairwise.deletion=TRUE)
# #dist.k80<-dist.dna(rand.seqs, model="K80", pairwise.deletion=TRUE)
# dist.tn93<-dist.dna(rand.seqs, model="TN93", pairwise.deletion=TRUE)
# dist<-dist.raw
# dist.corrected<-dist.tn93
#
# l<-lm(dist~dist.tn93)
# lm_coef<-coef(l)
#
# dist.raw<-as.matrix(dist.raw)
# dist.corrected<-as.matrix(dist.corrected)
#
# # We
# dist.raw[lower.tri(dist.raw)] <- NA
# diag(dist.raw)<-NA
# dist.raw<-melt(dist.raw)
# dist.raw<-dist.raw[!is.na(dist.raw$value),]
# #y<-subset(y,value != 'NA')
#
#
#
#
# # We
# dist.corrected[lower.tri(dist.corrected)] <- NA
# diag(dist.corrected)<-NA
# dist.corrected<-melt(dist.corrected)
# dist.corrected<-dist.corrected[!is.na(dist.corrected$value),]
# #x<-subset(x,value != 'NA')
#
#
#
# v<-cbind(dist.corrected$value,dist.raw$value)
# colnames(v)<-c("x","y")
# v<-as.data.table(v)
#
# setkey(v,NULL)
# v.unique<-unique(v)
#
# label <- lm_eqn(l)
#
# png(file="~/Downloads/test-99991.png")
# ggplot(v.unique,aes(x=x, y=y, group = 1)) +
# geom_abline(linetype="dotted") +
# geom_point(color="red", size=1) +
# # stat_smooth(method = "lm", se=F, color="black", formula = y ~ x, size = 1) +
# geom_abline(intercept = lm_coef[1], slope = lm_coef[2], size = 1) +
# #geom_text(aes(x = 0.3, y = 0.1, label = label, size=5), parse = T) +
# annotate("text", label = label, x = 0.4, y = 0.1, size = 4, colour = "red", parse =T) +
# theme_bw() +
# xlab("TN93 model distance") +
# ylab("Uncorrected genetic distance") +
# theme(legend.position="none",
# legend.title=element_blank(),
# legend.key=element_blank(),
# legend.background = element_rect(fill=alpha('white', 0.4)),
# axis.title=element_text(size=12),
# legend.text=element_text(size=8),
# legend.text.align=0,
# legend.margin=unit(-1, "cm"),
# legend.key.width=unit(0.5,"line"),
# legend.key.size = unit(1, "cm"),
# axis.text=element_text(size=10)
# )
# dev.off()
#
subs <- titv(dat)
#Saturation plot
doloDist <- dist.dna(dat,"K80")
#Transversions
tv <- t(subs)
tv <- tv[lower.tri(tv)]
#Transitions
ti <- subs[lower.tri(subs)]
ti<-ti/483
tv<-tv/483
c<-data.frame(x=as.vector(doloDist),y=ti, m="s")
c1<-data.frame(x=as.vector(doloDist),y=tv, m="v")
c<-rbind(c,c1)
c<-as.data.table(c)
setkey(c,NULL)
c.unique<-unique(c)
pdf("~/Downloads/ti-tv-3rd-5.pdf", height=8.27, width=11.69)
ggplot(aes(x,y,color=m),data=c.unique ) +
geom_hline(yintercept = mean(ti), size=0.5,color="#ff800e",) +
geom_hline(yintercept = c(mean(ti)+(sd(ti)),mean(ti)-((sd(ti)))) , size=0.5, color="#ff800e",linetype=2) +
geom_hline(yintercept = mean(tv) ,size=0.5, color="#0b86cf") +
geom_hline(yintercept = c(mean(tv)+(sd(tv)),mean(tv)-((sd(tv)))) , size=0.5, color="#0b86cf",linetype=2)+
geom_point(shape=15, alpha=0.5, size=3) + theme_bw() +
scale_color_manual(values=c("#ff800e","#0b86cf"), labels=c("Transitions", "Transversions"), name="") +
theme(legend.position="top",legend.key = element_blank(),
legend.margin = unit(-0.4, "cm"),legend.text = element_text( size = 11),
axis.text=element_text(size=10),axis.title=element_text(size=12)) +
xlab("Genetic distance (K80)") + ylab("Transitions and Transversions") +
guides(colour=guide_legend(override.aes=list(size=4)))
dev.off()
mean(ti)
mean(tv)