danielecook
3/14/2018 - 3:13 AM

rarefaction.R

rarefaction.R

library(tidyverse)


# bcftools query -f "[%GT\t]\n" WI.20170531.impute.vcf.gz  | awk '{ gsub(":GT", "", $0); gsub("(# )?\[[0-9]+\]","",$0); print $0 }' | sed 's/0|0/0/g' | sed 's/1|1/1/g' | sed 's/0|1/NA/g' | sed 's/1|0/NA/g' | head -n 1000000 | gzip > ~/Desktop/impute_gts.test.tsv.gz


df_use <- tseries::read.matrix(gzfile("~/Desktop/impute_gts.tsv.gz"))
          
storage.mode(df_use) <- "logical"
# cols


results <- sapply(1:10, function(x) {
  cumsum(
    
  (tidyr::complete(
                      (plyr::count(
                          apply(
                                df_use[,sample(n_samples, n_samples)],
                                1,
                                function(x) {
                                    min(which(x))
                                }
                          )
                          )
                       ),
                       x = 1:249,
                       fill = list(freq = as.integer(0))
  ))$freq
  )
})


colnames(results) <- 1:ncol(results)
results <- tbl_df(results) %>% 
              tidyr::gather(rep, sites) %>%
              dplyr::group_by(rep) %>%
              dplyr::mutate(isotypes = dplyr::row_number(rep)) %>%
              dplyr::ungroup() %>%
              dplyr::group_by(isotypes)
            
summarized_results <- results %>% 
                        dplyr::group_by(isotypes) %>%
                        dplyr::summarize(mean_sites = mean(sites))

ggplot(results) +
  geom_step(aes(y = sites, x = isotypes, color=rep), size = 0.5, alpha=0.5) +
  geom_step(aes(y = mean_sites, x = isotypes), size=2, data=summarized_results) +
  scale_y_continuous(sec.axis = sec_axis(~./nrow(df_use)*100)) +
  theme(legend.position='none')