carjed
8/28/2015 - 9:53 PM

From http://jedidiahcarlson.com/

require(plyr)

# read data and drop all rows with duplicate coordinates
# also drop the exon ID and strand information
exons<-read.table("exon_locations.bed", header=F, sep="\t", stringsAsFactors=F)
exons<-unique(exons[,c(1,2,3)])

# rename columns
names(exons)<-c("chr", "start", "end")

# define bin width
binw<-100000

# get length of each exon
exons$length<-exons$end-exons$start+1

# get bin number of the start and end positions
exons$bin.a<-ceiling(exons$start/binw)
exons$bin.b<-ceiling(exons$end/binw)

# get number of exonic sites for each bin, 
# excluding exons that overlap bin boundaries
subeq<-exons[exons$bin.a==exons$bin.b,]
z<-ddply(subeq, c("bin.a"), summarize, tot=sum(length))

# get number of exonic sites for each bin, only considering 
# those that overlap bin boundaries
subneq<-exons[exons$bin.a!=exons$bin.b,]
subneq$length.a<-(subneq$bin.a*binw)-subneq$start
subneq$length.b<-subneq$end-(subneq$bin.a*binw)+1

# Calculate number of exonic sites for each of the two bins 
# covered by the exon
za<-ddply(subneq, c("bin.a"), summarize, tot=sum(length.a))
zb<-ddply(subneq, c("bin.b"), summarize, tot=sum(length.b))

# merge the overlap data to a single dataframe
names(zb)<-c("bin.a", "tot")
zc<-rbind(za, zb)

# merge overlap data with non-overlapping data to get final dataset
zd<-ddply(merge(z, zc, all=T), .(bin.a), summarize, tot=sum(tot))