Import Homer ChIP-Seq Motif Data
# Daniel Cook 2014
# Danielecook.com
#
# Use this function to import ChIP Seq Data generated by Homer. This data is generated using homers findMotifsGenome.pl
# command with the '-find <motif file>' argument. Generate output
# looks like this:
#
# 1. Peak/Region ID
# 2. Chromosome
# 3. Start
# 4. End
# 5. Strand of Peaks
# 6-18: annotation information
# 19. CpG%
# 20. GC%
# 21. Motif Instances <distance from center of region>(<sequence>,<strand>,<conservation>)
#
# Getting the information from column 21 for plotting, for example, frequency can be tricky. This funciton aims to help that
# by converting the data format from wide to long. Currently, annotation columns are ignored but you can modify the function to
# fix.
library(stringr)
library(splitstackshape)
library(reshape2)
library(dplyr)
suppressMessages(library(data.table))
import_peak_file <- function(filename, peak_type) {
df <- fread(filename)
setnames(df, names(df)[1],"Peak")
setnames(df, names(df)[9:length(names(df))] , str_extract(names(df)[9:length(names(df))], "[A-Za-z0-9]+"))
motifs <- names(df)[9:length(names(df))]
# Reshape latter columns
# Remove extra columns if present
df$Annotation <- NULL
df$"Focus Ratio/Region Size" <- NULL
df$Detailed <- NULL
df$Distance <- NULL
df$Nearest <- NULL
df$Entrez <- NULL
df$Nearest <- NULL
df$Nearest <- NULL
df$Nearest <- NULL
df$Gene <- NULL
df$Gene <- NULL
df$Gene <- NULL
df$Gene <- NULL
# Melt Again
df <- melt(df, id.vars = 1:8)
df$value <- str_replace_all(df$value, "),", "|")
df <- filter(df, value != "")
df <- concat.split(as.data.frame(df), split.col = c("value"), sep="|")
df$value <- NULL
df <- rename(df, c("variable"="motif_name"))
df <- melt(df, id.vars = 1:9)
df$value_1 <- NULL
df$variable <- NULL
df <- filter(df, value != "")
df$MOTIF <- str_extract(df$value,"[ATCG]+")
df$POS <- as.integer(str_extract(df$value,"[-0-9]+"))
df$STRAND <- str_extract(df$value,"(,[+|-],)")
df$STRAND <- str_replace_all(df$STRAND,",","")
df$value <- NULL
df <- filter(df, !is.na(POS))
df$peak_type <- peak_type
df
}