martinctc
8/13/2015 - 11:23 AM

Sentiment Analyser Shiny app

Sentiment Analyser Shiny app

library(shiny)

shinyUI(fluidPage(
    
    # Application title
    headerPanel("Text Sentiment Analyser"),
    
    sidebarLayout(
        # the control panel
        sidebarPanel(
            fileInput('file1', 'Choose text File',
                      accept=c('text/tsv', 
                               'text/tab-separated-values,text/plain', 
                               '.tsv')),
            tags$hr(),
            sliderInput("threshold",
                        "Positive sentiment threshold",
                        min = .1,
                        max = .99,
                        value = .5),
            tags$hr(),
            sliderInput("sparsity",
                        "Max. term sparsity",
                        min = .1,
                        max = .99,
                        value = .95)
        ),
        
        # Show a plot of the generated distribution
        mainPanel(
            plotOutput('distribution')
        )
    ),
    tags$hr(),
    fluidRow(
        # the results detail panel
        column(12,
            tableOutput('contents')
        )
    )
))
library(shiny)
library(tm)
library(SnowballC)
library(randomForest)

options(mc.cores=1)

build_model <- function(new_data_df, sparsity) {
    # Create new data corpus
    new_corpus <- Corpus(VectorSource(new_data_df$Text))
    new_corpus <- tm_map(new_corpus, content_transformer(tolower))
    new_corpus <- tm_map(new_corpus, removePunctuation)
    new_corpus <- tm_map(new_corpus, removeWords, stopwords("english"))
    new_corpus <- tm_map(new_corpus, stripWhitespace)
    new_corpus <- tm_map(new_corpus, stemDocument)
    
    # create document-term matrix
    new_dtm <- DocumentTermMatrix(new_corpus)
    new_dtm <- removeSparseTerms(new_dtm, sparsity)
    new_dtm_df <- as.data.frame(as.matrix(new_dtm))
    colnames(new_dtm_df) <- make.names(colnames(new_dtm_df))
    
    # intersect corpora and prepare final training data
    common_names <- intersect(colnames(train_dtm_df),colnames(new_dtm_df))
    new_dtm_df <- subset(new_dtm_df, select=names(new_dtm_df) %in% common_names)
    
    model_train_data_df <- cbind(train_data_df, subset(train_dtm_df, select=names(train_dtm_df) %in% common_names))
    model_train_data_df$Text <- NULL
    
    # train classifier
    model <- randomForest(Sentiment~.,data=model_train_data_df, ntree=50)

    # return value as a list    
    list(model, new_dtm_df)
}


shinyServer(function(input, output) {
    
    output$contents <- renderTable({
        results()
    })
    
    output$distribution <- renderPlot({
        if (is.null(results()))
            return(NULL)
        d <- density(
            as.numeric(results()$Prob > input$threshold)
        )
        plot(
            d, 
            xlim = c(0, 1),
            main=paste0("Sentiment Distribution (Prob > ", input$threshold, ")")
        )
        polygon(d, col="lightgrey", border="lightgrey")
        abline(v = input$threshold, col = "blue")
    })
    
    results <- reactive({
        inFile <- input$file1
            
        if (is.null(inFile))
            return(NULL)
        
        # load input data
        new_data_df <- read.csv(
            inFile$datapath, 
            sep='\t', 
            header=FALSE, 
            quote = "",
            stringsAsFactor=F,
            col.names=c("Text")
        )
        
        model_and_data <- build_model(new_data_df, input$sparsity)
        
        pred <- predict(model_and_data[[1]], newdata=model_and_data[[2]], type="prob")
        
        new_data_df$Prob <- pred[,2]

        # return data frame
        new_data_df
    })
})

# This is the init code, that will be run when the web app loads

# Load train and test data
train_data_df <- read.csv(
    file = 'train_data.tsv',
    sep='\t', 
    quote = "",
    header=FALSE, 
    stringsAsFactor=F,
    col.names=c("Sentiment", "Text")
)
train_data_df$Sentiment <- as.factor(train_data_df$Sentiment)

# Create training corpus for later re-use
train_corpus <- Corpus(VectorSource(train_data_df$Text))
train_corpus <- tm_map(train_corpus, content_transformer(tolower))
train_corpus <- tm_map(train_corpus, removePunctuation)
train_corpus <- tm_map(train_corpus, removeWords, stopwords("english"))
train_corpus <- tm_map(train_corpus, stripWhitespace)
train_corpus <- tm_map(train_corpus, stemDocument)

# create document-term matrix
train_dtm <- DocumentTermMatrix(train_corpus)
train_dtm <- removeSparseTerms(train_dtm, 0.995)
train_dtm_df <- data.frame(as.matrix(train_dtm))
colnames(train_dtm_df) <- make.names(colnames(train_dtm_df))