rongmu
12/2/2015 - 3:32 PM

LC-2015-prepare_data.R

library(tidyr)
library(dplyr)
library(stringr)

# preparations ----
dirs    <- list.files('NICE/NICE_3.0.1b', full = TRUE)

files   <- dirs %>% list.files(full = TRUE)
writers <- dirs %>% list.files() %>% str_replace('\\.txt$', '')

data_raw <- lapply(files, function(f) {
    f %>%
        # read in the files
        scan(what = 'char', sep = '\n', fileEnc = 'cp932') %>%
        # extract the essay sentences
        str_extract('(?<=^\\*(JPN|NS)?\\d{3}:\\t).*$') %>%
        na.omit()
})

# the times that each writer id shoud repeat in the data.frame below
writer_times <- sapply(data_raw, length)


# essays ----
essays <- data_raw %>%
    unlist() %>%
    data.frame(text = .) %>%
    # the "id" vector: each of the writers x writer_times
    mutate(
        id = rep(writers, writer_times)
    ) %>%
    # separate into native languge and id
    separate(id,
        into = c('native', 'id'),
        sep  = -4
    ) %>%
    mutate(
        native = plyr::revalue(native, c(JPN = 'ja', NS = 'en'))
    ) %>%
    # number each sentence
    group_by(native, id) %>%
    mutate(
        sen_no = 1:n()
    ) %>%
    ungroup() %>%
    # reorder columns
    select(native, id, sen_no, text)


# raw tokens ----
tokens_raw <- essays %>%
    # one token one row
    rowwise() %>%
    do(data.frame(
        native  = .$native,
        id      = .$id,
        sen_no  = .$sen_no,
        token   = .$text %>% str_split(' +') %>% unlist(),
        stringsAsFactors = FALSE
    )) %>%
    ungroup() %>%
    # number each token in each sentence
    group_by(native, id, sen_no) %>%
    mutate(
        token_no = 1:n()
    ) %>%
    ungroup() %>%
    # reorder columns
    select(native, id, sen_no, token_no, token)


# cleaned tokens ----
tokens <- tokens_raw %>%
    mutate(
        token = token %>%
                str_replace_all('\\W', ' ') %>%
                str_replace_all(' +', ' ') %>%
                str_trim() %>%
                tolower()
    ) %>%
    # filter out empty tokens
    filter(
        str_length(token) > 0
    )


# save to csv ----
write.csv(essays,     file = 'data/essays.csv',     row.names = FALSE)
write.csv(tokens_raw, file = 'data/tokens_raw.csv', row.names = FALSE)
write.csv(tokens,     file = 'data/tokens.csv',     row.names = FALSE)