arbelt
10/9/2014 - 5:17 PM

gistfile1.r


## ----, include=FALSE-----------------------------------------------------
library(knitr)


## ------------------------------------------------------------------------
source("pre_azw.R")


## ------------------------------------------------------------------------
languages = read.csv("languages.csv", header = FALSE, stringsAsFactors = FALSE)[[1]]


## ------------------------------------------------------------------------
lang_regex <- paste(languages, collapse="|") %>% paste("(", ., ")", sep="")
lang_level_regex <- paste("(Elementary|Intermediate|Advanced)\\s+", lang_regex, sep="")
lang_in_title <- grepl(lang_regex, courses$course_title)
lang_level_in_title <- with(courses, grepl(lang_level_regex, course_title) | grepl(lang_level_regex, sub_title))
has_lang_code <- grepl("^\\w+ [A-C][a-z]*\\.", courses$course_title, perl=TRUE)
lang_in_desc <- grepl("language course", courses$course_description, ignore.case = TRUE)


## ------------------------------------------------------------------------
matched <- courses %>% filter(lang_in_desc | has_lang_code | lang_level_in_title)
unmatched <- courses %>% filter(lang_in_title) %>% anti_join(matched)