R code to work with CEEB cleaning
options(stringsAsFactors = F)
## the path
PATH = "N:/Enrollment Management/Documentation/Salesforce CRM/CEEB_Code-GC_Dbase"
## the libraries
library(RODBC)
library(stringr)
library(dplyr)
## connect to the access database
setwd(PATH)
list.files()
ch = odbcConnectAccess2007("CEEB_DBase_05192015.accdb")
sqlTables(ch)
## bring in the cboard
list.files()
cboard = read.csv("high-school-directory.csv", as.is=T, colClasses = "character")
head(cboard)
## ets high school raw
ets = sqlFetch(ch, "ETS_HS_CEEB", as.is=T)
head(ets)
nrow(ets) == length(unique(ets$ETS_HS_CEEB))
## ets college raw
ets_coll = sqlFetch(ch, "ETS_COLL_CEEB", as.is=T)
head(ets_coll)
nrow(ets_coll) == length(unique(ets_coll$ETS_COLL_CEEB))
## ets college raw
ets_coll = sqlFetch(ch, "ETS_COLL_CEEB", as.is=T)
head(ets_coll)
nrow(ets_coll) == length(unique(ets_coll$ETS_COLL_CEEB))
## collegeboard
cboard = sqlFetch(ch, "CollegeBoard_HS_Directory", as.is=T)
head(cboard)
colnames(cboard) = gsub(" ", "_", colnames(cboard))
nrow(cboard) == length(unique(cboard$AI_Code))
## what are the ceebs that are in there twice
dupes = tbl_df(cboard) %>%
group_by(AI_Code) %>%
summarise(tot = length(AI_Code)) %>%
filter(tot > 1)
dupes = subset(cboard, AI_Code %in% dupes$AI_Code)
dupes = arrange(dupes, AI_Code)
View(dupes)
## bring in the master population
dat = sqlFetch(ch, "TOTAL_CollegeBoard-ETS", as.is=T)
colnames(dat) = gsub(" ", "_", colnames(dat))
colnames(dat) = gsub("/", "_", colnames(dat))
colnames(dat) = tolower(colnames(dat))
## cross the high school types
tbl_df(dat) %>%
group_by(high_school_type, ets_hs_school_type) %>%
summarise(tot = length(clean_ceeb))
options(stringsAsFactors = F)
## the path
# PATH = "C:\\Users\\btibert\\Google Drive\\Bentley\\ceeb-project"
PATH = "C:/Users/btibert/Google Drive/Bentley/ceeb-project"
## the libraries
library(RODBC)
library(stringr)
library(dplyr)
## connect to the access database
setwd(PATH)
list.files()
ch = odbcConnectAccess2007("CEEB_Code-GC_Dbase/CEEB_DBase_05192015.accdb")
sqlTables(ch)[3:4]
# ## bring in the cboard
# list.files()
# cboard = read.csv("high-school-directory.csv", as.is=T, colClasses = "character")
#
#
# ## ets high school raw
# ets = sqlFetch(ch, "ETS_HS_CEEB", as.is=T)
# head(ets)
# nrow(ets) == length(unique(ets$ETS_HS_CEEB))
#
#
# ## ets college raw
# ets_coll = sqlFetch(ch, "ETS_COLL_CEEB", as.is=T)
# head(ets_coll)
# nrow(ets_coll) == length(unique(ets_coll$ETS_COLL_CEEB))
#
#
# ## ets college raw
# ets_coll = sqlFetch(ch, "ETS_COLL_CEEB", as.is=T)
# head(ets_coll)
# nrow(ets_coll) == length(unique(ets_coll$ETS_COLL_CEEB))
#
# ## collegeboard
# cboard = sqlFetch(ch, "CollegeBoard_HS_Directory", as.is=T)
# head(cboard)
# colnames(cboard) = gsub(" ", "_", colnames(cboard))
# nrow(cboard) == length(unique(cboard$AI_Code))
#
#
# ## what are the ceebs that are in there twice
# dupes = tbl_df(cboard) %>%
# group_by(AI_Code) %>%
# summarise(tot = length(AI_Code)) %>%
# filter(tot > 1)
# dupes = subset(cboard, AI_Code %in% dupes$AI_Code)
# dupes = arrange(dupes, AI_Code)
# View(dupes)
#
#
# ## bring in the master population
# dat = sqlFetch(ch, "TOTAL_CollegeBoard-ETS", as.is=T)
# colnames(dat) = gsub(" ", "_", colnames(dat))
# colnames(dat) = gsub("/", "_", colnames(dat))
# colnames(dat) = tolower(colnames(dat))
#
#
# ## cross the high school types
# tbl_df(dat) %>%
# group_by(high_school_type, ets_hs_school_type) %>%
# summarise(tot = length(clean_ceeb))