Btibert3
5/20/2015 - 5:46 PM

R code to work with CEEB cleaning

R code to work with CEEB cleaning

options(stringsAsFactors = F)


## the path
PATH = "N:/Enrollment Management/Documentation/Salesforce CRM/CEEB_Code-GC_Dbase"

## the libraries
library(RODBC)
library(stringr)
library(dplyr)

## connect to the access database
setwd(PATH)
list.files()
ch = odbcConnectAccess2007("CEEB_DBase_05192015.accdb")
sqlTables(ch)


## bring in the cboard
list.files()
cboard = read.csv("high-school-directory.csv", as.is=T, colClasses = "character")
head(cboard)


## ets high school raw
ets = sqlFetch(ch, "ETS_HS_CEEB", as.is=T)
head(ets)
nrow(ets) ==  length(unique(ets$ETS_HS_CEEB))


## ets college raw
ets_coll = sqlFetch(ch, "ETS_COLL_CEEB", as.is=T)
head(ets_coll)
nrow(ets_coll) ==  length(unique(ets_coll$ETS_COLL_CEEB))


## ets college raw
ets_coll = sqlFetch(ch, "ETS_COLL_CEEB", as.is=T)
head(ets_coll)
nrow(ets_coll) ==  length(unique(ets_coll$ETS_COLL_CEEB))

## collegeboard
cboard = sqlFetch(ch, "CollegeBoard_HS_Directory", as.is=T)
head(cboard)
colnames(cboard) = gsub(" ", "_", colnames(cboard))
nrow(cboard) == length(unique(cboard$AI_Code))


## what are the ceebs that are in there twice
dupes = tbl_df(cboard) %>%
 group_by(AI_Code) %>%
 summarise(tot = length(AI_Code)) %>%
 filter(tot > 1)
dupes = subset(cboard, AI_Code %in% dupes$AI_Code)
dupes = arrange(dupes, AI_Code)
View(dupes)


## bring in the master population
dat = sqlFetch(ch, "TOTAL_CollegeBoard-ETS", as.is=T)
colnames(dat) = gsub(" ", "_", colnames(dat))
colnames(dat) = gsub("/", "_", colnames(dat))
colnames(dat) = tolower(colnames(dat))


## cross the high school types
tbl_df(dat) %>% 
 group_by(high_school_type, ets_hs_school_type) %>% 
 summarise(tot = length(clean_ceeb))
 
options(stringsAsFactors = F)

## the path
# PATH = "C:\\Users\\btibert\\Google Drive\\Bentley\\ceeb-project"
PATH = "C:/Users/btibert/Google Drive/Bentley/ceeb-project"

## the libraries
library(RODBC)
library(stringr)
library(dplyr)

## connect to the access database
setwd(PATH)
list.files()
ch = odbcConnectAccess2007("CEEB_Code-GC_Dbase/CEEB_DBase_05192015.accdb")
sqlTables(ch)[3:4]


# ## bring in the cboard
# list.files()
# cboard = read.csv("high-school-directory.csv", as.is=T, colClasses = "character")
# 
# 
# ## ets high school raw
# ets = sqlFetch(ch, "ETS_HS_CEEB", as.is=T)
# head(ets)
# nrow(ets) ==  length(unique(ets$ETS_HS_CEEB))
# 
# 
# ## ets college raw
# ets_coll = sqlFetch(ch, "ETS_COLL_CEEB", as.is=T)
# head(ets_coll)
# nrow(ets_coll) ==  length(unique(ets_coll$ETS_COLL_CEEB))
# 
# 
# ## ets college raw
# ets_coll = sqlFetch(ch, "ETS_COLL_CEEB", as.is=T)
# head(ets_coll)
# nrow(ets_coll) ==  length(unique(ets_coll$ETS_COLL_CEEB))
# 
# ## collegeboard
# cboard = sqlFetch(ch, "CollegeBoard_HS_Directory", as.is=T)
# head(cboard)
# colnames(cboard) = gsub(" ", "_", colnames(cboard))
# nrow(cboard) == length(unique(cboard$AI_Code))
# 
# 
# ## what are the ceebs that are in there twice
# dupes = tbl_df(cboard) %>%
#  group_by(AI_Code) %>%
#  summarise(tot = length(AI_Code)) %>%
#  filter(tot > 1)
# dupes = subset(cboard, AI_Code %in% dupes$AI_Code)
# dupes = arrange(dupes, AI_Code)
# View(dupes)
# 
# 
# ## bring in the master population
# dat = sqlFetch(ch, "TOTAL_CollegeBoard-ETS", as.is=T)
# colnames(dat) = gsub(" ", "_", colnames(dat))
# colnames(dat) = gsub("/", "_", colnames(dat))
# colnames(dat) = tolower(colnames(dat))
# 
# 
# ## cross the high school types
# tbl_df(dat) %>% 
#  group_by(high_school_type, ets_hs_school_type) %>% 
#  summarise(tot = length(clean_ceeb))