Btibert3
1/12/2011 - 10:55 PM

Correlate 6-year Grad Rate with Freshmen Yield Rate

Correlate 6-year Grad Rate with Freshmen Yield Rate

################################################################
# Author:  	@BrockTibert
# Date: 	January 2011
# 
# Purpose: 	Study the relationship of freshmen yield rate and grad rate
# 			Comes from Chronicle blog post on research done about college selection
#			and more likely to be selected if have higher grad rate.  Let's use
#			actual data to see if this is true.
#
# http://chronicle.com/blogs/headcount/how-graduation-rates-shape-college-choice/27770		
#
# Windows 7 Pro, R 2.10
#
################################################################

# initial setup
DIR <- "C:/Users/Brock/Documents/My Dropbox/Eclipse/Projects/R/IPEDS/Grad and Yield Rate/"
setwd(DIR)

# Base URL for the IPEDS data raw datasets
BASE_URL <- 'http://nces.ed.gov/ipeds/datacenter/data/'

# the files of interest
FILES <- c('GR2009', 'IC2009', 'HD2009')


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Grab and unzip the datasets
# An example path: http://nces.ed.gov/ipeds/datacenter/data/EF2004D.zip
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

# download the files
for(i in FILES) {
	
	# build the URL String
	URL <- paste(BASE_URL, i, ".zip", sep="")
	
	# build the destination -- needs to include path AND file
	dest <- paste(DIR, i, ".zip", sep="")
	
	# fetch the file
	download.file(URL, destfile=dest, mode="wb")
	
}

# unzip the files
for (i in dir(pattern="\\.zip$"))
	unzip(i)

# delete the zip files if you want
for (i in dir(pattern="\\.zip$"))
	unlink(i)



#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Read in the datasets, filter, clean as needed
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#------------------------------------------
# app and yield data
#------------------------------------------

# read in the data
app.raw <- read.table("ic2009.csv", header=T, stringsAsFactors=F, sep=",")
dim(app.raw)

# filter columns -- pull some extra fields in case I want to use them
app <- app.raw[c("unitid", "applcn", "admssn", "enrlt", "satactdt",
				"satnum", "satpct", "actnum", "actpct", "satvr25",
				"satvr75", "satmt25", "satmt75", "actcm25", "actcm75",
				"acten25", "acten75", "actmt25", "actmt75",
				"tuitvary")]
rm(app.raw)



#------------------------------------------
# directory info so we can filter on 4-year, etc.
#------------------------------------------

# read in the raw data
dir.raw <- read.table("hd2009.csv", header=T, sep=",", stringsAsFactors=F)
dim(dir.raw)

# filter the rows/columns I want
dir.raw <- dir.raw[dir.raw$sector %in% c(1,2), ]
dir.raw <- dir.raw[dir.raw$obereg %in% c(1,2), ]
dir.raw <- dir.raw[dir.raw$deggrant==1, ]
dir.raw <- dir.raw[dir.raw$pset4flg==1, ]
inst <- dir.raw[c("unitid", "instnm", "sector", "addr", "stabbr", "zip",
				"carnegie", "locale")]
rm(dir.raw)



#------------------------------------------
# directory info so we can filter on 6-year rate
# prefer to do 4-year, but 6 is more common
#
# definition:  	total cohort considered = GRRACE24 when GRTYPE=8
#				grad in 6-years = GRRACE24 when GRTYPE=12
#
# NOTES:		These data are multi-record per school
#				need to restructure - will use reshape package
#------------------------------------------

# read in the data
grad.raw <- read.table("gr2009.csv", header=T, sep=",", stringsAsFactors=F)
dim(grad.raw)

# filter the rows
grad.temp <- grad.raw[grad.raw$GRTYPE %in% c(8,12),]
grad.temp <- grad.temp[c("UNITID", "GRTYPE", "GRTOTLT")]  # there isn't a GRRACE24

# reshape the data
library(reshape)
grad <- cast(grad.temp, UNITID ~ GRTYPE)
colnames(grad) <- c("unitid", "cohort", "grad")
grad$sixyear<- grad$grad/grad$cohort

rm(grad.raw, grad.temp)




#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Merge the datasets
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

# merge all app data that matches are filtered school list
ds <- merge(x=inst, y=app, by.x="unitid", by.y="unitid", all.x=T)

# merge on the grad rate data
ds <- merge(x=ds, y=grad, by.x="unitid", by.y="unitid", all.x=T)

# calculate a variable for yield rate
ds$yield <- ds$enrlt/ds$admssn


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Finally.... let's play with the data!
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++

# quick look at the variables of interest
summary(ds[c("sector", "sixyear", "yield")])

# lets correlate the two, ignoring sector
cor(ds$sixyear, ds$yield, use="complete.obs", method="pearson")
#hmmmm - not what I thought we would see

# just keep the complete observations for sixyear and yield
# could use ?complete.cases or na.omit, but looks across whole data frame
ds.comp <- ds[!is.na(ds$sixyear), ]
ds.comp <- ds.comp[!is.na(ds.comp$yield), ]
ds.comp <- ds.comp[ds.comp$yield <= .85, ]  # throw out cases with very high yield, Harvard=80%


# it's easier to see the relationships visually
plot(ds.comp$sixyear, ds.comp$yield,
		xlab="6-year Rate", ylab="Yield Rate",
		main="Grad Rate and Yield Rate - 2009 Data",
		pch=20)

# No real pattern - article says higher grad rates increased chances of selection as
#					as grad rates increased. To be fair, they used public schools and pitted
#					one school against another, all else equal.  Reality is that 
#					the selection set is much larger

# filter on public
public <- ds.comp[ds.comp$sector==1, ]
plot(public$sixyear, public$yield,
		xlab="6-year Rate", ylab="Yield Rate",
		main="Grad Rate and Yield Rate - 2009 Data - Public Schools Only",
		pch=20)


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Clean up R session
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
rm(list=ls())
q()
n