martinctc
5/18/2017 - 4:04 PM

Missing Values Exercise - Read in data from XLSX file, using the readxl package - Combine two data sets using merge(), referencing with a ke

Missing Values Exercise

• Combine two data sets using merge(), referencing with a key
• complete.cases() and filter() to create a data frame without missing values
#Read copied folder path as something that R can read in properly

library(tidyverse)
library(stringr)
library(forcats)

setwd("C:\\Users\\Martin.Chan\\Desktop\\R workdesk\\missing values")

names(dt.combined)

glimpse(dt.combined)
dt.combined\$countblank <-as.factor(dt.combined\$countblank)

dt.combined\$countblank%>%
is.na%>%
sum()

#get row numbers where countblank is NA
ind <-  dt.combined\$countblank%>%
is.na()%>%
which()

#get row numbers where countblank is NOT NA
ind.nona<-which(complete.cases(dt.combined\$countblank)==TRUE)

#column type corrections
dt.combined\$pid <-as.character(dt.combined\$pid)
dt.combined\$respid <-as.character(dt.combined\$respid)

#Replace the meaningful NAs with "Blank"
dt.combined[ind.nona,5:8]%>%
replace_na("Blank")

dt.complete <- dt.combined[ind.nona,]
names(dt.complete)

summary(dt.complete[,5:8])
# dt.complete[,5:8]%>%
#   replace_na(list("BLANK"))

#Replace all NAs with 0s in this complete case subset
for(i in 5:8){
x<-dt.complete[,i]%>%
is.na()%>%
which()

dt.complete[x,i]<-0
}
dt.complete[,5:8]<-lapply(dt.complete[,5:8],as.factor)

str(dt.complete)
dt.complete<-select(dt.complete,pid,5:8)
names(dt.complete)<-c("ID","howner","rentpl","rentha","lpa")

dt.combined <-merge(dt.combined,dt.complete,by.x="pid",by.y="ID",all.x=TRUE,all.y=TRUE)

summary(dt.combined)
glimpse(dt.combined)

dt.combined[,3:10]<-lapply(dt.combined[,3:10],as.factor) #Coerce multiple columns into factor

levels(dt.combined\$howner)

dt.combined %>%
select(pid,11:14)%>%
write_excel_csv("corrected with NAs 19-05-17.csv")

dt.filtered <- filter(dt.combined,complete.cases(dt.combined\$countblank)) #filter by data only with non-NAs in a column
#glimpse(dt.filtered)
dt.filtered<-lapply(dt.filtered[,1:10],as.factor) #Coerce multiple columns into factor
write_excel_csv(dt.filtered,"filtered segment file.csv")