martinctc
5/18/2017 - 4:04 PM

Missing Values Exercise - Read in data from XLSX file, using the readxl package - Combine two data sets using merge(), referencing with a ke

Missing Values Exercise

  • Read in data from XLSX file, using the readxl package
  • Combine two data sets using merge(), referencing with a key
  • complete.cases() and filter() to create a data frame without missing values
#readClipboard()
#Read copied folder path as something that R can read in properly

library(readxl)
library(tidyverse)
library(stringr)
library(forcats)

setwd("C:\\Users\\Martin.Chan\\Desktop\\R workdesk\\missing values")
add.dt<-read_excel("ORD-272769-F4S6  - Additional info for UK.xlsx",sheet="Additional info",na="NA")
seg.dt <-read_excel("ORD-272769-F4S6  - Additional info for UK.xlsx",sheet="Segments")

dt.combined <- merge(x=add.dt,y=seg.dt,by.x="pid",by.y="pid2")

names(dt.combined)

glimpse(dt.combined)
dt.combined$countblank <-as.factor(dt.combined$countblank)

dt.combined$countblank%>%
  is.na%>%
  sum()

#get row numbers where countblank is NA
ind <-  dt.combined$countblank%>%
  is.na()%>%
  which()

#get row numbers where countblank is NOT NA
ind.nona<-which(complete.cases(dt.combined$countblank)==TRUE)

#column type corrections
dt.combined$pid <-as.character(dt.combined$pid)
dt.combined$respid <-as.character(dt.combined$respid)

#Replace the meaningful NAs with "Blank"
dt.combined[ind.nona,5:8]%>%
  replace_na("Blank")

dt.complete <- dt.combined[ind.nona,]
names(dt.complete)

summary(dt.complete[,5:8])
# dt.complete[,5:8]%>%
#   replace_na(list("BLANK"))


#Replace all NAs with 0s in this complete case subset
for(i in 5:8){
x<-dt.complete[,i]%>%
    is.na()%>%
    which()

  dt.complete[x,i]<-0
}
dt.complete[,5:8]<-lapply(dt.complete[,5:8],as.factor)


str(dt.complete)
dt.complete<-select(dt.complete,pid,5:8)
names(dt.complete)<-c("ID","howner","rentpl","rentha","lpa")

dt.combined <-merge(dt.combined,dt.complete,by.x="pid",by.y="ID",all.x=TRUE,all.y=TRUE)

summary(dt.combined)
glimpse(dt.combined)

dt.combined[,3:10]<-lapply(dt.combined[,3:10],as.factor) #Coerce multiple columns into factor

levels(dt.combined$howner)

dt.combined %>%
  select(pid,11:14)%>%
  write_excel_csv("corrected with NAs 19-05-17.csv")

dt.filtered <- filter(dt.combined,complete.cases(dt.combined$countblank)) #filter by data only with non-NAs in a column
#glimpse(dt.filtered)  
dt.filtered<-lapply(dt.filtered[,1:10],as.factor) #Coerce multiple columns into factor
write_excel_csv(dt.filtered,"filtered segment file.csv")