Missing Values Exercise
#readClipboard()
#Read copied folder path as something that R can read in properly
library(readxl)
library(tidyverse)
library(stringr)
library(forcats)
setwd("C:\\Users\\Martin.Chan\\Desktop\\R workdesk\\missing values")
add.dt<-read_excel("ORD-272769-F4S6 - Additional info for UK.xlsx",sheet="Additional info",na="NA")
seg.dt <-read_excel("ORD-272769-F4S6 - Additional info for UK.xlsx",sheet="Segments")
dt.combined <- merge(x=add.dt,y=seg.dt,by.x="pid",by.y="pid2")
names(dt.combined)
glimpse(dt.combined)
dt.combined$countblank <-as.factor(dt.combined$countblank)
dt.combined$countblank%>%
is.na%>%
sum()
#get row numbers where countblank is NA
ind <- dt.combined$countblank%>%
is.na()%>%
which()
#get row numbers where countblank is NOT NA
ind.nona<-which(complete.cases(dt.combined$countblank)==TRUE)
#column type corrections
dt.combined$pid <-as.character(dt.combined$pid)
dt.combined$respid <-as.character(dt.combined$respid)
#Replace the meaningful NAs with "Blank"
dt.combined[ind.nona,5:8]%>%
replace_na("Blank")
dt.complete <- dt.combined[ind.nona,]
names(dt.complete)
summary(dt.complete[,5:8])
# dt.complete[,5:8]%>%
# replace_na(list("BLANK"))
#Replace all NAs with 0s in this complete case subset
for(i in 5:8){
x<-dt.complete[,i]%>%
is.na()%>%
which()
dt.complete[x,i]<-0
}
dt.complete[,5:8]<-lapply(dt.complete[,5:8],as.factor)
str(dt.complete)
dt.complete<-select(dt.complete,pid,5:8)
names(dt.complete)<-c("ID","howner","rentpl","rentha","lpa")
dt.combined <-merge(dt.combined,dt.complete,by.x="pid",by.y="ID",all.x=TRUE,all.y=TRUE)
summary(dt.combined)
glimpse(dt.combined)
dt.combined[,3:10]<-lapply(dt.combined[,3:10],as.factor) #Coerce multiple columns into factor
levels(dt.combined$howner)
dt.combined %>%
select(pid,11:14)%>%
write_excel_csv("corrected with NAs 19-05-17.csv")
dt.filtered <- filter(dt.combined,complete.cases(dt.combined$countblank)) #filter by data only with non-NAs in a column
#glimpse(dt.filtered)
dt.filtered<-lapply(dt.filtered[,1:10],as.factor) #Coerce multiple columns into factor
write_excel_csv(dt.filtered,"filtered segment file.csv")