hermesh2
10/26/2015 - 1:44 AM

Clean non english characters from a Portugues data base.R

library(rvest)
library(data.table)
library(dplyr)
lpt <- read_html("https://es.wikipedia.org/wiki/Primeira_Liga_2014/15", encoding = "Windos-1251")
data <- lpt %>%
  html_nodes("table") %>%
  html_table(fill = T) %>% 
  .[[11]] # The 11 table was chosen. We can choose the other( from one to eleven)
data$Jugador <- iconv(x = data$Jugador, from = "UTF-8", to = "latin1") # Transform to latin1 enconding Windows OS
data$Equipo <- iconv(x = data$Equipo, from = "UTF-8", to = "latin1")
data

data <- data.table(data)

data[ ,  Jugador := data$Jugador %>%  tolower %>%
        chartr( old="áéíóúàèìòùãõçñôê", new="aeiouaeiouaocnoe", x= .) %>%
        toupper %>% gsub(pattern=" ", replacement="_") %>% gsub(pattern="__", replacement="") %>%
        factor] # Erase the non english characters from Jugador
data[ ,  Equipo := data$Equipo %>%  tolower %>%
        chartr( old="áéíóúàèìòùãõçñôê", new="aeiouaeiouaocnoe", x= .) %>%
        toupper %>% gsub(pattern=" ", replacement="_") %>% gsub(pattern="__", replacement="") %>%
        factor]
data