vgrabovets
2/21/2017 - 2:12 PM

preprocess_text

preprocess_text

import unicodedata


def strip_accents(text):
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode('utf-8')
    return str(text)
    
    
def preprocess_text(text, replace_dict=None, stop_list=None):
    text = strip_accents(text)
    text = re.sub(r'[\.\,\:\;\-\/\(\)\'\|\’\s\!\?\/\\\@\#\$\%\^\&\*\+\=\~\s]', ' ', text).lower()
    return ' '.join([replace_dict[w] if w in replace_dict else w for w in text.split() if w not in stop_set])


def preprocess_text(text, stop_list=None):
    text = re.sub(r'[\.\,\:\;\-\/\(\)\'\|\’\s\!\?\/\\\@\#\$\%\^\&\*\+\=\~\"\s]', ' ', text).lower()
    return ' '.join([w for w in text.split() if w not in stop_list])