alexanderholt
10/25/2017 - 2:40 PM

NLTK

Natural language toolkit

Lemmatize

string = 'some string of characters'

from nltk.stem import WordNetLemmatizer
#instantiate
lemmatizer = WordNetLemmatizer()

#Before we can lemmitize our spam string we need to tokenize it.

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
#What is our tokenizer doing? Is anyone familiar with regex?

string_tokens = tokenizer.tokenize(string.lower()) #creates list of tokens corresponding
#to each word. All lowercased.

tokens_lem = [lemmatizer.lemmatize(i) for i in string_tokens] #lemmatizes individual words