alexanderholt
10/25/2017 - 3:02 PM

tf-idf document comparison

from sklearn.feature_extraction.text import TfidfVectorizer

string = 'a string of characters'

another_string = 'a comparison string of characters'


tvec = TfidfVectorizer(stop_words='english') #ignores common articlces etc.
tvec.fit([spam, ham])

df  = pd.DataFrame(tvec.transform([spam, ham]).todense(),
                   columns=tvec.get_feature_names(),
                   index=['string', 'another_string'])