fujiyuu75
5/10/2017 - 11:16 AM

sklearn.bow.sample.py

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

corpus = [
  'Steve Jobs biological father, Abdulfattah John Jandali.',
  'was born into an Arab Muslim household.',
  'Jobs moved back to the San Francisco Bay Area.',
]

X = vectorizer.fit_transform(corpus)

print (vectorizer.get_feature_names())
# ['abdulfattah', 'an', 'arab', 'area', 'back', 'bay', 'biological', 'born', 'father', 'francisco', 'household', 'into', 'jandali', 'jobs', 'john', 'moved', 'muslim', 'san', 'steve', 'the', 'to', 'was']

print (X.toarray())
#[[1 0 0 0 0 0 1 0 1 0 0 0 1 1 1 0 0 0 1 0 0 0]
# [0 1 1 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 1]
# [0 0 0 1 1 1 0 0 0 1 0 0 0 1 0 1 0 1 0 1 1 0]]