Classify words by their spelling with features of syllables, phonemes, vowels, and stresses
from __future__ import print_function
import csv
import re
from time import time
from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from numpy import random
def parse_features(line):
raw_text = re.split('[ \n]+', line)
if not raw_text[-1]:
raw_text = raw_text[0:-1]
if raw_text[0][-1] == ')':
lexeme = raw_text[0][0:len(raw_text[0])-3]
else:
lexeme = raw_text[0]
features = {}
syllables = []
stresses = []
vowels = []
syllable = []
stress = 0
vowel = None
for j in range(1, len(raw_text)):
letter = raw_text[j]
if letter == '-':
syllables.append(syllable)
features["syl{}".format(len(syllables))] = ','.join(syllable)
stresses.append(stress)
vowels.append(vowel)
features["vowels{}".format(len(vowels))] = vowel
syllable = []
else:
if ord(letter[-1]) in range(55):
stress = int(letter[-1])
phone = letter[0:-1]
vowel = letter[0:-1]
else:
phone = letter
syllable.append(phone)
syllables.append(syllable)
features["syl-{}".format(len(syllables))] = ','.join(syllable)
stresses.append(stress)
features["stresses"] = stresses
vowels.append(vowel)
features["vowels-{}".format(len(vowels))] = vowel
return {'word': lexeme.lower(), 'features': features}
def read_data(file_path):
with open(file_path) as f:
content = f.readlines()
return content[115:]
def vectorize(data):
vectorizer = DictVectorizer()
return vectorizer.fit_transform([d['features'] for d in data])
def cluster(X, num_clusters, mini=False):
if mini:
km = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=True)
else:
km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=1, verbose=True)
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()
return km
###############################################################################
# Load the words and their spellings
file_content = read_data('cmudict.06d.syl')
all_data = [parse_features(line) for line in file_content]
# Get a manageable random sampling of the data and vectorize the features
sample_data = random.choice(all_data, 20000)
X = vectorize(sample_data)
# Cluster the data
clustered = cluster(X, 75, False)
# Now figure out which cluster the words were assigned to
d = clustered.predict(X)
like_words = {}
for idx, c in enumerate(d):
w = like_words.get(c, list())
w.append(sample_data[idx]['word'])
like_words[c] = w
# Save the words organized by cluster
with open('word_clusters.csv', 'w') as fp:
a = csv.writer(fp, delimiter=',')
a.writerows(like_words.values())