stevenbeales
12/28/2018 - 3:17 AM

Spelling clustering

Classify words by their spelling with features of syllables, phonemes, vowels, and stresses

from __future__ import print_function
import csv
import re
from time import time

from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from numpy import random


def parse_features(line):
    raw_text = re.split('[ \n]+', line)
    if not raw_text[-1]:
        raw_text = raw_text[0:-1]
    if raw_text[0][-1] == ')':
        lexeme = raw_text[0][0:len(raw_text[0])-3]
    else:
        lexeme = raw_text[0]

    features = {}

    syllables = []
    stresses = []
    vowels = []
    syllable = []
    stress = 0
    vowel = None
    for j in range(1, len(raw_text)):
        letter = raw_text[j]
        if letter == '-':
            syllables.append(syllable)
            features["syl{}".format(len(syllables))] = ','.join(syllable)
            stresses.append(stress)
            vowels.append(vowel)
            features["vowels{}".format(len(vowels))] = vowel
            syllable = []
        else:
            if ord(letter[-1]) in range(55):
                stress = int(letter[-1])
                phone = letter[0:-1]
                vowel = letter[0:-1]
            else:
                phone = letter
            syllable.append(phone)
    syllables.append(syllable)
    features["syl-{}".format(len(syllables))] = ','.join(syllable)
    stresses.append(stress)
    features["stresses"] = stresses
    vowels.append(vowel)
    features["vowels-{}".format(len(vowels))] = vowel

    return {'word': lexeme.lower(), 'features': features}


def read_data(file_path):
    with open(file_path) as f:
        content = f.readlines()
    return content[115:]


def vectorize(data):
    vectorizer = DictVectorizer()
    return vectorizer.fit_transform([d['features'] for d in data])


def cluster(X, num_clusters, mini=False):
    if mini:
        km = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=True)
    else:
        km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=1, verbose=True)
    print("Clustering sparse data with %s" % km)
    t0 = time()
    km.fit(X)
    print("done in %0.3fs" % (time() - t0))
    print()
    return km


###############################################################################
# Load the words and their spellings
file_content = read_data('cmudict.06d.syl')
all_data = [parse_features(line) for line in file_content]

# Get a manageable random sampling of the data and vectorize the features
sample_data = random.choice(all_data, 20000)
X = vectorize(sample_data)

# Cluster the data
clustered = cluster(X, 75, False)

# Now figure out which cluster the words were assigned to
d = clustered.predict(X)
like_words = {}
for idx, c in enumerate(d):
    w = like_words.get(c, list())
    w.append(sample_data[idx]['word'])
    like_words[c] = w

# Save the words organized by cluster
with open('word_clusters.csv', 'w') as fp:
    a = csv.writer(fp, delimiter=',')
    a.writerows(like_words.values())