yankov
2/3/2015 - 7:47 PM

partial onhotencoder

partial onhotencoder

from scipy import sparse


class OneHotEncoder():
    """
    OneHotEncoder takes data matrix with categorical columns and
    converts it to a sparse binary matrix doing one-of-k encoding.

    Parts of code borrowed from Paul Duan (www.paulduan.com)
    Licence: MIT (https://github.com/pyduan/amazonaccess/blob/master/MIT-LICENSE)
    """

    def __init__(self):
        self.keymap = None

    def fit(self, x):
        self.keymap = []
        for col in x.T:
            uniques = set(list(col))
            self.keymap.append(dict((key, i) for i, key in enumerate(uniques)))

    def partial_fit(self, x):
        """
        This method can be used for doing one hot encoding in mini-batch mode.
        """
        if self.keymap is None:
            self.fit(x)
        else:
            for i, col in enumerate(x.T):
                uniques = set(self.keymap[i].keys() + (list(col)))
                self.keymap[i] = dict((key, i) for i, key in enumerate(uniques))

    def transform(self, x):
        if self.keymap is None:
            self.fit(x)

        outdat = []
        for i, col in enumerate(x.T):
            km = self.keymap[i]
            num_labels = len(km)
            spmat = sparse.lil_matrix((x.shape[0], num_labels))
            for j, val in enumerate(col):
                if val in km:
                    spmat[j, km[val]] = 1
            outdat.append(spmat)
        outdat = sparse.hstack(outdat).tocsr()
        return outdat