jmquintana79
8/4/2017 - 7:40 AM

Hierarchical Clustering Algorithm.

Hierarchical Clustering Algorithm.

"""

HIERARCHICAL CLUSTERING

input:
- DF: pandas df (rows = data to be classified, columns = features to be used to calculate distances)
- lvar: variables to be used as features
- isplot: plot or not results


source: 
Scipy Library: https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html
Manual: https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/

"""
def hierarchical_clustering(DF,lvar,isplot=False):
  import numpy as np
  import copy

  print('Hierarchical Clustering of: %s'%lvar)

  # prepare data
  X= DF[lvar].as_matrix()

  ## Hierarchical Clustering
  from matplotlib import pyplot as plt
  from scipy.cluster.hierarchy import dendrogram, linkage
  # generate the linkage matrix
  Z = linkage(X, 'ward')

  ## DENDOGRAM
  def fancy_dendrogram(*args, **kwargs):
      max_d = kwargs.pop('max_d', None)
      if max_d and 'color_threshold' not in kwargs:
          kwargs['color_threshold'] = max_d
      annotate_above = kwargs.pop('annotate_above', 0)

      ddata = dendrogram(*args, **kwargs)

      if not kwargs.get('no_plot', False):
          plt.title('Hierarchical Clustering Dendrogram (truncated)')
          plt.xlabel('sample index or (cluster size)')
          plt.ylabel('distance')
          for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
              x = 0.5 * sum(i[1:3])
              y = d[1]
              if y > annotate_above:
                  plt.plot(x, y, 'o', c=c)
                  plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                               textcoords='offset points',
                               va='top', ha='center')
          if max_d:
              plt.axhline(y=max_d, c='k')
      return ddata

  if isplot:
      fancy_dendrogram(
          Z,
          truncate_mode='lastp',
          p=20,
          leaf_rotation=90.,
          leaf_font_size=12.,
          show_contracted=True,
          annotate_above=10,  # useful in small plots so annotations don't overlap
      )
      plt.show()



  ## Elbow Method for estimate the most optimal number of clusters
  last = Z[-10:, 2]
  last_rev = last[::-1]
  if isplot:
      idxs = np.arange(1, len(last) + 1)
      plt.plot(idxs, last_rev)

  acceleration = np.diff(last, 2)  # 2nd derivative of the distances
  acceleration_rev = acceleration[::-1]
  if isplot:
      plt.plot(idxs[:-2] + 1, acceleration_rev)
      plt.show()
  k = acceleration_rev.argmax() + 2  # if idx 0 is the max of this we want 2 clusters
  print("clusters = %s"%(k))

  ## CALCULATE CLUSTERS INDEX
  from scipy.cluster.hierarchy import fcluster
  clusters = fcluster(Z, k, criterion='maxclust')

  ## PLOT CLUSTERS
  if isplot:
      plt.figure(figsize=(6, 6))
      plt.scatter(X[:,0], X[:,1], c=clusters, cmap='prism')  # plot points with cluster dependent colors
      plt.show()

  # clean
  del(X)
  
  # return
  return clusters



## calculate distances for Hierarchical Clustering Forecast
def cdist_sparse( X, Y, **kwargs ):
    """ -> |X| x |Y| cdist array, any cdist metric
        X or Y may be sparse -- best csr
    """
    from scipy.sparse import issparse  # $scipy/sparse/csr.py
    from scipy.spatial.distance import cdist  # $scipy/spatial/distance.py
    import numpy as np

    # todense row at a time, v slow if both v sparse
    sxy = 2*issparse(X) + issparse(Y)
    if sxy == 0:
        return cdist( X, Y, **kwargs )
    d = np.empty( (X.shape[0], Y.shape[0]), np.float64 )
    if sxy == 2:
        for j, x in enumerate(X):
            d[j] = cdist( x.todense(), Y, **kwargs ) [0]
    elif sxy == 1:
        for k, y in enumerate(Y):
            d[:,k] = cdist( X, y.todense(), **kwargs ) [0]
    else:
        for j, x in enumerate(X):
            for k, y in enumerate(Y):
                d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0]
    return d