jmquintana79
3/9/2017 - 6:22 AM

Multivariate distances for different metrics

Multivariate distances for different metrics

## algoritm to calculate multiviriante distances
def cdist_sparse( X, Y, **kwargs ):
    from scipy.spatial.distance import cdist
    from scipy.sparse import issparse 
    
    # todense row at a time, v slow if both v sparse
    sxy = 2*issparse(X) + issparse(Y)
    if sxy == 0:
        return cdist( X, Y, **kwargs )
    d = np.empty( (X.shape[0], Y.shape[0]), np.float64 )
    if sxy == 2:
        for j, x in enumerate(X):
            d[j] = cdist( x.todense(), Y, **kwargs ) [0]
    elif sxy == 1:
        for k, y in enumerate(Y):
            d[:,k] = cdist( X, y.todense(), **kwargs ) [0]
    else:
        for j, x in enumerate(X):
            for k, y in enumerate(Y):
                d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0]
    return d
    
    
if __name__ == __main__:
    
  # define metric (https://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.html) 
  metric = "cityblock" 
  # calculate distances
  D = cdist_sparse( X.as_matrix(), TARGET.as_matrix(), metric=metric, p=2 )
  
  """
  D: array of distances [D.shape = len(X) x len(TARGET)]
  """