jmquintana79
7/6/2018 - 8:18 AM

outliers multi-gaussian methodology

Outilers identification using a Multivariable-Gaussian methodology. In this case, it is required include manually the threshold.

import numpy as np
from pandas import read_csv
from scipy.stats import multivariate_normal

def read_dataset(filePath,delimiter=','):
    return read_csv(filePath, delimiter=delimiter)

def feature_normalize(dataset):
    mu = np.mean(dataset,axis=0)
    sigma = np.std(dataset,axis=0)
    return (dataset - mu)/sigma

def estimateGaussian(dataset):
    mu = np.mean(dataset, axis=0)
    sigma = np.cov(dataset.T)
    return mu, sigma
    
def multivariateGaussian(dataset,mu,sigma):
    p = multivariate_normal(mean=mu, cov=sigma)
    return p.pdf(dataset)

## outliers detection using a Multi-Gaussian method
def outliers_multigaussian(data:'df', threshold:float, snamex:str, snamey:str, path_output:str=None)->'df':
    """
    Outliers detection using a Multi-Gaussian method.
    data -- dataframe with the columns to be checked.
    threshold -- threshold to be used to select the anomalous data.
    snamex -- column name of the first column data.
    snamey -- column name of the second column data.
    path_output -- output path of final plot to be stored. If it is None, it is not stored nothing (default None).
    return -- dataframe with a new column 'isoutlier' where is labeled outliers / non outliers.
    """

    # validate column name arguments
    if not snamex in data.columns.tolist(): print('[error] "%s" column do not exists into the dataframe.'%snamex); return None
    if not snamey in data.columns.tolist(): print('[error] "%s" column do not exists into the dataframe.'%snamey); return None
    
    # df to array
    tr_data = data[[snamex,snamey]].as_matrix()

    # dimessions
    n_samples = tr_data.shape[0]
    n_dim = tr_data.shape[1]
    print('[info] Number of datapoints: %d' % n_samples)
    print('[info] Number of dimensions/features: %d' % n_dim)
    print('[info] Outliers will be identify according to "%s" and "%s"' %(snamex,snamey))
    

    # calculate multivariable gaussian distribution
    mu, sigma = estimateGaussian(tr_data)
    p = multivariateGaussian(tr_data,mu,sigma)

    #selecting outlier datapoints 
    outliers = np.asarray(np.where(p < threshold))

    # display
    print('[info] Threshold = %s'%threshold)
    print('[info] Number of Outliers = %s (%.3f%s)'%(len(outliers[0]),len(outliers[0])*100./n_samples,'%'))

    # set label of is outlier or not
    data['isoutlier'] = np.ones(n_samples) * False
    data['isoutlier'].iloc[outliers[0]] = np.ones(len(outliers[0])) * True
    data['isoutlier'] = data['isoutlier'].astype(int) 

    # store final chart
    if not path_output is None:
        import matplotlib.pyplot as plt
        plt.figure()
        plt.xlabel('%s'%snamex)
        plt.ylabel('%s'%snamey)
        plt.plot(tr_data[:,0],tr_data[:,1],'bx')
        plt.plot(tr_data[outliers,0],tr_data[outliers,1],'ro')
        plt.title('Number of Outliers = %s (%.3f%s)\nThreshold = %s'%(len(outliers[0]),len(outliers[0])*100./n_samples,'%',threshold),fontsize=14)
        plt.savefig(path_output,bbox_inches='tight',transparent=False)
        plt.cla()   # Clear axis
        plt.clf()   # Clear figure
        plt.close() # Close a figure window        
        print('[info] it was stored "%s"'%path_output)
    # return
    return data
    

## main    
if __name__=='__main__':
  # arguments
  threshold = 0.0005
  snamex = 'hwsm'
  snamey = 'y'
  path_output = 'outliers.png'
  # read data
  data = read_dataset('hwsm.csv')
  data = data.set_index('dt')
  # identify outliers
  data_out = outliers_multigaussian(data, threshold, snamex, snamey,path_output)