Outilers identification using a Multivariable-Gaussian methodology. In this case, it is required include manually the threshold.
import numpy as np
from pandas import read_csv
from scipy.stats import multivariate_normal
def read_dataset(filePath,delimiter=','):
return read_csv(filePath, delimiter=delimiter)
def feature_normalize(dataset):
mu = np.mean(dataset,axis=0)
sigma = np.std(dataset,axis=0)
return (dataset - mu)/sigma
def estimateGaussian(dataset):
mu = np.mean(dataset, axis=0)
sigma = np.cov(dataset.T)
return mu, sigma
def multivariateGaussian(dataset,mu,sigma):
p = multivariate_normal(mean=mu, cov=sigma)
return p.pdf(dataset)
## outliers detection using a Multi-Gaussian method
def outliers_multigaussian(data:'df', threshold:float, snamex:str, snamey:str, path_output:str=None)->'df':
"""
Outliers detection using a Multi-Gaussian method.
data -- dataframe with the columns to be checked.
threshold -- threshold to be used to select the anomalous data.
snamex -- column name of the first column data.
snamey -- column name of the second column data.
path_output -- output path of final plot to be stored. If it is None, it is not stored nothing (default None).
return -- dataframe with a new column 'isoutlier' where is labeled outliers / non outliers.
"""
# validate column name arguments
if not snamex in data.columns.tolist(): print('[error] "%s" column do not exists into the dataframe.'%snamex); return None
if not snamey in data.columns.tolist(): print('[error] "%s" column do not exists into the dataframe.'%snamey); return None
# df to array
tr_data = data[[snamex,snamey]].as_matrix()
# dimessions
n_samples = tr_data.shape[0]
n_dim = tr_data.shape[1]
print('[info] Number of datapoints: %d' % n_samples)
print('[info] Number of dimensions/features: %d' % n_dim)
print('[info] Outliers will be identify according to "%s" and "%s"' %(snamex,snamey))
# calculate multivariable gaussian distribution
mu, sigma = estimateGaussian(tr_data)
p = multivariateGaussian(tr_data,mu,sigma)
#selecting outlier datapoints
outliers = np.asarray(np.where(p < threshold))
# display
print('[info] Threshold = %s'%threshold)
print('[info] Number of Outliers = %s (%.3f%s)'%(len(outliers[0]),len(outliers[0])*100./n_samples,'%'))
# set label of is outlier or not
data['isoutlier'] = np.ones(n_samples) * False
data['isoutlier'].iloc[outliers[0]] = np.ones(len(outliers[0])) * True
data['isoutlier'] = data['isoutlier'].astype(int)
# store final chart
if not path_output is None:
import matplotlib.pyplot as plt
plt.figure()
plt.xlabel('%s'%snamex)
plt.ylabel('%s'%snamey)
plt.plot(tr_data[:,0],tr_data[:,1],'bx')
plt.plot(tr_data[outliers,0],tr_data[outliers,1],'ro')
plt.title('Number of Outliers = %s (%.3f%s)\nThreshold = %s'%(len(outliers[0]),len(outliers[0])*100./n_samples,'%',threshold),fontsize=14)
plt.savefig(path_output,bbox_inches='tight',transparent=False)
plt.cla() # Clear axis
plt.clf() # Clear figure
plt.close() # Close a figure window
print('[info] it was stored "%s"'%path_output)
# return
return data
## main
if __name__=='__main__':
# arguments
threshold = 0.0005
snamex = 'hwsm'
snamey = 'y'
path_output = 'outliers.png'
# read data
data = read_dataset('hwsm.csv')
data = data.set_index('dt')
# identify outliers
data_out = outliers_multigaussian(data, threshold, snamex, snamey,path_output)