jmquintana79
10/21/2016 - 6:36 AM

scatter hist

Plot the chart = scatter plot + histogram of two variable to be compared (with regression)

## Plot the chart = scatter plot + histogram of two variable to be compared
def scaterhist(df,svarx,svary,isregression=False):
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.ticker import NullFormatter

    # the random data
    x = np.array(df[svarx].tolist())
    y = np.array(df[svary].tolist())

    ## REGRESSION
    
    # initialize
    results = {}
    # calculate polynomial regression
    coeffs = np.polyfit(x, y, 1) # degree 1
    # build model (function)
    p = np.poly1d(coeffs)
    # fit values, and mean
    yhat = p(x)                      # or [p(z) for z in x]
    ybar = np.sum(y)/len(y)          # or sum(y)/len(y)
    ssreg = np.sum((yhat-ybar)**2)   # or sum([ (yihat - ybar)**2 for yihat in yhat])
    sstot = np.sum((y - ybar)**2)    # or sum([ (yi - ybar)**2 for yi in y])
     # Set regression info 
    results['polynomial'] = coeffs.tolist() # Polynomial Coefficients
    results['determination'] = ssreg / sstot # R**2
    
    ###
    from scipy.stats import gaussian_kde
    # Calculate the point density
    xy = np.vstack([x,y])
    z = gaussian_kde(xy)(xy)

    # Sort the points by density, so that the densest points are plotted last
    idx = z.argsort()
    x, y, z = x[idx], y[idx], z[idx]
    
    # no labels
    nullfmt = NullFormatter()
    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left + width + 0.02
    # setting of spasces
    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]
    # settings of figure
    plt.figure(1, figsize=(8, 8))
    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)
    # no labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)
    # the scatter plot:
    axScatter.scatter(x, y, c=z, s=50, edgecolor='')
    axScatter.set_xlabel(svarx,fontsize=12)
    axScatter.set_ylabel(svary,fontsize=12)
    # plot regression line
    if isregression: axScatter.plot(x,yhat, label='fit', color="red")


    # calculate limits:
    binwidth = 0.25
    xmin = min(np.fabs(x))
    ymin = min(np.fabs(y))
    xmax = max(np.fabs(x))
    ymax = max(np.fabs(y))
    limxmin = (int(xmin/binwidth) - 1) * binwidth
    limymin = (int(ymin/binwidth) - 1) * binwidth
    limxmax = (int(xmax/binwidth) + 1) * binwidth
    limymax = (int(ymax/binwidth) + 1) * binwidth
    # set limits
    axScatter.set_xlim((limxmin,limxmax))
    axScatter.set_ylim((limymin,limymax))
    # set bins
    binsx = np.arange(limxmin, limxmax + binwidth, binwidth)
    binsy = np.arange(limymin, limymax + binwidth, binwidth)
    # plot histograms
    nx, binsx, rectanglesx = axHistx.hist(x, bins=binsx, normed=True)
    ny, binsy, rectanglesy = axHisty.hist(y, bins=binsy, orientation='horizontal', normed=True)
    # set limits in histogram
    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())
    # set ticks labels
    axHisty.set_xticklabels(np.arange(0,max(nx)+0.1,0.05),rotation=270,fontsize=8)
    axHistx.set_yticklabels(np.arange(0,max(ny)+0.1,0.05),rotation=0,fontsize=8)
    
    # include text box
    xpos = 0.05; ypos = 0.95
    textstr = "R**2 = %.3f"%(results['determination'])
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
    axScatter.text(xpos,ypos, textstr, transform=axScatter.transAxes, fontsize=10,
            verticalalignment='top', bbox=props)

    # display
    plt.show()