Plot the chart = scatter plot + histogram of two variable to be compared (with regression)
## Plot the chart = scatter plot + histogram of two variable to be compared
def scaterhist(df,svarx,svary,isregression=False):
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
# the random data
x = np.array(df[svarx].tolist())
y = np.array(df[svary].tolist())
## REGRESSION
# initialize
results = {}
# calculate polynomial regression
coeffs = np.polyfit(x, y, 1) # degree 1
# build model (function)
p = np.poly1d(coeffs)
# fit values, and mean
yhat = p(x) # or [p(z) for z in x]
ybar = np.sum(y)/len(y) # or sum(y)/len(y)
ssreg = np.sum((yhat-ybar)**2) # or sum([ (yihat - ybar)**2 for yihat in yhat])
sstot = np.sum((y - ybar)**2) # or sum([ (yi - ybar)**2 for yi in y])
# Set regression info
results['polynomial'] = coeffs.tolist() # Polynomial Coefficients
results['determination'] = ssreg / sstot # R**2
###
from scipy.stats import gaussian_kde
# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]
# no labels
nullfmt = NullFormatter()
# definitions for the axes
left, width = 0.1, 0.65
bottom, height = 0.1, 0.65
bottom_h = left_h = left + width + 0.02
# setting of spasces
rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom_h, width, 0.2]
rect_histy = [left_h, bottom, 0.2, height]
# settings of figure
plt.figure(1, figsize=(8, 8))
axScatter = plt.axes(rect_scatter)
axHistx = plt.axes(rect_histx)
axHisty = plt.axes(rect_histy)
# no labels
axHistx.xaxis.set_major_formatter(nullfmt)
axHisty.yaxis.set_major_formatter(nullfmt)
# the scatter plot:
axScatter.scatter(x, y, c=z, s=50, edgecolor='')
axScatter.set_xlabel(svarx,fontsize=12)
axScatter.set_ylabel(svary,fontsize=12)
# plot regression line
if isregression: axScatter.plot(x,yhat, label='fit', color="red")
# calculate limits:
binwidth = 0.25
xmin = min(np.fabs(x))
ymin = min(np.fabs(y))
xmax = max(np.fabs(x))
ymax = max(np.fabs(y))
limxmin = (int(xmin/binwidth) - 1) * binwidth
limymin = (int(ymin/binwidth) - 1) * binwidth
limxmax = (int(xmax/binwidth) + 1) * binwidth
limymax = (int(ymax/binwidth) + 1) * binwidth
# set limits
axScatter.set_xlim((limxmin,limxmax))
axScatter.set_ylim((limymin,limymax))
# set bins
binsx = np.arange(limxmin, limxmax + binwidth, binwidth)
binsy = np.arange(limymin, limymax + binwidth, binwidth)
# plot histograms
nx, binsx, rectanglesx = axHistx.hist(x, bins=binsx, normed=True)
ny, binsy, rectanglesy = axHisty.hist(y, bins=binsy, orientation='horizontal', normed=True)
# set limits in histogram
axHistx.set_xlim(axScatter.get_xlim())
axHisty.set_ylim(axScatter.get_ylim())
# set ticks labels
axHisty.set_xticklabels(np.arange(0,max(nx)+0.1,0.05),rotation=270,fontsize=8)
axHistx.set_yticklabels(np.arange(0,max(ny)+0.1,0.05),rotation=0,fontsize=8)
# include text box
xpos = 0.05; ypos = 0.95
textstr = "R**2 = %.3f"%(results['determination'])
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
axScatter.text(xpos,ypos, textstr, transform=axScatter.transAxes, fontsize=10,
verticalalignment='top', bbox=props)
# display
plt.show()