Histogram of out-of-box data using PySpark and NumPy
from typing import Iterable
from numpy import histogram2d, array, ndarray, linspace
from pyspark.sql import DataFrame
import matplotlib.pyplot as plt
def _hist(it: Iterable[list]) -> Iterable[ndarray]:
arr = array(list(it))
h, *_ = histogram2d(
arr[:, 0],
arr[:, 1],
bins=[1000, 200],
range=[[0, 8000], [-50, 50]]
)
yield h
return
df: DataFrame
hist = (
df
.select('interest0', 'interest1')
.rdd
.map(list)
.mapPartitions(_hist)
.sum()
)
xedges = linspace(0, 8000, 1000+1)
yedges = linspace(-50, 50, 200+1)
plt.figure()
plt.pcolormesh(xedges, yedges, hist.T)
plt.show()