kNN implementations with Pandas based on examples from ML in Action by Peter Harrington

import math
import numpy as np

def createDataSet():
    Creates a basic data set labels.

    The labels are the classification given to the points. The data
    is hardcoded in this toy example.

        A numpy array of (x,y) points and a corresponding list of labels
    group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    labels = ['A', 'A', 'B', 'B']
    return group, labels

def classify(point, training_set, labels, k=1):
    Classify a given point using the training set and associated labels.

        point: A tuple of (x,y) coordinates
        training_set: An array of (x,y) coordinates
        labels: Labels associate with training set
        k: Number of neighbors to take into account

        Classification for given point

    # Calculate distance between points using Euclidean distance and sort closest
    distances = []
    for i, c in enumerate(training_set):
        distances.append((math.sqrt((c[0] - point[0])**2 + (c[0] - point[1])**2), i))

    # Get the top closest points
    top_knn = []
    num = 1
    for distance in distances:
        num += 1
        if num > k:

    # Count most labels in top_knn
    label_count = {}
    for _, i in top_knn:
        if labels[i] in label_count:
            label_count[labels[i]] += 1
            label_count[labels[i]] = 1

    # Return classification with most matches
    _, label = max([(c, l) for (l, c) in label_count.iteritems()])
    return label

def main():
    group, labels = createDataSet()
    print classify([0, 0], group, labels, 2)

if __name__ == '__main__':

import itertools
from ggplot import ggplot, aes, geom_point
import numpy as np
import pandas as pd

dating_test_set = '../sample/Ch02/datingTestSet.txt'

column_names = [
    'Number of frequent flyer miles earned per year',
    'Percentage of time spent playing video games',
    'Liters of ice cream consumed per week',

def load_file(filepath):
    Loads data in tab-separated format

        filepath: Location of data file

        Data and labels extracted from text file
    data = []
    labels = []
    with open(filepath) as infile:
        for line in infile:
            row = line.strip().split('\t')
    return data, labels

def normalize(df):
    Normalizes data to give equal weight to each features.

    General formula:

        norm_value = (value - min_value) / (max_value - min_value)

        df: Pandas data frame with unnormalized data

        Normalized dataframe, range of values, min values
    min_values = df.min()
    max_values = df.max()
    range_values = max_values - min_values
    norm_df = (df - min_values) / range_values
    return norm_df, range_values, min_values

def classify(input_data, training_set, labels, k=1):
    Uses kNN algorithm to classify input data given a set of
    known data.

        input_data: Pandas Series of input data
        training_set: Pandas Data frame of training data
        labels: Pandas Series of classifications for training set
        k: Number of neighbors to use

        Predicted classification for given input data
    distance_diff = training_set - input_data
    distance_squared = distance_diff**2
    distance = distance_squared.sum(axis=1)**0.5
    distance_df = pd.concat([distance, labels], axis=1)
    distance_df.sort(columns=[0], inplace=True)
    top_knn = distance_df[:k]
    return top_knn[1].value_counts().index.values[0]

def plot(df, x, y, color):
    Scatter plot with two of the features (x, y) grouped by classification (color)

        df: Dataframe of data
        x: Feature to plot on x axis
        y: Feature to plot on y axis
        color: Group by this column
    print(ggplot(df, aes(x=x, y=y, color=color)) + geom_point())

def main():

    # Load data
    raw_data, raw_labels = load_file(dating_test_set)

    # Convert data to Pandas data structures
    labels = pd.Series(raw_labels, name=column_names[3])
    df = pd.DataFrame.from_records(np.array(raw_data, np.float32), columns=column_names[:3])
    df[column_names[3]] = labels

    plot(df, column_names[1], column_names[2], column_names[3])

    # Normalize data since ranges of values are different
    norm_df, range_values, min_values = normalize(df)

    # Use first 10% of data for testing
    num_test_rows = int(norm_df.shape[0] * .1)

    # 90% training data
    training_df = norm_df[num_test_rows:]
    training_labels = labels[num_test_rows:]

    # 10% training data
    test_df = norm_df[:num_test_rows]
    test_labels = labels[:num_test_rows]

    # Apply kNN algorithm to all test data
    result_df = test_df.apply(lambda row: classify(row, training_df, training_labels, k=3), axis=1)

    # Calculate the number of correct predictions
    error_df = result_df == test_labels
    print error_df.value_counts()

if __name__ == '__main__':
import numpy as np
import pandas as pd
import os

def classify(input_data, training_set, labels, k=1):
    Uses kNN algorithm to classify input data given a set of
    known data.

        input_data: Pandas Series of input data
        training_set: Pandas Data frame of training data
        labels: Pandas Series of classifications for training set
        k: Number of neighbors to use

        Predicted classification for given input data
    distance_diff = training_set - input_data
    distance_squared = distance_diff**2
    distance = distance_squared.sum(axis=1)**0.5
    distance_df = pd.concat([distance, labels], axis=1)
    distance_df.sort(columns=[0], inplace=True)
    top_knn = distance_df[:k]
    return top_knn[1].value_counts().index.values[0]

def load_data(directory):
    Loads text files of digits in directory as list of lists,
    where each row is represents the digit in a series 0's and 1's

    Each digit is 32 x 32.

        directory: Directory that contains text files of digits

        List of lists containing 0's and 1's representing each digit
    dataset = []
    labels = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath) as infile:
            vector = []
            for line in infile:
    return dataset, labels

def main():

    # Load data
    raw_training_data, raw_training_labels = load_data('../sample/Ch02/trainingDigits/')
    raw_test_data, raw_test_labels = load_data('../sample/Ch02/testDigits/')

    # Convert data into Pandas data structures
    training_labels = pd.Series(raw_training_labels)
    training_data = pd.DataFrame.from_records(np.array(raw_training_data, int))

    test_labels = pd.Series(raw_test_labels)
    test_data = pd.DataFrame.from_records(np.array(raw_test_data, int))

    # Apply kNN algorithm to all test data
    result_df = test_data.apply(lambda row: classify(row, training_data, training_labels, k=3), axis=1)

    # Calculate the number of correct predictions
    error_df = result_df == test_labels
    print error_df.value_counts()

if __name__ == '__main__':