richard-to
11/4/2014 - 6:44 AM

kNN implementations with Pandas based on examples from ML in Action by Peter Harrington

kNN implementations with Pandas based on examples from ML in Action by Peter Harrington

import math
import numpy as np


def createDataSet():
    """
    Creates a basic data set labels.

    The labels are the classification given to the points. The data
    is hardcoded in this toy example.

    Returns:
        A numpy array of (x,y) points and a corresponding list of labels
    """
    group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    labels = ['A', 'A', 'B', 'B']
    return group, labels


def classify(point, training_set, labels, k=1):
    """
    Classify a given point using the training set and associated labels.

    Args:
        point: A tuple of (x,y) coordinates
        training_set: An array of (x,y) coordinates
        labels: Labels associate with training set
        k: Number of neighbors to take into account

    Returns:
        Classification for given point
    """

    # Calculate distance between points using Euclidean distance and sort closest
    distances = []
    for i, c in enumerate(training_set):
        distances.append((math.sqrt((c[0] - point[0])**2 + (c[0] - point[1])**2), i))
    distances.sort()

    # Get the top closest points
    top_knn = []
    num = 1
    for distance in distances:
        top_knn.append(distance)
        num += 1
        if num > k:
            break

    # Count most labels in top_knn
    label_count = {}
    for _, i in top_knn:
        if labels[i] in label_count:
            label_count[labels[i]] += 1
        else:
            label_count[labels[i]] = 1

    # Return classification with most matches
    _, label = max([(c, l) for (l, c) in label_count.iteritems()])
    return label


def main():
    group, labels = createDataSet()
    print classify([0, 0], group, labels, 2)


if __name__ == '__main__':
    main()

import itertools
from ggplot import ggplot, aes, geom_point
import numpy as np
import pandas as pd

dating_test_set = '../sample/Ch02/datingTestSet.txt'

column_names = [
    'Number of frequent flyer miles earned per year',
    'Percentage of time spent playing video games',
    'Liters of ice cream consumed per week',
    'Category'
]


def load_file(filepath):
    """
    Loads data in tab-separated format

    Args:
        filepath: Location of data file

    Returns:
        Data and labels extracted from text file
    """
    data = []
    labels = []
    with open(filepath) as infile:
        for line in infile:
            row = line.strip().split('\t')
            data.append(row[:-1])
            labels.append(row[-1])
    return data, labels


def normalize(df):
    """
    Normalizes data to give equal weight to each features.

    General formula:

        norm_value = (value - min_value) / (max_value - min_value)

    Args:
        df: Pandas data frame with unnormalized data

    Returns:
        Normalized dataframe, range of values, min values
    """
    min_values = df.min()
    max_values = df.max()
    range_values = max_values - min_values
    norm_df = (df - min_values) / range_values
    return norm_df, range_values, min_values


def classify(input_data, training_set, labels, k=1):
    """
    Uses kNN algorithm to classify input data given a set of
    known data.

    Args:
        input_data: Pandas Series of input data
        training_set: Pandas Data frame of training data
        labels: Pandas Series of classifications for training set
        k: Number of neighbors to use

    Returns:
        Predicted classification for given input data
    """
    distance_diff = training_set - input_data
    distance_squared = distance_diff**2
    distance = distance_squared.sum(axis=1)**0.5
    distance_df = pd.concat([distance, labels], axis=1)
    distance_df.sort(columns=[0], inplace=True)
    top_knn = distance_df[:k]
    return top_knn[1].value_counts().index.values[0]


def plot(df, x, y, color):
    """
    Scatter plot with two of the features (x, y) grouped by classification (color)

    Args:
        df: Dataframe of data
        x: Feature to plot on x axis
        y: Feature to plot on y axis
        color: Group by this column
    """
    print(ggplot(df, aes(x=x, y=y, color=color)) + geom_point())


def main():

    # Load data
    raw_data, raw_labels = load_file(dating_test_set)

    # Convert data to Pandas data structures
    labels = pd.Series(raw_labels, name=column_names[3])
    df = pd.DataFrame.from_records(np.array(raw_data, np.float32), columns=column_names[:3])
    df[column_names[3]] = labels

    plot(df, column_names[1], column_names[2], column_names[3])

    """
    # Normalize data since ranges of values are different
    norm_df, range_values, min_values = normalize(df)

    # Use first 10% of data for testing
    num_test_rows = int(norm_df.shape[0] * .1)

    # 90% training data
    training_df = norm_df[num_test_rows:]
    training_labels = labels[num_test_rows:]

    # 10% training data
    test_df = norm_df[:num_test_rows]
    test_labels = labels[:num_test_rows]

    # Apply kNN algorithm to all test data
    result_df = test_df.apply(lambda row: classify(row, training_df, training_labels, k=3), axis=1)

    # Calculate the number of correct predictions
    error_df = result_df == test_labels
    print error_df.value_counts()
    """

if __name__ == '__main__':
    main()
import numpy as np
import pandas as pd
import os


def classify(input_data, training_set, labels, k=1):
    """
    Uses kNN algorithm to classify input data given a set of
    known data.

    Args:
        input_data: Pandas Series of input data
        training_set: Pandas Data frame of training data
        labels: Pandas Series of classifications for training set
        k: Number of neighbors to use

    Returns:
        Predicted classification for given input data
    """
    distance_diff = training_set - input_data
    distance_squared = distance_diff**2
    distance = distance_squared.sum(axis=1)**0.5
    distance_df = pd.concat([distance, labels], axis=1)
    distance_df.sort(columns=[0], inplace=True)
    top_knn = distance_df[:k]
    return top_knn[1].value_counts().index.values[0]


def load_data(directory):
    """
    Loads text files of digits in directory as list of lists,
    where each row is represents the digit in a series 0's and 1's

    Each digit is 32 x 32.

    Args:
        directory: Directory that contains text files of digits

    Returns:
        List of lists containing 0's and 1's representing each digit
    """
    dataset = []
    labels = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath) as infile:
            vector = []
            for line in infile:
                vector.extend(line.strip())
            dataset.append(vector)
        labels.append(int(filename[0]))
    return dataset, labels


def main():

    # Load data
    raw_training_data, raw_training_labels = load_data('../sample/Ch02/trainingDigits/')
    raw_test_data, raw_test_labels = load_data('../sample/Ch02/testDigits/')

    # Convert data into Pandas data structures
    training_labels = pd.Series(raw_training_labels)
    training_data = pd.DataFrame.from_records(np.array(raw_training_data, int))

    test_labels = pd.Series(raw_test_labels)
    test_data = pd.DataFrame.from_records(np.array(raw_test_data, int))

    # Apply kNN algorithm to all test data
    result_df = test_data.apply(lambda row: classify(row, training_data, training_labels, k=3), axis=1)

    # Calculate the number of correct predictions
    error_df = result_df == test_labels
    print error_df.value_counts()


if __name__ == '__main__':
    main()