kNN implementations with Pandas based on examples from ML in Action by Peter Harrington
import math
import numpy as np
def createDataSet():
"""
Creates a basic data set labels.
The labels are the classification given to the points. The data
is hardcoded in this toy example.
Returns:
A numpy array of (x,y) points and a corresponding list of labels
"""
group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
def classify(point, training_set, labels, k=1):
"""
Classify a given point using the training set and associated labels.
Args:
point: A tuple of (x,y) coordinates
training_set: An array of (x,y) coordinates
labels: Labels associate with training set
k: Number of neighbors to take into account
Returns:
Classification for given point
"""
# Calculate distance between points using Euclidean distance and sort closest
distances = []
for i, c in enumerate(training_set):
distances.append((math.sqrt((c[0] - point[0])**2 + (c[0] - point[1])**2), i))
distances.sort()
# Get the top closest points
top_knn = []
num = 1
for distance in distances:
top_knn.append(distance)
num += 1
if num > k:
break
# Count most labels in top_knn
label_count = {}
for _, i in top_knn:
if labels[i] in label_count:
label_count[labels[i]] += 1
else:
label_count[labels[i]] = 1
# Return classification with most matches
_, label = max([(c, l) for (l, c) in label_count.iteritems()])
return label
def main():
group, labels = createDataSet()
print classify([0, 0], group, labels, 2)
if __name__ == '__main__':
main()
import itertools
from ggplot import ggplot, aes, geom_point
import numpy as np
import pandas as pd
dating_test_set = '../sample/Ch02/datingTestSet.txt'
column_names = [
'Number of frequent flyer miles earned per year',
'Percentage of time spent playing video games',
'Liters of ice cream consumed per week',
'Category'
]
def load_file(filepath):
"""
Loads data in tab-separated format
Args:
filepath: Location of data file
Returns:
Data and labels extracted from text file
"""
data = []
labels = []
with open(filepath) as infile:
for line in infile:
row = line.strip().split('\t')
data.append(row[:-1])
labels.append(row[-1])
return data, labels
def normalize(df):
"""
Normalizes data to give equal weight to each features.
General formula:
norm_value = (value - min_value) / (max_value - min_value)
Args:
df: Pandas data frame with unnormalized data
Returns:
Normalized dataframe, range of values, min values
"""
min_values = df.min()
max_values = df.max()
range_values = max_values - min_values
norm_df = (df - min_values) / range_values
return norm_df, range_values, min_values
def classify(input_data, training_set, labels, k=1):
"""
Uses kNN algorithm to classify input data given a set of
known data.
Args:
input_data: Pandas Series of input data
training_set: Pandas Data frame of training data
labels: Pandas Series of classifications for training set
k: Number of neighbors to use
Returns:
Predicted classification for given input data
"""
distance_diff = training_set - input_data
distance_squared = distance_diff**2
distance = distance_squared.sum(axis=1)**0.5
distance_df = pd.concat([distance, labels], axis=1)
distance_df.sort(columns=[0], inplace=True)
top_knn = distance_df[:k]
return top_knn[1].value_counts().index.values[0]
def plot(df, x, y, color):
"""
Scatter plot with two of the features (x, y) grouped by classification (color)
Args:
df: Dataframe of data
x: Feature to plot on x axis
y: Feature to plot on y axis
color: Group by this column
"""
print(ggplot(df, aes(x=x, y=y, color=color)) + geom_point())
def main():
# Load data
raw_data, raw_labels = load_file(dating_test_set)
# Convert data to Pandas data structures
labels = pd.Series(raw_labels, name=column_names[3])
df = pd.DataFrame.from_records(np.array(raw_data, np.float32), columns=column_names[:3])
df[column_names[3]] = labels
plot(df, column_names[1], column_names[2], column_names[3])
"""
# Normalize data since ranges of values are different
norm_df, range_values, min_values = normalize(df)
# Use first 10% of data for testing
num_test_rows = int(norm_df.shape[0] * .1)
# 90% training data
training_df = norm_df[num_test_rows:]
training_labels = labels[num_test_rows:]
# 10% training data
test_df = norm_df[:num_test_rows]
test_labels = labels[:num_test_rows]
# Apply kNN algorithm to all test data
result_df = test_df.apply(lambda row: classify(row, training_df, training_labels, k=3), axis=1)
# Calculate the number of correct predictions
error_df = result_df == test_labels
print error_df.value_counts()
"""
if __name__ == '__main__':
main()
import numpy as np
import pandas as pd
import os
def classify(input_data, training_set, labels, k=1):
"""
Uses kNN algorithm to classify input data given a set of
known data.
Args:
input_data: Pandas Series of input data
training_set: Pandas Data frame of training data
labels: Pandas Series of classifications for training set
k: Number of neighbors to use
Returns:
Predicted classification for given input data
"""
distance_diff = training_set - input_data
distance_squared = distance_diff**2
distance = distance_squared.sum(axis=1)**0.5
distance_df = pd.concat([distance, labels], axis=1)
distance_df.sort(columns=[0], inplace=True)
top_knn = distance_df[:k]
return top_knn[1].value_counts().index.values[0]
def load_data(directory):
"""
Loads text files of digits in directory as list of lists,
where each row is represents the digit in a series 0's and 1's
Each digit is 32 x 32.
Args:
directory: Directory that contains text files of digits
Returns:
List of lists containing 0's and 1's representing each digit
"""
dataset = []
labels = []
for filename in os.listdir(directory):
filepath = os.path.join(directory, filename)
with open(filepath) as infile:
vector = []
for line in infile:
vector.extend(line.strip())
dataset.append(vector)
labels.append(int(filename[0]))
return dataset, labels
def main():
# Load data
raw_training_data, raw_training_labels = load_data('../sample/Ch02/trainingDigits/')
raw_test_data, raw_test_labels = load_data('../sample/Ch02/testDigits/')
# Convert data into Pandas data structures
training_labels = pd.Series(raw_training_labels)
training_data = pd.DataFrame.from_records(np.array(raw_training_data, int))
test_labels = pd.Series(raw_test_labels)
test_data = pd.DataFrame.from_records(np.array(raw_test_data, int))
# Apply kNN algorithm to all test data
result_df = test_data.apply(lambda row: classify(row, training_data, training_labels, k=3), axis=1)
# Calculate the number of correct predictions
error_df = result_df == test_labels
print error_df.value_counts()
if __name__ == '__main__':
main()