JerBouma
2/9/2019 - 7:30 PM

[Support Vector Machine ML] Machine Learning with the Support Vector Machine method. #machinelearning

[Support Vector Machine ML] Machine Learning with the Support Vector Machine method. #machinelearning

# Read the Datafile
# Also read the names.txt file
df = pd.read_csv('breast-cancer-wisconsin.txt')

# Remove all '?' in the Data (that was documented in the txt file)
# -9999 causes Algorithms to indicate it as an outlier
# You don't just want to drop data since in real world sets this
# Might cause you to lose 15% of your data
df.replace('?', -99999, inplace=True)

# Drop ID columns since it does not add to anything
# Axis = 1, otherwise you won't be able to drop anything
# Not dropping ID completely changes R^2 (from 0.96 to 0.6)
df.drop(['id'], axis=1, inplace=True)
# Define X (features) and y (class)
X = np.array(df.drop(['class'], axis=1)) # Grabs everything but class
y = np.array(df['class']) # Grabs only class

# Fit model on the train set in order to make prediction on the test set
# Test Size = 0.2 --> 20%, this can be altered but this is 'standard'
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y, test_size=0.2)

# All you really do is change your initial method (Neighbors) to svm.SVC
clf = svm.SVC(gamma='auto')
clf.fit(X_train,y_train)

# Compute R^2
accuracy = clf.score(X_test, y_test)

# Show value of Accuracy
# This is a HUGE Accuracy
print("R^2 is:",accuracy)
# Test these Measures (which are not in the .txt file)
example_measures = np.array([[4,2,1,1,1,2,3,2,1], [4,2,2,1,2,2,3,2,1], [4,2,2,1,2,2,3,2,2]])

# Without this line (or previous line) you receive an error and have to reshape
# By adding len(example_measures) you don't need to reshape anymore
example_measures = example_measures.reshape(len(example_measures), -1)

prediction = clf.predict(example_measures)
print("The prediction is:",prediction)