[Linear Regression ML] Machine Learning with the Linear Regression method #machinelearning
# Gather Data
df = pdr.DataReader('GOOGL', 'yahoo', '2004-08-19', '2016-04-10')
df.drop('Adj Close', axis=1).head()
# High Minus Low Percentage
df['HL_PCT'] = (df['High'] - df['Low']) / df['Low'] * 100.0
# Percentage Change
df['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0
# Choose Only the Important Columns
df = df[['Close','HL_PCT','PCT_change','Volume']]
# Show df.head()
df.head()
# Fill NaN Date with something
df.fillna(-99999, inplace=True)
# Select the column that you wish to Forecast
forecast_col = 'Close'
# Math.ceil will take anything and get to the ceiling.
# It rounds everything up to the nearest whole number
# Making it an integer value as we don't want a float
# Set at 30 Days
forecast_out = int(math.ceil(0.01*len(df)))
df['Label'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True)
df.head()
# Define our X and Y
# X = Features, Y = Labels
X = np.array(df.drop(['Label'],1)) # Gives everything but the Label column
y = np.array(df['Label'])
# Scale our Labels
X = preprocessing.scale(X) # With HFT, you would skip this step due to higher processing times.
# Training and Testing Sets
# 20% of the Data we will use as training/testing data
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)
# Prepare a Regression
clf = LinearRegression()
# Test Data, note that if you have the same variables
# Python will recognize this and remembers it already
# has seen the data thus predicts 100% accurately
# It's like already seeing the answers beforehand and
# then making a test
clf.fit(x_train, y_train)
# Saving and Loading a Pickle
# Done to preserve time
with open('linearregression.pickle', 'wb') as f: # wb means write binary (thus save)
pickle.dump(clf, f)
pickle_in = open('linearregression.pickle', 'rb') # rb means read binary (thus load)
clf = pickle.load(pickle_in)
# Show how Accurate our model is
accuracy = clf.score(x_test, y_test) # THIS IS THE R^2 COEFFICENT
# Display the Accuracy of the Data
##### THIS IS THE R^2 COEFFICENT
accuracy
# X_lately is the stuff we want to predict which we don't have a y value for
X_lately = X[-forecast_out:]
X = X[:-forecast_out]
df.dropna(inplace=True)
# Create Forecast Set
forecast_set = clf.predict(X_lately)
# Show Output
print(forecast_set)
print("R^2 is:", accuracy)
print("Days is:",forecast_out)
# Creating a Graph (and creating dates for the future)
df['Forecast'] = np.nan
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400 # Amount of seconds in a day
next_unix = last_unix + one_day
# Iterating through the Set to make the Future Features not a Number
for i in forecast_set:
next_date = dt.datetime.fromtimestamp(next_unix)
next_unix += one_day
df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [i]
df['Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show