JerBouma
2/9/2019 - 7:26 PM

[Linear Regression ML] Machine Learning with the Linear Regression method #machinelearning

[Linear Regression ML] Machine Learning with the Linear Regression method #machinelearning

# Gather Data
df = pdr.DataReader('GOOGL', 'yahoo', '2004-08-19', '2016-04-10')
df.drop('Adj Close', axis=1).head()

# High Minus Low Percentage
df['HL_PCT'] = (df['High'] - df['Low']) / df['Low'] * 100.0

# Percentage Change
df['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

# Choose Only the Important Columns
df = df[['Close','HL_PCT','PCT_change','Volume']]

# Show df.head()
df.head()

# Fill NaN Date with something
df.fillna(-99999, inplace=True)
# Select the column that you wish to Forecast
forecast_col = 'Close'

# Math.ceil will take anything and get to the ceiling. 
# It rounds everything up to the nearest whole number
# Making it an integer value as we don't want a float
# Set at 30 Days
forecast_out = int(math.ceil(0.01*len(df)))

df['Label'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True)
df.head()
# Define our X and Y
# X = Features, Y = Labels
X = np.array(df.drop(['Label'],1)) # Gives everything but the Label column
y = np.array(df['Label'])

# Scale our Labels
X = preprocessing.scale(X) # With HFT, you would skip this step due to higher processing times.

# Training and Testing Sets
# 20% of the Data we will use as training/testing data
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2) 

# Prepare a Regression
clf = LinearRegression()

# Test Data, note that if you have the same variables
# Python will recognize this and remembers it already
# has seen the data thus predicts 100% accurately
# It's like already seeing the answers beforehand and
# then making a test
clf.fit(x_train, y_train)

# Saving and Loading a Pickle
# Done to preserve time
with open('linearregression.pickle', 'wb') as f: # wb means write binary (thus save)
    pickle.dump(clf, f)

pickle_in = open('linearregression.pickle', 'rb') # rb means read binary (thus load)
clf = pickle.load(pickle_in)

# Show how Accurate our model is
accuracy = clf.score(x_test, y_test) # THIS IS THE R^2 COEFFICENT

# Display the Accuracy of the Data
##### THIS IS THE R^2 COEFFICENT
accuracy
# X_lately is the stuff we want to predict which we don't have a y value for
X_lately = X[-forecast_out:]
X = X[:-forecast_out]
df.dropna(inplace=True)

# Create Forecast Set
forecast_set = clf.predict(X_lately)

# Show Output
print(forecast_set)
print("R^2 is:", accuracy)
print("Days is:",forecast_out)
# Creating a Graph (and creating dates for the future)
df['Forecast'] = np.nan
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400 # Amount of seconds in a day
next_unix = last_unix + one_day

# Iterating through the Set to make the Future Features not a Number
for i in forecast_set:
    next_date = dt.datetime.fromtimestamp(next_unix)
    next_unix += one_day
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [i]
    
df['Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show