Hi! I have created a stock predicter using an LSTM and have provided the code. You can change whatever stock the predicter predicts by changing sumbol to the stock symbol of any company. The code works by taking daily stock data from yahoo finance and training a model on it. The code predicts the last 250 days and compares it to the actual stock price of the last 250 days. It is supposed to be trained on the 2000 days before the last 250 days. The thing is, it is too accurate and I beleive there is data leak but I cannot find it. Data leak is when a model is trained on data it is supposed to predict and not know. I would really love help! Thanks :)) The code is below BTW!!! It was all ran in google colab.
# -*- coding: utf-8 -*-
"""predicter(edited) (1).ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1Z6jD7tR1xul_8G0JWOPDhQM20J2-fWAx
"""
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout,BatchNormalization
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error
from tensorflow.keras.optimizers import Adam,RMSprop,SGD,Adagrad
from tensorflow.keras.callbacks import EarlyStopping
from datetime import date, timedelta
import yfinance
symbol = "BABA"
def get_data(symbol):
prices = yfinance.download(symbol,"2016-01-01","2025-01-01",progress=False,multi_level_index=False)
prices = prices.reset_index().rename(columns={"index":"day"})
prices.rename(columns={"Date":"day"},inplace=True)
prices["day"] = pd.to_datetime(prices["day"])
return prices
def create_sequences(data, sequence_length):
"""
Create sliding window sequences from the data.
Returns X (input sequences) and y (the next value after each sequence).
"""
X, y = [], []
for i in range(len(data) - sequence_length):
X.append(data[i:i+sequence_length])
y.append(data[i+sequence_length])
return np.array(X), np.array(y)
# data[0:10]
# 0,1,2,3,4,5,6,7,8,9 X1, y1 = 10
df = get_data(symbol)
df
sequence_length = 5 # Use the past 24 hours to predict the next hour
#prices = df["4. close"].values.reshape(-1, 1)
prices = df["Close"].values.reshape(-1,1)
print(len(prices))
training_data = prices[:2000]
test_data = prices[2000:]
scaler = MinMaxScaler()
#scaler = StandardScaler()
training_data = scaler.fit_transform(training_data)
test_data = scaler.transform(test_data)
# scaled_prices = scaler.fit_transform(prices)
trainX,trainY = create_sequences(training_data,sequence_length)
testX,testY = create_sequences(test_data,sequence_length)
#W = 50 => 19
#W = 10 => 14
#1.Architecture
#2.Sequence Length ex 5,10,20,50,30
#3.Optimizer and its learning rate
#4.Batch size and epochs(Early Stopping)
#Scaler (Minmax scaler, Robust or Standard?)
#Loss function
model = Sequential()
model.add(LSTM(64,input_shape=(trainX.shape[1],1), return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(32,return_sequences=False))
#model.add(Dropout(0.5))
model.add(Dense(1))
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,loss='mae')
es = EarlyStopping(restore_best_weights=True,patience=30)
history = model.fit(trainX,trainY,epochs=200,batch_size=64,verbose=1,validation_data=(testX,testY),callbacks=[es],shuffle=False)
#
#
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
#plt.legend(['train', 'test'], loc='upper left')
plt.show()
#
predictions = model.predict(testX)
print(predictions.shape)
predictions = scaler.inverse_transform(predictions)
testY_original = scaler.inverse_transform(testY.reshape(-1, 1))
mse = mean_squared_error(testY_original, predictions)
mae = mean_absolute_error(testY_original, predictions)
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
plt.plot(testY_original, label='Actual')
plt.plot(predictions, label='Predicted')
plt.title('Actual vs. Predicted Stock Prices')
plt.xlabel('Time')
plt.ylabel('Stock Price')
# plt.axis('equal')
plt.legend()
plt.show()