154 lines
4.5 KiB
Python
154 lines
4.5 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""Platforms_train_v2.ipynb
|
|
|
|
Automatically generated by Colab.
|
|
|
|
Original file is located at
|
|
https://colab.research.google.com/drive/1yD7QxO8rUrHXvYLn_z5eofUKenJqXZoU
|
|
"""
|
|
|
|
import os
|
|
import numpy as np
|
|
import pandas as pd
|
|
from datetime import datetime
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import mean_squared_error
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
|
import joblib
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
from keras.models import Sequential
|
|
from keras.layers import Dense, Dropout, LSTM
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
|
|
column_names = ['product_url', 'price', 'datetime']
|
|
|
|
df1 = pd.read_csv('parsed_data_public_price_history_1.csv')
|
|
df2 = pd.read_csv('parsed_data_public_price_history.csv', names=column_names,)
|
|
df3 = pd.read_csv('price_history.csv', names=column_names,)
|
|
|
|
df = pd.concat([df1, df2, df3])
|
|
|
|
# Преобразуем колонку 'datetime' в тип данных datetime
|
|
df['datetime'] = pd.to_datetime(df['datetime'], format='mixed', utc=True)
|
|
df['price'] = df['price'].astype(float)
|
|
|
|
df.head()
|
|
|
|
df.describe()
|
|
|
|
df.info()
|
|
|
|
len(df.product_url.unique())
|
|
|
|
q_low = df['price'].quantile(0.55)
|
|
q_hi = df['price'].quantile(0.75)
|
|
q_range = q_hi - q_low
|
|
df = df[(df['price'] < q_hi + 1.5 * q_range) & (df['price'] > q_low - 1.5 * q_range)]
|
|
df.describe()
|
|
|
|
# Оставляем только колонки 'price' и 'datetime'
|
|
df_hourly_avg = df[['price', 'datetime']]
|
|
|
|
# Округляем время до дня
|
|
df_hourly_avg['datetime'] = df_hourly_avg['datetime'].dt.floor('1H')
|
|
|
|
df_hourly_avg.head()
|
|
|
|
# Группируем по каждому часу и вычисляем среднее значение цены
|
|
df_hourly_avg = df_hourly_avg.groupby('datetime').agg({'price': 'mean'}).reset_index()
|
|
|
|
# Выводим описательную статистику
|
|
df_hourly_avg.describe()
|
|
|
|
# Просмотр первых строк
|
|
df_hourly_avg.head()
|
|
|
|
df_hourly_avg
|
|
|
|
df_hourly_avg.set_index('datetime', inplace=True)
|
|
|
|
df_hourly_avg
|
|
|
|
#only values
|
|
df_hourly_avg_arr = df_hourly_avg.values
|
|
|
|
#Split
|
|
split = int(0.8*len(df_hourly_avg_arr))
|
|
|
|
|
|
train, test = df_hourly_avg_arr[:split], df_hourly_avg_arr[split:]
|
|
|
|
|
|
train.shape, test.shape
|
|
|
|
#Normalise data by scaling to a range of 0 to 1 to improve learning and convergence of model.
|
|
# Feature scaling and fitting scaled data
|
|
scaler = MinMaxScaler(feature_range=(0, 1))
|
|
scaled_data = scaler.fit_transform(df_hourly_avg_arr)
|
|
|
|
# Creating a data structure with n time-steps and 1 output
|
|
n = 3
|
|
X_train, y_train = [], []
|
|
for i in range(n,len(train)):
|
|
X_train.append(scaled_data[i-n:i,0])
|
|
y_train.append(scaled_data[i,0])
|
|
|
|
# Convert X_train and y_train to numpy arrays for training LSTM model
|
|
|
|
X_train, y_train = np.array(X_train), np.array(y_train)
|
|
|
|
# Reshape the data as LSTM expects 3-D data (samples, time steps, features)
|
|
|
|
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
|
|
X_train.shape
|
|
|
|
# create and fit the LSTM network
|
|
model = Sequential()
|
|
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1],1)))
|
|
model.add(LSTM(units=50))
|
|
model.add(Dense(1))
|
|
|
|
model.compile(loss='mean_squared_error', optimizer='adam')
|
|
model.fit(X_train, y_train, epochs=1000, batch_size=1, verbose=2)
|
|
|
|
inputs = df_hourly_avg_arr [len(df_hourly_avg_arr) - len(test) - n:]
|
|
inputs = inputs.reshape(-1,1)
|
|
inputs = scaler.transform(inputs)
|
|
|
|
# Create test data set
|
|
X_test = []
|
|
for i in range(n,inputs.shape[0]):
|
|
X_test.append(inputs[i-n:i,0])
|
|
|
|
# Convert data to numpy array
|
|
X_test = np.array(X_test)
|
|
|
|
# Reshape data to be 3-D
|
|
X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))
|
|
predict_price = model.predict(X_test)
|
|
predict_price = scaler.inverse_transform(predict_price)
|
|
|
|
print(X_test.shape)
|
|
|
|
rmse = np.sqrt(np.mean(np.power((test - predict_price),2)))
|
|
rmse
|
|
|
|
# Plot predicted vs actual values
|
|
train = df_hourly_avg[:split]
|
|
test = df_hourly_avg[split:]
|
|
test['Predictions'] = predict_price
|
|
|
|
plt.figure(figsize=(20,10))
|
|
sns.set_style("whitegrid")
|
|
plt.plot(train['price'], label='Training')
|
|
plt.plot(test['price'], label='Actual')
|
|
plt.plot(test['Predictions'], label='Predicted')
|
|
plt.title("AZN Close Price - LSTM", color = 'black', fontsize = 20)
|
|
plt.xlabel('Date', color = 'black', fontsize = 15)
|
|
plt.ylabel('Price', color = 'black', fontsize = 15)
|
|
plt.legend();
|
|
|
|
model.save("/content/drive/MyDrive/Colab Notebooks/Platforms/my_model_.keras") |