# -*- coding: utf-8 -*- """Platforms_train_v2.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1yD7QxO8rUrHXvYLn_z5eofUKenJqXZoU """ import os import numpy as np import pandas as pd from datetime import datetime from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score import joblib from sklearn.preprocessing import MinMaxScaler from keras.models import Sequential from keras.layers import Dense, Dropout, LSTM import matplotlib.pyplot as plt import seaborn as sns column_names = ['product_url', 'price', 'datetime'] df = pd.read_csv('parsed_data_public_price_history_all.csv') # Преобразуем колонку 'datetime' в тип данных datetime df['datetime'] = pd.to_datetime(df['datetime'], format='mixed', utc=True) df['price'] = df['price'].astype(float) q_low = df['price'].quantile(0.55) q_hi = df['price'].quantile(0.75) q_range = q_hi - q_low df = df[(df['price'] < q_hi + 1.5 * q_range) & (df['price'] > q_low - 1.5 * q_range)] df.describe() # Оставляем только колонки 'price' и 'datetime' df_hourly_avg = df[['price', 'datetime']] # Округляем время до дня df_hourly_avg['datetime'] = df_hourly_avg['datetime'].dt.floor('1H') # Группируем по каждому часу и вычисляем среднее значение цены df_hourly_avg = df_hourly_avg.groupby('datetime').agg({'price': 'mean'}).reset_index() df_hourly_avg.set_index('datetime', inplace=True) #only values df_hourly_avg_arr = df_hourly_avg.values #Split split = int(0.8*len(df_hourly_avg_arr)) train, test = df_hourly_avg_arr[:split], df_hourly_avg_arr[split:] #Normalise data by scaling to a range of 0 to 1 to improve learning and convergence of model. # Feature scaling and fitting scaled data scaler = MinMaxScaler(feature_range=(0, 1)) scaled_data = scaler.fit_transform(df_hourly_avg_arr) # Creating a data structure with n time-steps and 1 output n = 3 X_train, y_train = [], [] for i in range(n,len(train)): X_train.append(scaled_data[i-n:i,0]) y_train.append(scaled_data[i,0]) # Convert X_train and y_train to numpy arrays for training LSTM model X_train, y_train = np.array(X_train), np.array(y_train) # Reshape the data as LSTM expects 3-D data (samples, time steps, features) X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1)) # create and fit the LSTM network model = Sequential() model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1],1))) model.add(LSTM(units=50)) model.add(Dense(1)) model.compile(loss='mean_squared_error', optimizer='adam') model.fit(X_train, y_train, epochs=1000, batch_size=1, verbose=2) inputs = df_hourly_avg_arr [len(df_hourly_avg_arr) - len(test) - n:] inputs = inputs.reshape(-1,1) inputs = scaler.transform(inputs) # Create test data set X_test = [] for i in range(n, inputs.shape[0]): X_test.append(inputs[i-n:i, 0]) # Convert data to numpy array X_test = np.array(X_test) # Reshape data to be 3-D X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1)) predict_price = model.predict(X_test) predict_price = scaler.inverse_transform(predict_price) print(X_test.shape) rmse = np.sqrt(np.mean(np.power((test - predict_price),2))) # Plot predicted vs actual values train = df_hourly_avg[:split] test = df_hourly_avg[split:] test['Predictions'] = predict_price plt.figure(figsize=(20,10)) sns.set_style("whitegrid") plt.plot(train['price'], label='Training') plt.plot(test['price'], label='Actual') plt.plot(test['Predictions'], label='Predicted') plt.title("AZN Close Price - LSTM", color = 'black', fontsize = 20) plt.xlabel('Date', color = 'black', fontsize = 15) plt.ylabel('Price', color = 'black', fontsize = 15) plt.legend() model.save("/content/drive/MyDrive/Colab Notebooks/Platforms/my_model_.keras")