need refactor

This commit is contained in:
VictoriaPresnyakova 2024-11-05 20:50:07 +04:00
parent 01d05083dd
commit 203945b3e4
8 changed files with 136387 additions and 87 deletions

172
analysis/app.py Normal file
View File

@ -0,0 +1,172 @@
import numpy as np
import pandas as pd
from datetime import timedelta
from tensorflow.keras.models import load_model
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import io
import joblib
from flask import Flask, request, jsonify, Blueprint, send_file
from flasgger import Swagger
app = Flask(__name__)
api = Blueprint('api', __name__)
Swagger(app)
# Загружаем модель и scaler
model = load_model("my_model_1H.keras")
scaler = MinMaxScaler(feature_range=(0, 1))
# Загружаем данные
column_names = ['product_url', 'price', 'datetime']
df1 = pd.read_csv('parsed_data_public_price_history_1.csv')
df2 = pd.read_csv('parsed_data_public_price_history.csv', names=column_names, )
df3 = pd.read_csv('price_history.csv', names=column_names,)
df = pd.concat([df1, df2, df3])
# Преобразуем колонку 'datetime' в тип данных datetime
df['datetime'] = pd.to_datetime(df['datetime'], format='mixed', utc=True)
df['price'] = df['price'].astype(float)
q_low = df['price'].quantile(0.55)
q_hi = df['price'].quantile(0.75)
q_range = q_hi - q_low
df = df[(df['price'] < q_hi + 1.5 * q_range) & (df['price'] > q_low - 1.5 * q_range)]
df_hourly_avg = df[['price', 'datetime']]
df_hourly_avg['datetime'] = df_hourly_avg['datetime'].dt.floor('1H')
df_hourly_avg = df_hourly_avg.groupby('datetime').agg({'price': 'mean'}).reset_index()
df_hourly_avg.set_index('datetime', inplace=True)
# Подготовка данных для прогнозирования
def prepare_data(df, days_forward=7):
last_date = df.index[-1]
scaled_data = scaler.fit_transform(df[['price']].values)
n = 3 # число временных шагов (можно менять)
X_test = []
# Формируем X_test на основе последних n значений
for i in range(n, len(scaled_data)):
X_test.append(scaled_data[i - n:i, 0])
X_test = np.array(X_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
# Предсказание на 7 дней вперед
predictions = []
current_input = X_test[-1] # начальное состояние для прогноза
for _ in range(days_forward):
pred = model.predict(np.expand_dims(current_input, axis=0))
predictions.append(pred[0, 0])
# Обновляем current_input, добавляя новое предсказание и удаляя старое
current_input = np.append(current_input[1:], pred).reshape(n, 1)
# Масштабируем предсказания обратно
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
future_dates = [last_date + timedelta(days=i) for i in range(1, days_forward + 1)]
forecast_df = pd.DataFrame({'date': future_dates, 'predicted_price': predictions})
return forecast_df
# Построение графика
def plot_price(forecast_df):
plt.figure(figsize=(14, 7))
plt.plot(df_hourly_avg.index, df_hourly_avg['price'], label='Actual Price', color='blue')
plt.plot(forecast_df['date'], forecast_df['predicted_price'], label='Predicted Price', color='orange')
plt.title("Price Prediction")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend()
plt.grid(True)
img = io.BytesIO()
plt.savefig(img, format='png')
img.seek(0)
plt.close()
return img
@api.route('/predict_price', methods=['GET'])
def predict_price():
"""
Предсказание цены на 7 дней вперед
---
responses:
200:
description: JSON с предсказаниями цен и днем минимальной цены
schema:
type: object
properties:
forecast:
type: array
items:
type: object
properties:
date:
type: string
format: date
predicted_price:
type: number
min_price_day:
type: object
properties:
date:
type: string
format: date
price:
type: number
"""
forecast_df = prepare_data(df_hourly_avg)
forecast_list = forecast_df.to_dict(orient='records') # Преобразование в список словарей
# Преобразуем значения 'predicted_price' в float
for record in forecast_list:
record['predicted_price'] = float(record['predicted_price'])
# Определяем день с минимальной предсказанной ценой
min_price_day = forecast_df.loc[forecast_df['predicted_price'].idxmin()]
# Преобразуем минимальную цену в float
min_price_day_price = float(min_price_day['predicted_price'])
# Формируем ответ
return jsonify({
'forecast': forecast_list,
'min_price_day': {
'date': min_price_day['date'].strftime('%Y-%m-%d'),
'price': min_price_day_price
}
})
# Эндпоинт для получения графика
@api.route('/plot', methods=['GET'])
def plot():
"""
Получение графика предсказанных и фактических цен
---
responses:
200:
description: Возвращает график предсказанных и фактических цен в формате PNG
content:
image/png:
schema:
type: string
format: binary
"""
forecast_df = prepare_data(df_hourly_avg)
img = plot_price(forecast_df)
return send_file(img, mimetype='image/png')
app.register_blueprint(api, url_prefix='/api')
if __name__ == "__main__":
app.run(debug=True)

View File

@ -0,0 +1,9 @@
version: '3'
services:
clickhouse:
image: yandex/clickhouse-server:latest
ports:
- "8123:8123"
- "9000:9000"
volumes:
- ./clickhouse-data:/var/lib/clickhouse

View File

@ -1,87 +0,0 @@
import pandas as pd
# Загрузка данных из CSV файла
data = pd.read_csv('test.csv')
# Преобразование даты в тип datetime
data['date'] = pd.to_datetime(data['date'])
# Добавление времени суток и дня недели
data['time_of_day'] = data['date'].dt.hour
data['day_of_week'] = data['date'].dt.dayofweek
# Проверка данных
print(data.head())
import matplotlib.pyplot as plt
# Группировка по времени суток и анализ средней цены
price_by_time = data.groupby('time_of_day')['price'].mean()
# Группировка по дням недели
price_by_day = data.groupby('day_of_week')['price'].mean()
# Визуализация
plt.figure(figsize=(12, 6))
# График изменения цен по времени суток
plt.subplot(1, 2, 1)
plt.plot(price_by_time)
plt.title('Цены в зависимости от времени суток')
plt.xlabel('Час дня')
plt.ylabel('Средняя цена')
# График изменения цен по дням недели
plt.subplot(1, 2, 2)
plt.plot(price_by_day)
plt.title('Цены в зависимости от дня недели')
plt.xlabel('День недели')
plt.ylabel('Средняя цена')
plt.tight_layout()
plt.show()
from prophet import Prophet
# Подготовка данных для Prophet
df = data[['date', 'price']].rename(columns={'date': 'ds', 'price': 'y'})
# Обучение модели
model = Prophet()
model.fit(df)
# Прогнозирование на следующие 30 дней
future = model.make_future_dataframe(periods=30)
forecast = model.predict(future)
# Визуализация прогноза
model.plot(forecast)
plt.title('Прогноз цен на следующие 30 дней')
plt.show()
# Пример использования данных о температуре в качестве регрессора
data['temperature'] = data['temperature'] # Предположим, что данные о температуре уже добавлены в CSV
# Подготовка данных для Prophet с погодными условиями
df = data[['date', 'price', 'temperature']].rename(columns={'date': 'ds', 'price': 'y', 'temperature': 'temp'})
# Обучение модели с дополнительным регрессором
model = Prophet()
model.add_regressor('temp')
model.fit(df)
# Прогнозирование
future = model.make_future_dataframe(periods=30)
future['temp'] = 22.0 # Предположим, что это прогнозируемая температура
forecast = model.predict(future)
# График прогноза с учетом погодных условий
model.plot(forecast)
plt.title('Прогноз цен с учетом погодных условий')
plt.show()

BIN
analysis/my_model_1H.keras Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,154 @@
# -*- coding: utf-8 -*-
"""Platforms_train_v2.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1yD7QxO8rUrHXvYLn_z5eofUKenJqXZoU
"""
import os
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
import matplotlib.pyplot as plt
import seaborn as sns
column_names = ['product_url', 'price', 'datetime']
df1 = pd.read_csv('parsed_data_public_price_history_1.csv')
df2 = pd.read_csv('parsed_data_public_price_history.csv', names=column_names,)
df3 = pd.read_csv('price_history.csv', names=column_names,)
df = pd.concat([df1, df2, df3])
# Преобразуем колонку 'datetime' в тип данных datetime
df['datetime'] = pd.to_datetime(df['datetime'], format='mixed', utc=True)
df['price'] = df['price'].astype(float)
df.head()
df.describe()
df.info()
len(df.product_url.unique())
q_low = df['price'].quantile(0.55)
q_hi = df['price'].quantile(0.75)
q_range = q_hi - q_low
df = df[(df['price'] < q_hi + 1.5 * q_range) & (df['price'] > q_low - 1.5 * q_range)]
df.describe()
# Оставляем только колонки 'price' и 'datetime'
df_hourly_avg = df[['price', 'datetime']]
# Округляем время до дня
df_hourly_avg['datetime'] = df_hourly_avg['datetime'].dt.floor('1H')
df_hourly_avg.head()
# Группируем по каждому часу и вычисляем среднее значение цены
df_hourly_avg = df_hourly_avg.groupby('datetime').agg({'price': 'mean'}).reset_index()
# Выводим описательную статистику
df_hourly_avg.describe()
# Просмотр первых строк
df_hourly_avg.head()
df_hourly_avg
df_hourly_avg.set_index('datetime', inplace=True)
df_hourly_avg
#only values
df_hourly_avg_arr = df_hourly_avg.values
#Split
split = int(0.8*len(df_hourly_avg_arr))
train, test = df_hourly_avg_arr[:split], df_hourly_avg_arr[split:]
train.shape, test.shape
#Normalise data by scaling to a range of 0 to 1 to improve learning and convergence of model.
# Feature scaling and fitting scaled data
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df_hourly_avg_arr)
# Creating a data structure with n time-steps and 1 output
n = 3
X_train, y_train = [], []
for i in range(n,len(train)):
X_train.append(scaled_data[i-n:i,0])
y_train.append(scaled_data[i,0])
# Convert X_train and y_train to numpy arrays for training LSTM model
X_train, y_train = np.array(X_train), np.array(y_train)
# Reshape the data as LSTM expects 3-D data (samples, time steps, features)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_train.shape
# create and fit the LSTM network
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1],1)))
model.add(LSTM(units=50))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=1000, batch_size=1, verbose=2)
inputs = df_hourly_avg_arr [len(df_hourly_avg_arr) - len(test) - n:]
inputs = inputs.reshape(-1,1)
inputs = scaler.transform(inputs)
# Create test data set
X_test = []
for i in range(n,inputs.shape[0]):
X_test.append(inputs[i-n:i,0])
# Convert data to numpy array
X_test = np.array(X_test)
# Reshape data to be 3-D
X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))
predict_price = model.predict(X_test)
predict_price = scaler.inverse_transform(predict_price)
print(X_test.shape)
rmse = np.sqrt(np.mean(np.power((test - predict_price),2)))
rmse
# Plot predicted vs actual values
train = df_hourly_avg[:split]
test = df_hourly_avg[split:]
test['Predictions'] = predict_price
plt.figure(figsize=(20,10))
sns.set_style("whitegrid")
plt.plot(train['price'], label='Training')
plt.plot(test['price'], label='Actual')
plt.plot(test['Predictions'], label='Predicted')
plt.title("AZN Close Price - LSTM", color = 'black', fontsize = 20)
plt.xlabel('Date', color = 'black', fontsize = 15)
plt.ylabel('Price', color = 'black', fontsize = 15)
plt.legend();
model.save("/content/drive/MyDrive/Colab Notebooks/Platforms/my_model_.keras")

45158
analysis/price_history.csv Normal file

File diff suppressed because it is too large Load Diff