{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Лабораторная работа 3\n", "\n", "Датасет - **Цены на кофе**\thttps://www.kaggle.com/datasets/mayankanand2701/starbucks-stock-price-dataset\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Бизнес-цели**: \n", "1. Прогнозирование цены закрытия акции для поддержки принятия решений по инвестициям.\n", "2. Оценка волатильности акций Starbucks для долгосрочных стратегий инвестирования." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Цели технического проекта**: \n", "1. Создание модели машинного обучения для прогнозирования цены закрытия акций на основе исторических данных (дат, цен открытия, максимальных и минимальных цен, объёма торгов).\n", "2. Разработка системы, которая вычисляет и анализирует волатильность на основе исторической ценовой информации и объёмов торгов." ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')\n", "0 8212\n", "1 8215\n", "2 8216\n", "3 8217\n", "4 8218\n", " ... \n", "8031 19860\n", "8032 19863\n", "8033 19864\n", "8034 19865\n", "8035 19866\n", "Name: Date_numeric, Length: 8036, dtype: int64\n", "Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',\n", " 'Date_numeric', 'Close_binned'],\n", " dtype='object')\n", "Обучающая выборка: (4821, 9)\n", "Close\n", "0.765625 16\n", "0.835938 10\n", "0.750000 10\n", "0.882813 10\n", "0.753906 9\n", " ..\n", "17.745001 1\n", "4.757500 1\n", "20.174999 1\n", "94.080002 1\n", "54.599998 1\n", "Name: count, Length: 3663, dtype: int64\n", "Контрольная выборка: (1607, 9)\n", "Close\n", "0.750000 9\n", "0.773438 6\n", "0.703125 6\n", "0.765625 5\n", "0.898438 4\n", " ..\n", "1.804688 1\n", "0.656250 1\n", "13.740000 1\n", "27.799999 1\n", "84.260002 1\n", "Name: count, Length: 1421, dtype: int64\n", "Тестовая выборка: (1607, 9)\n", "Close\n", "0.742188 6\n", "0.789063 6\n", "1.367188 5\n", "0.750000 5\n", "0.781250 5\n", " ..\n", "57.810001 1\n", "111.239998 1\n", "4.132813 1\n", "38.915001 1\n", "96.760002 1\n", "Name: count, Length: 1425, dtype: int64\n", "Обучающая выборка: (4821, 9)\n", "Close_binned\n", "High 1639\n", "Low 1591\n", "Medium 1591\n", "Name: count, dtype: int64\n", "Контрольная выборка: (1607, 9)\n", "Close_binned\n", "High 547\n", "Low 530\n", "Medium 530\n", "Name: count, dtype: int64\n", "Тестовая выборка: (1607, 9)\n", "Close_binned\n", "High 546\n", "Medium 531\n", "Low 530\n", "Name: count, dtype: int64\n", "Обучающая выборка после undersampling: (4773, 9)\n", "Close\n", "0.765625 16\n", "0.835938 10\n", "0.882813 10\n", "0.750000 10\n", "0.753906 9\n", " ..\n", "80.669998 1\n", "57.799999 1\n", "57.230000 1\n", "85.169998 1\n", "11.795000 1\n", "Name: count, Length: 3630, dtype: int64\n", "Open float64\n", "High float64\n", "Low float64\n", "Adj Close float64\n", "Volume float64\n", "Date_numeric int64\n", "Close_binned_Low bool\n", "Close_binned_Medium bool\n", "Close_binned_High bool\n", "Volume_binned int64\n", "Price_change float64\n", "Volatility float64\n", "dtype: object\n", "Средняя абсолютная ошибка: 0.28906502132778844\n", "Среднеквадратичная ошибка: 0.2722364217628283\n", "время, затраченное на обучение модели: 0.016208171844482422. Время, затраченное на предсказание: 0.0012598037719726562\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Python312\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", " warnings.warn(\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from imblearn.under_sampling import RandomUnderSampler\n", "\n", "df = pd.read_csv(\"data/Coffe.csv\")\n", "print(df.columns)\n", "\n", "df['Date'] = pd.to_datetime(df['Date'])\n", "df['Date_numeric'] = (df['Date'] - pd.Timestamp('1970-01-01')).dt.days\n", "print(df['Date_numeric'])\n", "\n", "def split_stratified_into_train_val_test(\n", " df_input,\n", " stratify_colname=\"y\",\n", " frac_train=0.6,\n", " frac_val=0.15,\n", " frac_test=0.25,\n", " random_state=None,\n", "):\n", " if frac_train + frac_val + frac_test != 1.0:\n", " raise ValueError(\n", " \"fractions %f, %f, %f do not add up to 1.0\"\n", " % (frac_train, frac_val, frac_test)\n", " )\n", "\n", " if stratify_colname not in df_input.columns:\n", " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", "\n", " X = df_input \n", " y = df_input[\n", " [stratify_colname]\n", " ] \n", "\n", " df_train, df_temp, y_train, y_temp = train_test_split(\n", " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", " )\n", "\n", " relative_frac_test = frac_test / (frac_val + frac_test)\n", " df_val, df_test, y_val, y_test = train_test_split(\n", " df_temp,\n", " y_temp,\n", " stratify=y_temp,\n", " test_size=relative_frac_test,\n", " random_state=random_state,\n", " )\n", "\n", " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", "\n", " return df_train, df_val, df_test\n", "\n", "bins = [df['Close'].min(), df['Close'].quantile(0.33), df['Close'].quantile(0.66), df['Close'].max()]\n", "labels = ['Low', 'Medium', 'High']\n", "df['Close_binned'] = pd.cut(df['Close'], bins=bins, labels=labels)\n", "df = df.dropna()\n", "# Now stratify using the binned values\n", "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", " df, stratify_colname=\"Close_binned\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", ")\n", "\n", "print(df_train.columns) \n", " \n", "print(\"Обучающая выборка: \", df_train.shape)\n", "print(df_train.Close.value_counts()) \n", "\n", "print(\"Контрольная выборка: \", df_val.shape)\n", "print(df_val.Close.value_counts())\n", "\n", "print(\"Тестовая выборка: \", df_test.shape)\n", "print(df_test.Close.value_counts())\n", "\n", "print(\"Обучающая выборка: \", df_train.shape)\n", "print(df_train['Close_binned'].value_counts())\n", "\n", "print(\"Контрольная выборка: \", df_val.shape)\n", "print(df_val['Close_binned'].value_counts())\n", "\n", "print(\"Тестовая выборка: \", df_test.shape)\n", "print(df_test['Close_binned'].value_counts())\n", "\n", "rus = RandomUnderSampler(random_state=42)\n", "X_resampled, y_resampled = rus.fit_resample(df_train, df_train[\"Close_binned\"])\n", "df_train_rus = pd.DataFrame(X_resampled)\n", "print(\"Обучающая выборка после undersampling: \", df_train_rus.shape)\n", "print(df_train_rus.Close.value_counts())\n", "\n", "df_train = pd.get_dummies(df_train, columns=['Close_binned'])\n", "\n", "df_train['Volume_binned'] = pd.qcut(df_train['Volume'], q=4, labels=False)\n", "\n", "df_train['Price_change'] = df_train['Close'] - df_train['Open']\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "scaler = StandardScaler()\n", "df_train[['Open', 'Close', 'High', 'Low', 'Volume']] = scaler.fit_transform(df_train[['Open', 'Close', 'High', 'Low', 'Volume']])\n", "\n", "df_train['Volatility'] = df_train['High'] - df_train['Low']\n", "\n", "import featuretools as ft\n", "\n", "es = ft.EntitySet(id=\"stocks\")\n", "es = es.add_dataframe(\n", " dataframe_name=\"stock_data\", \n", " dataframe=df_train, \n", " index=\"Date\")\n", "\n", "feature_matrix, feature_defs = ft.dfs(\n", " entityset=es, \n", " target_dataframe_name=\"stock_data\")\n", "\n", "feature_defs\n", "\n", "# Оценка предсказательной способности\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error\n", "df_train_regression = df_train.copy()\n", "\n", "X_train = df_train_regression.drop(['Close', 'Date'], axis=1)\n", "y_train = df_train_regression['Close']\n", "\n", "X_test = df_test.drop(['Close', 'Date'], axis=1)\n", "y_test = df_test['Close']\n", "\n", "X_train_encoded = pd.get_dummies(X_train, drop_first=True)\n", "X_test_encoded = pd.get_dummies(X_test, drop_first=True)\n", "\n", "X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)\n", "\n", "print(X_train_encoded.dtypes)\n", "\n", "model = LinearRegression()\n", "model.fit(X_train_encoded, y_train)\n", "\n", "predictions = model.predict(X_test_encoded)\n", "\n", "mae = mean_absolute_error(y_test, predictions)\n", "mse = mean_squared_error(y_test, predictions)\n", "print(\"Средняя абсолютная ошибка:\", mae)\n", "print(\"Среднеквадратичная ошибка:\", mse)\n", "\n", "# Оценка скорости вычисления\n", "import time\n", "start_time = time.time()\n", "model.fit(X_train_encoded, y_train)\n", "training_time = time.time() - start_time\n", "\n", "start_time = time.time()\n", "predictions = model.predict(X_test_encoded)\n", "prediction_time = time.time() - start_time\n", "\n", "print(f'время, затраченное на обучение модели: {training_time}. Время, затраченное на предсказание: {prediction_time}')\n", "\n", "# Оценка корреляции\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "corr_matrix = df_train_regression.corr()\n", "sns.heatmap(corr_matrix, annot=False)\n", "plt.show()\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 2 }