{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Лабораторная работа 3\n", "\n", "Датасет - **Цены на кофе**\thttps://www.kaggle.com/datasets/mayankanand2701/starbucks-stock-price-dataset\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Бизнес-цели**: \n", "1. Прогнозирование цены закрытия акции для поддержки принятия решений по инвестициям.\n", "2. Оценка волатильности акций Starbucks для долгосрочных стратегий инвестирования." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Цели технического проекта**: \n", "1. Создание модели машинного обучения для прогнозирования цены закрытия акций на основе исторических данных (дат, цен открытия, максимальных и минимальных цен, объёма торгов).\n", "2. Разработка системы, которая вычисляет и анализирует волатильность на основе исторической ценовой информации и объёмов торгов." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Date Open High Low Close Adj Close \\\n", "0 1992-06-26 0.328125 0.347656 0.320313 0.335938 0.260703 \n", "1 1992-06-29 0.339844 0.367188 0.332031 0.359375 0.278891 \n", "2 1992-06-30 0.367188 0.371094 0.343750 0.347656 0.269797 \n", "3 1992-07-01 0.351563 0.359375 0.339844 0.355469 0.275860 \n", "4 1992-07-02 0.359375 0.359375 0.347656 0.355469 0.275860 \n", "... ... ... ... ... ... ... \n", "8031 2024-05-17 75.269997 78.000000 74.919998 77.849998 77.849998 \n", "8032 2024-05-20 77.680000 78.320000 76.709999 77.540001 77.540001 \n", "8033 2024-05-21 77.559998 78.220001 77.500000 77.720001 77.720001 \n", "8034 2024-05-22 77.699997 81.019997 77.440002 80.720001 80.720001 \n", "8035 2024-05-23 80.099998 80.699997 79.169998 79.260002 79.260002 \n", "\n", " Volume \n", "0 224358400 \n", "1 58732800 \n", "2 34777600 \n", "3 18316800 \n", "4 13996800 \n", "... ... \n", "8031 14436500 \n", "8032 11183800 \n", "8033 8916600 \n", "8034 22063400 \n", "8035 4651418 \n", "\n", "[8036 rows x 7 columns]\n", "0 8212\n", "1 8215\n", "2 8216\n", "3 8217\n", "4 8218\n", " ... \n", "8031 19860\n", "8032 19863\n", "8033 19864\n", "8034 19865\n", "8035 19866\n", "Name: Date_numeric, Length: 8036, dtype: int64\n" ] } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from imblearn.under_sampling import RandomUnderSampler\n", "\n", "df = pd.read_csv(\"data/Coffe.csv\")\n", "print(df)\n", "\n", "df['Date'] = pd.to_datetime(df['Date'])\n", "df['Date_numeric'] = (df['Date'] - pd.Timestamp('1970-01-01')).dt.days\n", "print(df['Date_numeric'])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',\n", " 'Date_numeric', 'Close_binned'],\n", " dtype='object')\n", "Обучающая выборка: (4821, 9)\n", "Close\n", "0.750000 17\n", "0.765625 15\n", "0.882813 11\n", "0.753906 9\n", "0.773438 9\n", " ..\n", "7.760000 1\n", "88.459999 1\n", "104.330002 1\n", "10.850000 1\n", "100.930000 1\n", "Name: count, Length: 3690, dtype: int64\n", "Контрольная выборка: (1607, 9)\n", "Close\n", "0.835938 6\n", "0.781250 5\n", "0.757813 5\n", "1.851563 4\n", "0.738281 4\n", " ..\n", "100.620003 1\n", "6.020000 1\n", "85.959999 1\n", "91.529999 1\n", "111.000000 1\n", "Name: count, Length: 1436, dtype: int64\n", "Тестовая выборка: (1607, 9)\n", "Close\n", "0.703125 6\n", "0.851563 6\n", "0.750000 6\n", "0.742188 5\n", "0.781250 5\n", " ..\n", "47.275002 1\n", "31.760000 1\n", "75.500000 1\n", "2.406250 1\n", "8.107500 1\n", "Name: count, Length: 1427, dtype: int64\n", "Обучающая выборка: (4821, 9)\n", "Close_binned\n", "High 1639\n", "Low 1591\n", "Medium 1591\n", "Name: count, dtype: int64\n", "Контрольная выборка: (1607, 9)\n", "Close_binned\n", "High 546\n", "Medium 531\n", "Low 530\n", "Name: count, dtype: int64\n", "Тестовая выборка: (1607, 9)\n", "Close_binned\n", "High 547\n", "Low 530\n", "Medium 530\n", "Name: count, dtype: int64\n" ] } ], "source": [ "def split_stratified_into_train_val_test(\n", " df_input,\n", " stratify_colname=\"y\",\n", " frac_train=0.6,\n", " frac_val=0.15,\n", " frac_test=0.25,\n", " random_state=None,\n", "):\n", " if frac_train + frac_val + frac_test != 1.0:\n", " raise ValueError(\n", " \"fractions %f, %f, %f do not add up to 1.0\"\n", " % (frac_train, frac_val, frac_test)\n", " )\n", "\n", " if stratify_colname not in df_input.columns:\n", " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", "\n", " X = df_input \n", " y = df_input[\n", " [stratify_colname]\n", " ] \n", "\n", " df_train, df_temp, y_train, y_temp = train_test_split(\n", " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", " )\n", "\n", " relative_frac_test = frac_test / (frac_val + frac_test)\n", " df_val, df_test, y_val, y_test = train_test_split(\n", " df_temp,\n", " y_temp,\n", " stratify=y_temp,\n", " test_size=relative_frac_test,\n", " random_state=random_state,\n", " )\n", "\n", " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", "\n", " return df_train, df_val, df_test\n", "\n", "bins = [df['Close'].min(), df['Close'].quantile(0.33), df['Close'].quantile(0.66), df['Close'].max()]\n", "labels = ['Low', 'Medium', 'High']\n", "df['Close_binned'] = pd.cut(df['Close'], bins=bins, labels=labels)\n", "df = df.dropna()\n", "# Now stratify using the binned values\n", "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", " df, stratify_colname=\"Close_binned\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", ")\n", "\n", "print(df_train.columns) \n", " \n", "print(\"Обучающая выборка: \", df_train.shape)\n", "print(df_train.Close.value_counts()) \n", "\n", "print(\"Контрольная выборка: \", df_val.shape)\n", "print(df_val.Close.value_counts())\n", "\n", "print(\"Тестовая выборка: \", df_test.shape)\n", "print(df_test.Close.value_counts())\n", "\n", "print(\"Обучающая выборка: \", df_train.shape)\n", "print(df_train['Close_binned'].value_counts())\n", "\n", "print(\"Контрольная выборка: \", df_val.shape)\n", "print(df_val['Close_binned'].value_counts())\n", "\n", "print(\"Тестовая выборка: \", df_test.shape)\n", "print(df_test['Close_binned'].value_counts())\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Обучающая выборка после undersampling: (4773, 9)\n", "Close\n", "0.750000 17\n", "0.765625 15\n", "0.882813 11\n", "0.773438 9\n", "0.753906 9\n", " ..\n", "58.810001 1\n", "40.535000 1\n", "91.860001 1\n", "90.779999 1\n", "96.970001 1\n", "Name: count, Length: 3651, dtype: int64\n" ] } ], "source": [ "rus = RandomUnderSampler(random_state=42)\n", "X_resampled, y_resampled = rus.fit_resample(df_train, df_train[\"Close_binned\"])\n", "df_train_rus = pd.DataFrame(X_resampled)\n", "print(\"Обучающая выборка после undersampling: \", df_train_rus.shape)\n", "print(df_train_rus.Close.value_counts())" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Date Open High Low Close Adj Close \\\n", "75 1992-10-13 0.464844 0.472656 0.457031 0.472656 0.366803 \n", "7819 2023-07-17 100.830002 101.809998 100.040001 100.930000 98.501541 \n", "6447 2018-01-31 57.230000 57.450001 56.700001 56.810001 49.579262 \n", "706 1995-04-12 0.769531 0.789063 0.769531 0.785156 0.609317 \n", "4437 2010-02-05 10.895000 11.020000 10.630000 10.850000 8.420099 \n", "... ... ... ... ... ... ... \n", "4113 2008-10-22 5.120000 5.245000 4.880000 4.995000 3.876349 \n", "4544 2010-07-12 12.635000 12.760000 12.490000 12.635000 9.845955 \n", "6517 2018-05-11 57.720001 57.860001 57.070000 57.270000 50.514595 \n", "3336 2005-09-21 11.642500 11.775000 11.530000 11.667500 9.054512 \n", "3122 2004-11-15 13.797500 13.860000 13.687500 13.790000 10.701671 \n", "\n", " Volume Date_numeric Close_binned_Low Close_binned_Medium \\\n", "75 4390400 8321 True False \n", "7819 5244500 19555 False False \n", "6447 13118400 17562 False False \n", "706 10294400 9232 True False \n", "4437 22069800 14645 False True \n", "... ... ... ... ... \n", "4113 29681400 14174 True False \n", "4544 12906200 14802 False True \n", "6517 5843400 17662 False False \n", "3336 16207600 13047 False True \n", "3122 10700400 12737 False True \n", "\n", " Close_binned_High Volume_binned Price_change \n", "75 False 0 0.007812 \n", "7819 True 0 0.099998 \n", "6447 True 2 -0.419999 \n", "706 False 1 0.015625 \n", "4437 False 3 -0.045000 \n", "... ... ... ... \n", "4113 False 3 -0.125000 \n", "4544 False 2 0.000000 \n", "6517 True 0 -0.450001 \n", "3336 False 2 0.025000 \n", "3122 False 1 -0.007500 \n", "\n", "[4821 rows x 13 columns]\n" ] } ], "source": [ "df_train = pd.get_dummies(df_train, columns=['Close_binned'])\n", "df_train['Volume_binned'] = pd.qcut(df_train['Volume'], q=4, labels=False)\n", "df_train['Price_change'] = df_train['Close'] - df_train['Open']\n", "print(df_train) " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Date Open High Low Close Adj Close Volume \\\n", "75 1992-10-13 -0.881061 -0.882121 -0.880250 -0.881109 0.366803 -0.731159 \n", "7819 2023-07-17 2.093371 2.095750 2.098759 2.096893 98.501541 -0.670368 \n", "6447 2018-01-31 0.801237 0.792199 0.802249 0.788980 49.579262 -0.109940 \n", "706 1995-04-12 -0.872031 -0.872824 -0.870902 -0.871845 0.609317 -0.310940 \n", "4437 2010-02-05 -0.571952 -0.572180 -0.575927 -0.573479 8.420099 0.527179 \n", "... ... ... ... ... ... ... ... \n", "4113 2008-10-22 -0.743100 -0.741883 -0.747938 -0.747047 3.876349 1.068937 \n", "4544 2010-07-12 -0.520385 -0.521049 -0.520286 -0.520563 9.845955 -0.125044 \n", "6517 2018-05-11 0.815758 0.804247 0.813318 0.802616 50.514595 -0.627741 \n", "3336 2005-09-21 -0.549799 -0.549994 -0.549004 -0.549244 9.054512 0.109935 \n", "3122 2004-11-15 -0.485933 -0.488725 -0.484463 -0.486324 10.701671 -0.282042 \n", "\n", " Date_numeric Close_binned_Low Close_binned_Medium Close_binned_High \\\n", "75 8321 True False False \n", "7819 19555 False False True \n", "6447 17562 False False True \n", "706 9232 True False False \n", "4437 14645 False True False \n", "... ... ... ... ... \n", "4113 14174 True False False \n", "4544 14802 False True False \n", "6517 17662 False False True \n", "3336 13047 False True False \n", "3122 12737 False True False \n", "\n", " Volume_binned Price_change Volatility \n", "75 0 0.007812 -0.001871 \n", "7819 0 0.099998 -0.003009 \n", "6447 2 -0.419999 -0.010050 \n", "706 1 0.015625 -0.001922 \n", "4437 3 -0.045000 0.003747 \n", "... ... ... ... \n", "4113 3 -0.125000 0.006055 \n", "4544 2 0.000000 -0.000763 \n", "6517 0 -0.450001 -0.009070 \n", "3336 2 0.025000 -0.000990 \n", "3122 1 -0.007500 -0.004262 \n", "\n", "[4821 rows x 14 columns]\n" ] } ], "source": [ "from sklearn.preprocessing import StandardScaler\n", "\n", "scaler = StandardScaler()\n", "df_train[['Open', 'Close', 'High', 'Low', 'Volume']] = scaler.fit_transform(\n", " df_train[['Open', 'Close', 'High', 'Low', 'Volume']])\n", "df_train['Volatility'] = df_train['High'] - df_train['Low']\n", "print(df_train) " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Python312\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import featuretools as ft\n", "\n", "es = ft.EntitySet(id=\"stocks\")\n", "es = es.add_dataframe(\n", " dataframe_name=\"stock_data\", \n", " dataframe=df_train, \n", " index=\"Date\")\n", "\n", "feature_matrix, feature_defs = ft.dfs(\n", " entityset=es, \n", " target_dataframe_name=\"stock_data\")\n", "\n", "feature_defs" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Open float64\n", "High float64\n", "Low float64\n", "Adj Close float64\n", "Volume float64\n", "Date_numeric int64\n", "Close_binned_Low bool\n", "Close_binned_Medium bool\n", "Close_binned_High bool\n", "Volume_binned int64\n", "Price_change float64\n", "Volatility float64\n", "dtype: object\n" ] } ], "source": [ "# Оценка предсказательной способности\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error\n", "df_train_regression = df_train.copy()\n", "\n", "X_train = df_train_regression.drop(['Close', 'Date'], axis=1)\n", "y_train = df_train_regression['Close']\n", "X_test = df_test.drop(['Close', 'Date'], axis=1)\n", "y_test = df_test['Close']\n", "\n", "X_train_encoded = pd.get_dummies(X_train, drop_first=True)\n", "X_test_encoded = pd.get_dummies(X_test, drop_first=True)\n", "\n", "X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)\n", "\n", "print(X_train_encoded.dtypes)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Средняя абсолютная ошибка: 0.28573230577357767\n", "Среднеквадратичная ошибка: 0.2813734754209575\n" ] } ], "source": [ "model = LinearRegression()\n", "model.fit(X_train_encoded, y_train)\n", "\n", "predictions = model.predict(X_test_encoded)\n", "\n", "mae = mean_absolute_error(y_test, predictions)\n", "mse = mean_squared_error(y_test, predictions)\n", "print(\"Средняя абсолютная ошибка:\", mae)\n", "print(\"Среднеквадратичная ошибка:\", mse)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "время, затраченное на обучение модели: 0.025032997131347656. Время, затраченное на предсказание: 0.0\n" ] } ], "source": [ "# Оценка скорости вычисления\n", "import time\n", "start_time = time.time()\n", "model.fit(X_train_encoded, y_train)\n", "training_time = time.time() - start_time\n", "\n", "start_time = time.time()\n", "predictions = model.predict(X_test_encoded)\n", "prediction_time = time.time() - start_time\n", "\n", "print(f'время, затраченное на обучение модели: {training_time}. Время, затраченное на предсказание: {prediction_time}')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Оценка корреляции\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "corr_matrix = df_train_regression.corr()\n", "sns.heatmap(corr_matrix, annot=False)\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 2 }