351 lines
76 KiB
Plaintext
351 lines
76 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Лабораторная работа 3\n",
|
|
"\n",
|
|
"Датасет - **Цены на кофе**\thttps://www.kaggle.com/datasets/mayankanand2701/starbucks-stock-price-dataset\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"**Бизнес-цели**: \n",
|
|
"1. Прогнозирование цены закрытия акции для поддержки принятия решений по инвестициям.\n",
|
|
"2. Оценка волатильности акций Starbucks для долгосрочных стратегий инвестирования."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"**Цели технического проекта**: \n",
|
|
"1. Создание модели машинного обучения для прогнозирования цены закрытия акций на основе исторических данных (дат, цен открытия, максимальных и минимальных цен, объёма торгов).\n",
|
|
"2. Разработка системы, которая вычисляет и анализирует волатильность на основе исторической ценовой информации и объёмов торгов."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')\n",
|
|
"0 8212\n",
|
|
"1 8215\n",
|
|
"2 8216\n",
|
|
"3 8217\n",
|
|
"4 8218\n",
|
|
" ... \n",
|
|
"8031 19860\n",
|
|
"8032 19863\n",
|
|
"8033 19864\n",
|
|
"8034 19865\n",
|
|
"8035 19866\n",
|
|
"Name: Date_numeric, Length: 8036, dtype: int64\n",
|
|
"Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',\n",
|
|
" 'Date_numeric', 'Close_binned'],\n",
|
|
" dtype='object')\n",
|
|
"Обучающая выборка: (4821, 9)\n",
|
|
"Close\n",
|
|
"0.765625 16\n",
|
|
"0.835938 10\n",
|
|
"0.750000 10\n",
|
|
"0.882813 10\n",
|
|
"0.753906 9\n",
|
|
" ..\n",
|
|
"17.745001 1\n",
|
|
"4.757500 1\n",
|
|
"20.174999 1\n",
|
|
"94.080002 1\n",
|
|
"54.599998 1\n",
|
|
"Name: count, Length: 3663, dtype: int64\n",
|
|
"Контрольная выборка: (1607, 9)\n",
|
|
"Close\n",
|
|
"0.750000 9\n",
|
|
"0.773438 6\n",
|
|
"0.703125 6\n",
|
|
"0.765625 5\n",
|
|
"0.898438 4\n",
|
|
" ..\n",
|
|
"1.804688 1\n",
|
|
"0.656250 1\n",
|
|
"13.740000 1\n",
|
|
"27.799999 1\n",
|
|
"84.260002 1\n",
|
|
"Name: count, Length: 1421, dtype: int64\n",
|
|
"Тестовая выборка: (1607, 9)\n",
|
|
"Close\n",
|
|
"0.742188 6\n",
|
|
"0.789063 6\n",
|
|
"1.367188 5\n",
|
|
"0.750000 5\n",
|
|
"0.781250 5\n",
|
|
" ..\n",
|
|
"57.810001 1\n",
|
|
"111.239998 1\n",
|
|
"4.132813 1\n",
|
|
"38.915001 1\n",
|
|
"96.760002 1\n",
|
|
"Name: count, Length: 1425, dtype: int64\n",
|
|
"Обучающая выборка: (4821, 9)\n",
|
|
"Close_binned\n",
|
|
"High 1639\n",
|
|
"Low 1591\n",
|
|
"Medium 1591\n",
|
|
"Name: count, dtype: int64\n",
|
|
"Контрольная выборка: (1607, 9)\n",
|
|
"Close_binned\n",
|
|
"High 547\n",
|
|
"Low 530\n",
|
|
"Medium 530\n",
|
|
"Name: count, dtype: int64\n",
|
|
"Тестовая выборка: (1607, 9)\n",
|
|
"Close_binned\n",
|
|
"High 546\n",
|
|
"Medium 531\n",
|
|
"Low 530\n",
|
|
"Name: count, dtype: int64\n",
|
|
"Обучающая выборка после undersampling: (4773, 9)\n",
|
|
"Close\n",
|
|
"0.765625 16\n",
|
|
"0.835938 10\n",
|
|
"0.882813 10\n",
|
|
"0.750000 10\n",
|
|
"0.753906 9\n",
|
|
" ..\n",
|
|
"80.669998 1\n",
|
|
"57.799999 1\n",
|
|
"57.230000 1\n",
|
|
"85.169998 1\n",
|
|
"11.795000 1\n",
|
|
"Name: count, Length: 3630, dtype: int64\n",
|
|
"Open float64\n",
|
|
"High float64\n",
|
|
"Low float64\n",
|
|
"Adj Close float64\n",
|
|
"Volume float64\n",
|
|
"Date_numeric int64\n",
|
|
"Close_binned_Low bool\n",
|
|
"Close_binned_Medium bool\n",
|
|
"Close_binned_High bool\n",
|
|
"Volume_binned int64\n",
|
|
"Price_change float64\n",
|
|
"Volatility float64\n",
|
|
"dtype: object\n",
|
|
"Средняя абсолютная ошибка: 0.28906502132778844\n",
|
|
"Среднеквадратичная ошибка: 0.2722364217628283\n",
|
|
"время, затраченное на обучение модели: 0.016208171844482422. Время, затраченное на предсказание: 0.0012598037719726562\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"c:\\Python312\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|
" warnings.warn(\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 2 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|
"\n",
|
|
"df = pd.read_csv(\"data/Coffe.csv\")\n",
|
|
"print(df.columns)\n",
|
|
"\n",
|
|
"df['Date'] = pd.to_datetime(df['Date'])\n",
|
|
"df['Date_numeric'] = (df['Date'] - pd.Timestamp('1970-01-01')).dt.days\n",
|
|
"print(df['Date_numeric'])\n",
|
|
"\n",
|
|
"def split_stratified_into_train_val_test(\n",
|
|
" df_input,\n",
|
|
" stratify_colname=\"y\",\n",
|
|
" frac_train=0.6,\n",
|
|
" frac_val=0.15,\n",
|
|
" frac_test=0.25,\n",
|
|
" random_state=None,\n",
|
|
"):\n",
|
|
" if frac_train + frac_val + frac_test != 1.0:\n",
|
|
" raise ValueError(\n",
|
|
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
|
" % (frac_train, frac_val, frac_test)\n",
|
|
" )\n",
|
|
"\n",
|
|
" if stratify_colname not in df_input.columns:\n",
|
|
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
|
"\n",
|
|
" X = df_input \n",
|
|
" y = df_input[\n",
|
|
" [stratify_colname]\n",
|
|
" ] \n",
|
|
"\n",
|
|
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
|
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
|
" )\n",
|
|
"\n",
|
|
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
|
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
|
" df_temp,\n",
|
|
" y_temp,\n",
|
|
" stratify=y_temp,\n",
|
|
" test_size=relative_frac_test,\n",
|
|
" random_state=random_state,\n",
|
|
" )\n",
|
|
"\n",
|
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
|
"\n",
|
|
" return df_train, df_val, df_test\n",
|
|
"\n",
|
|
"bins = [df['Close'].min(), df['Close'].quantile(0.33), df['Close'].quantile(0.66), df['Close'].max()]\n",
|
|
"labels = ['Low', 'Medium', 'High']\n",
|
|
"df['Close_binned'] = pd.cut(df['Close'], bins=bins, labels=labels)\n",
|
|
"df = df.dropna()\n",
|
|
"# Now stratify using the binned values\n",
|
|
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
|
" df, stratify_colname=\"Close_binned\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
|
")\n",
|
|
"\n",
|
|
"print(df_train.columns) \n",
|
|
" \n",
|
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
|
"print(df_train.Close.value_counts()) \n",
|
|
"\n",
|
|
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
|
"print(df_val.Close.value_counts())\n",
|
|
"\n",
|
|
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
|
"print(df_test.Close.value_counts())\n",
|
|
"\n",
|
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
|
"print(df_train['Close_binned'].value_counts())\n",
|
|
"\n",
|
|
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
|
"print(df_val['Close_binned'].value_counts())\n",
|
|
"\n",
|
|
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
|
"print(df_test['Close_binned'].value_counts())\n",
|
|
"\n",
|
|
"rus = RandomUnderSampler(random_state=42)\n",
|
|
"X_resampled, y_resampled = rus.fit_resample(df_train, df_train[\"Close_binned\"])\n",
|
|
"df_train_rus = pd.DataFrame(X_resampled)\n",
|
|
"print(\"Обучающая выборка после undersampling: \", df_train_rus.shape)\n",
|
|
"print(df_train_rus.Close.value_counts())\n",
|
|
"\n",
|
|
"df_train = pd.get_dummies(df_train, columns=['Close_binned'])\n",
|
|
"\n",
|
|
"df_train['Volume_binned'] = pd.qcut(df_train['Volume'], q=4, labels=False)\n",
|
|
"\n",
|
|
"df_train['Price_change'] = df_train['Close'] - df_train['Open']\n",
|
|
"\n",
|
|
"from sklearn.preprocessing import StandardScaler\n",
|
|
"\n",
|
|
"scaler = StandardScaler()\n",
|
|
"df_train[['Open', 'Close', 'High', 'Low', 'Volume']] = scaler.fit_transform(df_train[['Open', 'Close', 'High', 'Low', 'Volume']])\n",
|
|
"\n",
|
|
"df_train['Volatility'] = df_train['High'] - df_train['Low']\n",
|
|
"\n",
|
|
"import featuretools as ft\n",
|
|
"\n",
|
|
"es = ft.EntitySet(id=\"stocks\")\n",
|
|
"es = es.add_dataframe(\n",
|
|
" dataframe_name=\"stock_data\", \n",
|
|
" dataframe=df_train, \n",
|
|
" index=\"Date\")\n",
|
|
"\n",
|
|
"feature_matrix, feature_defs = ft.dfs(\n",
|
|
" entityset=es, \n",
|
|
" target_dataframe_name=\"stock_data\")\n",
|
|
"\n",
|
|
"feature_defs\n",
|
|
"\n",
|
|
"# Оценка предсказательной способности\n",
|
|
"from sklearn.linear_model import LinearRegression\n",
|
|
"from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
|
|
"df_train_regression = df_train.copy()\n",
|
|
"\n",
|
|
"X_train = df_train_regression.drop(['Close', 'Date'], axis=1)\n",
|
|
"y_train = df_train_regression['Close']\n",
|
|
"\n",
|
|
"X_test = df_test.drop(['Close', 'Date'], axis=1)\n",
|
|
"y_test = df_test['Close']\n",
|
|
"\n",
|
|
"X_train_encoded = pd.get_dummies(X_train, drop_first=True)\n",
|
|
"X_test_encoded = pd.get_dummies(X_test, drop_first=True)\n",
|
|
"\n",
|
|
"X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)\n",
|
|
"\n",
|
|
"print(X_train_encoded.dtypes)\n",
|
|
"\n",
|
|
"model = LinearRegression()\n",
|
|
"model.fit(X_train_encoded, y_train)\n",
|
|
"\n",
|
|
"predictions = model.predict(X_test_encoded)\n",
|
|
"\n",
|
|
"mae = mean_absolute_error(y_test, predictions)\n",
|
|
"mse = mean_squared_error(y_test, predictions)\n",
|
|
"print(\"Средняя абсолютная ошибка:\", mae)\n",
|
|
"print(\"Среднеквадратичная ошибка:\", mse)\n",
|
|
"\n",
|
|
"# Оценка скорости вычисления\n",
|
|
"import time\n",
|
|
"start_time = time.time()\n",
|
|
"model.fit(X_train_encoded, y_train)\n",
|
|
"training_time = time.time() - start_time\n",
|
|
"\n",
|
|
"start_time = time.time()\n",
|
|
"predictions = model.predict(X_test_encoded)\n",
|
|
"prediction_time = time.time() - start_time\n",
|
|
"\n",
|
|
"print(f'время, затраченное на обучение модели: {training_time}. Время, затраченное на предсказание: {prediction_time}')\n",
|
|
"\n",
|
|
"# Оценка корреляции\n",
|
|
"import seaborn as sns\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"\n",
|
|
"corr_matrix = df_train_regression.corr()\n",
|
|
"sns.heatmap(corr_matrix, annot=False)\n",
|
|
"plt.show()\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|