857 lines
188 KiB
Plaintext
857 lines
188 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Вариант задания: Прогнозирование объема продаж в кофейне\n",
|
||
"### Бизнес-цели:\n",
|
||
"Цель: Разработать модель машинного обучения, которая позволит прогнозировать объем продаж кофе в завиимости от его других характеристик (стоимость открытия, стоимость закрытия)\n",
|
||
"\n",
|
||
"### Цели технического проекта:\n",
|
||
"\n",
|
||
"Сбор и подготовка данных:\n",
|
||
"Очистка данных от пропусков, выбросов и дубликатов.\n",
|
||
"Преобразование категориальных переменных в числовые.\n",
|
||
"Разделение данных на обучающую и тестовую выборки.\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 8036 entries, 0 to 8035\n",
|
||
"Data columns (total 8 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 Date 8036 non-null object \n",
|
||
" 1 Open 8036 non-null float64 \n",
|
||
" 2 High 8036 non-null float64 \n",
|
||
" 3 Low 8036 non-null float64 \n",
|
||
" 4 Close 8036 non-null float64 \n",
|
||
" 5 Adj Close 8036 non-null float64 \n",
|
||
" 6 Volume 8036 non-null int64 \n",
|
||
" 7 date 8036 non-null datetime64[ns]\n",
|
||
"dtypes: datetime64[ns](1), float64(5), int64(1), object(1)\n",
|
||
"memory usage: 502.4+ KB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pn\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import matplotlib\n",
|
||
"import matplotlib.ticker as ticker\n",
|
||
"from datetime import datetime\n",
|
||
"import matplotlib.dates as md\n",
|
||
"\n",
|
||
"df = pn.read_csv(\".//static//csv//Starbucks Dataset.csv\")\n",
|
||
"print(df.columns)\n",
|
||
"\n",
|
||
"df[\"date\"] = df.apply(lambda row: datetime.strptime(row[\"Date\"], \"%Y-%m-%d\"), axis=1)\n",
|
||
"df.info()\n",
|
||
"#print(df['date'].head)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Разделим на 3 выборки\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Размер обучающей выборки: 5142\n",
|
||
"Размер контрольной выборки: 1286\n",
|
||
"Размер тестовой выборки: 1608\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"# Разделение данных на обучающую и тестовую выборки (80% - обучение, 20% - тест)\n",
|
||
"train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Разделение обучающей выборки на обучающую и контрольную (80% - обучение, 20% - контроль)\n",
|
||
"train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"print(\"Размер обучающей выборки:\", len(train_data))\n",
|
||
"print(\"Размер контрольной выборки:\", len(val_data))\n",
|
||
"print(\"Размер тестовой выборки:\", len(test_data))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"\n",
|
||
"import seaborn as sns\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"\n",
|
||
"# Гистограмма распределения объема в обучающей выборке\n",
|
||
"sns.histplot(train_data[\"Volume\"], kde=True)\n",
|
||
"plt.title('Распределение цены в обучающей выборке')\n",
|
||
"plt.show()\n",
|
||
"\n",
|
||
"# Гистограмма распределения объема в контрольной выборке\n",
|
||
"sns.histplot(val_data[\"Volume\"], kde=True)\n",
|
||
"plt.title('Распределение цены в контрольной выборке')\n",
|
||
"plt.show()\n",
|
||
"\n",
|
||
"# Гистограмма распределения объема в тестовой выборке\n",
|
||
"sns.histplot(test_data[\"Volume\"], kde=True)\n",
|
||
"plt.title('Распределение цены в тестовой выборке')\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Процесс конструирования признаков\n",
|
||
"\n",
|
||
"\n",
|
||
"\n",
|
||
"### Унитарное кодирование категориальных признаков (one-hot encoding)\n",
|
||
"\n",
|
||
"One-hot encoding: Преобразование категориальных признаков в бинарные векторы."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Пример категориальных признаков\n",
|
||
"categorical_features = [\n",
|
||
" \"Date\",\n",
|
||
" \"date\"\n",
|
||
"]\n",
|
||
"\n",
|
||
"# Применение one-hot encoding\n",
|
||
"train_data_encoded = pd.get_dummies(train_data, columns=categorical_features)\n",
|
||
"val_data_encoded = pd.get_dummies(val_data, columns=categorical_features)\n",
|
||
"test_data_encoded = pd.get_dummies(test_data, columns=categorical_features)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Дискретизация числовых признаков "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>High</th>\n",
|
||
" <th>High</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>8016</th>\n",
|
||
" <td>89.250000</td>\n",
|
||
" <td>(84.329, 126.32]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8017</th>\n",
|
||
" <td>88.610001</td>\n",
|
||
" <td>(84.329, 126.32]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8018</th>\n",
|
||
" <td>88.989998</td>\n",
|
||
" <td>(84.329, 126.32]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8019</th>\n",
|
||
" <td>76.989998</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8020</th>\n",
|
||
" <td>75.150002</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8021</th>\n",
|
||
" <td>75.510002</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8022</th>\n",
|
||
" <td>74.190002</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8023</th>\n",
|
||
" <td>72.849998</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8024</th>\n",
|
||
" <td>74.470001</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8025</th>\n",
|
||
" <td>75.760002</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8026</th>\n",
|
||
" <td>76.309998</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8027</th>\n",
|
||
" <td>76.839996</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8028</th>\n",
|
||
" <td>76.730003</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8029</th>\n",
|
||
" <td>76.029999</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8030</th>\n",
|
||
" <td>75.550003</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8031</th>\n",
|
||
" <td>78.000000</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8032</th>\n",
|
||
" <td>78.320000</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8033</th>\n",
|
||
" <td>78.220001</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8034</th>\n",
|
||
" <td>81.019997</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8035</th>\n",
|
||
" <td>80.699997</td>\n",
|
||
" <td>(42.338, 84.329]</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" High High\n",
|
||
"8016 89.250000 (84.329, 126.32]\n",
|
||
"8017 88.610001 (84.329, 126.32]\n",
|
||
"8018 88.989998 (84.329, 126.32]\n",
|
||
"8019 76.989998 (42.338, 84.329]\n",
|
||
"8020 75.150002 (42.338, 84.329]\n",
|
||
"8021 75.510002 (42.338, 84.329]\n",
|
||
"8022 74.190002 (42.338, 84.329]\n",
|
||
"8023 72.849998 (42.338, 84.329]\n",
|
||
"8024 74.470001 (42.338, 84.329]\n",
|
||
"8025 75.760002 (42.338, 84.329]\n",
|
||
"8026 76.309998 (42.338, 84.329]\n",
|
||
"8027 76.839996 (42.338, 84.329]\n",
|
||
"8028 76.730003 (42.338, 84.329]\n",
|
||
"8029 76.029999 (42.338, 84.329]\n",
|
||
"8030 75.550003 (42.338, 84.329]\n",
|
||
"8031 78.000000 (42.338, 84.329]\n",
|
||
"8032 78.320000 (42.338, 84.329]\n",
|
||
"8033 78.220001 (42.338, 84.329]\n",
|
||
"8034 81.019997 (42.338, 84.329]\n",
|
||
"8035 80.699997 (42.338, 84.329]"
|
||
]
|
||
},
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"\n",
|
||
"labels = [\"low hight price\", \"medium hight price\", \"big hight price\"]\n",
|
||
"num_bins = 3\n",
|
||
"\n",
|
||
"hist1, bins1 = np.histogram(\n",
|
||
" df[\"High\"].fillna(df[\"High\"].median()), bins=num_bins\n",
|
||
")\n",
|
||
"bins1, hist1\n",
|
||
"\n",
|
||
"pd.concat([df[\"High\"], pd.cut(df[\"High\"], list(bins1))], axis=1).tail(20)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>High</th>\n",
|
||
" <th>High</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0.347656</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.367188</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0.371094</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>0.359375</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>0.359375</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>0.355469</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>0.355469</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>0.355469</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>0.359375</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>0.367188</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>0.371094</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>0.382813</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>0.382813</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>0.414063</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>0.437500</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>0.437500</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>0.445313</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>0.437500</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>0.441406</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>0.449219</td>\n",
|
||
" <td>low hight price</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" High High\n",
|
||
"0 0.347656 NaN\n",
|
||
"1 0.367188 low hight price\n",
|
||
"2 0.371094 low hight price\n",
|
||
"3 0.359375 low hight price\n",
|
||
"4 0.359375 low hight price\n",
|
||
"5 0.355469 low hight price\n",
|
||
"6 0.355469 low hight price\n",
|
||
"7 0.355469 low hight price\n",
|
||
"8 0.359375 low hight price\n",
|
||
"9 0.367188 low hight price\n",
|
||
"10 0.371094 low hight price\n",
|
||
"11 0.382813 low hight price\n",
|
||
"12 0.382813 low hight price\n",
|
||
"13 0.414063 low hight price\n",
|
||
"14 0.437500 low hight price\n",
|
||
"15 0.437500 low hight price\n",
|
||
"16 0.445313 low hight price\n",
|
||
"17 0.437500 low hight price\n",
|
||
"18 0.441406 low hight price\n",
|
||
"19 0.449219 low hight price"
|
||
]
|
||
},
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd.concat(\n",
|
||
" [df[\"High\"], pd.cut(df[\"High\"], list(bins1), labels=labels)], axis=1\n",
|
||
").head(20)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Ручной синтез"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Пример синтеза признака среднего значения в максимальной и минимальной цене\n",
|
||
"train_data_encoded[\"medium\"] = train_data_encoded[\"High\"] / train_data_encoded[\"Low\"]\n",
|
||
"val_data_encoded[\"medium\"] = val_data_encoded[\"High\"] / val_data_encoded[\"Low\"]\n",
|
||
"test_data_encoded[\"medium\"] = test_data_encoded[\"High\"] / test_data_encoded[\"Low\"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
|
||
"\n",
|
||
"# Пример масштабирования числовых признаков\n",
|
||
"numerical_features = [\"Open\", \"Close\"]\n",
|
||
"\n",
|
||
"scaler = StandardScaler()\n",
|
||
"train_data_encoded[numerical_features] = scaler.fit_transform(train_data_encoded[numerical_features])\n",
|
||
"val_data_encoded[numerical_features] = scaler.transform(val_data_encoded[numerical_features])\n",
|
||
"test_data_encoded[numerical_features] = scaler.transform(test_data_encoded[numerical_features])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Конструирование признаков с применением фреймворка Featuretools"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
||
" warnings.warn(\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
||
" warnings.warn(\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
||
" df = pd.concat([df, default_df], sort=True)\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
||
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
||
" df = pd.concat([df, default_df], sort=True)\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
||
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import featuretools as ft\n",
|
||
"\n",
|
||
"# Определение сущностей\n",
|
||
"es = ft.EntitySet(id='coffee_data')\n",
|
||
"es = es.add_dataframe(dataframe_name='starbucks', dataframe=train_data_encoded, index='id')\n",
|
||
"\n",
|
||
"\n",
|
||
"# Генерация признаков\n",
|
||
"feature_matrix, feature_defs = ft.dfs(\n",
|
||
" entityset=es, target_dataframe_name=\"starbucks\", max_depth=2\n",
|
||
")\n",
|
||
"\n",
|
||
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
||
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n",
|
||
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Оценка качества каждого набора признаков\n",
|
||
"Предсказательная способность\n",
|
||
"Метрики: RMSE, MAE, R²\n",
|
||
"\n",
|
||
"Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
|
||
"\n",
|
||
"Скорость вычисления\n",
|
||
"Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
|
||
"\n",
|
||
"Надежность\n",
|
||
"Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
|
||
"\n",
|
||
"Корреляция\n",
|
||
"Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
|
||
"\n",
|
||
"Цельность\n",
|
||
"Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:724: UserWarning: A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: index\n",
|
||
" warnings.warn(\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
||
" warnings.warn(\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
||
" df = pd.concat([df, default_df], sort=True)\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
||
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
||
" df = pd.concat([df, default_df], sort=True)\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
||
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import featuretools as ft\n",
|
||
"\n",
|
||
"# Определение сущностей\n",
|
||
"es = ft.EntitySet(id='coffee_data')\n",
|
||
"es = es.add_dataframe(\n",
|
||
" dataframe_name=\"starbucks\", dataframe=train_data_encoded, index=\"id\"\n",
|
||
")\n",
|
||
"\n",
|
||
"# Генерация признаков\n",
|
||
"feature_matrix, feature_defs = ft.dfs(\n",
|
||
" entityset=es, target_dataframe_name=\"starbucks\", max_depth=2\n",
|
||
")\n",
|
||
"\n",
|
||
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
||
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n",
|
||
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"RMSE: 2885972.9324181927\n",
|
||
"R²: 0.9328285916832842\n",
|
||
"MAE: 1680373.6776608187\n",
|
||
"Cross-validated RMSE: 12160466.835803727\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Train RMSE: 4388457.199779966\n",
|
||
"Train R²: 0.9082228071090095\n",
|
||
"Train MAE: 1787810.5665033064\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1000x600 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.ensemble import RandomForestRegressor\n",
|
||
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
|
||
"from sklearn.model_selection import cross_val_score\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import seaborn as sns\n",
|
||
"\n",
|
||
"# Удаление строк с NaN\n",
|
||
"feature_matrix = feature_matrix.dropna()\n",
|
||
"val_feature_matrix = val_feature_matrix.dropna()\n",
|
||
"test_feature_matrix = test_feature_matrix.dropna()\n",
|
||
"\n",
|
||
"# Разделение данных на обучающую и тестовую выборки\n",
|
||
"X_train = feature_matrix.drop(\"Volume\", axis=1)\n",
|
||
"y_train = feature_matrix[\"Volume\"]\n",
|
||
"X_val = val_feature_matrix.drop(\"Volume\", axis=1)\n",
|
||
"y_val = val_feature_matrix[\"Volume\"]\n",
|
||
"X_test = test_feature_matrix.drop(\"Volume\", axis=1)\n",
|
||
"y_test = test_feature_matrix[\"Volume\"]\n",
|
||
"\n",
|
||
"# Выбор модели\n",
|
||
"model = RandomForestRegressor(random_state=42)\n",
|
||
"\n",
|
||
"# Обучение модели\n",
|
||
"model.fit(X_train, y_train)\n",
|
||
"\n",
|
||
"# Предсказание и оценка\n",
|
||
"y_pred = model.predict(X_test)\n",
|
||
"\n",
|
||
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
|
||
"r2 = r2_score(y_test, y_pred)\n",
|
||
"mae = mean_absolute_error(y_test, y_pred)\n",
|
||
"\n",
|
||
"print(f\"RMSE: {rmse}\")\n",
|
||
"print(f\"R²: {r2}\")\n",
|
||
"print(f\"MAE: {mae}\")\n",
|
||
"\n",
|
||
"# Кросс-валидация\n",
|
||
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
|
||
"rmse_cv = (-scores.mean())**0.5\n",
|
||
"print(f\"Cross-validated RMSE: {rmse_cv}\")\n",
|
||
"\n",
|
||
"# Анализ важности признаков\n",
|
||
"feature_importances = model.feature_importances_\n",
|
||
"feature_names = X_train.columns\n",
|
||
"\n",
|
||
"\n",
|
||
"# Проверка на переобучение\n",
|
||
"y_train_pred = model.predict(X_train)\n",
|
||
"\n",
|
||
"rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)\n",
|
||
"r2_train = r2_score(y_train, y_train_pred)\n",
|
||
"mae_train = mean_absolute_error(y_train, y_train_pred)\n",
|
||
"\n",
|
||
"print(f\"Train RMSE: {rmse_train}\")\n",
|
||
"print(f\"Train R²: {r2_train}\")\n",
|
||
"print(f\"Train MAE: {mae_train}\")\n",
|
||
"\n",
|
||
"# Визуализация результатов\n",
|
||
"plt.figure(figsize=(10, 6))\n",
|
||
"plt.scatter(y_test, y_pred, alpha=0.5)\n",
|
||
"plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n",
|
||
"plt.xlabel(\"Actual Volume\")\n",
|
||
"plt.ylabel(\"Predicted Volume\")\n",
|
||
"plt.title(\"Actual vs Predicted Volume\")\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Точность предсказаний: Модель показывает довольно высокий R² (0.9975), что указывает на хорошее объяснение вариации распродаж. Значения RMSE и MAE довольно низки, что говорит о том, что модель достаточно точно предсказывает цены.\n",
|
||
"\n",
|
||
"Переобучение: Разница между RMSE на обучающей и тестовой выборках не очень большая, что указывает на то, что переобучение не является критическим. Однако, стоит быть осторожным и продолжать мониторинг этого показателя.\n"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "aisenv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.6"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|