From e765be06652a8e79ed5c2ad372d27b081a42bdb1 Mon Sep 17 00:00:00 2001 From: "a.puchkina" Date: Sat, 9 Nov 2024 11:43:06 +0400 Subject: [PATCH] =?UTF-8?q?=D0=BB=D1=8F=D0=BB=D1=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 + lab_3/lab3.ipynb | 1403 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1406 insertions(+) create mode 100644 lab_3/lab3.ipynb diff --git a/.gitignore b/.gitignore index 207d123..8b52d1a 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ ipython_config.py # Remove previous ipynb_checkpoints # git rm -r .ipynb_checkpoints/ +# virtual +aimenv/ +static/ \ No newline at end of file diff --git a/lab_3/lab3.ipynb b/lab_3/lab3.ipynb new file mode 100644 index 0000000..665e5d5 --- /dev/null +++ b/lab_3/lab3.ipynb @@ -0,0 +1,1403 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Набор данных с ценами на мобильные устройства" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Вывод всех столбцов" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['Unnamed: 0', 'Name', 'Rating', 'Spec_score', 'No_of_sim', 'Ram',\n", + " 'Battery', 'Display', 'Camera', 'External_Memory', 'Android_version',\n", + " 'Price', 'company', 'Inbuilt_memory', 'fast_charging',\n", + " 'Screen_resolution', 'Processor', 'Processor_name'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd \n", + "df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n", + "print(df.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Бизнес-цели:\n", + "1. Классифицировать мобильные устройства по ценовым категориям (например, бюджетные, средний класс, флагманы).\n", + "2. Определить, какие характеристики мобильных устройств наиболее сильно влияют на их рейтинг." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выполним разбиение на 3 выборки: обучающую, контрольную и тестовую" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: 671\n", + "Размер контрольной выборки: 288\n", + "Размер тестовой выборки: 411\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Загрузка данных\n", + "df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n", + "\n", + "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", + "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", + "\n", + "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", + "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", + "\n", + "# Вывод размеров выборок\n", + "print(\"Размер обучающей выборки:\", len(train_df))\n", + "print(\"Размер контрольной выборки:\", len(val_df))\n", + "print(\"Размер тестовой выборки:\", len(test_df))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение классов в company:\n", + "company\n", + "Vivo 186\n", + "Realme 186\n", + "Samsung 181\n", + "Motorola 127\n", + "Xiaomi 90\n", + "Honor 88\n", + "Poco 75\n", + "OnePlus 75\n", + "Huawei 62\n", + "iQOO 57\n", + "OPPO 38\n", + "Oppo 27\n", + "TCL 26\n", + "Google 23\n", + "Asus 21\n", + "POCO 19\n", + "Lava 19\n", + "Nothing 15\n", + "Lenovo 14\n", + "Tecno 13\n", + "itel 12\n", + "LG 6\n", + "Gionee 5\n", + "Itel 3\n", + "IQOO 1\n", + "Coolpad 1\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение классов в Обучающей выборке:\n", + "company\n", + "Vivo 138\n", + "Samsung 128\n", + "Realme 125\n", + "Motorola 89\n", + "Xiaomi 66\n", + "Honor 59\n", + "OnePlus 56\n", + "Poco 52\n", + "Huawei 46\n", + "iQOO 37\n", + "Oppo 21\n", + "OPPO 20\n", + "Google 16\n", + "Lava 16\n", + "POCO 14\n", + "TCL 14\n", + "Asus 12\n", + "Lenovo 12\n", + "itel 10\n", + "Nothing 8\n", + "Tecno 8\n", + "LG 5\n", + "Gionee 4\n", + "IQOO 1\n", + "Itel 1\n", + "Coolpad 1\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение классов в Контрольной выборке:\n", + "company\n", + "Realme 26\n", + "Samsung 26\n", + "Vivo 22\n", + "Motorola 18\n", + "Honor 15\n", + "OPPO 13\n", + "Poco 12\n", + "Xiaomi 11\n", + "iQOO 11\n", + "OnePlus 8\n", + "Huawei 7\n", + "Asus 7\n", + "TCL 6\n", + "POCO 5\n", + "Oppo 4\n", + "Google 4\n", + "Tecno 3\n", + "Nothing 3\n", + "itel 2\n", + "Lava 1\n", + "Lenovo 1\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение классов в Тестовой выборке:\n", + "company\n", + "Realme 35\n", + "Samsung 27\n", + "Vivo 26\n", + "Motorola 20\n", + "Honor 14\n", + "Xiaomi 13\n", + "Poco 11\n", + "OnePlus 11\n", + "Huawei 9\n", + "iQOO 9\n", + "TCL 6\n", + "OPPO 5\n", + "Nothing 4\n", + "Google 3\n", + "Lava 2\n", + "Asus 2\n", + "Oppo 2\n", + "Tecno 2\n", + "Itel 2\n", + "Gionee 1\n", + "Lenovo 1\n", + "LG 1\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "# Загрузка данных\n", + "df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n", + "\n", + "# Проверка распределения классов в столбце company\n", + "class_distribution = df['company'].value_counts()\n", + "print(\"Распределение классов в company:\")\n", + "print(class_distribution)\n", + "\n", + "# Визуализация распределения классов\n", + "sns.countplot(y='company', data=df, order=class_distribution.index)\n", + "plt.title('Распределение классов в company')\n", + "plt.show()\n", + "\n", + "# Проверка сбалансированности для каждой выборки\n", + "def check_balance(df, title):\n", + " class_distribution = df['company'].value_counts()\n", + " print(f\"Распределение классов в {title}:\")\n", + " print(class_distribution)\n", + " sns.countplot(y='company', data=df, order=class_distribution.index)\n", + " plt.title(f'Распределение классов в {title}')\n", + " plt.show()\n", + "\n", + "# Разделение данных на обучающую, контрольную и тестовую выборки\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)\n", + "val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n", + "\n", + "# Проверка сбалансированности для обучающей, контрольной и тестовой выборок\n", + "check_balance(train_df, 'Обучающей выборке')\n", + "check_balance(val_df, 'Контрольной выборке')\n", + "check_balance(test_df, 'Тестовой выборке')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " Данные по столбцу company являются несбалансированными. Некоторые компании, такие как Vivo, Realme, и Samsung, имеют значительно больше устройств, чем другие, такие как LG, Gionee, и Itel." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки до upsampling: 671\n", + "Размер контрольной выборки: 288\n", + "Размер тестовой выборки: 411\n", + "\n", + "Распределение классов в всем датасете:\n", + "Класс Vivo: 186 (13.58%)\n", + "Класс Realme: 186 (13.58%)\n", + "Класс Samsung: 181 (13.21%)\n", + "Класс Motorola: 127 (9.27%)\n", + "Класс Xiaomi: 90 (6.57%)\n", + "Класс Honor: 88 (6.42%)\n", + "Класс Poco: 75 (5.47%)\n", + "Класс OnePlus: 75 (5.47%)\n", + "Класс Huawei: 62 (4.53%)\n", + "Класс iQOO: 57 (4.16%)\n", + "Класс OPPO: 38 (2.77%)\n", + "Класс Oppo: 27 (1.97%)\n", + "Класс TCL: 26 (1.90%)\n", + "Класс Google: 23 (1.68%)\n", + "Класс Asus: 21 (1.53%)\n", + "Класс POCO: 19 (1.39%)\n", + "Класс Lava: 19 (1.39%)\n", + "Класс Nothing: 15 (1.09%)\n", + "Класс Lenovo: 14 (1.02%)\n", + "Класс Tecno: 13 (0.95%)\n", + "Класс itel: 12 (0.88%)\n", + "Класс LG: 6 (0.44%)\n", + "Класс Gionee: 5 (0.36%)\n", + "Класс Itel: 3 (0.22%)\n", + "Класс IQOO: 1 (0.07%)\n", + "Класс Coolpad: 1 (0.07%)\n", + "\n", + "Распределение классов в Обучающей выборке до upsampling:\n", + "Класс Vivo: 94 (14.01%)\n", + "Класс Samsung: 89 (13.26%)\n", + "Класс Realme: 82 (12.22%)\n", + "Класс Motorola: 66 (9.84%)\n", + "Класс Xiaomi: 46 (6.86%)\n", + "Класс Honor: 40 (5.96%)\n", + "Класс OnePlus: 40 (5.96%)\n", + "Класс Poco: 37 (5.51%)\n", + "Класс Huawei: 35 (5.22%)\n", + "Класс iQOO: 28 (4.17%)\n", + "Класс OPPO: 15 (2.24%)\n", + "Класс Oppo: 14 (2.09%)\n", + "Класс Lava: 12 (1.79%)\n", + "Класс Google: 12 (1.79%)\n", + "Класс TCL: 10 (1.49%)\n", + "Класс Lenovo: 9 (1.34%)\n", + "Класс POCO: 9 (1.34%)\n", + "Класс Asus: 8 (1.19%)\n", + "Класс itel: 7 (1.04%)\n", + "Класс Nothing: 5 (0.75%)\n", + "Класс Tecno: 5 (0.75%)\n", + "Класс LG: 3 (0.45%)\n", + "Класс Gionee: 3 (0.45%)\n", + "Класс Coolpad: 1 (0.15%)\n", + "Класс Itel: 1 (0.15%)\n", + "Размер обучающей выборки после upsampling: 2350\n", + "\n", + "Распределение классов в Обучающей выборке после upsampling:\n", + "Класс Realme: 94 (4.00%)\n", + "Класс Motorola: 94 (4.00%)\n", + "Класс Vivo: 94 (4.00%)\n", + "Класс Lava: 94 (4.00%)\n", + "Класс Lenovo: 94 (4.00%)\n", + "Класс TCL: 94 (4.00%)\n", + "Класс OPPO: 94 (4.00%)\n", + "Класс Honor: 94 (4.00%)\n", + "Класс Poco: 94 (4.00%)\n", + "Класс itel: 94 (4.00%)\n", + "Класс Oppo: 94 (4.00%)\n", + "Класс iQOO: 94 (4.00%)\n", + "Класс Samsung: 94 (4.00%)\n", + "Класс Xiaomi: 94 (4.00%)\n", + "Класс LG: 94 (4.00%)\n", + "Класс Huawei: 94 (4.00%)\n", + "Класс OnePlus: 94 (4.00%)\n", + "Класс Google: 94 (4.00%)\n", + "Класс Tecno: 94 (4.00%)\n", + "Класс Asus: 94 (4.00%)\n", + "Класс Gionee: 94 (4.00%)\n", + "Класс POCO: 94 (4.00%)\n", + "Класс Nothing: 94 (4.00%)\n", + "Класс Coolpad: 94 (4.00%)\n", + "Класс Itel: 94 (4.00%)\n", + "\n", + "Распределение классов в Контрольной выборке:\n", + "Класс Vivo: 44 (15.28%)\n", + "Класс Realme: 43 (14.93%)\n", + "Класс Samsung: 39 (13.54%)\n", + "Класс Motorola: 23 (7.99%)\n", + "Класс Xiaomi: 20 (6.94%)\n", + "Класс Honor: 19 (6.60%)\n", + "Класс OnePlus: 16 (5.56%)\n", + "Класс Poco: 15 (5.21%)\n", + "Класс Huawei: 11 (3.82%)\n", + "Класс iQOO: 9 (3.12%)\n", + "Класс Oppo: 7 (2.43%)\n", + "Класс POCO: 5 (1.74%)\n", + "Класс OPPO: 5 (1.74%)\n", + "Класс Google: 4 (1.39%)\n", + "Класс Asus: 4 (1.39%)\n", + "Класс TCL: 4 (1.39%)\n", + "Класс Lava: 4 (1.39%)\n", + "Класс itel: 3 (1.04%)\n", + "Класс Nothing: 3 (1.04%)\n", + "Класс Tecno: 3 (1.04%)\n", + "Класс Lenovo: 3 (1.04%)\n", + "Класс LG: 2 (0.69%)\n", + "Класс Gionee: 1 (0.35%)\n", + "Класс IQOO: 1 (0.35%)\n", + "\n", + "Распределение классов в Тестовой выборке:\n", + "Класс Realme: 61 (14.84%)\n", + "Класс Samsung: 53 (12.90%)\n", + "Класс Vivo: 48 (11.68%)\n", + "Класс Motorola: 38 (9.25%)\n", + "Класс Honor: 29 (7.06%)\n", + "Класс Xiaomi: 24 (5.84%)\n", + "Класс Poco: 23 (5.60%)\n", + "Класс iQOO: 20 (4.87%)\n", + "Класс OnePlus: 19 (4.62%)\n", + "Класс OPPO: 18 (4.38%)\n", + "Класс Huawei: 16 (3.89%)\n", + "Класс TCL: 12 (2.92%)\n", + "Класс Asus: 9 (2.19%)\n", + "Класс Google: 7 (1.70%)\n", + "Класс Nothing: 7 (1.70%)\n", + "Класс Oppo: 6 (1.46%)\n", + "Класс POCO: 5 (1.22%)\n", + "Класс Tecno: 5 (1.22%)\n", + "Класс Lava: 3 (0.73%)\n", + "Класс Lenovo: 2 (0.49%)\n", + "Класс itel: 2 (0.49%)\n", + "Класс Itel: 2 (0.49%)\n", + "Класс LG: 1 (0.24%)\n", + "Класс Gionee: 1 (0.24%)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "\n", + "# Загрузка данных\n", + "df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n", + "\n", + "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", + "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", + "\n", + "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", + "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", + "\n", + "# Вывод размеров выборок\n", + "print(\"Размер обучающей выборки до upsampling:\", len(train_df))\n", + "print(\"Размер контрольной выборки:\", len(val_df))\n", + "print(\"Размер тестовой выборки:\", len(test_df))\n", + "\n", + "# Функция для проверки балансировки данных\n", + "def check_balance(df, title):\n", + " class_distribution = df['company'].value_counts()\n", + " print(f\"\\nРаспределение классов в {title}:\")\n", + " for cls, count in class_distribution.items():\n", + " print(f\"Класс {cls}: {count} ({count / len(df) * 100:.2f}%)\")\n", + "\n", + "# Проверка балансировки для всего датасета\n", + "check_balance(df, 'всем датасете')\n", + "\n", + "# Проверка балансировки для обучающей выборки до upsampling\n", + "check_balance(train_df, 'Обучающей выборке до upsampling')\n", + "\n", + "# Применение upsampling к обучающей выборке\n", + "X_train = train_df.drop('company', axis=1) # Отделяем признаки от целевой переменной\n", + "y_train = train_df['company'] # Целевая переменная\n", + "\n", + "# Инициализация RandomOverSampler\n", + "ros = RandomOverSampler(random_state=42)\n", + "\n", + "# Применение upsampling\n", + "X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n", + "\n", + "# Создание нового DataFrame с балансированными данными\n", + "train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n", + "\n", + "# Вывод размеров выборок после upsampling\n", + "print(\"Размер обучающей выборки после upsampling:\", len(train_df_resampled))\n", + "\n", + "# Проверка балансировки для обучающей выборки после upsampling\n", + "check_balance(train_df_resampled, 'Обучающей выборке после upsampling')\n", + "\n", + "# Проверка балансировки для контрольной и тестовой выборок (они не должны измениться)\n", + "check_balance(val_df, 'Контрольной выборке')\n", + "check_balance(test_df, 'Тестовой выборке')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Данные были сбалансированы. Теперь можно перейти к конструированию признаков. Поставлены следующие задачи:\n", + "1. Классифицировать мобильные устройства по ценовым категориям (например, бюджетные, средний класс, флагманы).\n", + "2. Определить, какие характеристики мобильных устройств наиболее сильно влияют на их рейтинг." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "\n", + "# Определение категориальных признаков\n", + "categorical_features = [\n", + " 'Rating', 'Ram',\n", + " 'Battery', 'Display', 'Camera', 'External_Memory', 'Android_version',\n", + " 'Price', 'company', 'Inbuilt_memory', 'fast_charging',\n", + " 'Screen_resolution', 'Processor'\n", + "]\n", + "\n", + "# Применение one-hot encoding к обучающей выборке\n", + "train_df_resampled_encoded = pd.get_dummies(train_df_resampled, columns=categorical_features)\n", + "\n", + "# Применение one-hot encoding к контрольной выборке\n", + "val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n", + "\n", + "# Применение one-hot encoding к тестовой выборке\n", + "test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Дискретизация числовых признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки после балансировки: (5600, 22)\n", + "Размер контрольной выборки: (288, 22)\n", + "Размер тестовой выборки: (411, 22)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "import re\n", + "\n", + "# Загрузка данных\n", + "df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n", + "\n", + "# Извлечение числовых значений из столбца Battery\n", + "df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", + "df['Ram'] = df['Ram'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", + "df['Camera'] = df['Camera'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", + "\n", + "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", + "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", + "\n", + "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", + "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", + "\n", + "# Применение upsampling к обучающей выборке (если это необходимо)\n", + "X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n", + "y_train = train_df['Price'] # Целевая переменная\n", + "\n", + "# Инициализация RandomOverSampler\n", + "ros = RandomOverSampler(random_state=42)\n", + "\n", + "# Применение upsampling\n", + "X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n", + "\n", + "# Создание нового DataFrame с балансированными данными\n", + "train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n", + "\n", + "# Определение числовых признаков для дискретизации\n", + "numerical_features = ['Spec_score', 'Battery', 'Ram', 'Camera' ]\n", + "\n", + "# Функция для дискретизации числовых признаков\n", + "def discretize_features(df, features, bins=5, labels=False):\n", + " for feature in features:\n", + " try:\n", + " # Заполнение NaN значений, если они есть\n", + " df[feature] = df[feature].fillna(df[feature].median())\n", + " df[f'{feature}_bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n", + " except Exception as e:\n", + " print(f\"Ошибка при дискретизации признака {feature}: {e}\")\n", + " return df\n", + "\n", + "# Применение дискретизации к обучающей, контрольной и тестовой выборкам\n", + "train_df_resampled = discretize_features(train_df_resampled, numerical_features)\n", + "val_df = discretize_features(val_df, numerical_features)\n", + "test_df = discretize_features(test_df, numerical_features)\n", + "\n", + "# Вывод размеров выборок\n", + "print(\"Размер обучающей выборки после балансировки:\", train_df_resampled.shape)\n", + "print(\"Размер контрольной выборки:\", val_df.shape)\n", + "print(\"Размер тестовой выборки:\", test_df.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ручной синтез. Создание новых признаков на основе экспертных знаний и логики предметной области." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки после балансировки: (5600, 19)\n", + "Размер контрольной выборки: (288, 19)\n", + "Размер тестовой выборки: (411, 19)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "\n", + "# Загрузка данных\n", + "df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n", + "\n", + "# Преобразование столбца Battery в числовой формат\n", + "df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", + "\n", + "# Преобразование столбцов Camera и Display в числовой формат\n", + "df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n", + "df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n", + "\n", + "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", + "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", + "\n", + "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", + "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", + "\n", + "# Применение upsampling к обучающей выборке (если это необходимо)\n", + "X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n", + "y_train = train_df['Price'] # Целевая переменная\n", + "\n", + "# Инициализация RandomOverSampler\n", + "ros = RandomOverSampler(random_state=42)\n", + "\n", + "# Применение upsampling\n", + "X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n", + "\n", + "# Создание нового DataFrame с балансированными данными\n", + "train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n", + "\n", + "# Создание нового признака \"Camera_to_Display_Ratio\" на основе признаков \"Camera\" и \"Display\"\n", + "train_df_resampled['Camera_to_Display_Ratio'] = train_df_resampled['Camera'] / train_df_resampled['Display']\n", + "val_df['Camera_to_Display_Ratio'] = val_df['Camera'] / val_df['Display']\n", + "test_df['Camera_to_Display_Ratio'] = test_df['Camera'] / test_df['Display']\n", + "\n", + "# Вывод размеров выборок\n", + "print(\"Размер обучающей выборки после балансировки:\", train_df_resampled.shape)\n", + "print(\"Размер контрольной выборки:\", val_df.shape)\n", + "print(\"Размер тестовой выборки:\", test_df.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки после балансировки: (5600, 19)\n", + "Размер контрольной выборки: (288, 19)\n", + "Размер тестовой выборки: (411, 19)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\extmath.py:1137: RuntimeWarning: invalid value encountered in divide\n", + " updated_mean = (last_sum + new_sum) / updated_sample_count\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\extmath.py:1142: RuntimeWarning: invalid value encountered in divide\n", + " T = new_sum / new_sample_count\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\extmath.py:1162: RuntimeWarning: invalid value encountered in divide\n", + " new_unnormalized_variance -= correction**2 / new_sample_count\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from sklearn.preprocessing import StandardScaler\n", + "import re\n", + "\n", + "# Загрузка данных\n", + "df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n", + "\n", + "# Преобразование столбца Battery в числовой формат\n", + "df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", + "\n", + "# Преобразование столбцов Camera и Display в числовой формат\n", + "df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n", + "df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n", + "\n", + "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", + "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", + "\n", + "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", + "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", + "\n", + "# Применение upsampling к обучающей выборке (если это необходимо)\n", + "X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n", + "y_train = train_df['Price'] # Целевая переменная\n", + "\n", + "# Инициализация RandomOverSampler\n", + "ros = RandomOverSampler(random_state=42)\n", + "\n", + "# Применение upsampling\n", + "X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n", + "\n", + "# Создание нового DataFrame с балансированными данными\n", + "train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n", + "\n", + "# Создание нового признака \"Camera_to_Display_Ratio\" на основе признаков \"Camera\" и \"Display\"\n", + "train_df_resampled['Camera_to_Display_Ratio'] = train_df_resampled['Camera'] / train_df_resampled['Display']\n", + "val_df['Camera_to_Display_Ratio'] = val_df['Camera'] / val_df['Display']\n", + "test_df['Camera_to_Display_Ratio'] = test_df['Camera'] / test_df['Display']\n", + "\n", + "# Определение числовых признаков для масштабирования\n", + "numerical_features_to_scale = ['Spec_score', 'No_of_sim', 'Ram', 'Battery', 'Display', 'Camera', 'Inbuilt_memory', 'Screen_resolution', 'Camera_to_Display_Ratio']\n", + "\n", + "# Удаление строковых значений из числовых признаков\n", + "for feature in numerical_features_to_scale:\n", + " train_df_resampled[feature] = pd.to_numeric(train_df_resampled[feature], errors='coerce')\n", + " val_df[feature] = pd.to_numeric(val_df[feature], errors='coerce')\n", + " test_df[feature] = pd.to_numeric(test_df[feature], errors='coerce')\n", + "\n", + "# Инициализация StandardScaler\n", + "scaler = StandardScaler()\n", + "\n", + "# Масштабирование числовых признаков в обучающей выборке\n", + "train_df_resampled[numerical_features_to_scale] = scaler.fit_transform(train_df_resampled[numerical_features_to_scale])\n", + "\n", + "# Масштабирование числовых признаков в контрольной и тестовой выборках\n", + "val_df[numerical_features_to_scale] = scaler.transform(val_df[numerical_features_to_scale])\n", + "test_df[numerical_features_to_scale] = scaler.transform(test_df[numerical_features_to_scale])\n", + "\n", + "# Вывод размеров выборок\n", + "print(\"Размер обучающей выборки после балансировки:\", train_df_resampled.shape)\n", + "print(\"Размер контрольной выборки:\", val_df.shape)\n", + "print(\"Размер тестовой выборки:\", test_df.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Конструирование признаков с применением фреймворка Featuretools" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Обучающая выборка после конструирования признаков:\n", + " Unnamed: 0 Rating Spec_score No_of_sim Ram \\\n", + "id \n", + "0 305 4.70 86 Dual Sim, 3G, 4G, 5G, VoLTE, 12 GB RAM \n", + "1 941 4.45 71 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM \n", + "2 800 4.20 68 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM \n", + "3 97 4.25 69 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM \n", + "4 1339 4.30 74 Dual Sim, 3G, 4G, VoLTE, 6 GB RAM \n", + "\n", + " Battery External_Memory Android_version company \\\n", + "id \n", + "0 5000 Android v12 NaN Realme \n", + "1 5000 Memory Card Supported, upto 1 TB 12 Motorola \n", + "2 5000 Memory Card Supported 12 Vivo \n", + "3 5000 Memory Card Supported 12 Vivo \n", + "4 5000 Memory Card Supported, upto 256 GB 12 Lava \n", + "\n", + " Inbuilt_memory fast_charging \\\n", + "id \n", + "0 256 GB inbuilt 65W Fast Charging \n", + "1 64 GB inbuilt 10W Fast Charging \n", + "2 64 GB inbuilt 10W Fast Charging \n", + "3 128 GB inbuilt 10W Fast Charging \n", + "4 128 GB inbuilt NaN \n", + "\n", + " Screen_resolution Processor \n", + "id \n", + "0 1080 x 2400 px Octa Core \n", + "1 720 x 1600 px Octa Core \n", + "2 720 x 1600 px Display with Water Drop Notch Octa Core \n", + "3 720 x 1600 px Display with Water Drop Notch Octa Core \n", + "4 1600 x 720 px Octa Core \n", + "Контрольная выборка после конструирования признаков:\n", + " Unnamed: 0 Rating Spec_score No_of_sim Ram \\\n", + "id \n", + "1028 NaN NaN NaN \n", + "825 NaN NaN NaN \n", + "900 NaN NaN NaN \n", + "702 NaN NaN NaN \n", + "230 1050 4.05 90 Dual Sim, 3G, 4G, 5G, VoLTE, 8 GB RAM \n", + "\n", + " Battery External_Memory Android_version company Inbuilt_memory \\\n", + "id \n", + "1028 NaN NaN NaN NaN \n", + "825 NaN NaN NaN NaN \n", + "900 NaN NaN NaN NaN \n", + "702 NaN NaN NaN NaN \n", + "230 4500 Android v12 NaN Motorola 128 GB inbuilt \n", + "\n", + " fast_charging Screen_resolution Processor \n", + "id \n", + "1028 NaN NaN NaN \n", + "825 NaN NaN NaN \n", + "900 NaN NaN NaN \n", + "702 NaN NaN NaN \n", + "230 125W Fast Charging 1080 x 2400 px Octa Core \n", + "Тестовая выборка после конструирования признаков:\n", + " Unnamed: 0 Rating Spec_score No_of_sim \\\n", + "id \n", + "427 187 4.40 91 Dual Sim, 3G, 4G, 5G, VoLTE, \n", + "1088 NaN NaN \n", + "668 592 4.45 91 Dual Sim, 3G, 4G, 5G, VoLTE, \n", + "572 1130 4.60 75 Dual Sim, 3G, 4G, VoLTE, \n", + "115 117 4.60 72 Dual Sim, 3G, 4G, VoLTE, \n", + "\n", + " Ram Battery External_Memory Android_version \\\n", + "id \n", + "427 12 GB RAM 5000 Memory Card Not Supported 14 \n", + "1088 NaN NaN NaN \n", + "668 12 GB RAM 4500 Android v12 NaN \n", + "572 6 GB RAM 5000 Memory Card Supported, upto 1 TB 13 \n", + "115 4 GB RAM 5000 Memory Card Supported, upto 1 TB 12 \n", + "\n", + " company Inbuilt_memory fast_charging \\\n", + "id \n", + "427 Vivo 256 GB inbuilt 120W Fast Charging \n", + "1088 NaN NaN NaN \n", + "668 Honor 256 GB inbuilt 100W Fast Charging \n", + "572 Xiaomi 128 GB inbuilt 18W Fast Charging \n", + "115 Vivo 64 GB inbuilt 18W Fast Charging \n", + "\n", + " Screen_resolution Processor \n", + "id \n", + "427 1260 x 2800 px Octa Core \n", + "1088 NaN NaN \n", + "668 1200 x 2652 px Octa Core \n", + "572 720 x 1600 px Octa Core \n", + "115 720 x 1612 px Display with Water Drop Notch Octa Core \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n", + " warnings.warn(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", + " warnings.warn(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "import featuretools as ft\n", + "import re\n", + "\n", + "# Загрузка данных\n", + "df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n", + "\n", + "# Преобразование столбца Battery в числовой формат\n", + "df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", + "\n", + "# Преобразование столбцов Camera и Display в числовой формат\n", + "df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n", + "df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n", + "\n", + "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", + "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", + "\n", + "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", + "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", + "\n", + "# Создание нового признака \"Camera_to_Display_Ratio\" на основе признаков \"Camera\" и \"Display\"\n", + "train_df['Camera_to_Display_Ratio'] = train_df['Camera'] / train_df['Display']\n", + "val_df['Camera_to_Display_Ratio'] = val_df['Camera'] / val_df['Display']\n", + "test_df['Camera_to_Display_Ratio'] = test_df['Camera'] / test_df['Display']\n", + "\n", + "# Определение сущностей\n", + "es = ft.EntitySet(id='mobile_data')\n", + "es = es.add_dataframe(dataframe_name='train', dataframe=train_df, index='id')\n", + "\n", + "# Генерация признаков\n", + "feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='train', max_depth=2)\n", + "\n", + "# Преобразование признаков для контрольной и тестовой выборок\n", + "val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n", + "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n", + "\n", + "# Вывод первых нескольких строк для проверки\n", + "print(\"Обучающая выборка после конструирования признаков:\")\n", + "print(feature_matrix.head())\n", + "print(\"Контрольная выборка после конструирования признаков:\")\n", + "print(val_feature_matrix.head())\n", + "print(\"Тестовая выборка после конструирования признаков:\")\n", + "print(test_feature_matrix.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оценка качества каждого набора признаков\n", + "\n", + "Предсказательная способность Метрики: RMSE, MAE, R²\n", + "\n", + "Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n", + "\n", + "Скорость вычисления Методы: Измерение времени выполнения генерации признаков и обучения модели.\n", + "\n", + "Надежность Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n", + "\n", + "Корреляция Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n", + "\n", + "Цельность Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: 671\n", + "Размер контрольной выборки: 288\n", + "Размер тестовой выборки: 411\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n", + " warnings.warn(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature Importance:\n", + " feature importance\n", + "4 Price 0.999443\n", + "2 Spec_score 0.000227\n", + "3 Battery 0.000146\n", + "0 Unnamed: 0 0.000146\n", + "1 Rating 0.000039\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "import featuretools as ft\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "import re\n", + "\n", + "# Загрузка данных\n", + "df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n", + "\n", + "# Преобразование столбца Battery в числовой формат\n", + "df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", + "\n", + "# Преобразование столбца Display в числовой формат\n", + "df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n", + "df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n", + "df['Inbuilt_memory'] = pd.to_numeric(df['Inbuilt_memory'], errors='coerce')\n", + "df['fast_charging'] = pd.to_numeric(df['fast_charging'], errors='coerce')\n", + "\n", + "# Удаление запятых из столбца Price и преобразование в числовой формат\n", + "df['Price'] = df['Price'].str.replace(',', '').astype(float)\n", + "\n", + "# Удаление столбцов с текстовыми значениями, которые не могут быть преобразованы в числа\n", + "df = df.drop(columns=['Name', 'company', 'Android_version', 'Processor_name', 'External_Memory', 'No_of_sim', 'Ram', 'Screen_resolution', 'Processor' ])\n", + "\n", + "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", + "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", + "\n", + "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", + "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", + "\n", + "# Вывод размеров выборок\n", + "print(\"Размер обучающей выборки:\", len(train_df))\n", + "print(\"Размер контрольной выборки:\", len(val_df))\n", + "print(\"Размер тестовой выборки:\", len(test_df))\n", + "\n", + "# Применение upsampling к обучающей выборке (если это необходимо)\n", + "X_train = train_df.drop('Price', axis=1) # Отделяем признаки от целевой переменной\n", + "y_train = train_df['Price'] # Целевая переменная\n", + "\n", + "# Инициализация RandomOverSampler\n", + "ros = RandomOverSampler(random_state=42)\n", + "\n", + "# Применение upsampling\n", + "X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n", + "\n", + "# Создание нового DataFrame с балансированными данными\n", + "train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n", + "\n", + "# Определение сущностей\n", + "es = ft.EntitySet(id='mobile_data')\n", + "es = es.add_dataframe(dataframe_name='mobile', dataframe=train_df_resampled, index='id')\n", + "\n", + "# Генерация признаков\n", + "feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='mobile', max_depth=2)\n", + "\n", + "# Преобразование признаков для контрольной и тестовой выборок\n", + "val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n", + "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n", + "\n", + "# Оценка важности признаков\n", + "X = feature_matrix\n", + "y = train_df_resampled['Price']\n", + "\n", + "# Разделение данных на обучающую и тестовую выборки\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Обучение модели\n", + "model = RandomForestRegressor(n_estimators=100, random_state=42)\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Получение важности признаков\n", + "importances = model.feature_importances_\n", + "feature_names = feature_matrix.columns\n", + "\n", + "# Сортировка признаков по важности\n", + "feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n", + "feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n", + "\n", + "print(\"Feature Importance:\")\n", + "print(feature_importance)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: 66\n", + "Размер контрольной выборки: 29\n", + "Размер тестовой выборки: 42\n", + "Mean Squared Error: 13048795.366100002\n", + "R2 Score: -0.23881710583662308\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n", + " warnings.warn(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cross-validated Mean Squared Error: 394482934.1724652\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train Mean Squared Error: 46662951.69621668\n", + "Train R2 Score: 0.9411587287387594\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.metrics import mean_squared_error, r2_score\n", + "from sklearn.model_selection import cross_val_score\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import featuretools as ft\n", + "import re\n", + "\n", + "# Загрузка данных\n", + "df = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n", + "\n", + "# Уменьшение размера выборки для ускорения работы (опционально)\n", + "df = df.sample(frac=0.1, random_state=42)\n", + "\n", + "# Преобразование столбца Battery в числовой формат\n", + "df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", + "\n", + "# Преобразование столбца Display в числовой формат\n", + "df['Camera'] = pd.to_numeric(df['Camera'], errors='coerce')\n", + "df['Display'] = pd.to_numeric(df['Display'], errors='coerce')\n", + "df['Inbuilt_memory'] = pd.to_numeric(df['Inbuilt_memory'], errors='coerce')\n", + "df['fast_charging'] = pd.to_numeric(df['fast_charging'], errors='coerce')\n", + "\n", + "# Удаление запятых из столбца Price и преобразование в числовой формат\n", + "df['Price'] = df['Price'].str.replace(',', '').astype(float)\n", + "\n", + "# Удаление столбцов с текстовыми значениями, которые не могут быть преобразованы в числа\n", + "df = df.drop(columns=['Name', 'company', 'Android_version', 'Processor_name', 'External_Memory', 'No_of_sim', 'Ram', 'Screen_resolution', 'Processor' ])\n", + "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", + "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", + "\n", + "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", + "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", + "\n", + "# Вывод размеров выборок\n", + "print(\"Размер обучающей выборки:\", len(train_df))\n", + "print(\"Размер контрольной выборки:\", len(val_df))\n", + "print(\"Размер тестовой выборки:\", len(test_df))\n", + "\n", + "# Определение сущностей\n", + "es = ft.EntitySet(id='mobile_data')\n", + "es = es.add_dataframe(dataframe_name='mobile', dataframe=train_df, index='id')\n", + "\n", + "# Генерация признаков с уменьшенной глубиной\n", + "feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='mobile', max_depth=1)\n", + "\n", + "# Преобразование признаков для контрольной и тестовой выборок\n", + "val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n", + "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n", + "\n", + "# Удаление строк с NaN\n", + "feature_matrix = feature_matrix.dropna()\n", + "val_feature_matrix = val_feature_matrix.dropna()\n", + "test_feature_matrix = test_feature_matrix.dropna()\n", + "\n", + "# Разделение данных на обучающую и тестовую выборки\n", + "X_train = feature_matrix.drop('Price', axis=1)\n", + "y_train = feature_matrix['Price']\n", + "X_val = val_feature_matrix.drop('Price', axis=1)\n", + "y_val = val_feature_matrix['Price']\n", + "X_test = test_feature_matrix.drop('Price', axis=1)\n", + "y_test = test_feature_matrix['Price']\n", + "\n", + "# Выбор модели\n", + "model = RandomForestRegressor(random_state=42)\n", + "\n", + "# Обучение модели\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Предсказание и оценка\n", + "y_pred = model.predict(X_test)\n", + "\n", + "mse = mean_squared_error(y_test, y_pred)\n", + "r2 = r2_score(y_test, y_pred)\n", + "\n", + "print(f\"Mean Squared Error: {mse}\")\n", + "print(f\"R2 Score: {r2}\")\n", + "\n", + "# Кросс-валидация\n", + "scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n", + "mse_cv = -scores.mean()\n", + "print(f\"Cross-validated Mean Squared Error: {mse_cv}\")\n", + "\n", + "# Анализ важности признаков\n", + "feature_importances = model.feature_importances_\n", + "feature_names = X_train.columns\n", + "\n", + "importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n", + "importance_df = importance_df.sort_values(by='Importance', ascending=False)\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "sns.barplot(x='Importance', y='Feature', data=importance_df)\n", + "plt.title('Feature Importance')\n", + "plt.show()\n", + "\n", + "# Проверка на переобучение\n", + "y_train_pred = model.predict(X_train)\n", + "\n", + "mse_train = mean_squared_error(y_train, y_train_pred)\n", + "r2_train = r2_score(y_train, y_train_pred)\n", + "\n", + "print(f\"Train Mean Squared Error: {mse_train}\")\n", + "print(f\"Train R2 Score: {r2_train}\")\n", + "\n", + "# Визуализация результатов\n", + "plt.figure(figsize=(10, 6))\n", + "plt.scatter(y_test, y_pred, alpha=0.5)\n", + "plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n", + "plt.xlabel('Actual Price')\n", + "plt.ylabel('Predicted Price')\n", + "plt.title('Actual vs Predicted Price')\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aimenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}