{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Начало лабораторной работы" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Unnamed: 0', 'Name', 'Brand', 'Model', 'Battery capacity (mAh)',\n", " 'Screen size (inches)', 'Touchscreen', 'Resolution x', 'Resolution y',\n", " 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera',\n", " 'Front camera', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS',\n", " 'Number of SIMs', '3G', '4G/ LTE', 'Price'],\n", " dtype='object')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0NameBrandModelBattery capacity (mAh)Screen size (inches)TouchscreenResolution xResolution yProcessor...Rear cameraFront cameraOperating systemWi-FiBluetoothGPSNumber of SIMs3G4G/ LTEPrice
00OnePlus 7T Pro McLaren EditionOnePlus7T Pro McLaren Edition40856.67Yes144031208...48.016.0AndroidYesYesYes2YesYes58998
11Realme X2 ProRealmeX2 Pro40006.50Yes108024008...64.016.0AndroidYesYesYes2YesYes27999
22iPhone 11 Pro MaxAppleiPhone 11 Pro Max39696.50Yes124226886...12.012.0iOSYesYesYes2YesYes106900
33iPhone 11AppleiPhone 1131106.10Yes82817926...12.012.0iOSYesYesYes2YesYes62900
44LG G8X ThinQLGG8X ThinQ40006.40Yes108023408...12.032.0AndroidYesYesYes1NoNo49990
55OnePlus 7TOnePlus7T38006.55Yes108024008...48.016.0AndroidYesYesNo2YesYes34930
66OnePlus 7T ProOnePlus7T Pro40856.67Yes144031208...48.016.0AndroidYesYesYes2YesYes52990
77Samsung Galaxy Note 10+SamsungGalaxy Note 10+43006.80Yes144030408...12.010.0AndroidYesYesYes2YesYes79699
88Asus ROG Phone 2AsusROG Phone 260006.59Yes108023408...48.024.0AndroidYesYesYes1YesYes37999
99Xiaomi Redmi K20 ProXiaomiRedmi K20 Pro40006.39Yes108023408...48.020.0AndroidYesYesYes2NoNo23190
\n", "

10 rows × 22 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 Name Brand \\\n", "0 0 OnePlus 7T Pro McLaren Edition OnePlus \n", "1 1 Realme X2 Pro Realme \n", "2 2 iPhone 11 Pro Max Apple \n", "3 3 iPhone 11 Apple \n", "4 4 LG G8X ThinQ LG \n", "5 5 OnePlus 7T OnePlus \n", "6 6 OnePlus 7T Pro OnePlus \n", "7 7 Samsung Galaxy Note 10+ Samsung \n", "8 8 Asus ROG Phone 2 Asus \n", "9 9 Xiaomi Redmi K20 Pro Xiaomi \n", "\n", " Model Battery capacity (mAh) Screen size (inches) \\\n", "0 7T Pro McLaren Edition 4085 6.67 \n", "1 X2 Pro 4000 6.50 \n", "2 iPhone 11 Pro Max 3969 6.50 \n", "3 iPhone 11 3110 6.10 \n", "4 G8X ThinQ 4000 6.40 \n", "5 7T 3800 6.55 \n", "6 7T Pro 4085 6.67 \n", "7 Galaxy Note 10+ 4300 6.80 \n", "8 ROG Phone 2 6000 6.59 \n", "9 Redmi K20 Pro 4000 6.39 \n", "\n", " Touchscreen Resolution x Resolution y Processor ... Rear camera \\\n", "0 Yes 1440 3120 8 ... 48.0 \n", "1 Yes 1080 2400 8 ... 64.0 \n", "2 Yes 1242 2688 6 ... 12.0 \n", "3 Yes 828 1792 6 ... 12.0 \n", "4 Yes 1080 2340 8 ... 12.0 \n", "5 Yes 1080 2400 8 ... 48.0 \n", "6 Yes 1440 3120 8 ... 48.0 \n", "7 Yes 1440 3040 8 ... 12.0 \n", "8 Yes 1080 2340 8 ... 48.0 \n", "9 Yes 1080 2340 8 ... 48.0 \n", "\n", " Front camera Operating system Wi-Fi Bluetooth GPS Number of SIMs 3G \\\n", "0 16.0 Android Yes Yes Yes 2 Yes \n", "1 16.0 Android Yes Yes Yes 2 Yes \n", "2 12.0 iOS Yes Yes Yes 2 Yes \n", "3 12.0 iOS Yes Yes Yes 2 Yes \n", "4 32.0 Android Yes Yes Yes 1 No \n", "5 16.0 Android Yes Yes No 2 Yes \n", "6 16.0 Android Yes Yes Yes 2 Yes \n", "7 10.0 Android Yes Yes Yes 2 Yes \n", "8 24.0 Android Yes Yes Yes 1 Yes \n", "9 20.0 Android Yes Yes Yes 2 No \n", "\n", " 4G/ LTE Price \n", "0 Yes 58998 \n", "1 Yes 27999 \n", "2 Yes 106900 \n", "3 Yes 62900 \n", "4 No 49990 \n", "5 Yes 34930 \n", "6 Yes 52990 \n", "7 Yes 79699 \n", "8 Yes 37999 \n", "9 No 23190 \n", "\n", "[10 rows x 22 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", "print(df.columns)\n", "display(df.head(10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Регрессия\n", "\n", "Цель: Разработать модель регрессии, которая будет предсказывать цену мобильного телефона на основе его технических характеристик и функциональных особенностей.\n", "\n", "Применение:\n", "\n", "Производители и продавцы мобильных телефонов: Модель может помочь в установлении справедливой цены для новых моделей, основываясь на их технических характеристиках, что может быть полезно для ценообразования и конкуренции на рынке.\n", "\n", "Потребители: Модель может помочь пользователям принимать более обоснованные решения при покупке, сравнивая цены и характеристики различных моделей.\n", "\n", "Рыночные аналитики: Модель может быть использована для анализа тенденций на рынке мобильных телефонов, выявления факторов, влияющих на цену, и прогнозирования будущих изменений.\n", "\n", "Исследования в области технологий: Модель может помочь в изучении влияния различных технических характеристик на цену и популярность моделей." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Удаление выбросов" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Размер данных до удаления выбросов: (1359, 21)\n", "Размер данных после удаления выбросов: (1256, 21)\n" ] } ], "source": [ "import pandas as pd\n", "from scipy import stats\n", "\n", "# Загружаем набор данных\n", "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", "\n", "# Удаление пустого столбца по имени\n", "df = df.drop('Unnamed: 0', axis=1)\n", "\n", "# Выбор числовых признаков для анализа выбросов\n", "numeric_features = [\n", " 'Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', \n", " 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', \n", " 'Rear camera', 'Front camera', 'Number of SIMs', 'Price'\n", "]\n", "\n", "# Вычисление Z-оценок для числовых признаков\n", "z_scores = stats.zscore(df[numeric_features])\n", "\n", "# Определение порога для выбросов\n", "threshold = 3\n", "\n", "# Вывод размеров данных до удаления выбросов\n", "print(\"Размер данных до удаления выбросов:\", df.shape)\n", "\n", "# Удаление строк, содержащих выбросы\n", "df = df[(z_scores < threshold).all(axis=1)]\n", "\n", "# Вывод размеров данных после удаления выбросов\n", "print(\"Размер данных после удаления выбросов:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Подготовка данных и оценка базовой модели" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Размер обучающей выборки: (1004, 18)\n", "Размер тестовой выборки: (252, 18)\n", "Baseline MAE: 4662.689511794094\n", "Baseline MSE: 50560680.710365206\n", "Baseline R²: -0.001378207894705552\n" ] } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", "\n", "\n", "# Выбор признаков и целевой переменной\n", "features = [\n", " 'Brand', 'Battery capacity (mAh)',\n", " 'Screen size (inches)', 'Touchscreen', 'Resolution x', 'Resolution y',\n", " 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera',\n", " 'Front camera', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS',\n", " 'Number of SIMs', '3G', '4G/ LTE'\n", "]\n", "target = 'Price'\n", "\n", "global X_train, X_test, y_train, y_test\n", "X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)\n", "\n", "print(\"Размер обучающей выборки:\", X_train.shape)\n", "print(\"Размер тестовой выборки:\", X_test.shape)\n", "\n", "# Базовые предсказания (среднее значение целевой переменной)\n", "baseline_predictions = [y_train.mean()] * len(y_test)\n", "\n", "# Оценка базовой модели\n", "print('Baseline MAE:', mean_absolute_error(y_test, baseline_predictions))\n", "print('Baseline MSE:', mean_squared_error(y_test, baseline_predictions))\n", "print('Baseline R²:', r2_score(y_test, baseline_predictions))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Обучение и оценка моделей" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: Linear Regression trained.\n", "MAE: 3251.7122571814075\n", "MSE: 25623200.493888523\n", "R²: 0.4925203887566195\n", "--------------------------------------------------\n", "Model: Decision Tree trained.\n", "MAE: 4112.809523809524\n", "MSE: 56543100.29960317\n", "R²: -0.11986285887206449\n", "--------------------------------------------------\n", "Model: Gradient Boosting trained.\n", "MAE: 2793.2991365668017\n", "MSE: 23739724.17710429\n", "R²: 0.5298235285129411\n", "--------------------------------------------------\n" ] } ], "source": [ "import pandas as pd\n", "from scipy import stats\n", "from sklearn.model_selection import train_test_split, RandomizedSearchCV\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import GradientBoostingRegressor\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", "\n", "# Выбор признаков и целевой переменной\n", "categorical_features = [\n", " 'Brand', 'Touchscreen', 'Operating system', 'Wi-Fi', \n", " 'Bluetooth', 'GPS', '3G', '4G/ LTE'\n", "]\n", "numeric_features = [\n", " 'Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', \n", " 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', \n", " 'Front camera', 'Number of SIMs'\n", "]\n", "target = 'Price'\n", "\n", "# Препроцессор для обработки числовых и категориальных признаков\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('num', StandardScaler(), numeric_features), # Масштабирование числовых признаков\n", " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) # Однократная кодировка категориальных признаков\n", " ]\n", ")\n", "\n", "# Создание пайплайнов для моделей\n", "pipeline_linear_regression = Pipeline(steps=[\n", " ('preprocessor', preprocessor),\n", " ('regressor', LinearRegression())\n", "])\n", "\n", "pipeline_decision_tree = Pipeline(steps=[\n", " ('preprocessor', preprocessor),\n", " ('regressor', DecisionTreeRegressor(random_state=42))\n", "])\n", "\n", "pipeline_gradient_boosting = Pipeline(steps=[\n", " ('preprocessor', preprocessor),\n", " ('regressor', GradientBoostingRegressor(random_state=42))\n", "])\n", "\n", "# Список моделей для сравнения\n", "pipelines = [\n", " ('Linear Regression', pipeline_linear_regression),\n", " ('Decision Tree', pipeline_decision_tree),\n", " ('Gradient Boosting', pipeline_gradient_boosting)\n", "]\n", "\n", "# Обучение моделей и вывод результатов\n", "for name, pipeline in pipelines:\n", " pipeline.fit(X_train, y_train)\n", " predictions = pipeline.predict(X_test)\n", " print(f\"Model: {name} trained.\")\n", " print(f\"MAE: {mean_absolute_error(y_test, predictions)}\")\n", " print(f\"MSE: {mean_squared_error(y_test, predictions)}\")\n", " print(f\"R²: {r2_score(y_test, predictions)}\")\n", " print(\"-\" * 50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Линейная регрессия улучшила качество предсказаний по сравнению с базовой моделью, но показала меньшую эффективность, чем градиентный бустинг\n", "\n", "* MAE уменьшился на 1410.98 (30% улучшение).\n", "\n", "* MSE уменьшился на 24937480.22 (49% улучшение).\n", "\n", "* R² стал положительным (0.4925), что означает, что модель объясняет 49.25% изменчивости цены.\n", "\n", "Дерево решений оказалось наименее эффективным среди всех моделей, показав худшие результаты даже по сравнению с базовой моделью.\n", "\n", "* MAE увеличился на 450.12 (9.6% ухудшение).\n", "\n", "* MSE увеличился на 5982419.59 (11.8% ухудшение).\n", "\n", "* R² стал еще ниже (-0.1199), что указывает на то, что модель работает хуже, чем базовая.\n", "\n", "Градиентный бустинг показал лучшие результаты среди всех моделей:\n", "\n", "* MAE уменьшился на 1869.39 (40% улучшение) по сравнению с базовой моделью.\n", "\n", "* MSE уменьшился на 26820956.53 (53% улучшение) по сравнению с базовой моделью.\n", "\n", "* R² достиг 0.5298, что означает, что модель объясняет 52.98% изменчивости цены.\n", "\n", "Таким образом, градиентный бустинг является наиболее подходящей моделью для предсказания цены мобильных телефонов на основе выбранных признаков." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Классификация \n", "\n", "Цель: Целью классификации является разработка модели, которая будет предсказывать категорию цены мобильного телефона на основе его технических характеристик и функциональных особенностей.\n", "\n", "Применение классификации:\n", "\n", "1. Рыночная аналитика:\n", "\n", "* Помогает производителям и продавцам телефонов определять целевую аудиторию для конкретных моделей.\n", "\n", "* Позволяет анализировать конкуренцию в разных ценовых сегментах.\n", "\n", "2. Потребительские рекомендации:\n", "\n", "* Помогает пользователям выбирать телефоны, соответствующие их бюджету и требованиям.\n", "\n", "* Упрощает процесс сравнения телефонов в рамках одной ценовой категории." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Размер обучающей выборки: (1004, 18)\n", "Размер тестовой выборки: (252, 18)\n" ] } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "\n", "# Выбор признаков и целевой переменной\n", "features = [\n", " 'Brand', 'Battery capacity (mAh)',\n", " 'Screen size (inches)', 'Touchscreen', 'Resolution x', 'Resolution y',\n", " 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera',\n", " 'Front camera', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS',\n", " 'Number of SIMs', '3G', '4G/ LTE'\n", "]\n", "target = 'Price'\n", "\n", "# Разделение данных на обучающую и тестовую выборки\n", "X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)\n", "\n", "print(\"Размер обучающей выборки:\", X_train.shape)\n", "print(\"Размер тестовой выборки:\", X_test.shape)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PriceCategory\n", "1 946\n", "0 946\n", "Name: count, dtype: int64\n", "Гиперпараметры для логистической регрессии:\n", "Accuracy: 0.8760\n", "Precision: 0.8784\n", "Recall: 0.8760\n", "F1-Score: 0.8762\n", "ROC-AUC: 0.9521\n", "Гиперпараметры для случайного леса:\n", "Accuracy: 0.9393\n", "Precision: 0.9395\n", "Recall: 0.9393\n", "F1-Score: 0.9392\n", "ROC-AUC: 0.9833\n", "Гиперпараметры для градиентного бустинга:\n", "Accuracy: 0.9261\n", "Precision: 0.9261\n", "Recall: 0.9261\n", "F1-Score: 0.9261\n", "ROC-AUC: 0.9777\n", "\n", "Результаты моделей:\n", "\n", "Logistic Regression:\n", "Accuracy: 0.8760\n", "Precision: 0.8784\n", "Recall: 0.8760\n", "F1: 0.8762\n", "Roc_auc: 0.9521\n", "\n", "Random Forest:\n", "Accuracy: 0.9393\n", "Precision: 0.9395\n", "Recall: 0.9393\n", "F1: 0.9392\n", "Roc_auc: 0.9833\n", "\n", "Gradient Boosting:\n", "Accuracy: 0.9261\n", "Precision: 0.9261\n", "Recall: 0.9261\n", "F1: 0.9261\n", "Roc_auc: 0.9777\n" ] } ], "source": [ "import pandas as pd\n", "from imblearn.over_sampling import SMOTE\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n", "from scipy.stats import uniform, randint\n", "from sklearn.model_selection import RandomizedSearchCV\n", "\n", "# Выбор признаков и целевой переменной\n", "features = [\n", " 'Brand', 'Battery capacity (mAh)', 'Screen size (inches)', 'Touchscreen', \n", " 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', \n", " 'Rear camera', 'Front camera', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', \n", " 'Number of SIMs', '3G', '4G/ LTE'\n", "]\n", "target = 'PriceCategory' # Целевая переменная: категория цены\n", "\n", "# Преобразование целевой переменной в категории (например, бюджетные, средний класс, премиум)\n", "bins = [0, 10000, 60000, float('inf')]\n", "labels = ['Budget', 'Mid-Range', 'Premium']\n", "df['PriceCategory'] = pd.cut(df['Price'], bins=bins, labels=labels)\n", "\n", "# Преобразование категорий в числа\n", "label_encoder = LabelEncoder()\n", "df[target] = label_encoder.fit_transform(df[target])\n", "\n", "# Определение категориальных и числовых признаков\n", "categorical_features = [\n", " 'Brand', 'Touchscreen', 'Operating system', 'Wi-Fi', \n", " 'Bluetooth', 'GPS', '3G', '4G/ LTE'\n", "]\n", "numeric_features = [\n", " 'Battery capacity (mAh)', 'Processor', 'Screen size (inches)', 'Resolution x', \n", " 'Resolution y', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', \n", " 'Front camera', 'Number of SIMs'\n", "]\n", "\n", "# Препроцессор для обработки числовых и категориальных признаков\n", "categorical_transformer = Pipeline(steps=[\n", " ('onehot', OneHotEncoder(handle_unknown='ignore'))\n", "])\n", "\n", "numeric_transformer = Pipeline(steps=[\n", " ('scaler', StandardScaler())\n", "])\n", "\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('num', numeric_transformer, numeric_features),\n", " ('cat', categorical_transformer, categorical_features)\n", " ])\n", "\n", "# Применение препроцессора\n", "X = preprocessor.fit_transform(df[features])\n", "y = df[target]\n", "\n", "# Балансировка классов с помощью SMOTE\n", "smote = SMOTE(random_state=42)\n", "X_resampled, y_resampled = smote.fit_resample(X, y)\n", "\n", "print(pd.Series(y_resampled).value_counts())\n", "\n", "# Разделение данных на обучающую и тестовую выборки\n", "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", "\n", "# Функция для оценки модели\n", "def evaluate_model(model, X_test, y_test):\n", " y_pred = model.predict(X_test)\n", " y_pred_proba = model.predict_proba(X_test)[:, 1]\n", " \n", " accuracy = accuracy_score(y_test, y_pred)\n", " precision = precision_score(y_test, y_pred, average='weighted') \n", " recall = recall_score(y_test, y_pred, average='weighted') \n", " f1 = f1_score(y_test, y_pred, average='weighted') \n", " roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')\n", " \n", " print(f\"Accuracy: {accuracy:.4f}\")\n", " print(f\"Precision: {precision:.4f}\")\n", " print(f\"Recall: {recall:.4f}\")\n", " print(f\"F1-Score: {f1:.4f}\")\n", " print(f\"ROC-AUC: {roc_auc:.4f}\")\n", " \n", " return {\n", " 'accuracy': accuracy,\n", " 'precision': precision,\n", " 'recall': recall,\n", " 'f1': f1,\n", " 'roc_auc': roc_auc\n", " }\n", "\n", "# Логистическая регрессия\n", "print(\"Гиперпараметры для логистической регрессии:\")\n", "logreg_param_dist = {\n", " 'classifier__C': uniform(loc=0, scale=4),\n", " 'classifier__penalty': ['l1', 'l2'],\n", " 'classifier__solver': ['liblinear', 'saga']\n", "}\n", "\n", "logreg_pipeline = Pipeline([\n", " ('classifier', LogisticRegression(max_iter=1000, random_state=42))\n", "])\n", "\n", "logreg_random_search = RandomizedSearchCV(logreg_pipeline, param_distributions=logreg_param_dist, n_iter=50, cv=5, random_state=42, n_jobs=-1)\n", "logreg_random_search.fit(X_train, y_train)\n", "logreg_best_model = logreg_random_search.best_estimator_\n", "logreg_results = evaluate_model(logreg_best_model, X_test, y_test)\n", "\n", "# Случайный лес\n", "print(\"Гиперпараметры для случайного леса:\")\n", "rf_param_dist = {\n", " 'classifier__n_estimators': randint(100, 1000),\n", " 'classifier__max_depth': [None] + list(randint(10, 100).rvs(10)),\n", " 'classifier__min_samples_split': randint(2, 20),\n", " 'classifier__min_samples_leaf': randint(1, 20),\n", " 'classifier__bootstrap': [True, False]\n", "}\n", "\n", "rf_pipeline = Pipeline([\n", " ('classifier', RandomForestClassifier(random_state=42))\n", "])\n", "\n", "rf_random_search = RandomizedSearchCV(rf_pipeline, param_distributions=rf_param_dist, n_iter=50, cv=5, random_state=42, n_jobs=-1)\n", "rf_random_search.fit(X_train, y_train)\n", "rf_best_model = rf_random_search.best_estimator_\n", "rf_results = evaluate_model(rf_best_model, X_test, y_test)\n", "\n", "# Градиентный бустинг\n", "print(\"Гиперпараметры для градиентного бустинга:\")\n", "gb_param_dist = {\n", " 'classifier__n_estimators': randint(100, 1000),\n", " 'classifier__learning_rate': uniform(0.01, 0.5),\n", " 'classifier__max_depth': [None] + list(randint(10, 100).rvs(10)),\n", " 'classifier__min_samples_split': randint(2, 20),\n", " 'classifier__min_samples_leaf': randint(1, 20),\n", " 'classifier__subsample': uniform(0.5, 0.5)\n", "}\n", "\n", "gb_pipeline = Pipeline([\n", " ('classifier', GradientBoostingClassifier(random_state=42))\n", "])\n", "\n", "gb_random_search = RandomizedSearchCV(gb_pipeline, param_distributions=gb_param_dist, n_iter=50, cv=5, random_state=42, n_jobs=-1)\n", "gb_random_search.fit(X_train, y_train)\n", "gb_best_model = gb_random_search.best_estimator_\n", "gb_results = evaluate_model(gb_best_model, X_test, y_test)\n", "\n", "# Вывод результатов\n", "print(\"\\nРезультаты моделей:\")\n", "print(\"\\nLogistic Regression:\")\n", "for metric, value in logreg_results.items():\n", " print(f\"{metric.capitalize()}: {value:.4f}\")\n", "\n", "print(\"\\nRandom Forest:\")\n", "for metric, value in rf_results.items():\n", " print(f\"{metric.capitalize()}: {value:.4f}\")\n", "\n", "print(\"\\nGradient Boosting:\")\n", "for metric, value in gb_results.items():\n", " print(f\"{metric.capitalize()}: {value:.4f}\")" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Logistic Regression Metrics:\n", "Accuracy: 0.8760\n", "Precision: 0.8784\n", "Recall: 0.8760\n", "F1-Score: 0.8762\n", "ROC-AUC: 0.9521\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Random Forest Metrics:\n", "Accuracy: 0.9393\n", "Precision: 0.9395\n", "Recall: 0.9393\n", "F1-Score: 0.9392\n", "ROC-AUC: 0.9833\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Gradient Boosting Metrics:\n", "Accuracy: 0.9261\n", "Precision: 0.9261\n", "Recall: 0.9261\n", "F1-Score: 0.9261\n", "ROC-AUC: 0.9777\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score\n", "from sklearn.preprocessing import label_binarize\n", "import numpy as np\n", "\n", "# Функция для построения матрицы ошибок\n", "def plot_confusion_matrix(y_true, y_pred, title):\n", " cm = confusion_matrix(y_true, y_pred)\n", " sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)\n", " plt.title(title)\n", " plt.xlabel('Предсказанные значения')\n", " plt.ylabel('Истинные значения')\n", " plt.show()\n", "\n", "# Функция для оценки и визуализации модели\n", "def evaluate_and_plot_model(model, X_test, y_test, model_name, class_names):\n", " y_pred = model.predict(X_test)\n", " y_pred_proba = model.predict_proba(X_test)[:, 1]\n", " # Метрики\n", " accuracy = accuracy_score(y_test, y_pred)\n", " precision = precision_score(y_test, y_pred, average='weighted')\n", " recall = recall_score(y_test, y_pred, average='weighted')\n", " f1 = f1_score(y_test, y_pred, average='weighted')\n", " roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted')\n", " \n", " print(f\"{model_name} Metrics:\")\n", " print(f\"Accuracy: {accuracy:.4f}\")\n", " print(f\"Precision: {precision:.4f}\")\n", " print(f\"Recall: {recall:.4f}\")\n", " print(f\"F1-Score: {f1:.4f}\")\n", " print(f\"ROC-AUC: {roc_auc:.4f}\")\n", " \n", " # Визуализация\n", " plot_confusion_matrix(y_test, y_pred, f'Confusion Matrix for {model_name}')\n", "\n", "# Пример использования\n", "class_names = ['Budget', 'Mid-Range', 'Premium']\n", "evaluate_and_plot_model(logreg_best_model, X_test, y_test, 'Logistic Regression', class_names)\n", "evaluate_and_plot_model(rf_best_model, X_test, y_test, 'Random Forest', class_names)\n", "evaluate_and_plot_model(gb_best_model, X_test, y_test, 'Gradient Boosting', class_names)" ] } ], "metadata": { "kernelspec": { "display_name": "aimenv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 2 }