From f143115f43448e8921fc5aefbf7bd1324ce94fa4 Mon Sep 17 00:00:00 2001 From: "a.puchkina" Date: Fri, 6 Dec 2024 22:30:14 +0400 Subject: [PATCH 1/4] =?UTF-8?q?4=20=D0=BB=D0=B0=D0=B1=D0=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 + lab_4/lab4.ipynb | 1197 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1200 insertions(+) create mode 100644 lab_4/lab4.ipynb diff --git a/.gitignore b/.gitignore index 207d123..8b52d1a 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ ipython_config.py # Remove previous ipynb_checkpoints # git rm -r .ipynb_checkpoints/ +# virtual +aimenv/ +static/ \ No newline at end of file diff --git a/lab_4/lab4.ipynb b/lab_4/lab4.ipynb new file mode 100644 index 0000000..2af2575 --- /dev/null +++ b/lab_4/lab4.ipynb @@ -0,0 +1,1197 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Начало лабораторной работы" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['Unnamed: 0', 'Name', 'Rating', 'Spec_score', 'No_of_sim', 'Ram',\n", + " 'Battery', 'Display', 'Camera', 'External_Memory', 'Android_version',\n", + " 'Price', 'company', 'Inbuilt_memory', 'fast_charging',\n", + " 'Screen_resolution', 'Processor', 'Processor_name'],\n", + " dtype='object')\n", + "Index(['Unnamed: 0', 'Name', 'Brand', 'Model', 'Battery capacity (mAh)',\n", + " 'Screen size (inches)', 'Touchscreen', 'Resolution x', 'Resolution y',\n", + " 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera',\n", + " 'Front camera', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS',\n", + " 'Number of SIMs', '3G', '4G/ LTE', 'Price'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "df1 = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n", + "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "print(df1.columns)\n", + "print(df.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Бизнес-цели:\n", + "1. Прогнозирование цен на мобильные телефоны: \n", + "\n", + "Цель: Разработать точную модель машинного обучения для прогнозирования цен на мобильные телефоны на основе характеристик.\n", + "\n", + "Применение:\n", + "Оптимизация ценообразования для производителей и продавцов.\n", + "Помощь потребителям в принятии обоснованных решений при покупке.\n", + "\n", + "2. Оптимизация характеристик продукта:\n", + "\n", + "Цель: Определить оптимальный набор характеристик для мобильных телефонов, максимизирующий ценность для потребителей при заданном бюджете.\n", + "\n", + "Применение:\n", + "Разработка новых моделей мобильных телефонов.\n", + "Улучшение существующих моделей." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Прогнозирование цен на мобильные телефоны " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Среднее значение поля 'Price': 11465.825607064018\n", + " Name Brand Model \\\n", + "0 OnePlus 7T Pro McLaren Edition OnePlus 7T Pro McLaren Edition \n", + "1 Realme X2 Pro Realme X2 Pro \n", + "2 iPhone 11 Pro Max Apple iPhone 11 Pro Max \n", + "3 iPhone 11 Apple iPhone 11 \n", + "4 LG G8X ThinQ LG G8X ThinQ \n", + "5 OnePlus 7T OnePlus 7T \n", + "6 OnePlus 7T Pro OnePlus 7T Pro \n", + "7 Samsung Galaxy Note 10+ Samsung Galaxy Note 10+ \n", + "8 Asus ROG Phone 2 Asus ROG Phone 2 \n", + "9 Xiaomi Redmi K20 Pro Xiaomi Redmi K20 Pro \n", + "10 Oppo K3 Oppo K3 \n", + "11 Realme X Realme X \n", + "12 Xiaomi Redmi K20 Xiaomi Redmi K20 \n", + "13 OnePlus 7 Pro OnePlus 7 Pro \n", + "14 Oppo Reno 10x Zoom Oppo Reno 10x Zoom \n", + "15 Realme 3 Pro Realme 3 Pro \n", + "16 Huawei P30 Pro Huawei P30 Pro \n", + "17 Redmi Note 7 Pro Xiaomi Redmi Note 7 Pro \n", + "18 Huawei Mate 20 Pro Huawei Mate 20 Pro \n", + "19 LG V40 ThinQ LG V40 ThinQ \n", + "\n", + " Battery capacity (mAh) Screen size (inches) Touchscreen Resolution x \\\n", + "0 4085 6.67 Yes 1440 \n", + "1 4000 6.50 Yes 1080 \n", + "2 3969 6.50 Yes 1242 \n", + "3 3110 6.10 Yes 828 \n", + "4 4000 6.40 Yes 1080 \n", + "5 3800 6.55 Yes 1080 \n", + "6 4085 6.67 Yes 1440 \n", + "7 4300 6.80 Yes 1440 \n", + "8 6000 6.59 Yes 1080 \n", + "9 4000 6.39 Yes 1080 \n", + "10 3765 6.50 Yes 1080 \n", + "11 3765 6.53 Yes 1080 \n", + "12 4000 6.39 Yes 1080 \n", + "13 4000 6.67 Yes 1440 \n", + "14 4065 6.60 Yes 1080 \n", + "15 4045 6.30 Yes 1080 \n", + "16 4200 6.47 Yes 1080 \n", + "17 4000 6.30 Yes 1080 \n", + "18 4200 6.39 Yes 1440 \n", + "19 3300 6.40 Yes 1440 \n", + "\n", + " Resolution y Processor RAM (MB) ... Operating system Wi-Fi \\\n", + "0 3120 8 12000 ... Android Yes \n", + "1 2400 8 6000 ... Android Yes \n", + "2 2688 6 4000 ... iOS Yes \n", + "3 1792 6 4000 ... iOS Yes \n", + "4 2340 8 6000 ... Android Yes \n", + "5 2400 8 8000 ... Android Yes \n", + "6 3120 8 8000 ... Android Yes \n", + "7 3040 8 12000 ... Android Yes \n", + "8 2340 8 8000 ... Android Yes \n", + "9 2340 8 6000 ... Android Yes \n", + "10 2340 8 6000 ... Android Yes \n", + "11 2340 8 4000 ... Android Yes \n", + "12 2340 8 6000 ... Android Yes \n", + "13 3120 8 6000 ... Android Yes \n", + "14 2340 8 6000 ... Android Yes \n", + "15 2340 8 4000 ... Android Yes \n", + "16 2340 8 8000 ... Android Yes \n", + "17 2340 8 4000 ... Android Yes \n", + "18 3120 8 6000 ... Android Yes \n", + "19 3120 8 6000 ... Android Yes \n", + "\n", + " Bluetooth GPS Number of SIMs 3G 4G/ LTE Price above_average_price \\\n", + "0 Yes Yes 2 Yes Yes 58998 1 \n", + "1 Yes Yes 2 Yes Yes 27999 1 \n", + "2 Yes Yes 2 Yes Yes 106900 1 \n", + "3 Yes Yes 2 Yes Yes 62900 1 \n", + "4 Yes Yes 1 No No 49990 1 \n", + "5 Yes No 2 Yes Yes 34930 1 \n", + "6 Yes Yes 2 Yes Yes 52990 1 \n", + "7 Yes Yes 2 Yes Yes 79699 1 \n", + "8 Yes Yes 1 Yes Yes 37999 1 \n", + "9 Yes Yes 2 No No 23190 1 \n", + "10 Yes Yes 2 Yes Yes 23990 1 \n", + "11 Yes Yes 2 Yes Yes 14999 1 \n", + "12 Yes Yes 2 Yes Yes 19282 1 \n", + "13 Yes Yes 2 Yes Yes 39995 1 \n", + "14 Yes Yes 2 Yes Yes 36990 1 \n", + "15 Yes Yes 2 Yes Yes 13999 1 \n", + "16 Yes No 2 Yes Yes 54280 1 \n", + "17 Yes Yes 2 Yes Yes 9799 0 \n", + "18 Yes Yes 2 Yes Yes 63990 1 \n", + "19 Yes Yes 2 Yes Yes 29999 1 \n", + "\n", + " price_volatility \n", + "0 174496 \n", + "1 174496 \n", + "2 174496 \n", + "3 174496 \n", + "4 174496 \n", + "5 174496 \n", + "6 174496 \n", + "7 174496 \n", + "8 174496 \n", + "9 174496 \n", + "10 174496 \n", + "11 174496 \n", + "12 174496 \n", + "13 174496 \n", + "14 174496 \n", + "15 174496 \n", + "16 174496 \n", + "17 174496 \n", + "18 174496 \n", + "19 174496 \n", + "\n", + "[20 rows x 23 columns]\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Загружаем набор данных\n", + "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "\n", + "# Удаление пустого столбца по имени\n", + "df = df.drop('Unnamed: 0', axis=1)\n", + "\n", + "# Устанавливаем случайное состояние\n", + "random_state = 42\n", + "\n", + "# Рассчитываем среднее значение цены\n", + "average_price = df['Price'].mean()\n", + "print(f\"Среднее значение поля 'Price': {average_price}\")\n", + "\n", + "# Создаем новую переменную, указывающую, превышает ли цена среднюю\n", + "df['above_average_price'] = (df['Price'] > average_price).astype(int)\n", + "\n", + "# Рассчитываем волатильность (разницу между максимальной и минимальной ценой)\n", + "df['price_volatility'] = df['Price'].max() - df['Price'].min()\n", + "\n", + "# Выводим первые строки измененной таблицы для проверки\n", + "print(df.head(20))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Оптимизация характеристик продукта " + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Средняя цена для 'Brand':\n", + "Brand\n", + "10.or 5999.000000\n", + "Acer 6470.000000\n", + "Alcatel 9201.600000\n", + "Apple 45510.470588\n", + "Aqua 4599.000000\n", + " ... \n", + "Zopo 6027.916667\n", + "Zuk 29124.000000\n", + "iBall 4051.000000\n", + "iVoomi 4018.875000\n", + "mPhone 6949.000000\n", + "Name: Price, Length: 76, dtype: float64\n", + "\n", + "Средняя цена для 'Battery capacity (mAh)':\n", + "Battery capacity (mAh)\n", + "1010 3499.000000\n", + "1050 4790.000000\n", + "1200 3396.000000\n", + "1250 2498.500000\n", + "1300 3338.333333\n", + " ... \n", + "5000 13542.530612\n", + "5020 23900.000000\n", + "5100 16999.000000\n", + "5300 8999.000000\n", + "6000 19832.333333\n", + "Name: Price, Length: 165, dtype: float64\n", + "\n", + "Средняя цена для 'Screen size (inches)':\n", + "Screen size (inches)\n", + "2.40 1249.000000\n", + "2.44 6999.000000\n", + "2.45 2999.000000\n", + "2.60 5555.000000\n", + "2.80 5345.000000\n", + " ... \n", + "6.70 58896.428571\n", + "6.80 79699.000000\n", + "6.90 92999.000000\n", + "7.00 6199.000000\n", + "7.30 164999.000000\n", + "Name: Price, Length: 80, dtype: float64\n", + "\n", + "Средняя цена для 'Touchscreen':\n", + "Touchscreen\n", + "No 5255.411765\n", + "Yes 11544.497019\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'Resolution x':\n", + "Resolution x\n", + "240 3819.888889\n", + "320 2499.000000\n", + "360 5797.500000\n", + "400 2860.500000\n", + "480 4128.602459\n", + "540 7445.848485\n", + "560 15990.000000\n", + "600 4829.000000\n", + "640 7697.200000\n", + "720 7619.360684\n", + "750 25999.000000\n", + "768 8872.000000\n", + "800 2190.000000\n", + "828 54199.500000\n", + "850 4115.000000\n", + "854 4415.666667\n", + "1024 6199.000000\n", + "1080 17165.865952\n", + "1125 75632.666667\n", + "1176 77299.000000\n", + "1242 88449.500000\n", + "1280 7144.333333\n", + "1440 34240.461538\n", + "1520 7935.000000\n", + "1536 164999.000000\n", + "1600 29950.000000\n", + "1880 20800.000000\n", + "2160 45500.000000\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'Resolution y':\n", + "Resolution y\n", + "320 3811.125000\n", + "480 4264.818182\n", + "485 3299.000000\n", + "584 4499.000000\n", + "600 6199.000000\n", + "640 4095.000000\n", + "720 8539.900000\n", + "800 4189.321429\n", + "854 4087.901316\n", + "960 6921.651163\n", + "1080 7583.800000\n", + "1136 12749.000000\n", + "1280 7449.136674\n", + "1290 14990.000000\n", + "1334 25999.000000\n", + "1440 6930.808824\n", + "1480 13596.166667\n", + "1498 4490.000000\n", + "1500 9083.125000\n", + "1520 8139.425000\n", + "1544 10010.000000\n", + "1548 8499.000000\n", + "1560 9135.750000\n", + "1580 9950.000000\n", + "1600 9476.111111\n", + "1620 21492.000000\n", + "1792 54199.500000\n", + "1820 6498.000000\n", + "1920 14698.088496\n", + "2152 164999.000000\n", + "2160 17465.194444\n", + "2220 20531.333333\n", + "2240 49990.000000\n", + "2244 18000.000000\n", + "2246 16177.200000\n", + "2248 39990.000000\n", + "2270 8961.000000\n", + "2280 19073.190476\n", + "2310 25120.000000\n", + "2316 29999.000000\n", + "2340 19768.017241\n", + "2400 32131.083333\n", + "2436 75632.666667\n", + "2520 11844.500000\n", + "2560 26501.476190\n", + "2636 174990.000000\n", + "2688 88449.500000\n", + "2880 34993.250000\n", + "2960 38425.714286\n", + "3040 81799.500000\n", + "3120 45710.000000\n", + "3200 77999.000000\n", + "3840 45500.000000\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'Processor':\n", + "Processor\n", + "1 7861.095238\n", + "2 7929.444444\n", + "4 7209.401171\n", + "6 39365.400000\n", + "8 16193.927434\n", + "10 8542.000000\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'RAM (MB)':\n", + "RAM (MB)\n", + "64 4249.000000\n", + "256 2625.625000\n", + "289 3190.000000\n", + "384 5555.000000\n", + "512 3419.847222\n", + "768 4749.500000\n", + "1000 5747.082153\n", + "2000 8645.608187\n", + "3000 11587.829787\n", + "4000 17741.965000\n", + "6000 28291.587302\n", + "8000 45080.586207\n", + "12000 99173.750000\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'Internal storage (GB)':\n", + "Internal storage (GB)\n", + "0.064 3499.000000\n", + "0.128 4999.000000\n", + "0.160 5555.000000\n", + "0.512 3390.444444\n", + "1.000 5197.000000\n", + "2.000 2399.000000\n", + "3.000 3190.000000\n", + "4.000 4513.032258\n", + "8.000 5311.654206\n", + "16.000 8379.403341\n", + "32.000 12089.644366\n", + "64.000 21208.814607\n", + "128.000 29477.043478\n", + "256.000 68682.777778\n", + "512.000 164999.000000\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'Rear camera':\n", + "Rear camera\n", + "0.0 4193.500000\n", + "0.3 3054.666667\n", + "2.0 3000.804878\n", + "3.0 4329.333333\n", + "3.2 3506.727273\n", + "5.0 4413.183486\n", + "8.0 6658.109966\n", + "8.7 8999.000000\n", + "10.0 10499.250000\n", + "12.0 28533.950980\n", + "12.2 29162.500000\n", + "12.3 30569.250000\n", + "13.0 9531.776018\n", + "13.1 32490.000000\n", + "13.2 7496.333333\n", + "15.0 88719.000000\n", + "16.0 17923.896825\n", + "18.0 19999.000000\n", + "19.0 39095.000000\n", + "20.0 23739.777778\n", + "20.7 35718.600000\n", + "21.0 14513.900000\n", + "21.5 6999.000000\n", + "23.0 31966.090909\n", + "24.0 18587.250000\n", + "25.0 13980.000000\n", + "32.0 19589.000000\n", + "40.0 61389.750000\n", + "41.0 10990.000000\n", + "48.0 24205.615385\n", + "64.0 19332.333333\n", + "108.0 92999.000000\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'Front camera':\n", + "Front camera\n", + "0.0 6799.333333\n", + "0.3 4398.430233\n", + "0.9 19999.000000\n", + "1.1 18745.000000\n", + "1.2 18579.333333\n", + "1.3 5640.352941\n", + "1.5 18748.000000\n", + "1.6 15833.000000\n", + "1.9 11246.750000\n", + "2.0 6711.995122\n", + "2.1 24839.666667\n", + "2.2 35823.333333\n", + "2.4 5061.333333\n", + "3.0 8500.000000\n", + "3.2 4660.900000\n", + "3.7 28925.000000\n", + "4.0 13519.285714\n", + "5.0 8347.064655\n", + "7.0 49061.625000\n", + "8.0 13894.422222\n", + "10.0 105114.166667\n", + "12.0 41929.500000\n", + "13.0 11380.436364\n", + "16.0 17646.113402\n", + "20.0 13498.520000\n", + "24.0 21860.562500\n", + "25.0 19182.692308\n", + "32.0 23749.782609\n", + "40.0 92999.000000\n", + "48.0 27999.000000\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'Operating system':\n", + "Operating system\n", + "Android 10989.947652\n", + "BlackBerry 10509.600000\n", + "Cyanogen 9173.600000\n", + "Sailfish 4799.000000\n", + "Tizen 4459.666667\n", + "Windows 16706.684211\n", + "iOS 45510.470588\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'Wi-Fi':\n", + "Wi-Fi\n", + "No 6334.375000\n", + "Yes 11496.211695\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'Bluetooth':\n", + "Bluetooth\n", + "No 7794.466667\n", + "Yes 11506.800595\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'GPS':\n", + "GPS\n", + "No 8316.148148\n", + "Yes 11737.740208\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'Number of SIMs':\n", + "Number of SIMs\n", + "1 16479.718062\n", + "2 10465.580018\n", + "3 4590.000000\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для '3G':\n", + "3G\n", + "No 11616.882759\n", + "Yes 11447.783361\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для '4G/ LTE':\n", + "4G/ LTE\n", + "No 7922.270893\n", + "Yes 12680.858696\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для комбинации 'Brand' и 'Operating system':\n", + "Brand Operating system\n", + "10.or Android 5999.000000\n", + "Acer Android 6470.000000\n", + "Alcatel Android 9201.600000\n", + "Apple iOS 45510.470588\n", + "Aqua Android 4599.000000\n", + " ... \n", + "Zopo Android 6027.916667\n", + "Zuk Android 29124.000000\n", + "iBall Android 4051.000000\n", + "iVoomi Android 4018.875000\n", + "mPhone Android 6949.000000\n", + "Name: Price, Length: 85, dtype: float64\n", + "\n", + "Средняя цена для комбинации 'RAM (MB)' и 'Internal storage (GB)':\n", + "RAM (MB) Internal storage (GB)\n", + "64 0.064 3499.000000\n", + " 0.128 4999.000000\n", + "256 0.512 2287.857143\n", + " 8.000 4990.000000\n", + "289 3.000 3190.000000\n", + "384 0.160 5555.000000\n", + "512 0.512 7249.500000\n", + " 2.000 2399.000000\n", + " 4.000 3222.244444\n", + " 8.000 3513.750000\n", + "768 4.000 6500.000000\n", + " 8.000 2999.000000\n", + "1000 1.000 5197.000000\n", + " 4.000 8019.187500\n", + " 8.000 5398.572464\n", + " 16.000 6772.263158\n", + " 32.000 6997.000000\n", + "2000 8.000 6458.736842\n", + " 16.000 8518.889273\n", + " 32.000 10155.272727\n", + " 64.000 36999.000000\n", + "3000 16.000 9082.082192\n", + " 32.000 11742.751269\n", + " 64.000 24287.833333\n", + "4000 32.000 14827.288462\n", + " 64.000 19201.702290\n", + " 128.000 15408.882353\n", + "6000 64.000 26554.636364\n", + " 128.000 29692.000000\n", + " 256.000 44999.000000\n", + "8000 64.000 54990.000000\n", + " 128.000 37177.181818\n", + " 256.000 72408.166667\n", + "12000 128.000 92999.000000\n", + " 256.000 69348.500000\n", + " 512.000 164999.000000\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для комбинации 'Screen size (inches)' и 'Battery capacity (mAh)':\n", + "Screen size (inches) Battery capacity (mAh)\n", + "2.40 2000 1249.0\n", + "2.44 1450 6999.0\n", + "2.45 1500 2999.0\n", + "2.60 1350 5555.0\n", + "2.80 1200 3190.0\n", + " ... \n", + "6.70 4500 40259.0\n", + "6.80 4300 79699.0\n", + "6.90 5000 92999.0\n", + "7.00 3450 6199.0\n", + "7.30 4380 164999.0\n", + "Name: Price, Length: 431, dtype: float64\n", + "\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Загружаем набор данных\n", + "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "\n", + "# Удаление пустого столбца по имени\n", + "df = df.drop('Unnamed: 0', axis=1)\n", + "\n", + "# Устанавливаем случайное состояние\n", + "random_state = 42\n", + "\n", + "# Рассчитываем среднюю цену для каждого значения каждого признака (Model уберем, у всех разная)\n", + "for column in ['Brand', 'Battery capacity (mAh)', 'Screen size (inches)', 'Touchscreen', \n", + " 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', \n", + " 'Rear camera', 'Front camera', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', \n", + " 'Number of SIMs', '3G', '4G/ LTE']:\n", + " print(f\"Средняя цена для '{column}':\")\n", + " print(df.groupby(column)['Price'].mean())\n", + " print()\n", + "\n", + "# Рассчитываем среднюю цену для комбинаций признаков\n", + "# для комбинации 'Brand' и 'Operating system'\n", + "print(\"Средняя цена для комбинации 'Brand' и 'Operating system':\")\n", + "print(df.groupby(['Brand', 'Operating system'])['Price'].mean())\n", + "print()\n", + "\n", + "# Рассчитываем среднюю цену для комбинации 'RAM (MB)' и 'Internal storage (GB)'\n", + "print(\"Средняя цена для комбинации 'RAM (MB)' и 'Internal storage (GB)':\")\n", + "print(df.groupby(['RAM (MB)', 'Internal storage (GB)'])['Price'].mean())\n", + "print()\n", + "\n", + "# Рассчитываем среднюю цену для комбинации 'Screen size (inches)' и 'Battery capacity (mAh)'\n", + "print(\"Средняя цена для комбинации 'Screen size (inches)' и 'Battery capacity (mAh)':\")\n", + "print(df.groupby(['Screen size (inches)', 'Battery capacity (mAh)'])['Price'].mean())\n", + "print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Выбор ориентира для каждой задачи:\n", + "\n", + "1. Прогнозирование цен на мобильные телефоны:\n", + "Ориентир:\n", + "\n", + "R² (коэффициент детерминации): 0.75 - 0.85\n", + "\n", + "MAE (средняя абсолютная ошибка): 5000 - 10000 рублей\n", + "\n", + "RMSE (среднеквадратичная ошибка): 10000 - 20000 рублей\n", + "\n", + "Объяснение:\n", + "\n", + "R²: Значение 0.75 - 0.85 будет означать, что модель объясняет 75-85% вариации цен на мобильные телефоны, что является хорошим результатом для задачи регрессии.\n", + "\n", + "MAE: Значение 5000 - 10000 рублей будет означать, что в среднем модель ошибается на 5000 - 10000 рублей при прогнозировании цен на мобильные телефоны. Учитывая диапазон цен от 9000 до 150000 рублей, этот ориентир является разумным.\n", + "\n", + "RMSE: Значение 10000 - 20000 рублей будет означать, что среднеквадратичная ошибка модели составляет 10000 - 20000 рублей.\n", + "\n", + "2. Оптимизация характеристик продукта:\n", + "Ориентир:\n", + "\n", + "Увеличение прибыли компании: 5% - 10%\n", + "\n", + "Сохранение конкурентоспособных тарифов:\n", + "\n", + "Средняя цена мобильных телефонов не должна увеличиваться более чем на 5% по сравнению с текущими тарифами.\n", + "\n", + "Доля клиентов, считающих цены высокими, не должна увеличиваться более чем на 2%.\n", + "\n", + "Объяснение:\n", + "\n", + "Увеличение прибыли компании: \n", + "\n", + "Цель оптимизации характеристик продукта - максимизировать прибыль компании. Ориентир в 5% - 10% увеличения прибыли является реалистичным и достижимым.\n", + "\n", + "Сохранение конкурентоспособных тарифов: \n", + "\n", + "Важно, чтобы оптимизация характеристик продукта не привела к значительному увеличению цен для клиентов. Ориентир в 5% увеличения средней цены мобильных телефонов и 2% увеличения доли клиентов, считающих цены высокими, позволяет сохранить конкурентоспособность компании." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MAE: 5424.128251450574\n", + "MSE: 143399695.59118932\n", + "RMSE: 11974.961193723733\n", + "R²: 0.4559750691933513\n", + "Ориентиры для прогнозирования цены мобильных устройств не достигнуты.\n", + "Средняя цена для 'Battery capacity (mAh)':\n", + "Battery capacity (mAh)\n", + "1010 3499.000000\n", + "1050 4790.000000\n", + "1200 3396.000000\n", + "1250 2498.500000\n", + "1300 3338.333333\n", + " ... \n", + "5000 13542.530612\n", + "5020 23900.000000\n", + "5100 16999.000000\n", + "5300 8999.000000\n", + "6000 19832.333333\n", + "Name: Price, Length: 165, dtype: float64\n", + "\n", + "Средняя цена для 'Screen size (inches)':\n", + "Screen size (inches)\n", + "2.40 1249.000000\n", + "2.44 6999.000000\n", + "2.45 2999.000000\n", + "2.60 5555.000000\n", + "2.80 5345.000000\n", + " ... \n", + "6.70 58896.428571\n", + "6.80 79699.000000\n", + "6.90 92999.000000\n", + "7.00 6199.000000\n", + "7.30 164999.000000\n", + "Name: Price, Length: 80, dtype: float64\n", + "\n", + "Средняя цена для 'RAM (MB)':\n", + "RAM (MB)\n", + "64 4249.000000\n", + "256 2625.625000\n", + "289 3190.000000\n", + "384 5555.000000\n", + "512 3419.847222\n", + "768 4749.500000\n", + "1000 5747.082153\n", + "2000 8645.608187\n", + "3000 11587.829787\n", + "4000 17741.965000\n", + "6000 28291.587302\n", + "8000 45080.586207\n", + "12000 99173.750000\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'Internal storage (GB)':\n", + "Internal storage (GB)\n", + "0.064 3499.000000\n", + "0.128 4999.000000\n", + "0.160 5555.000000\n", + "0.512 3390.444444\n", + "1.000 5197.000000\n", + "2.000 2399.000000\n", + "3.000 3190.000000\n", + "4.000 4513.032258\n", + "8.000 5311.654206\n", + "16.000 8379.403341\n", + "32.000 12089.644366\n", + "64.000 21208.814607\n", + "128.000 29477.043478\n", + "256.000 68682.777778\n", + "512.000 164999.000000\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'Rear camera':\n", + "Rear camera\n", + "0.0 4193.500000\n", + "0.3 3054.666667\n", + "2.0 3000.804878\n", + "3.0 4329.333333\n", + "3.2 3506.727273\n", + "5.0 4413.183486\n", + "8.0 6658.109966\n", + "8.7 8999.000000\n", + "10.0 10499.250000\n", + "12.0 28533.950980\n", + "12.2 29162.500000\n", + "12.3 30569.250000\n", + "13.0 9531.776018\n", + "13.1 32490.000000\n", + "13.2 7496.333333\n", + "15.0 88719.000000\n", + "16.0 17923.896825\n", + "18.0 19999.000000\n", + "19.0 39095.000000\n", + "20.0 23739.777778\n", + "20.7 35718.600000\n", + "21.0 14513.900000\n", + "21.5 6999.000000\n", + "23.0 31966.090909\n", + "24.0 18587.250000\n", + "25.0 13980.000000\n", + "32.0 19589.000000\n", + "40.0 61389.750000\n", + "41.0 10990.000000\n", + "48.0 24205.615385\n", + "64.0 19332.333333\n", + "108.0 92999.000000\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для 'Front camera':\n", + "Front camera\n", + "0.0 6799.333333\n", + "0.3 4398.430233\n", + "0.9 19999.000000\n", + "1.1 18745.000000\n", + "1.2 18579.333333\n", + "1.3 5640.352941\n", + "1.5 18748.000000\n", + "1.6 15833.000000\n", + "1.9 11246.750000\n", + "2.0 6711.995122\n", + "2.1 24839.666667\n", + "2.2 35823.333333\n", + "2.4 5061.333333\n", + "3.0 8500.000000\n", + "3.2 4660.900000\n", + "3.7 28925.000000\n", + "4.0 13519.285714\n", + "5.0 8347.064655\n", + "7.0 49061.625000\n", + "8.0 13894.422222\n", + "10.0 105114.166667\n", + "12.0 41929.500000\n", + "13.0 11380.436364\n", + "16.0 17646.113402\n", + "20.0 13498.520000\n", + "24.0 21860.562500\n", + "25.0 19182.692308\n", + "32.0 23749.782609\n", + "40.0 92999.000000\n", + "48.0 27999.000000\n", + "Name: Price, dtype: float64\n", + "\n", + "Средняя цена для комбинации 'Battery capacity (mAh)' и 'RAM (MB)':\n", + "Battery capacity (mAh) RAM (MB)\n", + "1010 64 3499.0\n", + "1050 1000 4790.0\n", + "1200 64 4999.0\n", + " 289 3190.0\n", + " 512 1999.0\n", + " ... \n", + "5020 3000 23900.0\n", + "5100 3000 16999.0\n", + "5300 4000 8999.0\n", + "6000 4000 10749.0\n", + " 8000 37999.0\n", + "Name: Price, Length: 323, dtype: float64\n", + "\n", + "Средняя цена для комбинации 'Screen size (inches)' и 'Internal storage (GB)':\n", + "Screen size (inches) Internal storage (GB)\n", + "2.40 4.000 1249.0\n", + "2.44 0.512 6999.0\n", + "2.45 4.000 2999.0\n", + "2.60 0.160 5555.0\n", + "2.80 0.512 7500.0\n", + " ... \n", + "6.70 256.000 174990.0\n", + "6.80 256.000 79699.0\n", + "6.90 128.000 92999.0\n", + "7.00 8.000 6199.0\n", + "7.30 512.000 164999.0\n", + "Name: Price, Length: 177, dtype: float64\n", + "\n", + "Средняя цена для комбинации 'Rear camera' и 'Front camera':\n", + "Rear camera Front camera\n", + "0.0 0.0 4193.500000\n", + "0.3 0.3 3054.666667\n", + "2.0 0.0 4396.000000\n", + " 0.3 2940.937500\n", + " 1.3 2240.000000\n", + " ... \n", + "48.0 48.0 27999.000000\n", + "64.0 16.0 21499.000000\n", + " 20.0 14999.000000\n", + " 32.0 21499.000000\n", + "108.0 40.0 92999.000000\n", + "Name: Price, Length: 126, dtype: float64\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", + "\n", + "# Загружаем набор данных\n", + "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "\n", + "# Удаление пустого столбца по имени\n", + "df = df.drop('Unnamed: 0', axis=1)\n", + "\n", + "# Преобразуем категориальные переменные в числовые\n", + "df = pd.get_dummies(df, columns=['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE'], drop_first=True)\n", + "\n", + "# Разделяем данные на признаки (X) и целевую переменную (y)\n", + "X = df.drop('Price', axis=1)\n", + "y = df['Price']\n", + "\n", + "# Разделяем данные на обучающую и тестовую выборки\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Стандартизируем признаки\n", + "scaler = StandardScaler()\n", + "X_train = scaler.fit_transform(X_train)\n", + "X_test = scaler.transform(X_test)\n", + "\n", + "# Обучаем модель линейной регрессии\n", + "model = LinearRegression()\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Делаем предсказания на тестовой выборке\n", + "y_pred = model.predict(X_test)\n", + "\n", + "# Оцениваем качество модели\n", + "mae = mean_absolute_error(y_test, y_pred)\n", + "mse = mean_squared_error(y_test, y_pred)\n", + "rmse = mean_squared_error(y_test, y_pred, squared=False)\n", + "r2 = r2_score(y_test, y_pred)\n", + "\n", + "print(f\"MAE: {mae}\")\n", + "print(f\"MSE: {mse}\")\n", + "print(f\"RMSE: {rmse}\")\n", + "print(f\"R²: {r2}\")\n", + "\n", + "# Проверяем, достигнуты ли ориентиры\n", + "if r2 >= 0.75 and mae <= 10000 and rmse <= 20000:\n", + " print(\"Ориентиры для прогнозирования цены мобильных устройств достигнуты!\")\n", + "else:\n", + " print(\"Ориентиры для прогнозирования цены мобильных устройств не достигнуты.\")\n", + "\n", + "# Оптимизация тарифной сетки\n", + "# Убедитесь, что столбцы существуют\n", + "columns_to_group = ['Battery capacity (mAh)', 'Screen size (inches)', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera']\n", + "\n", + "# Рассчитываем среднюю цену для каждого значения каждого признака\n", + "for column in columns_to_group:\n", + " print(f\"Средняя цена для '{column}':\")\n", + " print(df.groupby(column)['Price'].mean())\n", + " print()\n", + "\n", + "# Рассчитываем среднюю цену для комбинаций признаков\n", + "# Например, для комбинации 'Battery capacity (mAh)' и 'RAM (MB)'\n", + "print(\"Средняя цена для комбинации 'Battery capacity (mAh)' и 'RAM (MB)':\")\n", + "print(df.groupby(['Battery capacity (mAh)', 'RAM (MB)'])['Price'].mean())\n", + "print()\n", + "\n", + "# Рассчитываем среднюю цену для комбинации 'Screen size (inches)' и 'Internal storage (GB)'\n", + "print(\"Средняя цена для комбинации 'Screen size (inches)' и 'Internal storage (GB)':\")\n", + "print(df.groupby(['Screen size (inches)', 'Internal storage (GB)'])['Price'].mean())\n", + "print()\n", + "\n", + "# Рассчитываем среднюю цену для комбинации 'Rear camera' и 'Front camera'\n", + "print(\"Средняя цена для комбинации 'Rear camera' и 'Front camera':\")\n", + "print(df.groupby(['Rear camera', 'Front camera'])['Price'].mean())\n", + "print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Анализ применимости алгоритмов обучения с учителем для решения поставленных задач:\n", + "\n", + "1. Прогнозирование стоимости страховых взносов: Задача: Регрессия\n", + "\n", + "Свойства алгоритмов:\n", + "\n", + "Линейная регрессия: Применимость: Хорошо подходит для задач, где зависимость между признаками и целевой переменной линейна. Преимущества: Проста в реализации, интерпретируема. Недостатки: Может плохо работать, если зависимость нелинейна.\n", + "\n", + "Деревья решений (регрессия): Применимость: Подходит для задач с нелинейными зависимостями. Преимущества: Может обрабатывать категориальные признаки, не требует масштабирования данных. Недостатки: Подвержены переобучению, могут давать нестабильные результаты.\n", + "\n", + "Случайный лес (регрессия): Применимость: Хорошо подходит для задач с нелинейными зависимостями и большим количеством признаков. Преимущества: Устойчив к переобучению, может обрабатывать категориальные признаки. Недостатки: Менее интерпретируем, чем линейная регрессия.\n", + "\n", + "Градиентный бустинг (регрессия): Применимость: Подходит для задач с нелинейными зависимостями и сложными взаимосвязями между признаками. Преимущества: Может достигать высокой точности, устойчив к переобучению. Недостатки: Сложнее в настройке, чем случайный лес, менее интерпретируем.\n", + "\n", + "Нейронные сети (регрессия): Применимость: Подходит для задач с очень сложными зависимостями и большим количеством данных. Преимущества: Может моделировать очень сложные зависимости. Недостатки: Требует большого количества данных, сложнее в настройке и интерпретации.\n", + "\n", + "#### Вывод:\n", + "\n", + "Линейная регрессия: Может быть хорошим выбором для начала, особенно если зависимость между признаками и целевой переменной линейна.\n", + "\n", + "Деревья решений и случайный лес: Подходят для задач с нелинейными зависимостями.\n", + "\n", + "Градиентный бустинг: Может давать более высокую точность, чем случайный лес, но требует больше времени на настройку.\n", + "\n", + "Нейронные сети: Могут быть излишними для этой задачи, если данных недостаточно много.\n", + "\n", + "2. Оптимизация тарифной сетки: Задача: Классификация (группировка клиентов по группам риска)\n", + "\n", + "Свойства алгоритмов:\n", + "\n", + "Логистическая регрессия: Применимость: Хорошо подходит для задач бинарной классификации, где зависимость между признаками и целевой переменной линейна. Преимущества: Проста в реализации, интерпретируема. Недостатки: Может плохо работать, если зависимость нелинейна.\n", + "\n", + "Деревья решений (классификация): Применимость: Подходит для задач с нелинейными зависимостями. Преимущества: Может обрабатывать категориальные признаки, не требует масштабирования данных. Недостатки: Подвержены переобучению, могут давать нестабильные результаты.\n", + "\n", + "Случайный лес (классификация): Применимость: Хорошо подходит для задач с нелинейными зависимостями и большим количеством признаков. Преимущества: Устойчив к переобучению, может обрабатывать категориальные признаки. Недостатки: Менее интерпретируем, чем линейная регрессия.\n", + "\n", + "Градиентный бустинг (классификация): Применимость: Подходит для задач с нелинейными зависимостями и сложными взаимосвязями между признаками. Преимущества: Может достигать высокой точности, устойчив к переобучению. Недостатки: Сложнее в настройке, чем случайный лес, менее интерпретируем.\n", + "\n", + "Нейронные сети (классификация): Применимость: Подходит для задач с очень сложными зависимостями и большим количеством данных. Преимущества: Может моделировать очень сложные зависимости. Недостатки: Требует большого количества данных, сложнее в настройке и интерпретации.\n", + "\n", + "#### Вывод:\n", + "\n", + "Логистическая регрессия: Может быть хорошим выбором для начала, особенно если зависимость между признаками и целевой переменной линейна.\n", + "\n", + "Деревья решений и случайный лес: Подходят для задач с нелинейными зависимостями.\n", + "\n", + "Градиентный бустинг: Может давать более высокую точность, чем случайный лес, но требует больше времени на настройку.\n", + "\n", + "Нейронные сети: Могут быть излишними для этой задачи, если данных недостаточно много.\n", + "\n", + "1. Прогнозирование стоимости страховых взносов: Выбранные модели:\n", + "\n", + "Линейная регрессия\n", + "\n", + "Случайный лес (регрессия)\n", + "\n", + "Градиентный бустинг (регрессия)\n", + "\n", + "2. Оптимизация тарифной сетки: Выбранные модели:\n", + "\n", + "Логистическая регрессия\n", + "\n", + "Случайный лес (классификация)\n", + "\n", + "Градиентный бустинг (классификация)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "could not convert string to float: '9.0 (Pie)'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[19], line 39\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;66;03m# Стандартизируем признаки для задачи регрессии\u001b[39;00m\n\u001b[0;32m 38\u001b[0m scaler_reg \u001b[38;5;241m=\u001b[39m StandardScaler()\n\u001b[1;32m---> 39\u001b[0m X_train_reg \u001b[38;5;241m=\u001b[39m \u001b[43mscaler_reg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train_reg\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 40\u001b[0m X_test_reg \u001b[38;5;241m=\u001b[39m scaler_reg\u001b[38;5;241m.\u001b[39mtransform(X_test_reg)\n\u001b[0;32m 42\u001b[0m \u001b[38;5;66;03m# Список моделей для задачи регрессии\u001b[39;00m\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output..wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1098\u001b[0m, in \u001b[0;36mTransformerMixin.fit_transform\u001b[1;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[0;32m 1083\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 1084\u001b[0m (\n\u001b[0;32m 1085\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis object (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) has a `transform`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1093\u001b[0m \u001b[38;5;167;01mUserWarning\u001b[39;00m,\n\u001b[0;32m 1094\u001b[0m )\n\u001b[0;32m 1096\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m y \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 1097\u001b[0m \u001b[38;5;66;03m# fit method of arity 1 (unsupervised transformation)\u001b[39;00m\n\u001b[1;32m-> 1098\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_params\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mtransform(X)\n\u001b[0;32m 1099\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1100\u001b[0m \u001b[38;5;66;03m# fit method of arity 2 (supervised transformation)\u001b[39;00m\n\u001b[0;32m 1101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\u001b[38;5;241m.\u001b[39mtransform(X)\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_data.py:878\u001b[0m, in \u001b[0;36mStandardScaler.fit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 876\u001b[0m \u001b[38;5;66;03m# Reset internal state before fitting\u001b[39;00m\n\u001b[0;32m 877\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset()\n\u001b[1;32m--> 878\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpartial_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_data.py:914\u001b[0m, in \u001b[0;36mStandardScaler.partial_fit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 882\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Online computation of mean and std on X for later scaling.\u001b[39;00m\n\u001b[0;32m 883\u001b[0m \n\u001b[0;32m 884\u001b[0m \u001b[38;5;124;03mAll of X is processed as a single batch. This is intended for cases\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 911\u001b[0m \u001b[38;5;124;03m Fitted scaler.\u001b[39;00m\n\u001b[0;32m 912\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 913\u001b[0m first_call \u001b[38;5;241m=\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_samples_seen_\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 914\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 915\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 916\u001b[0m \u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcsr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcsc\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 917\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mFLOAT_DTYPES\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 918\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_all_finite\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mallow-nan\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 919\u001b[0m \u001b[43m \u001b[49m\u001b[43mreset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfirst_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 920\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 921\u001b[0m n_features \u001b[38;5;241m=\u001b[39m X\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 923\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m sample_weight \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\base.py:633\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[0;32m 631\u001b[0m out \u001b[38;5;241m=\u001b[39m X, y\n\u001b[0;32m 632\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m no_val_y:\n\u001b[1;32m--> 633\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mX\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcheck_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 634\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_y:\n\u001b[0;32m 635\u001b[0m out \u001b[38;5;241m=\u001b[39m _check_y(y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcheck_params)\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py:929\u001b[0m, in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 924\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pandas_requires_conversion:\n\u001b[0;32m 925\u001b[0m \u001b[38;5;66;03m# pandas dataframe requires conversion earlier to handle extension dtypes with\u001b[39;00m\n\u001b[0;32m 926\u001b[0m \u001b[38;5;66;03m# nans\u001b[39;00m\n\u001b[0;32m 927\u001b[0m \u001b[38;5;66;03m# Use the original dtype for conversion if dtype is None\u001b[39;00m\n\u001b[0;32m 928\u001b[0m new_dtype \u001b[38;5;241m=\u001b[39m dtype_orig \u001b[38;5;28;01mif\u001b[39;00m dtype \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m dtype\n\u001b[1;32m--> 929\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43marray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnew_dtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 930\u001b[0m \u001b[38;5;66;03m# Since we converted here, we do not need to convert again later\u001b[39;00m\n\u001b[0;32m 931\u001b[0m dtype \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\generic.py:6643\u001b[0m, in \u001b[0;36mNDFrame.astype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 6637\u001b[0m results \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 6638\u001b[0m ser\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy, errors\u001b[38;5;241m=\u001b[39merrors) \u001b[38;5;28;01mfor\u001b[39;00m _, ser \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems()\n\u001b[0;32m 6639\u001b[0m ]\n\u001b[0;32m 6641\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 6642\u001b[0m \u001b[38;5;66;03m# else, only a single dtype is given\u001b[39;00m\n\u001b[1;32m-> 6643\u001b[0m new_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mgr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 6644\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_constructor_from_mgr(new_data, axes\u001b[38;5;241m=\u001b[39mnew_data\u001b[38;5;241m.\u001b[39maxes)\n\u001b[0;32m 6645\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mastype\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:430\u001b[0m, in \u001b[0;36mBaseBlockManager.astype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 427\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m using_copy_on_write():\n\u001b[0;32m 428\u001b[0m copy \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m--> 430\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mastype\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 435\u001b[0m \u001b[43m \u001b[49m\u001b[43musing_cow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43musing_copy_on_write\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 436\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:363\u001b[0m, in \u001b[0;36mBaseBlockManager.apply\u001b[1;34m(self, f, align_keys, **kwargs)\u001b[0m\n\u001b[0;32m 361\u001b[0m applied \u001b[38;5;241m=\u001b[39m b\u001b[38;5;241m.\u001b[39mapply(f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 362\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 363\u001b[0m applied \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 364\u001b[0m result_blocks \u001b[38;5;241m=\u001b[39m extend_blocks(applied, result_blocks)\n\u001b[0;32m 366\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mfrom_blocks(result_blocks, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maxes)\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\internals\\blocks.py:758\u001b[0m, in \u001b[0;36mBlock.astype\u001b[1;34m(self, dtype, copy, errors, using_cow, squeeze)\u001b[0m\n\u001b[0;32m 755\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCan not squeeze with more than one column.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 756\u001b[0m values \u001b[38;5;241m=\u001b[39m values[\u001b[38;5;241m0\u001b[39m, :] \u001b[38;5;66;03m# type: ignore[call-overload]\u001b[39;00m\n\u001b[1;32m--> 758\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array_safe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 760\u001b[0m new_values \u001b[38;5;241m=\u001b[39m maybe_coerce_values(new_values)\n\u001b[0;32m 762\u001b[0m refs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:237\u001b[0m, in \u001b[0;36mastype_array_safe\u001b[1;34m(values, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 234\u001b[0m dtype \u001b[38;5;241m=\u001b[39m dtype\u001b[38;5;241m.\u001b[39mnumpy_dtype\n\u001b[0;32m 236\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 237\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 238\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m):\n\u001b[0;32m 239\u001b[0m \u001b[38;5;66;03m# e.g. _astype_nansafe can fail on object-dtype of strings\u001b[39;00m\n\u001b[0;32m 240\u001b[0m \u001b[38;5;66;03m# trying to convert to float\u001b[39;00m\n\u001b[0;32m 241\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:182\u001b[0m, in \u001b[0;36mastype_array\u001b[1;34m(values, dtype, copy)\u001b[0m\n\u001b[0;32m 179\u001b[0m values \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n\u001b[0;32m 181\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 182\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43m_astype_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 184\u001b[0m \u001b[38;5;66;03m# in pandas we don't store numpy str dtypes, so convert to object\u001b[39;00m\n\u001b[0;32m 185\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, np\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, \u001b[38;5;28mstr\u001b[39m):\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:133\u001b[0m, in \u001b[0;36m_astype_nansafe\u001b[1;34m(arr, dtype, copy, skipna)\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg)\n\u001b[0;32m 131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m copy \u001b[38;5;129;01mor\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mobject\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m dtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mobject\u001b[39m:\n\u001b[0;32m 132\u001b[0m \u001b[38;5;66;03m# Explicit copy, or required since NumPy can't view from / to object.\u001b[39;00m\n\u001b[1;32m--> 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43marr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[0;32m 135\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n", + "\u001b[1;31mValueError\u001b[0m: could not convert string to float: '9.0 (Pie)'" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LinearRegression, LogisticRegression\n", + "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", + "from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score\n", + "import re\n", + "\n", + "# Загружаем набор данных\n", + "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "\n", + "# Удаление пустого столбца по имени\n", + "df = df.drop('Unnamed: 0', axis=1)\n", + "\n", + "# Извлечение числовых значений из столбца Battery\n", + "df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", + "df['Ram'] = df['Ram'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", + "df['Camera'] = df['Camera'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", + "df['Display'] = df['Display'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", + "\n", + "# Преобразуем категориальные переменные в числовые\n", + "# Предположим, что 'company' и 'Processor_name' являются категориальными\n", + "df = pd.get_dummies(df, columns=['company', 'Processor_name'], drop_first=True)\n", + "\n", + "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", + "X_reg = df.drop('Price', axis=1)\n", + "y_reg = df['Price']\n", + "\n", + "# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n", + "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n", + "\n", + "# Стандартизируем признаки для задачи регрессии\n", + "scaler_reg = StandardScaler()\n", + "X_train_reg = scaler_reg.fit_transform(X_train_reg)\n", + "X_test_reg = scaler_reg.transform(X_test_reg)\n", + "\n", + "# Список моделей для задачи регрессии\n", + "models_reg = {\n", + " \"Linear Regression\": LinearRegression(),\n", + " \"Random Forest Regression\": RandomForestRegressor(),\n", + " \"Gradient Boosting Regression\": GradientBoostingRegressor()\n", + "}\n", + "\n", + "# Обучаем и оцениваем модели для задачи регрессии\n", + "print(\"Результаты для задачи регрессии:\")\n", + "for name, model in models_reg.items():\n", + " model.fit(X_train_reg, y_train_reg)\n", + " y_pred_reg = model.predict(X_test_reg)\n", + " mae = mean_absolute_error(y_test_reg, y_pred_reg)\n", + " mse = mean_squared_error(y_test_reg, y_pred_reg)\n", + " rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n", + " r2 = r2_score(y_test_reg, y_pred_reg)\n", + " print(f\"Model: {name}\")\n", + " print(f\"MAE: {mae}\")\n", + " print(f\"MSE: {mse}\")\n", + " print(f\"RMSE: {rmse}\")\n", + " print(f\"R²: {r2}\")\n", + " print()\n", + "\n", + "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", + "# Предположим, что мы классифицируем устройства по рейтингу выше среднего\n", + "X_class = df.drop('Rating', axis=1)\n", + "y_class = (df['Rating'] > df['Rating'].mean()).astype(int)\n", + "\n", + "# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n", + "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n", + "\n", + "# Стандартизируем признаки для задачи классификации\n", + "scaler_class = StandardScaler()\n", + "X_train_class = scaler_class.fit_transform(X_train_class)\n", + "X_test_class = scaler_class.transform(X_test_class)\n", + "\n", + "# Список моделей для задачи классификации\n", + "models_class = {\n", + " \"Logistic Regression\": LogisticRegression(),\n", + " \"Random Forest Classification\": RandomForestClassifier(),\n", + " \"Gradient Boosting Classification\": GradientBoostingClassifier()\n", + "}\n", + "\n", + "# Обучаем и оцениваем модели для задачи классификации\n", + "print(\"Результаты для задачи классификации:\")\n", + "for name, model in models_class.items():\n", + " model.fit(X_train_class, y_train_class)\n", + " y_pred_class = model.predict(X_test_class)\n", + " accuracy = accuracy_score(y_test_class, y_pred_class)\n", + " print(f\"Model: {name}\")\n", + " print(f\"Accuracy: {accuracy}\")\n", + " print()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aimenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 98aa97904bae8d416c17366392de34665dd8922d Mon Sep 17 00:00:00 2001 From: "a.puchkina" Date: Sun, 8 Dec 2024 23:25:41 +0400 Subject: [PATCH 2/4] =?UTF-8?q?=D0=BB=D1=8F=D0=BB=D1=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_4/lab4.ipynb | 1188 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 1141 insertions(+), 47 deletions(-) diff --git a/lab_4/lab4.ipynb b/lab_4/lab4.ipynb index 2af2575..45a40d5 100644 --- a/lab_4/lab4.ipynb +++ b/lab_4/lab4.ipynb @@ -9,18 +9,13 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Index(['Unnamed: 0', 'Name', 'Rating', 'Spec_score', 'No_of_sim', 'Ram',\n", - " 'Battery', 'Display', 'Camera', 'External_Memory', 'Android_version',\n", - " 'Price', 'company', 'Inbuilt_memory', 'fast_charging',\n", - " 'Screen_resolution', 'Processor', 'Processor_name'],\n", - " dtype='object')\n", "Index(['Unnamed: 0', 'Name', 'Brand', 'Model', 'Battery capacity (mAh)',\n", " 'Screen size (inches)', 'Touchscreen', 'Resolution x', 'Resolution y',\n", " 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera',\n", @@ -32,9 +27,7 @@ ], "source": [ "import pandas as pd\n", - "df1 = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n", "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", - "print(df1.columns)\n", "print(df.columns)" ] }, @@ -69,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -226,7 +219,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -695,7 +688,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -1051,32 +1044,84 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [ { - "ename": "ValueError", - "evalue": "could not convert string to float: '9.0 (Pie)'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[19], line 39\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;66;03m# Стандартизируем признаки для задачи регрессии\u001b[39;00m\n\u001b[0;32m 38\u001b[0m scaler_reg \u001b[38;5;241m=\u001b[39m StandardScaler()\n\u001b[1;32m---> 39\u001b[0m X_train_reg \u001b[38;5;241m=\u001b[39m \u001b[43mscaler_reg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train_reg\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 40\u001b[0m X_test_reg \u001b[38;5;241m=\u001b[39m scaler_reg\u001b[38;5;241m.\u001b[39mtransform(X_test_reg)\n\u001b[0;32m 42\u001b[0m \u001b[38;5;66;03m# Список моделей для задачи регрессии\u001b[39;00m\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output..wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1098\u001b[0m, in \u001b[0;36mTransformerMixin.fit_transform\u001b[1;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[0;32m 1083\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 1084\u001b[0m (\n\u001b[0;32m 1085\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis object (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) has a `transform`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1093\u001b[0m \u001b[38;5;167;01mUserWarning\u001b[39;00m,\n\u001b[0;32m 1094\u001b[0m )\n\u001b[0;32m 1096\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m y \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 1097\u001b[0m \u001b[38;5;66;03m# fit method of arity 1 (unsupervised transformation)\u001b[39;00m\n\u001b[1;32m-> 1098\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_params\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mtransform(X)\n\u001b[0;32m 1099\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1100\u001b[0m \u001b[38;5;66;03m# fit method of arity 2 (supervised transformation)\u001b[39;00m\n\u001b[0;32m 1101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\u001b[38;5;241m.\u001b[39mtransform(X)\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_data.py:878\u001b[0m, in \u001b[0;36mStandardScaler.fit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 876\u001b[0m \u001b[38;5;66;03m# Reset internal state before fitting\u001b[39;00m\n\u001b[0;32m 877\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset()\n\u001b[1;32m--> 878\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpartial_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_data.py:914\u001b[0m, in \u001b[0;36mStandardScaler.partial_fit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 882\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Online computation of mean and std on X for later scaling.\u001b[39;00m\n\u001b[0;32m 883\u001b[0m \n\u001b[0;32m 884\u001b[0m \u001b[38;5;124;03mAll of X is processed as a single batch. This is intended for cases\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 911\u001b[0m \u001b[38;5;124;03m Fitted scaler.\u001b[39;00m\n\u001b[0;32m 912\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 913\u001b[0m first_call \u001b[38;5;241m=\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_samples_seen_\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 914\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 915\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 916\u001b[0m \u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcsr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcsc\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 917\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mFLOAT_DTYPES\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 918\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_all_finite\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mallow-nan\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 919\u001b[0m \u001b[43m \u001b[49m\u001b[43mreset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfirst_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 920\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 921\u001b[0m n_features \u001b[38;5;241m=\u001b[39m X\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 923\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m sample_weight \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\base.py:633\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[0;32m 631\u001b[0m out \u001b[38;5;241m=\u001b[39m X, y\n\u001b[0;32m 632\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m no_val_y:\n\u001b[1;32m--> 633\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mX\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcheck_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 634\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_y:\n\u001b[0;32m 635\u001b[0m out \u001b[38;5;241m=\u001b[39m _check_y(y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcheck_params)\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py:929\u001b[0m, in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 924\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pandas_requires_conversion:\n\u001b[0;32m 925\u001b[0m \u001b[38;5;66;03m# pandas dataframe requires conversion earlier to handle extension dtypes with\u001b[39;00m\n\u001b[0;32m 926\u001b[0m \u001b[38;5;66;03m# nans\u001b[39;00m\n\u001b[0;32m 927\u001b[0m \u001b[38;5;66;03m# Use the original dtype for conversion if dtype is None\u001b[39;00m\n\u001b[0;32m 928\u001b[0m new_dtype \u001b[38;5;241m=\u001b[39m dtype_orig \u001b[38;5;28;01mif\u001b[39;00m dtype \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m dtype\n\u001b[1;32m--> 929\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43marray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnew_dtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 930\u001b[0m \u001b[38;5;66;03m# Since we converted here, we do not need to convert again later\u001b[39;00m\n\u001b[0;32m 931\u001b[0m dtype \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\generic.py:6643\u001b[0m, in \u001b[0;36mNDFrame.astype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 6637\u001b[0m results \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 6638\u001b[0m ser\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy, errors\u001b[38;5;241m=\u001b[39merrors) \u001b[38;5;28;01mfor\u001b[39;00m _, ser \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems()\n\u001b[0;32m 6639\u001b[0m ]\n\u001b[0;32m 6641\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 6642\u001b[0m \u001b[38;5;66;03m# else, only a single dtype is given\u001b[39;00m\n\u001b[1;32m-> 6643\u001b[0m new_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mgr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 6644\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_constructor_from_mgr(new_data, axes\u001b[38;5;241m=\u001b[39mnew_data\u001b[38;5;241m.\u001b[39maxes)\n\u001b[0;32m 6645\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mastype\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:430\u001b[0m, in \u001b[0;36mBaseBlockManager.astype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 427\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m using_copy_on_write():\n\u001b[0;32m 428\u001b[0m copy \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m--> 430\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mastype\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 435\u001b[0m \u001b[43m \u001b[49m\u001b[43musing_cow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43musing_copy_on_write\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 436\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:363\u001b[0m, in \u001b[0;36mBaseBlockManager.apply\u001b[1;34m(self, f, align_keys, **kwargs)\u001b[0m\n\u001b[0;32m 361\u001b[0m applied \u001b[38;5;241m=\u001b[39m b\u001b[38;5;241m.\u001b[39mapply(f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 362\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 363\u001b[0m applied \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 364\u001b[0m result_blocks \u001b[38;5;241m=\u001b[39m extend_blocks(applied, result_blocks)\n\u001b[0;32m 366\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mfrom_blocks(result_blocks, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maxes)\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\internals\\blocks.py:758\u001b[0m, in \u001b[0;36mBlock.astype\u001b[1;34m(self, dtype, copy, errors, using_cow, squeeze)\u001b[0m\n\u001b[0;32m 755\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCan not squeeze with more than one column.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 756\u001b[0m values \u001b[38;5;241m=\u001b[39m values[\u001b[38;5;241m0\u001b[39m, :] \u001b[38;5;66;03m# type: ignore[call-overload]\u001b[39;00m\n\u001b[1;32m--> 758\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array_safe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 760\u001b[0m new_values \u001b[38;5;241m=\u001b[39m maybe_coerce_values(new_values)\n\u001b[0;32m 762\u001b[0m refs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:237\u001b[0m, in \u001b[0;36mastype_array_safe\u001b[1;34m(values, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 234\u001b[0m dtype \u001b[38;5;241m=\u001b[39m dtype\u001b[38;5;241m.\u001b[39mnumpy_dtype\n\u001b[0;32m 236\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 237\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 238\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m):\n\u001b[0;32m 239\u001b[0m \u001b[38;5;66;03m# e.g. _astype_nansafe can fail on object-dtype of strings\u001b[39;00m\n\u001b[0;32m 240\u001b[0m \u001b[38;5;66;03m# trying to convert to float\u001b[39;00m\n\u001b[0;32m 241\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:182\u001b[0m, in \u001b[0;36mastype_array\u001b[1;34m(values, dtype, copy)\u001b[0m\n\u001b[0;32m 179\u001b[0m values \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n\u001b[0;32m 181\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 182\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43m_astype_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 184\u001b[0m \u001b[38;5;66;03m# in pandas we don't store numpy str dtypes, so convert to object\u001b[39;00m\n\u001b[0;32m 185\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, np\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, \u001b[38;5;28mstr\u001b[39m):\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:133\u001b[0m, in \u001b[0;36m_astype_nansafe\u001b[1;34m(arr, dtype, copy, skipna)\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg)\n\u001b[0;32m 131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m copy \u001b[38;5;129;01mor\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mobject\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m dtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mobject\u001b[39m:\n\u001b[0;32m 132\u001b[0m \u001b[38;5;66;03m# Explicit copy, or required since NumPy can't view from / to object.\u001b[39;00m\n\u001b[1;32m--> 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43marr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[0;32m 135\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n", - "\u001b[1;31mValueError\u001b[0m: could not convert string to float: '9.0 (Pie)'" + "name": "stdout", + "output_type": "stream", + "text": [ + "Результаты для задачи регрессии:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: Linear Regression\n", + "MAE: 5424.128251450574\n", + "MSE: 143399695.59118932\n", + "RMSE: 11974.961193723733\n", + "R²: 0.4559750691933513\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: Random Forest Regression\n", + "MAE: 4022.856102941176\n", + "MSE: 107013897.40805219\n", + "RMSE: 10344.752167551052\n", + "R²: 0.5940142836932047\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: Gradient Boosting Regression\n", + "MAE: 4136.035916841655\n", + "MSE: 90628419.64103268\n", + "RMSE: 9519.895988981849\n", + "R²: 0.656176956854307\n", + "\n", + "Результаты для задачи классификации:\n", + "Model: Logistic Regression\n", + "Accuracy: 0.8137254901960784\n", + "\n", + "Model: Random Forest Classification\n", + "Accuracy: 0.8627450980392157\n", + "\n", + "Model: Gradient Boosting Classification\n", + "Accuracy: 0.8578431372549019\n", + "\n" ] } ], @@ -1088,7 +1133,6 @@ "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", "from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score\n", - "import re\n", "\n", "# Загружаем набор данных\n", "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", @@ -1096,15 +1140,16 @@ "# Удаление пустого столбца по имени\n", "df = df.drop('Unnamed: 0', axis=1)\n", "\n", - "# Извлечение числовых значений из столбца Battery\n", - "df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", - "df['Ram'] = df['Ram'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", - "df['Camera'] = df['Camera'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", - "df['Display'] = df['Display'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", - "\n", "# Преобразуем категориальные переменные в числовые\n", - "# Предположим, что 'company' и 'Processor_name' являются категориальными\n", - "df = pd.get_dummies(df, columns=['company', 'Processor_name'], drop_first=True)\n", + "df = pd.get_dummies(df, columns=['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE'], drop_first=True)\n", + "\n", + "# Убеждаемся, что все столбцы, которые должны быть числовыми, действительно являются числовыми\n", + "numeric_columns = ['Battery capacity (mAh)', 'Screen size (inches)', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Price']\n", + "for column in numeric_columns:\n", + " df[column] = pd.to_numeric(df[column], errors='coerce')\n", + "\n", + "# Удаляем строки с пропущенными значениями, которые могли возникнуть при преобразовании\n", + "df.dropna(subset=numeric_columns, inplace=True)\n", "\n", "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", "X_reg = df.drop('Price', axis=1)\n", @@ -1142,12 +1187,11 @@ " print()\n", "\n", "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", - "# Предположим, что мы классифицируем устройства по рейтингу выше среднего\n", - "X_class = df.drop('Rating', axis=1)\n", - "y_class = (df['Rating'] > df['Rating'].mean()).astype(int)\n", + "X_class = df.drop('Price', axis=1)\n", + "y_class = (df['Price'] > df['Price'].mean()).astype(int)\n", "\n", "# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n", - "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n", + "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.3, random_state=42)\n", "\n", "# Стандартизируем признаки для задачи классификации\n", "scaler_class = StandardScaler()\n", @@ -1171,6 +1215,1056 @@ " print(f\"Accuracy: {accuracy}\")\n", " print()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Прогнозирование стоимости мобильных телефонов: \n", + "\n", + "Конвейер для задачи регрессии:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Результаты для задачи регрессии:\n", + "Model: Linear Regression\n", + "MAE: 4695.532622690401\n", + "MSE: 65643669.229680344\n", + "RMSE: 8102.078080941972\n", + "R²: 0.6878208068254403\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: Random Forest Regression\n", + "MAE: 4002.1279166666664\n", + "MSE: 88540074.860938\n", + "RMSE: 9409.573574872455\n", + "R²: 0.578933209278842\n", + "\n", + "Model: Gradient Boosting Regression\n", + "MAE: 4080.6847402556627\n", + "MSE: 75338600.25148971\n", + "RMSE: 8679.781117717757\n", + "R²: 0.6417149784982356\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", + "\n", + "# Загружаем набор данных\n", + "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "\n", + "# Удаление пустого столбца по имени\n", + "df = df.drop('Unnamed: 0', axis=1)\n", + "\n", + "# Определяем категориальные и числовые столбцы\n", + "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", + "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", + "\n", + "# Создаем преобразователь для категориальных и числовых столбцов\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", + " ('num', StandardScaler(), numerical_cols)\n", + " ])\n", + "\n", + "# Список моделей для задачи регрессии\n", + "models_reg = {\n", + " \"Linear Regression\": LinearRegression(),\n", + " \"Random Forest Regression\": RandomForestRegressor(),\n", + " \"Gradient Boosting Regression\": GradientBoostingRegressor()\n", + "}\n", + "\n", + "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", + "X_reg = df[categorical_cols + numerical_cols]\n", + "y_reg = df['Price']\n", + "\n", + "# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n", + "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)\n", + "\n", + "# Обучаем и оцениваем модели для задачи регрессии\n", + "print(\"Результаты для задачи регрессии:\")\n", + "for name, model in models_reg.items():\n", + " pipeline = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', model)\n", + " ])\n", + " pipeline.fit(X_train_reg, y_train_reg)\n", + " y_pred_reg = pipeline.predict(X_test_reg)\n", + " mae = mean_absolute_error(y_test_reg, y_pred_reg)\n", + " mse = mean_squared_error(y_test_reg, y_pred_reg)\n", + " rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n", + " r2 = r2_score(y_test_reg, y_pred_reg)\n", + " print(f\"Model: {name}\")\n", + " print(f\"MAE: {mae}\")\n", + " print(f\"MSE: {mse}\")\n", + " print(f\"RMSE: {rmse}\")\n", + " print(f\"R²: {r2}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. Оптимизация тарифной сетки:\n", + "\n", + " Конвейер для задачи классификации:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Результаты для задачи классификации:\n", + "Model: Logistic Regression\n", + "Accuracy: 0.8713235294117647\n", + "\n", + "Model: Random Forest Classification\n", + "Accuracy: 0.8786764705882353\n", + "\n", + "Model: Gradient Boosting Classification\n", + "Accuracy: 0.8676470588235294\n", + "\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", + "\n", + "# Загружаем набор данных\n", + "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "\n", + "# Удаление пустого столбца по имени\n", + "df = df.drop('Unnamed: 0', axis=1)\n", + "\n", + "# Определяем категориальные и числовые столбцы\n", + "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", + "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", + "\n", + "# Создаем преобразователь для категориальных и числовых столбцов\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", + " ('num', StandardScaler(), numerical_cols)\n", + " ])\n", + "\n", + "# Список моделей для задачи классификации\n", + "models_class = {\n", + " \"Logistic Regression\": LogisticRegression(),\n", + " \"Random Forest Classification\": RandomForestClassifier(),\n", + " \"Gradient Boosting Classification\": GradientBoostingClassifier()\n", + "}\n", + "\n", + "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", + "X_class = df[categorical_cols + numerical_cols]\n", + "y_class = (df['Price'] > df['Price'].mean()).astype(int)\n", + "\n", + "# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n", + "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n", + "\n", + "# Обучаем и оцениваем модели для задачи классификации\n", + "print(\"Результаты для задачи классификации:\")\n", + "for name, model in models_class.items():\n", + " pipeline = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', model)\n", + " ])\n", + " pipeline.fit(X_train_class, y_train_class)\n", + " y_pred_class = pipeline.predict(X_test_class)\n", + " accuracy = accuracy_score(y_test_class, y_pred_class)\n", + " print(f\"Model: {name}\")\n", + " print(f\"Accuracy: {accuracy}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Прогнозирование стоимости страховых взносов:\n", + "\n", + "Настройка гиперпараметров для задачи регрессии:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Результаты для задачи регрессии:\n", + "Model: Linear Regression\n", + "Best Parameters: {}\n", + "MAE: 4695.532622690401\n", + "MSE: 65643669.229680344\n", + "RMSE: 8102.078080941972\n", + "R²: 0.6878208068254403\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", + "\n", + "# Загружаем набор данных\n", + "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "\n", + "# Удаление пустого столбца по имени\n", + "df = df.drop('Unnamed: 0', axis=1)\n", + "\n", + "# Определяем категориальные и числовые столбцы\n", + "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", + "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", + "\n", + "# Создаем преобразователь для категориальных и числовых столбцов\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", + " ('num', StandardScaler(), numerical_cols)\n", + " ])\n", + "\n", + "# Список моделей и их гиперпараметров для задачи регрессии\n", + "models_reg = {\n", + " \"Linear Regression\": (LinearRegression(), {}),\n", + " \"Random Forest Regression\": (RandomForestRegressor(), {\n", + " 'model__n_estimators': [100, 200],\n", + " 'model__max_depth': [None, 10, 20]\n", + " }),\n", + " \"Gradient Boosting Regression\": (GradientBoostingRegressor(), {\n", + " 'model__n_estimators': [100, 200],\n", + " 'model__learning_rate': [0.01, 0.1],\n", + " 'model__max_depth': [3, 5]\n", + " })\n", + "}\n", + "\n", + "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", + "X_reg = df[categorical_cols + numerical_cols]\n", + "y_reg = df['Price']\n", + "\n", + "# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n", + "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)\n", + "\n", + "# Обучаем и оцениваем модели для задачи регрессии\n", + "print(\"Результаты для задачи регрессии:\")\n", + "for name, (model, params) in models_reg.items():\n", + " pipeline = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', model)\n", + " ])\n", + " grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_absolute_error')\n", + " grid_search.fit(X_train_reg, y_train_reg)\n", + " best_model = grid_search.best_estimator_\n", + " y_pred_reg = best_model.predict(X_test_reg)\n", + " mae = mean_absolute_error(y_test_reg, y_pred_reg)\n", + " mse = mean_squared_error(y_test_reg, y_pred_reg)\n", + " rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n", + " r2 = r2_score(y_test_reg, y_pred_reg)\n", + " print(f\"Model: {name}\")\n", + " print(f\"Best Parameters: {grid_search.best_params_}\")\n", + " print(f\"MAE: {mae}\")\n", + " print(f\"MSE: {mse}\")\n", + " print(f\"RMSE: {rmse}\")\n", + " print(f\"R²: {r2}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. Оптимизация тарифной сетки:\n", + "\n", + "Настройка гиперпараметров для задачи классификации:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Результаты для задачи классификации:\n", + "Model: Logistic Regression\n", + "Best Parameters: {'model__C': 1, 'model__solver': 'lbfgs'}\n", + "Accuracy: 0.8713235294117647\n", + "\n", + "Model: Random Forest Classification\n", + "Best Parameters: {'model__max_depth': None, 'model__n_estimators': 200}\n", + "Accuracy: 0.8676470588235294\n", + "\n", + "Model: Gradient Boosting Classification\n", + "Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200}\n", + "Accuracy: 0.8602941176470589\n", + "\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "# Загружаем набор данных\n", + "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "\n", + "# Удаление пустого столбца по имени\n", + "df = df.drop('Unnamed: 0', axis=1)\n", + "\n", + "# Определяем категориальные и числовые столбцы\n", + "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", + "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", + "\n", + "# Создаем преобразователь для категориальных и числовых столбцов\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", + " ('num', StandardScaler(), numerical_cols)\n", + " ])\n", + "\n", + "# Список моделей и их гиперпараметров для задачи классификации\n", + "models_class = {\n", + " \"Logistic Regression\": (LogisticRegression(), {\n", + " 'model__C': [0.1, 1, 10],\n", + " 'model__solver': ['liblinear', 'lbfgs']\n", + " }),\n", + " \"Random Forest Classification\": (RandomForestClassifier(), {\n", + " 'model__n_estimators': [100, 200],\n", + " 'model__max_depth': [None, 10, 20]\n", + " }),\n", + " \"Gradient Boosting Classification\": (GradientBoostingClassifier(), {\n", + " 'model__n_estimators': [100, 200],\n", + " 'model__learning_rate': [0.01, 0.1],\n", + " 'model__max_depth': [3, 5]\n", + " })\n", + "}\n", + "\n", + "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", + "X_class = df[categorical_cols + numerical_cols]\n", + "y_class = (df['Price'] > df['Price'].mean()).astype(int)\n", + "\n", + "# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n", + "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n", + "\n", + "# Обучаем и оцениваем модели для задачи классификации\n", + "print(\"Результаты для задачи классификации:\")\n", + "for name, (model, params) in models_class.items():\n", + " pipeline = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', model)\n", + " ])\n", + " grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')\n", + " grid_search.fit(X_train_class, y_train_class)\n", + " best_model = grid_search.best_estimator_\n", + " y_pred_class = best_model.predict(X_test_class)\n", + " accuracy = accuracy_score(y_test_class, y_pred_class)\n", + " print(f\"Model: {name}\")\n", + " print(f\"Best Parameters: {grid_search.best_params_}\")\n", + " print(f\"Accuracy: {accuracy}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Прогнозирование стоимости страховых взносов: Задача: Регрессия\n", + "\n", + "Выбор метрик:\n", + "\n", + "MAE (Mean Absolute Error): Средняя абсолютная ошибка. Показывает среднее отклонение предсказанных значений от фактических. Эта метрика легко интерпретируется, так как она измеряется в тех же единицах, что и целевая переменная (доллары).\n", + "\n", + "MSE (Mean Squared Error): Среднеквадратичная ошибка. Показывает среднее квадратичное отклонение предсказанных значений от фактических. Эта метрика чувствительна к выбросам, так как ошибки возводятся в квадрат.\n", + "\n", + "RMSE (Root Mean Squared Error): Квадратный корень из среднеквадратичной ошибки. Показывает среднее отклонение предсказанных значений от фактических в тех же единицах, что и целевая переменная. Эта метрика также чувствительна к выбросам, но легче интерпретируется, чем MSE.\n", + "\n", + "R² (R-squared): Коэффициент детерминации. Показывает, какую долю дисперсии целевой переменной объясняет модель. Значение R² близкое к 1 указывает на хорошее качество модели.\n", + "\n", + "Обоснование:\n", + "\n", + "MAE: Хорошо подходит для задач, где важно понимать среднее отклонение предсказаний от фактических значений.\n", + "\n", + "MSE и RMSE: Полезны для задач, где важно минимизировать влияние выбросов, так как они возводят ошибки в квадрат.\n", + "\n", + "R²: Позволяет оценить, насколько хорошо модель объясняет вариацию целевой переменной.\n", + "\n", + "2. Оптимизация тарифной сетки: Задача: Классификация\n", + "\n", + "Выбор метрик:\n", + "\n", + "Accuracy: Доля правильных предсказаний среди всех предсказаний. Эта метрика показывает общую точность модели.\n", + "\n", + "Precision: Доля правильных положительных предсказаний среди всех положительных предсказаний. Эта метрика важна, если важно минимизировать количество ложноположительных результатов.\n", + "\n", + "Recall (Sensitivity): Доля правильных положительных предсказаний среди всех фактических положительных случаев. Эта метрика важна, если важно минимизировать количество ложноотрицательных результатов.\n", + "\n", + "F1-score: Гармоническое среднее между precision и recall. Эта метрика показывает баланс между precision и recall.\n", + "\n", + "Обоснование:\n", + "\n", + "Accuracy: Хорошо подходит для задач, где классы сбалансированы.\n", + "\n", + "Precision и Recall: Важны для задач, где важно минимизировать ошибки определенного типа (ложноположительные или ложноотрицательные).\n", + "\n", + "F1-score: Позволяет оценить баланс между precision и recall." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Результаты для задачи регрессии:\n", + "Model: Linear Regression\n", + "Best Parameters: {}\n", + "MAE: 4808.184185261903\n", + "MSE: 77191075.38529098\n", + "RMSE: 8785.845171939407\n", + "R²: 0.7071551004885543\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: Random Forest Regression\n", + "Best Parameters: {'model__max_depth': None, 'model__n_estimators': 100}\n", + "MAE: 3931.812022058823\n", + "MSE: 99133452.49149376\n", + "RMSE: 9956.578352601548\n", + "R²: 0.6239108499500701\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: Gradient Boosting Regression\n", + "Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200}\n", + "MAE: 3741.1262483893015\n", + "MSE: 76106789.83221506\n", + "RMSE: 8723.920553983458\n", + "R²: 0.711268626466102\n", + "\n", + "Результаты для задачи классификации:\n", + "Model: Logistic Regression\n", + "Best Parameters: {'model__C': 1, 'model__solver': 'lbfgs'}\n", + "Accuracy: 0.8713235294117647\n", + "Precision: 0.8035714285714286\n", + "Recall: 0.6521739130434783\n", + "F1-score: 0.72\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: Random Forest Classification\n", + "Best Parameters: {'model__max_depth': 20, 'model__n_estimators': 200}\n", + "Accuracy: 0.8676470588235294\n", + "Precision: 0.851063829787234\n", + "Recall: 0.5797101449275363\n", + "F1-score: 0.6896551724137931\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: Gradient Boosting Classification\n", + "Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200}\n", + "Accuracy: 0.8676470588235294\n", + "Precision: 0.8113207547169812\n", + "Recall: 0.6231884057971014\n", + "F1-score: 0.7049180327868853\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LinearRegression, LogisticRegression\n", + "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", + "from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay\n", + "\n", + "# Загружаем набор данных\n", + "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "\n", + "# Удаление пустого столбца по имени\n", + "df = df.drop('Unnamed: 0', axis=1)\n", + "\n", + "# Определяем категориальные и числовые столбцы\n", + "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", + "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", + "\n", + "# Создаем преобразователь для категориальных и числовых столбцов\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", + " ('num', StandardScaler(), numerical_cols)\n", + " ])\n", + "\n", + "# Список моделей и их гиперпараметров для задачи регрессии\n", + "models_reg = {\n", + " \"Linear Regression\": (LinearRegression(), {}),\n", + " \"Random Forest Regression\": (RandomForestRegressor(), {\n", + " 'model__n_estimators': [100, 200],\n", + " 'model__max_depth': [None, 10, 20]\n", + " }),\n", + " \"Gradient Boosting Regression\": (GradientBoostingRegressor(), {\n", + " 'model__n_estimators': [100, 200],\n", + " 'model__learning_rate': [0.01, 0.1],\n", + " 'model__max_depth': [3, 5]\n", + " })\n", + "}\n", + "\n", + "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", + "X_reg = df[categorical_cols + numerical_cols]\n", + "y_reg = df['Price']\n", + "\n", + "# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n", + "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n", + "\n", + "# Обучаем и оцениваем модели для задачи регрессии\n", + "print(\"Результаты для задачи регрессии:\")\n", + "for name, (model, params) in models_reg.items():\n", + " pipeline = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', model)\n", + " ])\n", + " grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_absolute_error')\n", + " grid_search.fit(X_train_reg, y_train_reg)\n", + " best_model = grid_search.best_estimator_\n", + " y_pred_reg = best_model.predict(X_test_reg)\n", + " mae = mean_absolute_error(y_test_reg, y_pred_reg)\n", + " mse = mean_squared_error(y_test_reg, y_pred_reg)\n", + " rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n", + " r2 = r2_score(y_test_reg, y_pred_reg)\n", + " print(f\"Model: {name}\")\n", + " print(f\"Best Parameters: {grid_search.best_params_}\")\n", + " print(f\"MAE: {mae}\")\n", + " print(f\"MSE: {mse}\")\n", + " print(f\"RMSE: {rmse}\")\n", + " print(f\"R²: {r2}\")\n", + " print()\n", + "\n", + "# Список моделей и их гиперпараметров для задачи классификации\n", + "models_class = {\n", + " \"Logistic Regression\": (LogisticRegression(), {\n", + " 'model__C': [0.1, 1, 10],\n", + " 'model__solver': ['liblinear', 'lbfgs']\n", + " }),\n", + " \"Random Forest Classification\": (RandomForestClassifier(), {\n", + " 'model__n_estimators': [100, 200],\n", + " 'model__max_depth': [None, 10, 20]\n", + " }),\n", + " \"Gradient Boosting Classification\": (GradientBoostingClassifier(), {\n", + " 'model__n_estimators': [100, 200],\n", + " 'model__learning_rate': [0.01, 0.1],\n", + " 'model__max_depth': [3, 5]\n", + " })\n", + "}\n", + "\n", + "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", + "X_class = df[categorical_cols + numerical_cols]\n", + "y_class = (df['Price'] > df['Price'].mean()).astype(int)\n", + "\n", + "# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n", + "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n", + "\n", + "# Обучаем и оцениваем модели для задачи классификации\n", + "print(\"Результаты для задачи классификации:\")\n", + "for name, (model, params) in models_class.items():\n", + " pipeline = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', model)\n", + " ])\n", + " grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')\n", + " grid_search.fit(X_train_class, y_train_class)\n", + " best_model = grid_search.best_estimator_\n", + " y_pred_class = best_model.predict(X_test_class)\n", + " accuracy = accuracy_score(y_test_class, y_pred_class)\n", + " precision = precision_score(y_test_class, y_pred_class)\n", + " recall = recall_score(y_test_class, y_pred_class)\n", + " f1 = f1_score(y_test_class, y_pred_class)\n", + " print(f\"Model: {name}\")\n", + " print(f\"Best Parameters: {grid_search.best_params_}\")\n", + " print(f\"Accuracy: {accuracy}\")\n", + " print(f\"Precision: {precision}\")\n", + " print(f\"Recall: {recall}\")\n", + " print(f\"F1-score: {f1}\")\n", + " print()\n", + "\n", + " # Визуализация матрицы ошибок\n", + " cm = confusion_matrix(y_test_class, y_pred_class)\n", + " disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Less', 'More'])\n", + " disp.plot(cmap=plt.cm.Blues)\n", + " plt.title(f'Confusion Matrix for {name}')\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Давайте проанализируем полученные значения метрик и определим, являются ли они нормальными или их можно улучшить.\n", + "\n", + "### Оценка смещения и дисперсии для задачи регрессии:\n", + "#### Вывод для задачи регрессии:\n", + "\n", + "- Random Forest Regression демонстрирует наилучшие результаты по метрикам MAE и R², что указывает на высокую точность и стабильность модели.\n", + "- Linear Regression и Gradient Boosting Regression также показывают хорошие результаты, но уступают случайному лесу.\n", + "\n", + "#### Вывод для задачи классификации:\n", + "- Random Forest Classification демонстрирует наилучшие результаты по всем метрикам (Accuracy, Precision, Recall, F1-score), что указывает на высокую точность и стабильность модели.\n", + "- Logistic Regression и Gradient Boosting Classification также показывают хорошие результаты, но уступают случайному лесу.\n", + "\n", + "Для оценки смещения (bias) и дисперсии (variance) моделей можно использовать метод перекрестной проверки (cross-validation). Этот метод позволяет оценить, насколько хорошо модель обобщается на новых данных.\n", + "\n", + "Оценка смещения и дисперсии для задачи регрессии: Для задачи регрессии мы будем использовать метрики MAE (Mean Absolute Error) и R² (R-squared) для оценки смещения и дисперсии.\n", + "\n", + "Оценка смещения и дисперсии для задачи классификации: Для задачи классификации мы будем использовать метрики Accuracy, Precision, Recall и F1-score для оценки смещения и дисперсии.\n", + "\n", + "Пример кода для оценки смещения и дисперсии:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Оценка смещения и дисперсии для задачи регрессии:\n", + "Model: Linear Regression\n", + "MAE (Cross-Validation): Mean = 5393.4164031476985, Std = 2364.7028993279946\n", + "R² (Cross-Validation): Mean = 0.31163924349046784, Std = 0.20066100088089248\n", + "\n", + "Model: Random Forest Regression\n", + "MAE (Cross-Validation): Mean = 4632.622847297591, Std = 2958.2113579489355\n", + "R² (Cross-Validation): Mean = 0.37197535985326297, Std = 0.11953059546045496\n", + "\n", + "Model: Gradient Boosting Regression\n", + "MAE (Cross-Validation): Mean = 4705.057344875317, Std = 2676.698617934764\n", + "R² (Cross-Validation): Mean = 0.30094204368967814, Std = 0.29605037554256863\n", + "\n", + "Оценка смещения и дисперсии для задачи классификации:\n", + "Model: Logistic Regression\n", + "Accuracy (Cross-Validation): Mean = 0.8373806164532234, Std = 0.04588247503012167\n", + "Precision (Cross-Validation): Mean = 0.7753980572437796, Std = 0.128244461080632\n", + "Recall (Cross-Validation): Mean = 0.663888888888889, Std = 0.22281171811283865\n", + "F1-score (Cross-Validation): Mean = 0.6730794905263655, Std = 0.09462791851118425\n", + "\n", + "Model: Random Forest Classification\n", + "Accuracy (Cross-Validation): Mean = 0.835177447362709, Std = 0.04406396622710083\n", + "Precision (Cross-Validation): Mean = 0.807362406312931, Std = 0.11894947251838416\n", + "Recall (Cross-Validation): Mean = 0.6194444444444445, Std = 0.2382420182487952\n", + "F1-score (Cross-Validation): Mean = 0.6475067867122541, Std = 0.10752644539783694\n", + "\n", + "Model: Gradient Boosting Classification\n", + "Accuracy (Cross-Validation): Mean = 0.8469530062947689, Std = 0.03006931461001492\n", + "Precision (Cross-Validation): Mean = 0.8072507504934535, Std = 0.11323561235031351\n", + "Recall (Cross-Validation): Mean = 0.6250380517503805, Std = 0.23296204577560678\n", + "F1-score (Cross-Validation): Mean = 0.661152357410075, Std = 0.1082825984327667\n", + "\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LinearRegression, LogisticRegression\n", + "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", + "from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "# Загружаем набор данных\n", + "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "\n", + "# Удаление пустого столбца по имени\n", + "df = df.drop('Unnamed: 0', axis=1)\n", + "\n", + "# Определяем категориальные и числовые столбцы\n", + "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", + "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", + "\n", + "# Создаем преобразователь для категориальных и числовых столбцов\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", + " ('num', StandardScaler(), numerical_cols)\n", + " ])\n", + "\n", + "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", + "X_reg = df[categorical_cols + numerical_cols]\n", + "y_reg = df['Price']\n", + "\n", + "# Список моделей для задачи регрессии\n", + "models_reg = {\n", + " \"Linear Regression\": LinearRegression(),\n", + " \"Random Forest Regression\": RandomForestRegressor(),\n", + " \"Gradient Boosting Regression\": GradientBoostingRegressor()\n", + "}\n", + "\n", + "# Оценка смещения и дисперсии для задачи регрессии\n", + "print(\"Оценка смещения и дисперсии для задачи регрессии:\")\n", + "for name, model in models_reg.items():\n", + " pipeline = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', model)\n", + " ])\n", + " mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n", + " r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n", + " print(f\"Model: {name}\")\n", + " print(f\"MAE (Cross-Validation): Mean = {mae_scores.mean()}, Std = {mae_scores.std()}\")\n", + " print(f\"R² (Cross-Validation): Mean = {r2_scores.mean()}, Std = {r2_scores.std()}\")\n", + " print()\n", + "\n", + "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", + "X_class = df[categorical_cols + numerical_cols]\n", + "y_class = (df['Price'] > df['Price'].mean()).astype(int)\n", + "\n", + "# Список моделей для задачи классификации\n", + "models_class = {\n", + " \"Logistic Regression\": LogisticRegression(),\n", + " \"Random Forest Classification\": RandomForestClassifier(),\n", + " \"Gradient Boosting Classification\": GradientBoostingClassifier()\n", + "}\n", + "\n", + "# Оценка смещения и дисперсии для задачи классификации\n", + "print(\"Оценка смещения и дисперсии для задачи классификации:\")\n", + "for name, model in models_class.items():\n", + " pipeline = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', model)\n", + " ])\n", + " accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n", + " precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n", + " recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n", + " f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n", + " print(f\"Model: {name}\")\n", + " print(f\"Accuracy (Cross-Validation): Mean = {accuracy_scores.mean()}, Std = {accuracy_scores.std()}\")\n", + " print(f\"Precision (Cross-Validation): Mean = {precision_scores.mean()}, Std = {precision_scores.std()}\")\n", + " print(f\"Recall (Cross-Validation): Mean = {recall_scores.mean()}, Std = {recall_scores.std()}\")\n", + " print(f\"F1-score (Cross-Validation): Mean = {f1_scores.mean()}, Std = {f1_scores.std()}\")\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LinearRegression, LogisticRegression\n", + "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", + "from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "# Загружаем набор данных\n", + "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "\n", + "# Удаление пустого столбца по имени\n", + "df = df.drop('Unnamed: 0', axis=1)\n", + "\n", + "# Определяем категориальные и числовые столбцы\n", + "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", + "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", + "\n", + "# Создаем преобразователь для категориальных и числовых столбцов\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", + " ('num', StandardScaler(), numerical_cols)\n", + " ])\n", + "\n", + "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", + "X_reg = df[categorical_cols + numerical_cols]\n", + "y_reg = df['Price']\n", + "\n", + "# Список моделей для задачи регрессии\n", + "models_reg = {\n", + " \"Linear Regression\": LinearRegression(),\n", + " \"Random Forest Regression\": RandomForestRegressor(),\n", + " \"Gradient Boosting Regression\": GradientBoostingRegressor()\n", + "}\n", + "\n", + "# Оценка смещения и дисперсии для задачи регрессии\n", + "mae_means = []\n", + "mae_stds = []\n", + "r2_means = []\n", + "r2_stds = []\n", + "\n", + "for name, model in models_reg.items():\n", + " pipeline = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', model)\n", + " ])\n", + " mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n", + " r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n", + " mae_means.append(mae_scores.mean())\n", + " mae_stds.append(mae_scores.std())\n", + " r2_means.append(r2_scores.mean())\n", + " r2_stds.append(r2_scores.std())\n", + "\n", + "# Визуализация результатов для задачи регрессии\n", + "fig, ax = plt.subplots(1, 2, figsize=(12, 6))\n", + "\n", + "ax[0].bar(models_reg.keys(), mae_means, yerr=mae_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n", + "ax[0].set_ylabel('MAE')\n", + "ax[0].set_title('Mean Absolute Error (MAE) for Regression Models')\n", + "ax[0].yaxis.grid(True)\n", + "\n", + "ax[1].bar(models_reg.keys(), r2_means, yerr=r2_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n", + "ax[1].set_ylabel('R²')\n", + "ax[1].set_title('R-squared (R²) for Regression Models')\n", + "ax[1].yaxis.grid(True)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", + "X_class = df[categorical_cols + numerical_cols]\n", + "y_class = (df['Price'] > df['Price'].mean()).astype(int)\n", + "\n", + "# Список моделей для задачи классификации\n", + "models_class = {\n", + " \"Logistic Regression\": LogisticRegression(),\n", + " \"Random Forest Classification\": RandomForestClassifier(),\n", + " \"Gradient Boosting Classification\": GradientBoostingClassifier()\n", + "}\n", + "\n", + "# Оценка смещения и дисперсии для задачи классификации\n", + "accuracy_means = []\n", + "accuracy_stds = []\n", + "precision_means = []\n", + "precision_stds = []\n", + "recall_means = []\n", + "recall_stds = []\n", + "f1_means = []\n", + "f1_stds = []\n", + "\n", + "for name, model in models_class.items():\n", + " pipeline = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('model', model)\n", + " ])\n", + " accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n", + " precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n", + " recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n", + " f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n", + " accuracy_means.append(accuracy_scores.mean())\n", + " accuracy_stds.append(accuracy_scores.std())\n", + " precision_means.append(precision_scores.mean())\n", + " precision_stds.append(precision_scores.std())\n", + " recall_means.append(recall_scores.mean())\n", + " recall_stds.append(recall_scores.std())\n", + " f1_means.append(f1_scores.mean())\n", + " f1_stds.append(f1_scores.std())\n", + "\n", + "# Визуализация результатов для задачи классификации\n", + "fig, ax = plt.subplots(2, 2, figsize=(12, 12))\n", + "\n", + "ax[0, 0].bar(models_class.keys(), accuracy_means, yerr=accuracy_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n", + "ax[0, 0].set_ylabel('Accuracy')\n", + "ax[0, 0].set_title('Accuracy for Classification Models')\n", + "ax[0, 0].yaxis.grid(True)\n", + "\n", + "ax[0, 1].bar(models_class.keys(), precision_means, yerr=precision_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n", + "ax[0, 1].set_ylabel('Precision')\n", + "ax[0, 1].set_title('Precision for Classification Models')\n", + "ax[0, 1].yaxis.grid(True)\n", + "\n", + "ax[1, 0].bar(models_class.keys(), recall_means, yerr=recall_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n", + "ax[1, 0].set_ylabel('Recall')\n", + "ax[1, 0].set_title('Recall for Classification Models')\n", + "ax[1, 0].yaxis.grid(True)\n", + "\n", + "ax[1, 1].bar(models_class.keys(), f1_means, yerr=f1_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n", + "ax[1, 1].set_ylabel('F1-score')\n", + "ax[1, 1].set_title('F1-score for Classification Models')\n", + "ax[1, 1].yaxis.grid(True)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] } ], "metadata": { From 8480d8559c306afeadd9a47a3d7b07595e0ca7a8 Mon Sep 17 00:00:00 2001 From: "a.puchkina" Date: Thu, 12 Dec 2024 15:54:39 +0400 Subject: [PATCH 3/4] =?UTF-8?q?=D0=BB=D0=B0=D0=B1=D0=B04!?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_4/lab4.ipynb | 2776 +++++++++++++--------------------------------- 1 file changed, 768 insertions(+), 2008 deletions(-) diff --git a/lab_4/lab4.ipynb b/lab_4/lab4.ipynb index 45a40d5..61ff186 100644 --- a/lab_4/lab4.ipynb +++ b/lab_4/lab4.ipynb @@ -23,41 +23,396 @@ " 'Number of SIMs', '3G', '4G/ LTE', 'Price'],\n", " dtype='object')\n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0NameBrandModelBattery capacity (mAh)Screen size (inches)TouchscreenResolution xResolution yProcessor...Rear cameraFront cameraOperating systemWi-FiBluetoothGPSNumber of SIMs3G4G/ LTEPrice
00OnePlus 7T Pro McLaren EditionOnePlus7T Pro McLaren Edition40856.67Yes144031208...48.016.0AndroidYesYesYes2YesYes58998
11Realme X2 ProRealmeX2 Pro40006.50Yes108024008...64.016.0AndroidYesYesYes2YesYes27999
22iPhone 11 Pro MaxAppleiPhone 11 Pro Max39696.50Yes124226886...12.012.0iOSYesYesYes2YesYes106900
33iPhone 11AppleiPhone 1131106.10Yes82817926...12.012.0iOSYesYesYes2YesYes62900
44LG G8X ThinQLGG8X ThinQ40006.40Yes108023408...12.032.0AndroidYesYesYes1NoNo49990
55OnePlus 7TOnePlus7T38006.55Yes108024008...48.016.0AndroidYesYesNo2YesYes34930
66OnePlus 7T ProOnePlus7T Pro40856.67Yes144031208...48.016.0AndroidYesYesYes2YesYes52990
77Samsung Galaxy Note 10+SamsungGalaxy Note 10+43006.80Yes144030408...12.010.0AndroidYesYesYes2YesYes79699
88Asus ROG Phone 2AsusROG Phone 260006.59Yes108023408...48.024.0AndroidYesYesYes1YesYes37999
99Xiaomi Redmi K20 ProXiaomiRedmi K20 Pro40006.39Yes108023408...48.020.0AndroidYesYesYes2NoNo23190
\n", + "

10 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Name Brand \\\n", + "0 0 OnePlus 7T Pro McLaren Edition OnePlus \n", + "1 1 Realme X2 Pro Realme \n", + "2 2 iPhone 11 Pro Max Apple \n", + "3 3 iPhone 11 Apple \n", + "4 4 LG G8X ThinQ LG \n", + "5 5 OnePlus 7T OnePlus \n", + "6 6 OnePlus 7T Pro OnePlus \n", + "7 7 Samsung Galaxy Note 10+ Samsung \n", + "8 8 Asus ROG Phone 2 Asus \n", + "9 9 Xiaomi Redmi K20 Pro Xiaomi \n", + "\n", + " Model Battery capacity (mAh) Screen size (inches) \\\n", + "0 7T Pro McLaren Edition 4085 6.67 \n", + "1 X2 Pro 4000 6.50 \n", + "2 iPhone 11 Pro Max 3969 6.50 \n", + "3 iPhone 11 3110 6.10 \n", + "4 G8X ThinQ 4000 6.40 \n", + "5 7T 3800 6.55 \n", + "6 7T Pro 4085 6.67 \n", + "7 Galaxy Note 10+ 4300 6.80 \n", + "8 ROG Phone 2 6000 6.59 \n", + "9 Redmi K20 Pro 4000 6.39 \n", + "\n", + " Touchscreen Resolution x Resolution y Processor ... Rear camera \\\n", + "0 Yes 1440 3120 8 ... 48.0 \n", + "1 Yes 1080 2400 8 ... 64.0 \n", + "2 Yes 1242 2688 6 ... 12.0 \n", + "3 Yes 828 1792 6 ... 12.0 \n", + "4 Yes 1080 2340 8 ... 12.0 \n", + "5 Yes 1080 2400 8 ... 48.0 \n", + "6 Yes 1440 3120 8 ... 48.0 \n", + "7 Yes 1440 3040 8 ... 12.0 \n", + "8 Yes 1080 2340 8 ... 48.0 \n", + "9 Yes 1080 2340 8 ... 48.0 \n", + "\n", + " Front camera Operating system Wi-Fi Bluetooth GPS Number of SIMs 3G \\\n", + "0 16.0 Android Yes Yes Yes 2 Yes \n", + "1 16.0 Android Yes Yes Yes 2 Yes \n", + "2 12.0 iOS Yes Yes Yes 2 Yes \n", + "3 12.0 iOS Yes Yes Yes 2 Yes \n", + "4 32.0 Android Yes Yes Yes 1 No \n", + "5 16.0 Android Yes Yes No 2 Yes \n", + "6 16.0 Android Yes Yes Yes 2 Yes \n", + "7 10.0 Android Yes Yes Yes 2 Yes \n", + "8 24.0 Android Yes Yes Yes 1 Yes \n", + "9 20.0 Android Yes Yes Yes 2 No \n", + "\n", + " 4G/ LTE Price \n", + "0 Yes 58998 \n", + "1 Yes 27999 \n", + "2 Yes 106900 \n", + "3 Yes 62900 \n", + "4 No 49990 \n", + "5 Yes 34930 \n", + "6 Yes 52990 \n", + "7 Yes 79699 \n", + "8 Yes 37999 \n", + "9 No 23190 \n", + "\n", + "[10 rows x 22 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", - "print(df.columns)" + "print(df.columns)\n", + "display(df.head(10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Бизнес-цели:\n", - "1. Прогнозирование цен на мобильные телефоны: \n", + "## Регрессия\n", "\n", - "Цель: Разработать точную модель машинного обучения для прогнозирования цен на мобильные телефоны на основе характеристик.\n", + "Цель: Разработать модель регрессии, которая будет предсказывать цену мобильного телефона на основе его технических характеристик и функциональных особенностей.\n", "\n", "Применение:\n", - "Оптимизация ценообразования для производителей и продавцов.\n", - "Помощь потребителям в принятии обоснованных решений при покупке.\n", "\n", - "2. Оптимизация характеристик продукта:\n", + "Производители и продавцы мобильных телефонов: Модель может помочь в установлении справедливой цены для новых моделей, основываясь на их технических характеристиках, что может быть полезно для ценообразования и конкуренции на рынке.\n", "\n", - "Цель: Определить оптимальный набор характеристик для мобильных телефонов, максимизирующий ценность для потребителей при заданном бюджете.\n", + "Потребители: Модель может помочь пользователям принимать более обоснованные решения при покупке, сравнивая цены и характеристики различных моделей.\n", "\n", - "Применение:\n", - "Разработка новых моделей мобильных телефонов.\n", - "Улучшение существующих моделей." + "Рыночные аналитики: Модель может быть использована для анализа тенденций на рынке мобильных телефонов, выявления факторов, влияющих на цену, и прогнозирования будущих изменений.\n", + "\n", + "Исследования в области технологий: Модель может помочь в изучении влияния различных технических характеристик на цену и популярность моделей." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 1. Прогнозирование цен на мобильные телефоны " + "#### Удаление выбросов" ] }, { @@ -69,123 +424,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Среднее значение поля 'Price': 11465.825607064018\n", - " Name Brand Model \\\n", - "0 OnePlus 7T Pro McLaren Edition OnePlus 7T Pro McLaren Edition \n", - "1 Realme X2 Pro Realme X2 Pro \n", - "2 iPhone 11 Pro Max Apple iPhone 11 Pro Max \n", - "3 iPhone 11 Apple iPhone 11 \n", - "4 LG G8X ThinQ LG G8X ThinQ \n", - "5 OnePlus 7T OnePlus 7T \n", - "6 OnePlus 7T Pro OnePlus 7T Pro \n", - "7 Samsung Galaxy Note 10+ Samsung Galaxy Note 10+ \n", - "8 Asus ROG Phone 2 Asus ROG Phone 2 \n", - "9 Xiaomi Redmi K20 Pro Xiaomi Redmi K20 Pro \n", - "10 Oppo K3 Oppo K3 \n", - "11 Realme X Realme X \n", - "12 Xiaomi Redmi K20 Xiaomi Redmi K20 \n", - "13 OnePlus 7 Pro OnePlus 7 Pro \n", - "14 Oppo Reno 10x Zoom Oppo Reno 10x Zoom \n", - "15 Realme 3 Pro Realme 3 Pro \n", - "16 Huawei P30 Pro Huawei P30 Pro \n", - "17 Redmi Note 7 Pro Xiaomi Redmi Note 7 Pro \n", - "18 Huawei Mate 20 Pro Huawei Mate 20 Pro \n", - "19 LG V40 ThinQ LG V40 ThinQ \n", - "\n", - " Battery capacity (mAh) Screen size (inches) Touchscreen Resolution x \\\n", - "0 4085 6.67 Yes 1440 \n", - "1 4000 6.50 Yes 1080 \n", - "2 3969 6.50 Yes 1242 \n", - "3 3110 6.10 Yes 828 \n", - "4 4000 6.40 Yes 1080 \n", - "5 3800 6.55 Yes 1080 \n", - "6 4085 6.67 Yes 1440 \n", - "7 4300 6.80 Yes 1440 \n", - "8 6000 6.59 Yes 1080 \n", - "9 4000 6.39 Yes 1080 \n", - "10 3765 6.50 Yes 1080 \n", - "11 3765 6.53 Yes 1080 \n", - "12 4000 6.39 Yes 1080 \n", - "13 4000 6.67 Yes 1440 \n", - "14 4065 6.60 Yes 1080 \n", - "15 4045 6.30 Yes 1080 \n", - "16 4200 6.47 Yes 1080 \n", - "17 4000 6.30 Yes 1080 \n", - "18 4200 6.39 Yes 1440 \n", - "19 3300 6.40 Yes 1440 \n", - "\n", - " Resolution y Processor RAM (MB) ... Operating system Wi-Fi \\\n", - "0 3120 8 12000 ... Android Yes \n", - "1 2400 8 6000 ... Android Yes \n", - "2 2688 6 4000 ... iOS Yes \n", - "3 1792 6 4000 ... iOS Yes \n", - "4 2340 8 6000 ... Android Yes \n", - "5 2400 8 8000 ... Android Yes \n", - "6 3120 8 8000 ... Android Yes \n", - "7 3040 8 12000 ... Android Yes \n", - "8 2340 8 8000 ... Android Yes \n", - "9 2340 8 6000 ... Android Yes \n", - "10 2340 8 6000 ... Android Yes \n", - "11 2340 8 4000 ... Android Yes \n", - "12 2340 8 6000 ... Android Yes \n", - "13 3120 8 6000 ... Android Yes \n", - "14 2340 8 6000 ... Android Yes \n", - "15 2340 8 4000 ... Android Yes \n", - "16 2340 8 8000 ... Android Yes \n", - "17 2340 8 4000 ... Android Yes \n", - "18 3120 8 6000 ... Android Yes \n", - "19 3120 8 6000 ... Android Yes \n", - "\n", - " Bluetooth GPS Number of SIMs 3G 4G/ LTE Price above_average_price \\\n", - "0 Yes Yes 2 Yes Yes 58998 1 \n", - "1 Yes Yes 2 Yes Yes 27999 1 \n", - "2 Yes Yes 2 Yes Yes 106900 1 \n", - "3 Yes Yes 2 Yes Yes 62900 1 \n", - "4 Yes Yes 1 No No 49990 1 \n", - "5 Yes No 2 Yes Yes 34930 1 \n", - "6 Yes Yes 2 Yes Yes 52990 1 \n", - "7 Yes Yes 2 Yes Yes 79699 1 \n", - "8 Yes Yes 1 Yes Yes 37999 1 \n", - "9 Yes Yes 2 No No 23190 1 \n", - "10 Yes Yes 2 Yes Yes 23990 1 \n", - "11 Yes Yes 2 Yes Yes 14999 1 \n", - "12 Yes Yes 2 Yes Yes 19282 1 \n", - "13 Yes Yes 2 Yes Yes 39995 1 \n", - "14 Yes Yes 2 Yes Yes 36990 1 \n", - "15 Yes Yes 2 Yes Yes 13999 1 \n", - "16 Yes No 2 Yes Yes 54280 1 \n", - "17 Yes Yes 2 Yes Yes 9799 0 \n", - "18 Yes Yes 2 Yes Yes 63990 1 \n", - "19 Yes Yes 2 Yes Yes 29999 1 \n", - "\n", - " price_volatility \n", - "0 174496 \n", - "1 174496 \n", - "2 174496 \n", - "3 174496 \n", - "4 174496 \n", - "5 174496 \n", - "6 174496 \n", - "7 174496 \n", - "8 174496 \n", - "9 174496 \n", - "10 174496 \n", - "11 174496 \n", - "12 174496 \n", - "13 174496 \n", - "14 174496 \n", - "15 174496 \n", - "16 174496 \n", - "17 174496 \n", - "18 174496 \n", - "19 174496 \n", - "\n", - "[20 rows x 23 columns]\n" + "Размер данных до удаления выбросов: (1359, 21)\n", + "Размер данных после удаления выбросов: (1256, 21)\n" ] } ], "source": [ "import pandas as pd\n", + "from scipy import stats\n", "\n", "# Загружаем набор данных\n", "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", @@ -193,28 +439,34 @@ "# Удаление пустого столбца по имени\n", "df = df.drop('Unnamed: 0', axis=1)\n", "\n", - "# Устанавливаем случайное состояние\n", - "random_state = 42\n", + "# Выбор числовых признаков для анализа выбросов\n", + "numeric_features = [\n", + " 'Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', \n", + " 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', \n", + " 'Rear camera', 'Front camera', 'Number of SIMs', 'Price'\n", + "]\n", "\n", - "# Рассчитываем среднее значение цены\n", - "average_price = df['Price'].mean()\n", - "print(f\"Среднее значение поля 'Price': {average_price}\")\n", + "# Вычисление Z-оценок для числовых признаков\n", + "z_scores = stats.zscore(df[numeric_features])\n", "\n", - "# Создаем новую переменную, указывающую, превышает ли цена среднюю\n", - "df['above_average_price'] = (df['Price'] > average_price).astype(int)\n", + "# Определение порога для выбросов\n", + "threshold = 3\n", "\n", - "# Рассчитываем волатильность (разницу между максимальной и минимальной ценой)\n", - "df['price_volatility'] = df['Price'].max() - df['Price'].min()\n", + "# Вывод размеров данных до удаления выбросов\n", + "print(\"Размер данных до удаления выбросов:\", df.shape)\n", "\n", - "# Выводим первые строки измененной таблицы для проверки\n", - "print(df.head(20))" + "# Удаление строк, содержащих выбросы\n", + "df = df[(z_scores < threshold).all(axis=1)]\n", + "\n", + "# Вывод размеров данных после удаления выбросов\n", + "print(\"Размер данных после удаления выбросов:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 2. Оптимизация характеристик продукта " + "#### Подготовка данных и оценка базовой модели" ] }, { @@ -226,464 +478,50 @@ "name": "stdout", "output_type": "stream", "text": [ - "Средняя цена для 'Brand':\n", - "Brand\n", - "10.or 5999.000000\n", - "Acer 6470.000000\n", - "Alcatel 9201.600000\n", - "Apple 45510.470588\n", - "Aqua 4599.000000\n", - " ... \n", - "Zopo 6027.916667\n", - "Zuk 29124.000000\n", - "iBall 4051.000000\n", - "iVoomi 4018.875000\n", - "mPhone 6949.000000\n", - "Name: Price, Length: 76, dtype: float64\n", - "\n", - "Средняя цена для 'Battery capacity (mAh)':\n", - "Battery capacity (mAh)\n", - "1010 3499.000000\n", - "1050 4790.000000\n", - "1200 3396.000000\n", - "1250 2498.500000\n", - "1300 3338.333333\n", - " ... \n", - "5000 13542.530612\n", - "5020 23900.000000\n", - "5100 16999.000000\n", - "5300 8999.000000\n", - "6000 19832.333333\n", - "Name: Price, Length: 165, dtype: float64\n", - "\n", - "Средняя цена для 'Screen size (inches)':\n", - "Screen size (inches)\n", - "2.40 1249.000000\n", - "2.44 6999.000000\n", - "2.45 2999.000000\n", - "2.60 5555.000000\n", - "2.80 5345.000000\n", - " ... \n", - "6.70 58896.428571\n", - "6.80 79699.000000\n", - "6.90 92999.000000\n", - "7.00 6199.000000\n", - "7.30 164999.000000\n", - "Name: Price, Length: 80, dtype: float64\n", - "\n", - "Средняя цена для 'Touchscreen':\n", - "Touchscreen\n", - "No 5255.411765\n", - "Yes 11544.497019\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'Resolution x':\n", - "Resolution x\n", - "240 3819.888889\n", - "320 2499.000000\n", - "360 5797.500000\n", - "400 2860.500000\n", - "480 4128.602459\n", - "540 7445.848485\n", - "560 15990.000000\n", - "600 4829.000000\n", - "640 7697.200000\n", - "720 7619.360684\n", - "750 25999.000000\n", - "768 8872.000000\n", - "800 2190.000000\n", - "828 54199.500000\n", - "850 4115.000000\n", - "854 4415.666667\n", - "1024 6199.000000\n", - "1080 17165.865952\n", - "1125 75632.666667\n", - "1176 77299.000000\n", - "1242 88449.500000\n", - "1280 7144.333333\n", - "1440 34240.461538\n", - "1520 7935.000000\n", - "1536 164999.000000\n", - "1600 29950.000000\n", - "1880 20800.000000\n", - "2160 45500.000000\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'Resolution y':\n", - "Resolution y\n", - "320 3811.125000\n", - "480 4264.818182\n", - "485 3299.000000\n", - "584 4499.000000\n", - "600 6199.000000\n", - "640 4095.000000\n", - "720 8539.900000\n", - "800 4189.321429\n", - "854 4087.901316\n", - "960 6921.651163\n", - "1080 7583.800000\n", - "1136 12749.000000\n", - "1280 7449.136674\n", - "1290 14990.000000\n", - "1334 25999.000000\n", - "1440 6930.808824\n", - "1480 13596.166667\n", - "1498 4490.000000\n", - "1500 9083.125000\n", - "1520 8139.425000\n", - "1544 10010.000000\n", - "1548 8499.000000\n", - "1560 9135.750000\n", - "1580 9950.000000\n", - "1600 9476.111111\n", - "1620 21492.000000\n", - "1792 54199.500000\n", - "1820 6498.000000\n", - "1920 14698.088496\n", - "2152 164999.000000\n", - "2160 17465.194444\n", - "2220 20531.333333\n", - "2240 49990.000000\n", - "2244 18000.000000\n", - "2246 16177.200000\n", - "2248 39990.000000\n", - "2270 8961.000000\n", - "2280 19073.190476\n", - "2310 25120.000000\n", - "2316 29999.000000\n", - "2340 19768.017241\n", - "2400 32131.083333\n", - "2436 75632.666667\n", - "2520 11844.500000\n", - "2560 26501.476190\n", - "2636 174990.000000\n", - "2688 88449.500000\n", - "2880 34993.250000\n", - "2960 38425.714286\n", - "3040 81799.500000\n", - "3120 45710.000000\n", - "3200 77999.000000\n", - "3840 45500.000000\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'Processor':\n", - "Processor\n", - "1 7861.095238\n", - "2 7929.444444\n", - "4 7209.401171\n", - "6 39365.400000\n", - "8 16193.927434\n", - "10 8542.000000\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'RAM (MB)':\n", - "RAM (MB)\n", - "64 4249.000000\n", - "256 2625.625000\n", - "289 3190.000000\n", - "384 5555.000000\n", - "512 3419.847222\n", - "768 4749.500000\n", - "1000 5747.082153\n", - "2000 8645.608187\n", - "3000 11587.829787\n", - "4000 17741.965000\n", - "6000 28291.587302\n", - "8000 45080.586207\n", - "12000 99173.750000\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'Internal storage (GB)':\n", - "Internal storage (GB)\n", - "0.064 3499.000000\n", - "0.128 4999.000000\n", - "0.160 5555.000000\n", - "0.512 3390.444444\n", - "1.000 5197.000000\n", - "2.000 2399.000000\n", - "3.000 3190.000000\n", - "4.000 4513.032258\n", - "8.000 5311.654206\n", - "16.000 8379.403341\n", - "32.000 12089.644366\n", - "64.000 21208.814607\n", - "128.000 29477.043478\n", - "256.000 68682.777778\n", - "512.000 164999.000000\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'Rear camera':\n", - "Rear camera\n", - "0.0 4193.500000\n", - "0.3 3054.666667\n", - "2.0 3000.804878\n", - "3.0 4329.333333\n", - "3.2 3506.727273\n", - "5.0 4413.183486\n", - "8.0 6658.109966\n", - "8.7 8999.000000\n", - "10.0 10499.250000\n", - "12.0 28533.950980\n", - "12.2 29162.500000\n", - "12.3 30569.250000\n", - "13.0 9531.776018\n", - "13.1 32490.000000\n", - "13.2 7496.333333\n", - "15.0 88719.000000\n", - "16.0 17923.896825\n", - "18.0 19999.000000\n", - "19.0 39095.000000\n", - "20.0 23739.777778\n", - "20.7 35718.600000\n", - "21.0 14513.900000\n", - "21.5 6999.000000\n", - "23.0 31966.090909\n", - "24.0 18587.250000\n", - "25.0 13980.000000\n", - "32.0 19589.000000\n", - "40.0 61389.750000\n", - "41.0 10990.000000\n", - "48.0 24205.615385\n", - "64.0 19332.333333\n", - "108.0 92999.000000\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'Front camera':\n", - "Front camera\n", - "0.0 6799.333333\n", - "0.3 4398.430233\n", - "0.9 19999.000000\n", - "1.1 18745.000000\n", - "1.2 18579.333333\n", - "1.3 5640.352941\n", - "1.5 18748.000000\n", - "1.6 15833.000000\n", - "1.9 11246.750000\n", - "2.0 6711.995122\n", - "2.1 24839.666667\n", - "2.2 35823.333333\n", - "2.4 5061.333333\n", - "3.0 8500.000000\n", - "3.2 4660.900000\n", - "3.7 28925.000000\n", - "4.0 13519.285714\n", - "5.0 8347.064655\n", - "7.0 49061.625000\n", - "8.0 13894.422222\n", - "10.0 105114.166667\n", - "12.0 41929.500000\n", - "13.0 11380.436364\n", - "16.0 17646.113402\n", - "20.0 13498.520000\n", - "24.0 21860.562500\n", - "25.0 19182.692308\n", - "32.0 23749.782609\n", - "40.0 92999.000000\n", - "48.0 27999.000000\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'Operating system':\n", - "Operating system\n", - "Android 10989.947652\n", - "BlackBerry 10509.600000\n", - "Cyanogen 9173.600000\n", - "Sailfish 4799.000000\n", - "Tizen 4459.666667\n", - "Windows 16706.684211\n", - "iOS 45510.470588\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'Wi-Fi':\n", - "Wi-Fi\n", - "No 6334.375000\n", - "Yes 11496.211695\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'Bluetooth':\n", - "Bluetooth\n", - "No 7794.466667\n", - "Yes 11506.800595\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'GPS':\n", - "GPS\n", - "No 8316.148148\n", - "Yes 11737.740208\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'Number of SIMs':\n", - "Number of SIMs\n", - "1 16479.718062\n", - "2 10465.580018\n", - "3 4590.000000\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для '3G':\n", - "3G\n", - "No 11616.882759\n", - "Yes 11447.783361\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для '4G/ LTE':\n", - "4G/ LTE\n", - "No 7922.270893\n", - "Yes 12680.858696\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для комбинации 'Brand' и 'Operating system':\n", - "Brand Operating system\n", - "10.or Android 5999.000000\n", - "Acer Android 6470.000000\n", - "Alcatel Android 9201.600000\n", - "Apple iOS 45510.470588\n", - "Aqua Android 4599.000000\n", - " ... \n", - "Zopo Android 6027.916667\n", - "Zuk Android 29124.000000\n", - "iBall Android 4051.000000\n", - "iVoomi Android 4018.875000\n", - "mPhone Android 6949.000000\n", - "Name: Price, Length: 85, dtype: float64\n", - "\n", - "Средняя цена для комбинации 'RAM (MB)' и 'Internal storage (GB)':\n", - "RAM (MB) Internal storage (GB)\n", - "64 0.064 3499.000000\n", - " 0.128 4999.000000\n", - "256 0.512 2287.857143\n", - " 8.000 4990.000000\n", - "289 3.000 3190.000000\n", - "384 0.160 5555.000000\n", - "512 0.512 7249.500000\n", - " 2.000 2399.000000\n", - " 4.000 3222.244444\n", - " 8.000 3513.750000\n", - "768 4.000 6500.000000\n", - " 8.000 2999.000000\n", - "1000 1.000 5197.000000\n", - " 4.000 8019.187500\n", - " 8.000 5398.572464\n", - " 16.000 6772.263158\n", - " 32.000 6997.000000\n", - "2000 8.000 6458.736842\n", - " 16.000 8518.889273\n", - " 32.000 10155.272727\n", - " 64.000 36999.000000\n", - "3000 16.000 9082.082192\n", - " 32.000 11742.751269\n", - " 64.000 24287.833333\n", - "4000 32.000 14827.288462\n", - " 64.000 19201.702290\n", - " 128.000 15408.882353\n", - "6000 64.000 26554.636364\n", - " 128.000 29692.000000\n", - " 256.000 44999.000000\n", - "8000 64.000 54990.000000\n", - " 128.000 37177.181818\n", - " 256.000 72408.166667\n", - "12000 128.000 92999.000000\n", - " 256.000 69348.500000\n", - " 512.000 164999.000000\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для комбинации 'Screen size (inches)' и 'Battery capacity (mAh)':\n", - "Screen size (inches) Battery capacity (mAh)\n", - "2.40 2000 1249.0\n", - "2.44 1450 6999.0\n", - "2.45 1500 2999.0\n", - "2.60 1350 5555.0\n", - "2.80 1200 3190.0\n", - " ... \n", - "6.70 4500 40259.0\n", - "6.80 4300 79699.0\n", - "6.90 5000 92999.0\n", - "7.00 3450 6199.0\n", - "7.30 4380 164999.0\n", - "Name: Price, Length: 431, dtype: float64\n", - "\n" + "Размер обучающей выборки: (1004, 18)\n", + "Размер тестовой выборки: (252, 18)\n", + "Baseline MAE: 4662.689511794094\n", + "Baseline MSE: 50560680.710365206\n", + "Baseline R²: -0.001378207894705552\n" ] } ], "source": [ "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", "\n", - "# Загружаем набор данных\n", - "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", "\n", - "# Удаление пустого столбца по имени\n", - "df = df.drop('Unnamed: 0', axis=1)\n", + "# Выбор признаков и целевой переменной\n", + "features = [\n", + " 'Brand', 'Battery capacity (mAh)',\n", + " 'Screen size (inches)', 'Touchscreen', 'Resolution x', 'Resolution y',\n", + " 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera',\n", + " 'Front camera', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS',\n", + " 'Number of SIMs', '3G', '4G/ LTE'\n", + "]\n", + "target = 'Price'\n", "\n", - "# Устанавливаем случайное состояние\n", - "random_state = 42\n", + "global X_train, X_test, y_train, y_test\n", + "X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)\n", "\n", - "# Рассчитываем среднюю цену для каждого значения каждого признака (Model уберем, у всех разная)\n", - "for column in ['Brand', 'Battery capacity (mAh)', 'Screen size (inches)', 'Touchscreen', \n", - " 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', \n", - " 'Rear camera', 'Front camera', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', \n", - " 'Number of SIMs', '3G', '4G/ LTE']:\n", - " print(f\"Средняя цена для '{column}':\")\n", - " print(df.groupby(column)['Price'].mean())\n", - " print()\n", + "print(\"Размер обучающей выборки:\", X_train.shape)\n", + "print(\"Размер тестовой выборки:\", X_test.shape)\n", "\n", - "# Рассчитываем среднюю цену для комбинаций признаков\n", - "# для комбинации 'Brand' и 'Operating system'\n", - "print(\"Средняя цена для комбинации 'Brand' и 'Operating system':\")\n", - "print(df.groupby(['Brand', 'Operating system'])['Price'].mean())\n", - "print()\n", + "# Базовые предсказания (среднее значение целевой переменной)\n", + "baseline_predictions = [y_train.mean()] * len(y_test)\n", "\n", - "# Рассчитываем среднюю цену для комбинации 'RAM (MB)' и 'Internal storage (GB)'\n", - "print(\"Средняя цена для комбинации 'RAM (MB)' и 'Internal storage (GB)':\")\n", - "print(df.groupby(['RAM (MB)', 'Internal storage (GB)'])['Price'].mean())\n", - "print()\n", - "\n", - "# Рассчитываем среднюю цену для комбинации 'Screen size (inches)' и 'Battery capacity (mAh)'\n", - "print(\"Средняя цена для комбинации 'Screen size (inches)' и 'Battery capacity (mAh)':\")\n", - "print(df.groupby(['Screen size (inches)', 'Battery capacity (mAh)'])['Price'].mean())\n", - "print()" + "# Оценка базовой модели\n", + "print('Baseline MAE:', mean_absolute_error(y_test, baseline_predictions))\n", + "print('Baseline MSE:', mean_squared_error(y_test, baseline_predictions))\n", + "print('Baseline R²:', r2_score(y_test, baseline_predictions))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Выбор ориентира для каждой задачи:\n", - "\n", - "1. Прогнозирование цен на мобильные телефоны:\n", - "Ориентир:\n", - "\n", - "R² (коэффициент детерминации): 0.75 - 0.85\n", - "\n", - "MAE (средняя абсолютная ошибка): 5000 - 10000 рублей\n", - "\n", - "RMSE (среднеквадратичная ошибка): 10000 - 20000 рублей\n", - "\n", - "Объяснение:\n", - "\n", - "R²: Значение 0.75 - 0.85 будет означать, что модель объясняет 75-85% вариации цен на мобильные телефоны, что является хорошим результатом для задачи регрессии.\n", - "\n", - "MAE: Значение 5000 - 10000 рублей будет означать, что в среднем модель ошибается на 5000 - 10000 рублей при прогнозировании цен на мобильные телефоны. Учитывая диапазон цен от 9000 до 150000 рублей, этот ориентир является разумным.\n", - "\n", - "RMSE: Значение 10000 - 20000 рублей будет означать, что среднеквадратичная ошибка модели составляет 10000 - 20000 рублей.\n", - "\n", - "2. Оптимизация характеристик продукта:\n", - "Ориентир:\n", - "\n", - "Увеличение прибыли компании: 5% - 10%\n", - "\n", - "Сохранение конкурентоспособных тарифов:\n", - "\n", - "Средняя цена мобильных телефонов не должна увеличиваться более чем на 5% по сравнению с текущими тарифами.\n", - "\n", - "Доля клиентов, считающих цены высокими, не должна увеличиваться более чем на 2%.\n", - "\n", - "Объяснение:\n", - "\n", - "Увеличение прибыли компании: \n", - "\n", - "Цель оптимизации характеристик продукта - максимизировать прибыль компании. Ориентир в 5% - 10% увеличения прибыли является реалистичным и достижимым.\n", - "\n", - "Сохранение конкурентоспособных тарифов: \n", - "\n", - "Важно, чтобы оптимизация характеристик продукта не привела к значительному увеличению цен для клиентов. Ориентир в 5% увеличения средней цены мобильных телефонов и 2% увеличения доли клиентов, считающих цены высокими, позволяет сохранить конкурентоспособность компании." + "#### Обучение и оценка моделей" ] }, { @@ -695,351 +533,142 @@ "name": "stdout", "output_type": "stream", "text": [ - "MAE: 5424.128251450574\n", - "MSE: 143399695.59118932\n", - "RMSE: 11974.961193723733\n", - "R²: 0.4559750691933513\n", - "Ориентиры для прогнозирования цены мобильных устройств не достигнуты.\n", - "Средняя цена для 'Battery capacity (mAh)':\n", - "Battery capacity (mAh)\n", - "1010 3499.000000\n", - "1050 4790.000000\n", - "1200 3396.000000\n", - "1250 2498.500000\n", - "1300 3338.333333\n", - " ... \n", - "5000 13542.530612\n", - "5020 23900.000000\n", - "5100 16999.000000\n", - "5300 8999.000000\n", - "6000 19832.333333\n", - "Name: Price, Length: 165, dtype: float64\n", - "\n", - "Средняя цена для 'Screen size (inches)':\n", - "Screen size (inches)\n", - "2.40 1249.000000\n", - "2.44 6999.000000\n", - "2.45 2999.000000\n", - "2.60 5555.000000\n", - "2.80 5345.000000\n", - " ... \n", - "6.70 58896.428571\n", - "6.80 79699.000000\n", - "6.90 92999.000000\n", - "7.00 6199.000000\n", - "7.30 164999.000000\n", - "Name: Price, Length: 80, dtype: float64\n", - "\n", - "Средняя цена для 'RAM (MB)':\n", - "RAM (MB)\n", - "64 4249.000000\n", - "256 2625.625000\n", - "289 3190.000000\n", - "384 5555.000000\n", - "512 3419.847222\n", - "768 4749.500000\n", - "1000 5747.082153\n", - "2000 8645.608187\n", - "3000 11587.829787\n", - "4000 17741.965000\n", - "6000 28291.587302\n", - "8000 45080.586207\n", - "12000 99173.750000\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'Internal storage (GB)':\n", - "Internal storage (GB)\n", - "0.064 3499.000000\n", - "0.128 4999.000000\n", - "0.160 5555.000000\n", - "0.512 3390.444444\n", - "1.000 5197.000000\n", - "2.000 2399.000000\n", - "3.000 3190.000000\n", - "4.000 4513.032258\n", - "8.000 5311.654206\n", - "16.000 8379.403341\n", - "32.000 12089.644366\n", - "64.000 21208.814607\n", - "128.000 29477.043478\n", - "256.000 68682.777778\n", - "512.000 164999.000000\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'Rear camera':\n", - "Rear camera\n", - "0.0 4193.500000\n", - "0.3 3054.666667\n", - "2.0 3000.804878\n", - "3.0 4329.333333\n", - "3.2 3506.727273\n", - "5.0 4413.183486\n", - "8.0 6658.109966\n", - "8.7 8999.000000\n", - "10.0 10499.250000\n", - "12.0 28533.950980\n", - "12.2 29162.500000\n", - "12.3 30569.250000\n", - "13.0 9531.776018\n", - "13.1 32490.000000\n", - "13.2 7496.333333\n", - "15.0 88719.000000\n", - "16.0 17923.896825\n", - "18.0 19999.000000\n", - "19.0 39095.000000\n", - "20.0 23739.777778\n", - "20.7 35718.600000\n", - "21.0 14513.900000\n", - "21.5 6999.000000\n", - "23.0 31966.090909\n", - "24.0 18587.250000\n", - "25.0 13980.000000\n", - "32.0 19589.000000\n", - "40.0 61389.750000\n", - "41.0 10990.000000\n", - "48.0 24205.615385\n", - "64.0 19332.333333\n", - "108.0 92999.000000\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для 'Front camera':\n", - "Front camera\n", - "0.0 6799.333333\n", - "0.3 4398.430233\n", - "0.9 19999.000000\n", - "1.1 18745.000000\n", - "1.2 18579.333333\n", - "1.3 5640.352941\n", - "1.5 18748.000000\n", - "1.6 15833.000000\n", - "1.9 11246.750000\n", - "2.0 6711.995122\n", - "2.1 24839.666667\n", - "2.2 35823.333333\n", - "2.4 5061.333333\n", - "3.0 8500.000000\n", - "3.2 4660.900000\n", - "3.7 28925.000000\n", - "4.0 13519.285714\n", - "5.0 8347.064655\n", - "7.0 49061.625000\n", - "8.0 13894.422222\n", - "10.0 105114.166667\n", - "12.0 41929.500000\n", - "13.0 11380.436364\n", - "16.0 17646.113402\n", - "20.0 13498.520000\n", - "24.0 21860.562500\n", - "25.0 19182.692308\n", - "32.0 23749.782609\n", - "40.0 92999.000000\n", - "48.0 27999.000000\n", - "Name: Price, dtype: float64\n", - "\n", - "Средняя цена для комбинации 'Battery capacity (mAh)' и 'RAM (MB)':\n", - "Battery capacity (mAh) RAM (MB)\n", - "1010 64 3499.0\n", - "1050 1000 4790.0\n", - "1200 64 4999.0\n", - " 289 3190.0\n", - " 512 1999.0\n", - " ... \n", - "5020 3000 23900.0\n", - "5100 3000 16999.0\n", - "5300 4000 8999.0\n", - "6000 4000 10749.0\n", - " 8000 37999.0\n", - "Name: Price, Length: 323, dtype: float64\n", - "\n", - "Средняя цена для комбинации 'Screen size (inches)' и 'Internal storage (GB)':\n", - "Screen size (inches) Internal storage (GB)\n", - "2.40 4.000 1249.0\n", - "2.44 0.512 6999.0\n", - "2.45 4.000 2999.0\n", - "2.60 0.160 5555.0\n", - "2.80 0.512 7500.0\n", - " ... \n", - "6.70 256.000 174990.0\n", - "6.80 256.000 79699.0\n", - "6.90 128.000 92999.0\n", - "7.00 8.000 6199.0\n", - "7.30 512.000 164999.0\n", - "Name: Price, Length: 177, dtype: float64\n", - "\n", - "Средняя цена для комбинации 'Rear camera' и 'Front camera':\n", - "Rear camera Front camera\n", - "0.0 0.0 4193.500000\n", - "0.3 0.3 3054.666667\n", - "2.0 0.0 4396.000000\n", - " 0.3 2940.937500\n", - " 1.3 2240.000000\n", - " ... \n", - "48.0 48.0 27999.000000\n", - "64.0 16.0 21499.000000\n", - " 20.0 14999.000000\n", - " 32.0 21499.000000\n", - "108.0 40.0 92999.000000\n", - "Name: Price, Length: 126, dtype: float64\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", - " warnings.warn(\n" + "Model: Linear Regression trained.\n", + "MAE: 3251.7122571814075\n", + "MSE: 25623200.493888523\n", + "R²: 0.4925203887566195\n", + "--------------------------------------------------\n", + "Model: Decision Tree trained.\n", + "MAE: 4112.809523809524\n", + "MSE: 56543100.29960317\n", + "R²: -0.11986285887206449\n", + "--------------------------------------------------\n", + "Model: Gradient Boosting trained.\n", + "MAE: 2793.2991365668017\n", + "MSE: 23739724.17710429\n", + "R²: 0.5298235285129411\n", + "--------------------------------------------------\n" ] } ], "source": [ "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler\n", + "from scipy import stats\n", + "from sklearn.model_selection import train_test_split, RandomizedSearchCV\n", + "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.pipeline import Pipeline\n", "from sklearn.linear_model import LinearRegression\n", + "from sklearn.tree import DecisionTreeRegressor\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", "\n", - "# Загружаем набор данных\n", - "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "# Выбор признаков и целевой переменной\n", + "categorical_features = [\n", + " 'Brand', 'Touchscreen', 'Operating system', 'Wi-Fi', \n", + " 'Bluetooth', 'GPS', '3G', '4G/ LTE'\n", + "]\n", + "numeric_features = [\n", + " 'Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', \n", + " 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', \n", + " 'Front camera', 'Number of SIMs'\n", + "]\n", + "target = 'Price'\n", "\n", - "# Удаление пустого столбца по имени\n", - "df = df.drop('Unnamed: 0', axis=1)\n", + "# Препроцессор для обработки числовых и категориальных признаков\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('num', StandardScaler(), numeric_features), # Масштабирование числовых признаков\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) # Однократная кодировка категориальных признаков\n", + " ]\n", + ")\n", "\n", - "# Преобразуем категориальные переменные в числовые\n", - "df = pd.get_dummies(df, columns=['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE'], drop_first=True)\n", + "# Создание пайплайнов для моделей\n", + "pipeline_linear_regression = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('regressor', LinearRegression())\n", + "])\n", "\n", - "# Разделяем данные на признаки (X) и целевую переменную (y)\n", - "X = df.drop('Price', axis=1)\n", - "y = df['Price']\n", + "pipeline_decision_tree = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('regressor', DecisionTreeRegressor(random_state=42))\n", + "])\n", "\n", - "# Разделяем данные на обучающую и тестовую выборки\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "pipeline_gradient_boosting = Pipeline(steps=[\n", + " ('preprocessor', preprocessor),\n", + " ('regressor', GradientBoostingRegressor(random_state=42))\n", + "])\n", "\n", - "# Стандартизируем признаки\n", - "scaler = StandardScaler()\n", - "X_train = scaler.fit_transform(X_train)\n", - "X_test = scaler.transform(X_test)\n", + "# Список моделей для сравнения\n", + "pipelines = [\n", + " ('Linear Regression', pipeline_linear_regression),\n", + " ('Decision Tree', pipeline_decision_tree),\n", + " ('Gradient Boosting', pipeline_gradient_boosting)\n", + "]\n", "\n", - "# Обучаем модель линейной регрессии\n", - "model = LinearRegression()\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Делаем предсказания на тестовой выборке\n", - "y_pred = model.predict(X_test)\n", - "\n", - "# Оцениваем качество модели\n", - "mae = mean_absolute_error(y_test, y_pred)\n", - "mse = mean_squared_error(y_test, y_pred)\n", - "rmse = mean_squared_error(y_test, y_pred, squared=False)\n", - "r2 = r2_score(y_test, y_pred)\n", - "\n", - "print(f\"MAE: {mae}\")\n", - "print(f\"MSE: {mse}\")\n", - "print(f\"RMSE: {rmse}\")\n", - "print(f\"R²: {r2}\")\n", - "\n", - "# Проверяем, достигнуты ли ориентиры\n", - "if r2 >= 0.75 and mae <= 10000 and rmse <= 20000:\n", - " print(\"Ориентиры для прогнозирования цены мобильных устройств достигнуты!\")\n", - "else:\n", - " print(\"Ориентиры для прогнозирования цены мобильных устройств не достигнуты.\")\n", - "\n", - "# Оптимизация тарифной сетки\n", - "# Убедитесь, что столбцы существуют\n", - "columns_to_group = ['Battery capacity (mAh)', 'Screen size (inches)', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera']\n", - "\n", - "# Рассчитываем среднюю цену для каждого значения каждого признака\n", - "for column in columns_to_group:\n", - " print(f\"Средняя цена для '{column}':\")\n", - " print(df.groupby(column)['Price'].mean())\n", - " print()\n", - "\n", - "# Рассчитываем среднюю цену для комбинаций признаков\n", - "# Например, для комбинации 'Battery capacity (mAh)' и 'RAM (MB)'\n", - "print(\"Средняя цена для комбинации 'Battery capacity (mAh)' и 'RAM (MB)':\")\n", - "print(df.groupby(['Battery capacity (mAh)', 'RAM (MB)'])['Price'].mean())\n", - "print()\n", - "\n", - "# Рассчитываем среднюю цену для комбинации 'Screen size (inches)' и 'Internal storage (GB)'\n", - "print(\"Средняя цена для комбинации 'Screen size (inches)' и 'Internal storage (GB)':\")\n", - "print(df.groupby(['Screen size (inches)', 'Internal storage (GB)'])['Price'].mean())\n", - "print()\n", - "\n", - "# Рассчитываем среднюю цену для комбинации 'Rear camera' и 'Front camera'\n", - "print(\"Средняя цена для комбинации 'Rear camera' и 'Front camera':\")\n", - "print(df.groupby(['Rear camera', 'Front camera'])['Price'].mean())\n", - "print()" + "# Обучение моделей и вывод результатов\n", + "for name, pipeline in pipelines:\n", + " pipeline.fit(X_train, y_train)\n", + " predictions = pipeline.predict(X_test)\n", + " print(f\"Model: {name} trained.\")\n", + " print(f\"MAE: {mean_absolute_error(y_test, predictions)}\")\n", + " print(f\"MSE: {mean_squared_error(y_test, predictions)}\")\n", + " print(f\"R²: {r2_score(y_test, predictions)}\")\n", + " print(\"-\" * 50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Анализ применимости алгоритмов обучения с учителем для решения поставленных задач:\n", + "Линейная регрессия улучшила качество предсказаний по сравнению с базовой моделью, но показала меньшую эффективность, чем градиентный бустинг\n", "\n", - "1. Прогнозирование стоимости страховых взносов: Задача: Регрессия\n", + "* MAE уменьшился на 1410.98 (30% улучшение).\n", "\n", - "Свойства алгоритмов:\n", + "* MSE уменьшился на 24937480.22 (49% улучшение).\n", "\n", - "Линейная регрессия: Применимость: Хорошо подходит для задач, где зависимость между признаками и целевой переменной линейна. Преимущества: Проста в реализации, интерпретируема. Недостатки: Может плохо работать, если зависимость нелинейна.\n", + "* R² стал положительным (0.4925), что означает, что модель объясняет 49.25% изменчивости цены.\n", "\n", - "Деревья решений (регрессия): Применимость: Подходит для задач с нелинейными зависимостями. Преимущества: Может обрабатывать категориальные признаки, не требует масштабирования данных. Недостатки: Подвержены переобучению, могут давать нестабильные результаты.\n", + "Дерево решений оказалось наименее эффективным среди всех моделей, показав худшие результаты даже по сравнению с базовой моделью.\n", "\n", - "Случайный лес (регрессия): Применимость: Хорошо подходит для задач с нелинейными зависимостями и большим количеством признаков. Преимущества: Устойчив к переобучению, может обрабатывать категориальные признаки. Недостатки: Менее интерпретируем, чем линейная регрессия.\n", + "* MAE увеличился на 450.12 (9.6% ухудшение).\n", "\n", - "Градиентный бустинг (регрессия): Применимость: Подходит для задач с нелинейными зависимостями и сложными взаимосвязями между признаками. Преимущества: Может достигать высокой точности, устойчив к переобучению. Недостатки: Сложнее в настройке, чем случайный лес, менее интерпретируем.\n", + "* MSE увеличился на 5982419.59 (11.8% ухудшение).\n", "\n", - "Нейронные сети (регрессия): Применимость: Подходит для задач с очень сложными зависимостями и большим количеством данных. Преимущества: Может моделировать очень сложные зависимости. Недостатки: Требует большого количества данных, сложнее в настройке и интерпретации.\n", + "* R² стал еще ниже (-0.1199), что указывает на то, что модель работает хуже, чем базовая.\n", "\n", - "#### Вывод:\n", + "Градиентный бустинг показал лучшие результаты среди всех моделей:\n", "\n", - "Линейная регрессия: Может быть хорошим выбором для начала, особенно если зависимость между признаками и целевой переменной линейна.\n", + "* MAE уменьшился на 1869.39 (40% улучшение) по сравнению с базовой моделью.\n", "\n", - "Деревья решений и случайный лес: Подходят для задач с нелинейными зависимостями.\n", + "* MSE уменьшился на 26820956.53 (53% улучшение) по сравнению с базовой моделью.\n", "\n", - "Градиентный бустинг: Может давать более высокую точность, чем случайный лес, но требует больше времени на настройку.\n", + "* R² достиг 0.5298, что означает, что модель объясняет 52.98% изменчивости цены.\n", "\n", - "Нейронные сети: Могут быть излишними для этой задачи, если данных недостаточно много.\n", + "Таким образом, градиентный бустинг является наиболее подходящей моделью для предсказания цены мобильных телефонов на основе выбранных признаков." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Классификация \n", "\n", - "2. Оптимизация тарифной сетки: Задача: Классификация (группировка клиентов по группам риска)\n", + "Цель: Целью классификации является разработка модели, которая будет предсказывать категорию цены мобильного телефона на основе его технических характеристик и функциональных особенностей.\n", "\n", - "Свойства алгоритмов:\n", + "Применение классификации:\n", "\n", - "Логистическая регрессия: Применимость: Хорошо подходит для задач бинарной классификации, где зависимость между признаками и целевой переменной линейна. Преимущества: Проста в реализации, интерпретируема. Недостатки: Может плохо работать, если зависимость нелинейна.\n", + "1. Рыночная аналитика:\n", "\n", - "Деревья решений (классификация): Применимость: Подходит для задач с нелинейными зависимостями. Преимущества: Может обрабатывать категориальные признаки, не требует масштабирования данных. Недостатки: Подвержены переобучению, могут давать нестабильные результаты.\n", + "* Помогает производителям и продавцам телефонов определять целевую аудиторию для конкретных моделей.\n", "\n", - "Случайный лес (классификация): Применимость: Хорошо подходит для задач с нелинейными зависимостями и большим количеством признаков. Преимущества: Устойчив к переобучению, может обрабатывать категориальные признаки. Недостатки: Менее интерпретируем, чем линейная регрессия.\n", + "* Позволяет анализировать конкуренцию в разных ценовых сегментах.\n", "\n", - "Градиентный бустинг (классификация): Применимость: Подходит для задач с нелинейными зависимостями и сложными взаимосвязями между признаками. Преимущества: Может достигать высокой точности, устойчив к переобучению. Недостатки: Сложнее в настройке, чем случайный лес, менее интерпретируем.\n", + "2. Потребительские рекомендации:\n", "\n", - "Нейронные сети (классификация): Применимость: Подходит для задач с очень сложными зависимостями и большим количеством данных. Преимущества: Может моделировать очень сложные зависимости. Недостатки: Требует большого количества данных, сложнее в настройке и интерпретации.\n", + "* Помогает пользователям выбирать телефоны, соответствующие их бюджету и требованиям.\n", "\n", - "#### Вывод:\n", - "\n", - "Логистическая регрессия: Может быть хорошим выбором для начала, особенно если зависимость между признаками и целевой переменной линейна.\n", - "\n", - "Деревья решений и случайный лес: Подходят для задач с нелинейными зависимостями.\n", - "\n", - "Градиентный бустинг: Может давать более высокую точность, чем случайный лес, но требует больше времени на настройку.\n", - "\n", - "Нейронные сети: Могут быть излишними для этой задачи, если данных недостаточно много.\n", - "\n", - "1. Прогнозирование стоимости страховых взносов: Выбранные модели:\n", - "\n", - "Линейная регрессия\n", - "\n", - "Случайный лес (регрессия)\n", - "\n", - "Градиентный бустинг (регрессия)\n", - "\n", - "2. Оптимизация тарифной сетки: Выбранные модели:\n", - "\n", - "Логистическая регрессия\n", - "\n", - "Случайный лес (классификация)\n", - "\n", - "Градиентный бустинг (классификация)" + "* Упрощает процесс сравнения телефонов в рамках одной ценовой категории." ] }, { @@ -1051,178 +680,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "Результаты для задачи регрессии:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: Linear Regression\n", - "MAE: 5424.128251450574\n", - "MSE: 143399695.59118932\n", - "RMSE: 11974.961193723733\n", - "R²: 0.4559750691933513\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: Random Forest Regression\n", - "MAE: 4022.856102941176\n", - "MSE: 107013897.40805219\n", - "RMSE: 10344.752167551052\n", - "R²: 0.5940142836932047\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: Gradient Boosting Regression\n", - "MAE: 4136.035916841655\n", - "MSE: 90628419.64103268\n", - "RMSE: 9519.895988981849\n", - "R²: 0.656176956854307\n", - "\n", - "Результаты для задачи классификации:\n", - "Model: Logistic Regression\n", - "Accuracy: 0.8137254901960784\n", - "\n", - "Model: Random Forest Classification\n", - "Accuracy: 0.8627450980392157\n", - "\n", - "Model: Gradient Boosting Classification\n", - "Accuracy: 0.8578431372549019\n", - "\n" + "Размер обучающей выборки: (1004, 18)\n", + "Размер тестовой выборки: (252, 18)\n" ] } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.linear_model import LinearRegression, LogisticRegression\n", - "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", - "from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n", - "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score\n", "\n", - "# Загружаем набор данных\n", - "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "# Выбор признаков и целевой переменной\n", + "features = [\n", + " 'Brand', 'Battery capacity (mAh)',\n", + " 'Screen size (inches)', 'Touchscreen', 'Resolution x', 'Resolution y',\n", + " 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera',\n", + " 'Front camera', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS',\n", + " 'Number of SIMs', '3G', '4G/ LTE'\n", + "]\n", + "target = 'Price'\n", "\n", - "# Удаление пустого столбца по имени\n", - "df = df.drop('Unnamed: 0', axis=1)\n", + "# Разделение данных на обучающую и тестовую выборки\n", + "X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)\n", "\n", - "# Преобразуем категориальные переменные в числовые\n", - "df = pd.get_dummies(df, columns=['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE'], drop_first=True)\n", - "\n", - "# Убеждаемся, что все столбцы, которые должны быть числовыми, действительно являются числовыми\n", - "numeric_columns = ['Battery capacity (mAh)', 'Screen size (inches)', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Price']\n", - "for column in numeric_columns:\n", - " df[column] = pd.to_numeric(df[column], errors='coerce')\n", - "\n", - "# Удаляем строки с пропущенными значениями, которые могли возникнуть при преобразовании\n", - "df.dropna(subset=numeric_columns, inplace=True)\n", - "\n", - "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", - "X_reg = df.drop('Price', axis=1)\n", - "y_reg = df['Price']\n", - "\n", - "# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n", - "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n", - "\n", - "# Стандартизируем признаки для задачи регрессии\n", - "scaler_reg = StandardScaler()\n", - "X_train_reg = scaler_reg.fit_transform(X_train_reg)\n", - "X_test_reg = scaler_reg.transform(X_test_reg)\n", - "\n", - "# Список моделей для задачи регрессии\n", - "models_reg = {\n", - " \"Linear Regression\": LinearRegression(),\n", - " \"Random Forest Regression\": RandomForestRegressor(),\n", - " \"Gradient Boosting Regression\": GradientBoostingRegressor()\n", - "}\n", - "\n", - "# Обучаем и оцениваем модели для задачи регрессии\n", - "print(\"Результаты для задачи регрессии:\")\n", - "for name, model in models_reg.items():\n", - " model.fit(X_train_reg, y_train_reg)\n", - " y_pred_reg = model.predict(X_test_reg)\n", - " mae = mean_absolute_error(y_test_reg, y_pred_reg)\n", - " mse = mean_squared_error(y_test_reg, y_pred_reg)\n", - " rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n", - " r2 = r2_score(y_test_reg, y_pred_reg)\n", - " print(f\"Model: {name}\")\n", - " print(f\"MAE: {mae}\")\n", - " print(f\"MSE: {mse}\")\n", - " print(f\"RMSE: {rmse}\")\n", - " print(f\"R²: {r2}\")\n", - " print()\n", - "\n", - "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", - "X_class = df.drop('Price', axis=1)\n", - "y_class = (df['Price'] > df['Price'].mean()).astype(int)\n", - "\n", - "# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n", - "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.3, random_state=42)\n", - "\n", - "# Стандартизируем признаки для задачи классификации\n", - "scaler_class = StandardScaler()\n", - "X_train_class = scaler_class.fit_transform(X_train_class)\n", - "X_test_class = scaler_class.transform(X_test_class)\n", - "\n", - "# Список моделей для задачи классификации\n", - "models_class = {\n", - " \"Logistic Regression\": LogisticRegression(),\n", - " \"Random Forest Classification\": RandomForestClassifier(),\n", - " \"Gradient Boosting Classification\": GradientBoostingClassifier()\n", - "}\n", - "\n", - "# Обучаем и оцениваем модели для задачи классификации\n", - "print(\"Результаты для задачи классификации:\")\n", - "for name, model in models_class.items():\n", - " model.fit(X_train_class, y_train_class)\n", - " y_pred_class = model.predict(X_test_class)\n", - " accuracy = accuracy_score(y_test_class, y_pred_class)\n", - " print(f\"Model: {name}\")\n", - " print(f\"Accuracy: {accuracy}\")\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. Прогнозирование стоимости мобильных телефонов: \n", - "\n", - "Конвейер для задачи регрессии:" + "print(\"Размер обучающей выборки:\", X_train.shape)\n", + "print(\"Размер тестовой выборки:\", X_test.shape)" ] }, { @@ -1234,22 +715,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "Результаты для задачи регрессии:\n", - "Model: Linear Regression\n", - "MAE: 4695.532622690401\n", - "MSE: 65643669.229680344\n", - "RMSE: 8102.078080941972\n", - "R²: 0.6878208068254403\n", - "\n" + "PriceCategory\n", + "1 946\n", + "2 946\n", + "0 946\n", + "Name: count, dtype: int64\n", + "Гиперпараметры для логистической регрессии:\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", - " warnings.warn(\n", - "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", " warnings.warn(\n" ] }, @@ -1257,437 +735,205 @@ "name": "stdout", "output_type": "stream", "text": [ - "Model: Random Forest Regression\n", - "MAE: 4002.1279166666664\n", - "MSE: 88540074.860938\n", - "RMSE: 9409.573574872455\n", - "R²: 0.578933209278842\n", - "\n", - "Model: Gradient Boosting Regression\n", - "MAE: 4080.6847402556627\n", - "MSE: 75338600.25148971\n", - "RMSE: 8679.781117717757\n", - "R²: 0.6417149784982356\n", - "\n" + "Accuracy: 0.8891\n", + "Precision: 0.8888\n", + "Recall: 0.8891\n", + "F1-Score: 0.8881\n", + "ROC-AUC: 0.9674\n", + "Гиперпараметры для случайного леса:\n", + "Accuracy: 0.9525\n", + "Precision: 0.9540\n", + "Recall: 0.9525\n", + "F1-Score: 0.9524\n", + "ROC-AUC: 0.9929\n", + "Гиперпараметры для градиентного бустинга:\n" ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", - " warnings.warn(\n" + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[6], line 148\u001b[0m\n\u001b[0;32m 143\u001b[0m gb_pipeline \u001b[38;5;241m=\u001b[39m Pipeline([\n\u001b[0;32m 144\u001b[0m (\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassifier\u001b[39m\u001b[38;5;124m'\u001b[39m, GradientBoostingClassifier(random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m))\n\u001b[0;32m 145\u001b[0m ])\n\u001b[0;32m 147\u001b[0m gb_random_search \u001b[38;5;241m=\u001b[39m RandomizedSearchCV(gb_pipeline, param_distributions\u001b[38;5;241m=\u001b[39mgb_param_dist, n_iter\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m50\u001b[39m, cv\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m, n_jobs\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m--> 148\u001b[0m \u001b[43mgb_random_search\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 149\u001b[0m gb_best_model \u001b[38;5;241m=\u001b[39m gb_random_search\u001b[38;5;241m.\u001b[39mbest_estimator_\n\u001b[0;32m 150\u001b[0m gb_results \u001b[38;5;241m=\u001b[39m evaluate_model(gb_best_model, X_test, y_test)\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1019\u001b[0m, in \u001b[0;36mBaseSearchCV.fit\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 1013\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_results(\n\u001b[0;32m 1014\u001b[0m all_candidate_params, n_splits, all_out, all_more_results\n\u001b[0;32m 1015\u001b[0m )\n\u001b[0;32m 1017\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m results\n\u001b[1;32m-> 1019\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevaluate_candidates\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1021\u001b[0m \u001b[38;5;66;03m# multimetric is determined here because in the case of a callable\u001b[39;00m\n\u001b[0;32m 1022\u001b[0m \u001b[38;5;66;03m# self.scoring the return type is only known after calling\u001b[39;00m\n\u001b[0;32m 1023\u001b[0m first_test_score \u001b[38;5;241m=\u001b[39m all_out[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_scores\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1960\u001b[0m, in \u001b[0;36mRandomizedSearchCV._run_search\u001b[1;34m(self, evaluate_candidates)\u001b[0m\n\u001b[0;32m 1958\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_run_search\u001b[39m(\u001b[38;5;28mself\u001b[39m, evaluate_candidates):\n\u001b[0;32m 1959\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Search n_iter candidates from param_distributions\"\"\"\u001b[39;00m\n\u001b[1;32m-> 1960\u001b[0m \u001b[43mevaluate_candidates\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1961\u001b[0m \u001b[43m \u001b[49m\u001b[43mParameterSampler\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1962\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparam_distributions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mn_iter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrandom_state\u001b[49m\n\u001b[0;32m 1963\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1964\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:965\u001b[0m, in \u001b[0;36mBaseSearchCV.fit..evaluate_candidates\u001b[1;34m(candidate_params, cv, more_results)\u001b[0m\n\u001b[0;32m 957\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mverbose \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 958\u001b[0m \u001b[38;5;28mprint\u001b[39m(\n\u001b[0;32m 959\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFitting \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;124m folds for each of \u001b[39m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;124m candidates,\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 960\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m totalling \u001b[39m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m fits\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[0;32m 961\u001b[0m n_splits, n_candidates, n_candidates \u001b[38;5;241m*\u001b[39m n_splits\n\u001b[0;32m 962\u001b[0m )\n\u001b[0;32m 963\u001b[0m )\n\u001b[1;32m--> 965\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mparallel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 966\u001b[0m \u001b[43m \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_fit_and_score\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 967\u001b[0m \u001b[43m \u001b[49m\u001b[43mclone\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbase_estimator\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 968\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 969\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 970\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 971\u001b[0m \u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 972\u001b[0m \u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparameters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 973\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_progress\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msplit_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_splits\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 974\u001b[0m \u001b[43m \u001b[49m\u001b[43mcandidate_progress\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcand_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_candidates\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 975\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_and_score_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 976\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 977\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mcand_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mproduct\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 978\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43menumerate\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcandidate_params\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 979\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43menumerate\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplitter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 980\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 981\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 983\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(out) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m 984\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 985\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo fits were performed. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 986\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWas the CV iterator empty? \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 987\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWere there no candidates?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 988\u001b[0m )\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\parallel.py:74\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 69\u001b[0m config \u001b[38;5;241m=\u001b[39m get_config()\n\u001b[0;32m 70\u001b[0m iterable_with_config \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 71\u001b[0m (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m delayed_func, args, kwargs \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[0;32m 73\u001b[0m )\n\u001b[1;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43miterable_with_config\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:2007\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 2001\u001b[0m \u001b[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001b[39;00m\n\u001b[0;32m 2002\u001b[0m \u001b[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001b[39;00m\n\u001b[0;32m 2003\u001b[0m \u001b[38;5;66;03m# reaches the first `yield` statement. This starts the asynchronous\u001b[39;00m\n\u001b[0;32m 2004\u001b[0m \u001b[38;5;66;03m# dispatch of the tasks to the workers.\u001b[39;00m\n\u001b[0;32m 2005\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 2007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:1650\u001b[0m, in \u001b[0;36mParallel._get_outputs\u001b[1;34m(self, iterator, pre_dispatch)\u001b[0m\n\u001b[0;32m 1647\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[0;32m 1649\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backend\u001b[38;5;241m.\u001b[39mretrieval_context():\n\u001b[1;32m-> 1650\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_retrieve()\n\u001b[0;32m 1652\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mGeneratorExit\u001b[39;00m:\n\u001b[0;32m 1653\u001b[0m \u001b[38;5;66;03m# The generator has been garbage collected before being fully\u001b[39;00m\n\u001b[0;32m 1654\u001b[0m \u001b[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001b[39;00m\n\u001b[0;32m 1655\u001b[0m \u001b[38;5;66;03m# the user if necessary.\u001b[39;00m\n\u001b[0;32m 1656\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:1762\u001b[0m, in \u001b[0;36mParallel._retrieve\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1757\u001b[0m \u001b[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001b[39;00m\n\u001b[0;32m 1758\u001b[0m \u001b[38;5;66;03m# async callbacks to progress.\u001b[39;00m\n\u001b[0;32m 1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ((\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m\n\u001b[0;32m 1760\u001b[0m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mget_status(\n\u001b[0;32m 1761\u001b[0m timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtimeout) \u001b[38;5;241m==\u001b[39m TASK_PENDING)):\n\u001b[1;32m-> 1762\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0.01\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1763\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m 1765\u001b[0m \u001b[38;5;66;03m# We need to be careful: the job list can be filling up as\u001b[39;00m\n\u001b[0;32m 1766\u001b[0m \u001b[38;5;66;03m# we empty it and Python list are not thread-safe by\u001b[39;00m\n\u001b[0;32m 1767\u001b[0m \u001b[38;5;66;03m# default hence the use of the lock\u001b[39;00m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "import pandas as pd\n", - "from sklearn.model_selection import train_test_split, GridSearchCV\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.linear_model import LinearRegression\n", - "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n", - "from sklearn.pipeline import Pipeline\n", + "from imblearn.over_sampling import SMOTE\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", "from sklearn.compose import ColumnTransformer\n", - "from sklearn.preprocessing import OneHotEncoder\n", - "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", - "\n", - "# Загружаем набор данных\n", - "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", - "\n", - "# Удаление пустого столбца по имени\n", - "df = df.drop('Unnamed: 0', axis=1)\n", - "\n", - "# Определяем категориальные и числовые столбцы\n", - "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", - "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", - "\n", - "# Создаем преобразователь для категориальных и числовых столбцов\n", - "preprocessor = ColumnTransformer(\n", - " transformers=[\n", - " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", - " ('num', StandardScaler(), numerical_cols)\n", - " ])\n", - "\n", - "# Список моделей для задачи регрессии\n", - "models_reg = {\n", - " \"Linear Regression\": LinearRegression(),\n", - " \"Random Forest Regression\": RandomForestRegressor(),\n", - " \"Gradient Boosting Regression\": GradientBoostingRegressor()\n", - "}\n", - "\n", - "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", - "X_reg = df[categorical_cols + numerical_cols]\n", - "y_reg = df['Price']\n", - "\n", - "# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n", - "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)\n", - "\n", - "# Обучаем и оцениваем модели для задачи регрессии\n", - "print(\"Результаты для задачи регрессии:\")\n", - "for name, model in models_reg.items():\n", - " pipeline = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('model', model)\n", - " ])\n", - " pipeline.fit(X_train_reg, y_train_reg)\n", - " y_pred_reg = pipeline.predict(X_test_reg)\n", - " mae = mean_absolute_error(y_test_reg, y_pred_reg)\n", - " mse = mean_squared_error(y_test_reg, y_pred_reg)\n", - " rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n", - " r2 = r2_score(y_test_reg, y_pred_reg)\n", - " print(f\"Model: {name}\")\n", - " print(f\"MAE: {mae}\")\n", - " print(f\"MSE: {mse}\")\n", - " print(f\"RMSE: {rmse}\")\n", - " print(f\"R²: {r2}\")\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "2. Оптимизация тарифной сетки:\n", - "\n", - " Конвейер для задачи классификации:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Результаты для задачи классификации:\n", - "Model: Logistic Regression\n", - "Accuracy: 0.8713235294117647\n", - "\n", - "Model: Random Forest Classification\n", - "Accuracy: 0.8786764705882353\n", - "\n", - "Model: Gradient Boosting Classification\n", - "Accuracy: 0.8676470588235294\n", - "\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split, GridSearchCV\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.linear_model import LinearRegression\n", - "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n", "from sklearn.pipeline import Pipeline\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.preprocessing import OneHotEncoder\n", - "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", - "\n", - "# Загружаем набор данных\n", - "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", - "\n", - "# Удаление пустого столбца по имени\n", - "df = df.drop('Unnamed: 0', axis=1)\n", - "\n", - "# Определяем категориальные и числовые столбцы\n", - "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", - "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", - "\n", - "# Создаем преобразователь для категориальных и числовых столбцов\n", - "preprocessor = ColumnTransformer(\n", - " transformers=[\n", - " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", - " ('num', StandardScaler(), numerical_cols)\n", - " ])\n", - "\n", - "# Список моделей для задачи классификации\n", - "models_class = {\n", - " \"Logistic Regression\": LogisticRegression(),\n", - " \"Random Forest Classification\": RandomForestClassifier(),\n", - " \"Gradient Boosting Classification\": GradientBoostingClassifier()\n", - "}\n", - "\n", - "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", - "X_class = df[categorical_cols + numerical_cols]\n", - "y_class = (df['Price'] > df['Price'].mean()).astype(int)\n", - "\n", - "# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n", - "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n", - "\n", - "# Обучаем и оцениваем модели для задачи классификации\n", - "print(\"Результаты для задачи классификации:\")\n", - "for name, model in models_class.items():\n", - " pipeline = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('model', model)\n", - " ])\n", - " pipeline.fit(X_train_class, y_train_class)\n", - " y_pred_class = pipeline.predict(X_test_class)\n", - " accuracy = accuracy_score(y_test_class, y_pred_class)\n", - " print(f\"Model: {name}\")\n", - " print(f\"Accuracy: {accuracy}\")\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. Прогнозирование стоимости страховых взносов:\n", - "\n", - "Настройка гиперпараметров для задачи регрессии:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Результаты для задачи регрессии:\n", - "Model: Linear Regression\n", - "Best Parameters: {}\n", - "MAE: 4695.532622690401\n", - "MSE: 65643669.229680344\n", - "RMSE: 8102.078080941972\n", - "R²: 0.6878208068254403\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split, GridSearchCV\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.linear_model import LinearRegression\n", - "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.preprocessing import OneHotEncoder\n", - "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", - "\n", - "# Загружаем набор данных\n", - "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", - "\n", - "# Удаление пустого столбца по имени\n", - "df = df.drop('Unnamed: 0', axis=1)\n", - "\n", - "# Определяем категориальные и числовые столбцы\n", - "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", - "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", - "\n", - "# Создаем преобразователь для категориальных и числовых столбцов\n", - "preprocessor = ColumnTransformer(\n", - " transformers=[\n", - " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", - " ('num', StandardScaler(), numerical_cols)\n", - " ])\n", - "\n", - "# Список моделей и их гиперпараметров для задачи регрессии\n", - "models_reg = {\n", - " \"Linear Regression\": (LinearRegression(), {}),\n", - " \"Random Forest Regression\": (RandomForestRegressor(), {\n", - " 'model__n_estimators': [100, 200],\n", - " 'model__max_depth': [None, 10, 20]\n", - " }),\n", - " \"Gradient Boosting Regression\": (GradientBoostingRegressor(), {\n", - " 'model__n_estimators': [100, 200],\n", - " 'model__learning_rate': [0.01, 0.1],\n", - " 'model__max_depth': [3, 5]\n", - " })\n", - "}\n", - "\n", - "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", - "X_reg = df[categorical_cols + numerical_cols]\n", - "y_reg = df['Price']\n", - "\n", - "# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n", - "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)\n", - "\n", - "# Обучаем и оцениваем модели для задачи регрессии\n", - "print(\"Результаты для задачи регрессии:\")\n", - "for name, (model, params) in models_reg.items():\n", - " pipeline = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('model', model)\n", - " ])\n", - " grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_absolute_error')\n", - " grid_search.fit(X_train_reg, y_train_reg)\n", - " best_model = grid_search.best_estimator_\n", - " y_pred_reg = best_model.predict(X_test_reg)\n", - " mae = mean_absolute_error(y_test_reg, y_pred_reg)\n", - " mse = mean_squared_error(y_test_reg, y_pred_reg)\n", - " rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n", - " r2 = r2_score(y_test_reg, y_pred_reg)\n", - " print(f\"Model: {name}\")\n", - " print(f\"Best Parameters: {grid_search.best_params_}\")\n", - " print(f\"MAE: {mae}\")\n", - " print(f\"MSE: {mse}\")\n", - " print(f\"RMSE: {rmse}\")\n", - " print(f\"R²: {r2}\")\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "2. Оптимизация тарифной сетки:\n", - "\n", - "Настройка гиперпараметров для задачи классификации:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Результаты для задачи классификации:\n", - "Model: Logistic Regression\n", - "Best Parameters: {'model__C': 1, 'model__solver': 'lbfgs'}\n", - "Accuracy: 0.8713235294117647\n", - "\n", - "Model: Random Forest Classification\n", - "Best Parameters: {'model__max_depth': None, 'model__n_estimators': 200}\n", - "Accuracy: 0.8676470588235294\n", - "\n", - "Model: Gradient Boosting Classification\n", - "Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200}\n", - "Accuracy: 0.8602941176470589\n", - "\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split, GridSearchCV\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.preprocessing import OneHotEncoder\n", - "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n", + "from scipy.stats import uniform, randint\n", + "from sklearn.model_selection import RandomizedSearchCV\n", "\n", - "# Загружаем набор данных\n", - "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "# Выбор признаков и целевой переменной\n", + "features = [\n", + " 'Brand', 'Battery capacity (mAh)', 'Screen size (inches)', 'Touchscreen', \n", + " 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', \n", + " 'Rear camera', 'Front camera', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', \n", + " 'Number of SIMs', '3G', '4G/ LTE'\n", + "]\n", + "target = 'PriceCategory' # Целевая переменная: категория цены\n", "\n", - "# Удаление пустого столбца по имени\n", - "df = df.drop('Unnamed: 0', axis=1)\n", + "# Преобразование целевой переменной в категории (например, бюджетные, средний класс, премиум)\n", + "bins = [0, 10000, 30000, float('inf')]\n", + "labels = ['Budget', 'Mid-Range', 'Premium']\n", + "df['PriceCategory'] = pd.cut(df['Price'], bins=bins, labels=labels)\n", "\n", - "# Определяем категориальные и числовые столбцы\n", - "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", - "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", + "# Преобразование категорий в числа\n", + "label_encoder = LabelEncoder()\n", + "df[target] = label_encoder.fit_transform(df[target])\n", + "\n", + "# Определение категориальных и числовых признаков\n", + "categorical_features = [\n", + " 'Brand', 'Touchscreen', 'Operating system', 'Wi-Fi', \n", + " 'Bluetooth', 'GPS', '3G', '4G/ LTE'\n", + "]\n", + "numeric_features = [\n", + " 'Battery capacity (mAh)', 'Processor', 'Screen size (inches)', 'Resolution x', \n", + " 'Resolution y', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', \n", + " 'Front camera', 'Number of SIMs'\n", + "]\n", + "\n", + "# Препроцессор для обработки числовых и категориальных признаков\n", + "categorical_transformer = Pipeline(steps=[\n", + " ('onehot', OneHotEncoder(handle_unknown='ignore'))\n", + "])\n", + "\n", + "numeric_transformer = Pipeline(steps=[\n", + " ('scaler', StandardScaler())\n", + "])\n", "\n", - "# Создаем преобразователь для категориальных и числовых столбцов\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", - " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", - " ('num', StandardScaler(), numerical_cols)\n", + " ('num', numeric_transformer, numeric_features),\n", + " ('cat', categorical_transformer, categorical_features)\n", " ])\n", "\n", - "# Список моделей и их гиперпараметров для задачи классификации\n", - "models_class = {\n", - " \"Logistic Regression\": (LogisticRegression(), {\n", - " 'model__C': [0.1, 1, 10],\n", - " 'model__solver': ['liblinear', 'lbfgs']\n", - " }),\n", - " \"Random Forest Classification\": (RandomForestClassifier(), {\n", - " 'model__n_estimators': [100, 200],\n", - " 'model__max_depth': [None, 10, 20]\n", - " }),\n", - " \"Gradient Boosting Classification\": (GradientBoostingClassifier(), {\n", - " 'model__n_estimators': [100, 200],\n", - " 'model__learning_rate': [0.01, 0.1],\n", - " 'model__max_depth': [3, 5]\n", - " })\n", + "# Применение препроцессора\n", + "X = preprocessor.fit_transform(df[features])\n", + "y = df[target]\n", + "\n", + "# Балансировка классов с помощью SMOTE\n", + "smote = SMOTE(random_state=42)\n", + "X_resampled, y_resampled = smote.fit_resample(X, y)\n", + "\n", + "print(pd.Series(y_resampled).value_counts())\n", + "\n", + "# Разделение данных на обучающую и тестовую выборки\n", + "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", + "\n", + "# Функция для оценки модели\n", + "def evaluate_model(model, X_test, y_test):\n", + " y_pred = model.predict(X_test)\n", + " y_pred_proba = model.predict_proba(X_test)\n", + " \n", + " accuracy = accuracy_score(y_test, y_pred)\n", + " precision = precision_score(y_test, y_pred, average='weighted') \n", + " recall = recall_score(y_test, y_pred, average='weighted') \n", + " f1 = f1_score(y_test, y_pred, average='weighted') \n", + " roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')\n", + " \n", + " print(f\"Accuracy: {accuracy:.4f}\")\n", + " print(f\"Precision: {precision:.4f}\")\n", + " print(f\"Recall: {recall:.4f}\")\n", + " print(f\"F1-Score: {f1:.4f}\")\n", + " print(f\"ROC-AUC: {roc_auc:.4f}\")\n", + " \n", + " return {\n", + " 'accuracy': accuracy,\n", + " 'precision': precision,\n", + " 'recall': recall,\n", + " 'f1': f1,\n", + " 'roc_auc': roc_auc\n", + " }\n", + "\n", + "# Логистическая регрессия\n", + "print(\"Гиперпараметры для логистической регрессии:\")\n", + "logreg_param_dist = {\n", + " 'classifier__C': uniform(loc=0, scale=4),\n", + " 'classifier__penalty': ['l1', 'l2'],\n", + " 'classifier__solver': ['liblinear', 'saga']\n", "}\n", "\n", - "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", - "X_class = df[categorical_cols + numerical_cols]\n", - "y_class = (df['Price'] > df['Price'].mean()).astype(int)\n", + "logreg_pipeline = Pipeline([\n", + " ('classifier', LogisticRegression(max_iter=1000, random_state=42))\n", + "])\n", "\n", - "# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n", - "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n", + "logreg_random_search = RandomizedSearchCV(logreg_pipeline, param_distributions=logreg_param_dist, n_iter=50, cv=5, random_state=42, n_jobs=-1)\n", + "logreg_random_search.fit(X_train, y_train)\n", + "logreg_best_model = logreg_random_search.best_estimator_\n", + "logreg_results = evaluate_model(logreg_best_model, X_test, y_test)\n", "\n", - "# Обучаем и оцениваем модели для задачи классификации\n", - "print(\"Результаты для задачи классификации:\")\n", - "for name, (model, params) in models_class.items():\n", - " pipeline = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('model', model)\n", - " ])\n", - " grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')\n", - " grid_search.fit(X_train_class, y_train_class)\n", - " best_model = grid_search.best_estimator_\n", - " y_pred_class = best_model.predict(X_test_class)\n", - " accuracy = accuracy_score(y_test_class, y_pred_class)\n", - " print(f\"Model: {name}\")\n", - " print(f\"Best Parameters: {grid_search.best_params_}\")\n", - " print(f\"Accuracy: {accuracy}\")\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. Прогнозирование стоимости страховых взносов: Задача: Регрессия\n", + "# Случайный лес\n", + "print(\"Гиперпараметры для случайного леса:\")\n", + "rf_param_dist = {\n", + " 'classifier__n_estimators': randint(100, 1000),\n", + " 'classifier__max_depth': [None] + list(randint(10, 100).rvs(10)),\n", + " 'classifier__min_samples_split': randint(2, 20),\n", + " 'classifier__min_samples_leaf': randint(1, 20),\n", + " 'classifier__bootstrap': [True, False]\n", + "}\n", "\n", - "Выбор метрик:\n", + "rf_pipeline = Pipeline([\n", + " ('classifier', RandomForestClassifier(random_state=42))\n", + "])\n", "\n", - "MAE (Mean Absolute Error): Средняя абсолютная ошибка. Показывает среднее отклонение предсказанных значений от фактических. Эта метрика легко интерпретируется, так как она измеряется в тех же единицах, что и целевая переменная (доллары).\n", + "rf_random_search = RandomizedSearchCV(rf_pipeline, param_distributions=rf_param_dist, n_iter=50, cv=5, random_state=42, n_jobs=-1)\n", + "rf_random_search.fit(X_train, y_train)\n", + "rf_best_model = rf_random_search.best_estimator_\n", + "rf_results = evaluate_model(rf_best_model, X_test, y_test)\n", "\n", - "MSE (Mean Squared Error): Среднеквадратичная ошибка. Показывает среднее квадратичное отклонение предсказанных значений от фактических. Эта метрика чувствительна к выбросам, так как ошибки возводятся в квадрат.\n", + "# Градиентный бустинг\n", + "print(\"Гиперпараметры для градиентного бустинга:\")\n", + "gb_param_dist = {\n", + " 'classifier__n_estimators': randint(100, 1000),\n", + " 'classifier__learning_rate': uniform(0.01, 0.5),\n", + " 'classifier__max_depth': [None] + list(randint(10, 100).rvs(10)),\n", + " 'classifier__min_samples_split': randint(2, 20),\n", + " 'classifier__min_samples_leaf': randint(1, 20),\n", + " 'classifier__subsample': uniform(0.5, 0.5)\n", + "}\n", "\n", - "RMSE (Root Mean Squared Error): Квадратный корень из среднеквадратичной ошибки. Показывает среднее отклонение предсказанных значений от фактических в тех же единицах, что и целевая переменная. Эта метрика также чувствительна к выбросам, но легче интерпретируется, чем MSE.\n", + "gb_pipeline = Pipeline([\n", + " ('classifier', GradientBoostingClassifier(random_state=42))\n", + "])\n", "\n", - "R² (R-squared): Коэффициент детерминации. Показывает, какую долю дисперсии целевой переменной объясняет модель. Значение R² близкое к 1 указывает на хорошее качество модели.\n", + "gb_random_search = RandomizedSearchCV(gb_pipeline, param_distributions=gb_param_dist, n_iter=50, cv=5, random_state=42, n_jobs=-1)\n", + "gb_random_search.fit(X_train, y_train)\n", + "gb_best_model = gb_random_search.best_estimator_\n", + "gb_results = evaluate_model(gb_best_model, X_test, y_test)\n", "\n", - "Обоснование:\n", + "# Вывод результатов\n", + "print(\"\\nРезультаты моделей:\")\n", + "print(\"\\nLogistic Regression:\")\n", + "for metric, value in logreg_results.items():\n", + " print(f\"{metric.capitalize()}: {value:.4f}\")\n", "\n", - "MAE: Хорошо подходит для задач, где важно понимать среднее отклонение предсказаний от фактических значений.\n", + "print(\"\\nRandom Forest:\")\n", + "for metric, value in rf_results.items():\n", + " print(f\"{metric.capitalize()}: {value:.4f}\")\n", "\n", - "MSE и RMSE: Полезны для задач, где важно минимизировать влияние выбросов, так как они возводят ошибки в квадрат.\n", - "\n", - "R²: Позволяет оценить, насколько хорошо модель объясняет вариацию целевой переменной.\n", - "\n", - "2. Оптимизация тарифной сетки: Задача: Классификация\n", - "\n", - "Выбор метрик:\n", - "\n", - "Accuracy: Доля правильных предсказаний среди всех предсказаний. Эта метрика показывает общую точность модели.\n", - "\n", - "Precision: Доля правильных положительных предсказаний среди всех положительных предсказаний. Эта метрика важна, если важно минимизировать количество ложноположительных результатов.\n", - "\n", - "Recall (Sensitivity): Доля правильных положительных предсказаний среди всех фактических положительных случаев. Эта метрика важна, если важно минимизировать количество ложноотрицательных результатов.\n", - "\n", - "F1-score: Гармоническое среднее между precision и recall. Эта метрика показывает баланс между precision и recall.\n", - "\n", - "Обоснование:\n", - "\n", - "Accuracy: Хорошо подходит для задач, где классы сбалансированы.\n", - "\n", - "Precision и Recall: Важны для задач, где важно минимизировать ошибки определенного типа (ложноположительные или ложноотрицательные).\n", - "\n", - "F1-score: Позволяет оценить баланс между precision и recall." + "print(\"\\nGradient Boosting:\")\n", + "for metric, value in gb_results.items():\n", + " print(f\"{metric.capitalize()}: {value:.4f}\")" ] }, { @@ -1696,574 +942,88 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Результаты для задачи регрессии:\n", - "Model: Linear Regression\n", - "Best Parameters: {}\n", - "MAE: 4808.184185261903\n", - "MSE: 77191075.38529098\n", - "RMSE: 8785.845171939407\n", - "R²: 0.7071551004885543\n", - "\n" + "ename": "AttributeError", + "evalue": "'dict' object has no attribute 'best_estimator'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[38], line 66\u001b[0m\n\u001b[0;32m 64\u001b[0m plot_roc_curve_multiclass(y_test, y_pred_proba, class_names, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mROC Curve for \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodel_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 65\u001b[0m \u001b[38;5;66;03m# Обученные модели\u001b[39;00m\n\u001b[1;32m---> 66\u001b[0m logreg_best_model \u001b[38;5;241m=\u001b[39m \u001b[43mlogreg_results\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbest_estimator\u001b[49m \u001b[38;5;66;03m# Обученная модель\u001b[39;00m\n\u001b[0;32m 67\u001b[0m rf_best_model \u001b[38;5;241m=\u001b[39m rf_results\u001b[38;5;241m.\u001b[39mbest_estimator_ \u001b[38;5;66;03m# Обученная модель\u001b[39;00m\n\u001b[0;32m 68\u001b[0m gb_best_model \u001b[38;5;241m=\u001b[39m gb_results\u001b[38;5;241m.\u001b[39mbest_estimator_ \n", + "\u001b[1;31mAttributeError\u001b[0m: 'dict' object has no attribute 'best_estimator'" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", - " warnings.warn(\n", - "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: Random Forest Regression\n", - "Best Parameters: {'model__max_depth': None, 'model__n_estimators': 100}\n", - "MAE: 3931.812022058823\n", - "MSE: 99133452.49149376\n", - "RMSE: 9956.578352601548\n", - "R²: 0.6239108499500701\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: Gradient Boosting Regression\n", - "Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200}\n", - "MAE: 3741.1262483893015\n", - "MSE: 76106789.83221506\n", - "RMSE: 8723.920553983458\n", - "R²: 0.711268626466102\n", - "\n", - "Результаты для задачи классификации:\n", - "Model: Logistic Regression\n", - "Best Parameters: {'model__C': 1, 'model__solver': 'lbfgs'}\n", - "Accuracy: 0.8713235294117647\n", - "Precision: 0.8035714285714286\n", - "Recall: 0.6521739130434783\n", - "F1-score: 0.72\n", - "\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: Random Forest Classification\n", - "Best Parameters: {'model__max_depth': 20, 'model__n_estimators': 200}\n", - "Accuracy: 0.8676470588235294\n", - "Precision: 0.851063829787234\n", - "Recall: 0.5797101449275363\n", - "F1-score: 0.6896551724137931\n", - "\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: Gradient Boosting Classification\n", - "Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200}\n", - "Accuracy: 0.8676470588235294\n", - "Precision: 0.8113207547169812\n", - "Recall: 0.6231884057971014\n", - "F1-score: 0.7049180327868853\n", - "\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", - "from sklearn.model_selection import train_test_split, GridSearchCV\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.linear_model import LinearRegression, LogisticRegression\n", - "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", - "from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.preprocessing import OneHotEncoder\n", - "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay\n", - "\n", - "# Загружаем набор данных\n", - "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", - "\n", - "# Удаление пустого столбца по имени\n", - "df = df.drop('Unnamed: 0', axis=1)\n", - "\n", - "# Определяем категориальные и числовые столбцы\n", - "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", - "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", - "\n", - "# Создаем преобразователь для категориальных и числовых столбцов\n", - "preprocessor = ColumnTransformer(\n", - " transformers=[\n", - " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", - " ('num', StandardScaler(), numerical_cols)\n", - " ])\n", - "\n", - "# Список моделей и их гиперпараметров для задачи регрессии\n", - "models_reg = {\n", - " \"Linear Regression\": (LinearRegression(), {}),\n", - " \"Random Forest Regression\": (RandomForestRegressor(), {\n", - " 'model__n_estimators': [100, 200],\n", - " 'model__max_depth': [None, 10, 20]\n", - " }),\n", - " \"Gradient Boosting Regression\": (GradientBoostingRegressor(), {\n", - " 'model__n_estimators': [100, 200],\n", - " 'model__learning_rate': [0.01, 0.1],\n", - " 'model__max_depth': [3, 5]\n", - " })\n", - "}\n", - "\n", - "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", - "X_reg = df[categorical_cols + numerical_cols]\n", - "y_reg = df['Price']\n", - "\n", - "# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n", - "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n", - "\n", - "# Обучаем и оцениваем модели для задачи регрессии\n", - "print(\"Результаты для задачи регрессии:\")\n", - "for name, (model, params) in models_reg.items():\n", - " pipeline = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('model', model)\n", - " ])\n", - " grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_absolute_error')\n", - " grid_search.fit(X_train_reg, y_train_reg)\n", - " best_model = grid_search.best_estimator_\n", - " y_pred_reg = best_model.predict(X_test_reg)\n", - " mae = mean_absolute_error(y_test_reg, y_pred_reg)\n", - " mse = mean_squared_error(y_test_reg, y_pred_reg)\n", - " rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n", - " r2 = r2_score(y_test_reg, y_pred_reg)\n", - " print(f\"Model: {name}\")\n", - " print(f\"Best Parameters: {grid_search.best_params_}\")\n", - " print(f\"MAE: {mae}\")\n", - " print(f\"MSE: {mse}\")\n", - " print(f\"RMSE: {rmse}\")\n", - " print(f\"R²: {r2}\")\n", - " print()\n", - "\n", - "# Список моделей и их гиперпараметров для задачи классификации\n", - "models_class = {\n", - " \"Logistic Regression\": (LogisticRegression(), {\n", - " 'model__C': [0.1, 1, 10],\n", - " 'model__solver': ['liblinear', 'lbfgs']\n", - " }),\n", - " \"Random Forest Classification\": (RandomForestClassifier(), {\n", - " 'model__n_estimators': [100, 200],\n", - " 'model__max_depth': [None, 10, 20]\n", - " }),\n", - " \"Gradient Boosting Classification\": (GradientBoostingClassifier(), {\n", - " 'model__n_estimators': [100, 200],\n", - " 'model__learning_rate': [0.01, 0.1],\n", - " 'model__max_depth': [3, 5]\n", - " })\n", - "}\n", - "\n", - "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", - "X_class = df[categorical_cols + numerical_cols]\n", - "y_class = (df['Price'] > df['Price'].mean()).astype(int)\n", - "\n", - "# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n", - "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n", - "\n", - "# Обучаем и оцениваем модели для задачи классификации\n", - "print(\"Результаты для задачи классификации:\")\n", - "for name, (model, params) in models_class.items():\n", - " pipeline = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('model', model)\n", - " ])\n", - " grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')\n", - " grid_search.fit(X_train_class, y_train_class)\n", - " best_model = grid_search.best_estimator_\n", - " y_pred_class = best_model.predict(X_test_class)\n", - " accuracy = accuracy_score(y_test_class, y_pred_class)\n", - " precision = precision_score(y_test_class, y_pred_class)\n", - " recall = recall_score(y_test_class, y_pred_class)\n", - " f1 = f1_score(y_test_class, y_pred_class)\n", - " print(f\"Model: {name}\")\n", - " print(f\"Best Parameters: {grid_search.best_params_}\")\n", - " print(f\"Accuracy: {accuracy}\")\n", - " print(f\"Precision: {precision}\")\n", - " print(f\"Recall: {recall}\")\n", - " print(f\"F1-score: {f1}\")\n", - " print()\n", - "\n", - " # Визуализация матрицы ошибок\n", - " cm = confusion_matrix(y_test_class, y_pred_class)\n", - " disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Less', 'More'])\n", - " disp.plot(cmap=plt.cm.Blues)\n", - " plt.title(f'Confusion Matrix for {name}')\n", - " plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Давайте проанализируем полученные значения метрик и определим, являются ли они нормальными или их можно улучшить.\n", - "\n", - "### Оценка смещения и дисперсии для задачи регрессии:\n", - "#### Вывод для задачи регрессии:\n", - "\n", - "- Random Forest Regression демонстрирует наилучшие результаты по метрикам MAE и R², что указывает на высокую точность и стабильность модели.\n", - "- Linear Regression и Gradient Boosting Regression также показывают хорошие результаты, но уступают случайному лесу.\n", - "\n", - "#### Вывод для задачи классификации:\n", - "- Random Forest Classification демонстрирует наилучшие результаты по всем метрикам (Accuracy, Precision, Recall, F1-score), что указывает на высокую точность и стабильность модели.\n", - "- Logistic Regression и Gradient Boosting Classification также показывают хорошие результаты, но уступают случайному лесу.\n", - "\n", - "Для оценки смещения (bias) и дисперсии (variance) моделей можно использовать метод перекрестной проверки (cross-validation). Этот метод позволяет оценить, насколько хорошо модель обобщается на новых данных.\n", - "\n", - "Оценка смещения и дисперсии для задачи регрессии: Для задачи регрессии мы будем использовать метрики MAE (Mean Absolute Error) и R² (R-squared) для оценки смещения и дисперсии.\n", - "\n", - "Оценка смещения и дисперсии для задачи классификации: Для задачи классификации мы будем использовать метрики Accuracy, Precision, Recall и F1-score для оценки смещения и дисперсии.\n", - "\n", - "Пример кода для оценки смещения и дисперсии:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Оценка смещения и дисперсии для задачи регрессии:\n", - "Model: Linear Regression\n", - "MAE (Cross-Validation): Mean = 5393.4164031476985, Std = 2364.7028993279946\n", - "R² (Cross-Validation): Mean = 0.31163924349046784, Std = 0.20066100088089248\n", - "\n", - "Model: Random Forest Regression\n", - "MAE (Cross-Validation): Mean = 4632.622847297591, Std = 2958.2113579489355\n", - "R² (Cross-Validation): Mean = 0.37197535985326297, Std = 0.11953059546045496\n", - "\n", - "Model: Gradient Boosting Regression\n", - "MAE (Cross-Validation): Mean = 4705.057344875317, Std = 2676.698617934764\n", - "R² (Cross-Validation): Mean = 0.30094204368967814, Std = 0.29605037554256863\n", - "\n", - "Оценка смещения и дисперсии для задачи классификации:\n", - "Model: Logistic Regression\n", - "Accuracy (Cross-Validation): Mean = 0.8373806164532234, Std = 0.04588247503012167\n", - "Precision (Cross-Validation): Mean = 0.7753980572437796, Std = 0.128244461080632\n", - "Recall (Cross-Validation): Mean = 0.663888888888889, Std = 0.22281171811283865\n", - "F1-score (Cross-Validation): Mean = 0.6730794905263655, Std = 0.09462791851118425\n", - "\n", - "Model: Random Forest Classification\n", - "Accuracy (Cross-Validation): Mean = 0.835177447362709, Std = 0.04406396622710083\n", - "Precision (Cross-Validation): Mean = 0.807362406312931, Std = 0.11894947251838416\n", - "Recall (Cross-Validation): Mean = 0.6194444444444445, Std = 0.2382420182487952\n", - "F1-score (Cross-Validation): Mean = 0.6475067867122541, Std = 0.10752644539783694\n", - "\n", - "Model: Gradient Boosting Classification\n", - "Accuracy (Cross-Validation): Mean = 0.8469530062947689, Std = 0.03006931461001492\n", - "Precision (Cross-Validation): Mean = 0.8072507504934535, Std = 0.11323561235031351\n", - "Recall (Cross-Validation): Mean = 0.6250380517503805, Std = 0.23296204577560678\n", - "F1-score (Cross-Validation): Mean = 0.661152357410075, Std = 0.1082825984327667\n", - "\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import cross_val_score\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.linear_model import LinearRegression, LogisticRegression\n", - "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", - "from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.preprocessing import OneHotEncoder\n", - "\n", - "# Загружаем набор данных\n", - "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", - "\n", - "# Удаление пустого столбца по имени\n", - "df = df.drop('Unnamed: 0', axis=1)\n", - "\n", - "# Определяем категориальные и числовые столбцы\n", - "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", - "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", - "\n", - "# Создаем преобразователь для категориальных и числовых столбцов\n", - "preprocessor = ColumnTransformer(\n", - " transformers=[\n", - " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", - " ('num', StandardScaler(), numerical_cols)\n", - " ])\n", - "\n", - "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", - "X_reg = df[categorical_cols + numerical_cols]\n", - "y_reg = df['Price']\n", - "\n", - "# Список моделей для задачи регрессии\n", - "models_reg = {\n", - " \"Linear Regression\": LinearRegression(),\n", - " \"Random Forest Regression\": RandomForestRegressor(),\n", - " \"Gradient Boosting Regression\": GradientBoostingRegressor()\n", - "}\n", - "\n", - "# Оценка смещения и дисперсии для задачи регрессии\n", - "print(\"Оценка смещения и дисперсии для задачи регрессии:\")\n", - "for name, model in models_reg.items():\n", - " pipeline = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('model', model)\n", - " ])\n", - " mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n", - " r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n", - " print(f\"Model: {name}\")\n", - " print(f\"MAE (Cross-Validation): Mean = {mae_scores.mean()}, Std = {mae_scores.std()}\")\n", - " print(f\"R² (Cross-Validation): Mean = {r2_scores.mean()}, Std = {r2_scores.std()}\")\n", - " print()\n", - "\n", - "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", - "X_class = df[categorical_cols + numerical_cols]\n", - "y_class = (df['Price'] > df['Price'].mean()).astype(int)\n", - "\n", - "# Список моделей для задачи классификации\n", - "models_class = {\n", - " \"Logistic Regression\": LogisticRegression(),\n", - " \"Random Forest Classification\": RandomForestClassifier(),\n", - " \"Gradient Boosting Classification\": GradientBoostingClassifier()\n", - "}\n", - "\n", - "# Оценка смещения и дисперсии для задачи классификации\n", - "print(\"Оценка смещения и дисперсии для задачи классификации:\")\n", - "for name, model in models_class.items():\n", - " pipeline = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('model', model)\n", - " ])\n", - " accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n", - " precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n", - " recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n", - " f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n", - " print(f\"Model: {name}\")\n", - " print(f\"Accuracy (Cross-Validation): Mean = {accuracy_scores.mean()}, Std = {accuracy_scores.std()}\")\n", - " print(f\"Precision (Cross-Validation): Mean = {precision_scores.mean()}, Std = {precision_scores.std()}\")\n", - " print(f\"Recall (Cross-Validation): Mean = {recall_scores.mean()}, Std = {recall_scores.std()}\")\n", - " print(f\"F1-score (Cross-Validation): Mean = {f1_scores.mean()}, Std = {f1_scores.std()}\")\n", - " print()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import pandas as pd\n", "import matplotlib.pyplot as plt\n", - "from sklearn.model_selection import cross_val_score\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.linear_model import LinearRegression, LogisticRegression\n", - "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", - "from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score\n", + "from sklearn.preprocessing import label_binarize\n", + "import numpy as np\n", "\n", - "# Загружаем набор данных\n", - "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", + "def plot_confusion_matrix(y_true, y_pred, title):\n", + " cm = confusion_matrix(y_true, y_pred)\n", + " sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)\n", + " plt.title(title)\n", + " plt.xlabel('Предсказанные значения')\n", + " plt.ylabel('Истинные значения')\n", + " plt.show()\n", "\n", - "# Удаление пустого столбца по имени\n", - "df = df.drop('Unnamed: 0', axis=1)\n", + "def plot_roc_curve_multiclass(y_true, y_pred_proba, class_names, title):\n", + " # Бинаризация меток для мультиклассовой классификации\n", + " y_true_bin = label_binarize(y_true, classes=np.unique(y_true))\n", + " n_classes = y_true_bin.shape[1]\n", "\n", - "# Определяем категориальные и числовые столбцы\n", - "categorical_cols = ['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE']\n", - "numerical_cols = ['Battery capacity (mAh)', 'Screen size (inches)', 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera', 'Number of SIMs']\n", + " # Вычисление ROC-кривых для каждого класса\n", + " fpr = dict()\n", + " tpr = dict()\n", + " roc_auc = dict()\n", + " for i in range(n_classes):\n", + " fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_pred_proba[:, i])\n", + " roc_auc[i] = auc(fpr[i], tpr[i])\n", "\n", - "# Создаем преобразователь для категориальных и числовых столбцов\n", - "preprocessor = ColumnTransformer(\n", - " transformers=[\n", - " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),\n", - " ('num', StandardScaler(), numerical_cols)\n", - " ])\n", + " # Построение ROC-кривых\n", + " plt.figure(figsize=(8, 6))\n", + " colors = ['blue', 'red', 'green', 'purple', 'orange'] # Цвета для каждого класса\n", + " for i, color in zip(range(n_classes), colors):\n", + " plt.plot(fpr[i], tpr[i], color=color, lw=2,\n", + " label=f'ROC curve (class {class_names[i]}, area = {roc_auc[i]:.2f})')\n", "\n", - "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", - "X_reg = df[categorical_cols + numerical_cols]\n", - "y_reg = df['Price']\n", + " plt.plot([0, 1], [0, 1], 'k--', lw=2)\n", + " plt.xlim([0.0, 1.0])\n", + " plt.ylim([0.0, 1.05])\n", + " plt.xlabel('False Positive Rate')\n", + " plt.ylabel('True Positive Rate')\n", + " plt.title(title)\n", + " plt.legend(loc=\"lower right\")\n", + " plt.show()\n", "\n", - "# Список моделей для задачи регрессии\n", - "models_reg = {\n", - " \"Linear Regression\": LinearRegression(),\n", - " \"Random Forest Regression\": RandomForestRegressor(),\n", - " \"Gradient Boosting Regression\": GradientBoostingRegressor()\n", - "}\n", + "def evaluate_and_plot_model(model, X_test, y_test, model_name, class_names):\n", + " y_pred = model.predict(X_test)\n", + " y_pred_proba = model.predict_proba(X_test)\n", + " \n", + " # Метрики\n", + " accuracy = accuracy_score(y_test, y_pred)\n", + " precision = precision_score(y_test, y_pred, average='weighted')\n", + " recall = recall_score(y_test, y_pred, average='weighted')\n", + " f1 = f1_score(y_test, y_pred, average='weighted')\n", + " roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted')\n", + " \n", + " print(f\"{model_name} Metrics:\")\n", + " print(f\"Accuracy: {accuracy:.4f}\")\n", + " print(f\"Precision: {precision:.4f}\")\n", + " print(f\"Recall: {recall:.4f}\")\n", + " print(f\"F1-Score: {f1:.4f}\")\n", + " print(f\"ROC-AUC: {roc_auc:.4f}\")\n", + " \n", + " # Визуализация\n", + " plot_confusion_matrix(y_test, y_pred, f'Confusion Matrix for {model_name}')\n", + " plot_roc_curve_multiclass(y_test, y_pred_proba, class_names, f'ROC Curve for {model_name}')\n", "\n", - "# Оценка смещения и дисперсии для задачи регрессии\n", - "mae_means = []\n", - "mae_stds = []\n", - "r2_means = []\n", - "r2_stds = []\n", - "\n", - "for name, model in models_reg.items():\n", - " pipeline = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('model', model)\n", - " ])\n", - " mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n", - " r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n", - " mae_means.append(mae_scores.mean())\n", - " mae_stds.append(mae_scores.std())\n", - " r2_means.append(r2_scores.mean())\n", - " r2_stds.append(r2_scores.std())\n", - "\n", - "# Визуализация результатов для задачи регрессии\n", - "fig, ax = plt.subplots(1, 2, figsize=(12, 6))\n", - "\n", - "ax[0].bar(models_reg.keys(), mae_means, yerr=mae_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n", - "ax[0].set_ylabel('MAE')\n", - "ax[0].set_title('Mean Absolute Error (MAE) for Regression Models')\n", - "ax[0].yaxis.grid(True)\n", - "\n", - "ax[1].bar(models_reg.keys(), r2_means, yerr=r2_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n", - "ax[1].set_ylabel('R²')\n", - "ax[1].set_title('R-squared (R²) for Regression Models')\n", - "ax[1].yaxis.grid(True)\n", - "\n", - "plt.tight_layout()\n", - "plt.show()\n", - "\n", - "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", - "X_class = df[categorical_cols + numerical_cols]\n", - "y_class = (df['Price'] > df['Price'].mean()).astype(int)\n", - "\n", - "# Список моделей для задачи классификации\n", - "models_class = {\n", - " \"Logistic Regression\": LogisticRegression(),\n", - " \"Random Forest Classification\": RandomForestClassifier(),\n", - " \"Gradient Boosting Classification\": GradientBoostingClassifier()\n", - "}\n", - "\n", - "# Оценка смещения и дисперсии для задачи классификации\n", - "accuracy_means = []\n", - "accuracy_stds = []\n", - "precision_means = []\n", - "precision_stds = []\n", - "recall_means = []\n", - "recall_stds = []\n", - "f1_means = []\n", - "f1_stds = []\n", - "\n", - "for name, model in models_class.items():\n", - " pipeline = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('model', model)\n", - " ])\n", - " accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n", - " precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n", - " recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n", - " f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n", - " accuracy_means.append(accuracy_scores.mean())\n", - " accuracy_stds.append(accuracy_scores.std())\n", - " precision_means.append(precision_scores.mean())\n", - " precision_stds.append(precision_scores.std())\n", - " recall_means.append(recall_scores.mean())\n", - " recall_stds.append(recall_scores.std())\n", - " f1_means.append(f1_scores.mean())\n", - " f1_stds.append(f1_scores.std())\n", - "\n", - "# Визуализация результатов для задачи классификации\n", - "fig, ax = plt.subplots(2, 2, figsize=(12, 12))\n", - "\n", - "ax[0, 0].bar(models_class.keys(), accuracy_means, yerr=accuracy_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n", - "ax[0, 0].set_ylabel('Accuracy')\n", - "ax[0, 0].set_title('Accuracy for Classification Models')\n", - "ax[0, 0].yaxis.grid(True)\n", - "\n", - "ax[0, 1].bar(models_class.keys(), precision_means, yerr=precision_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n", - "ax[0, 1].set_ylabel('Precision')\n", - "ax[0, 1].set_title('Precision for Classification Models')\n", - "ax[0, 1].yaxis.grid(True)\n", - "\n", - "ax[1, 0].bar(models_class.keys(), recall_means, yerr=recall_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n", - "ax[1, 0].set_ylabel('Recall')\n", - "ax[1, 0].set_title('Recall for Classification Models')\n", - "ax[1, 0].yaxis.grid(True)\n", - "\n", - "ax[1, 1].bar(models_class.keys(), f1_means, yerr=f1_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n", - "ax[1, 1].set_ylabel('F1-score')\n", - "ax[1, 1].set_title('F1-score for Classification Models')\n", - "ax[1, 1].yaxis.grid(True)\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" + "# Пример использования\n", + "class_names = ['Budget', 'Mid-Range', 'Premium'] # Замените на ваши классы\n", + "evaluate_and_plot_model(logreg_best_model, X_test, y_test, 'Logistic Regression', class_names)\n", + "evaluate_and_plot_model(rf_best_model, X_test, y_test, 'Random Forest', class_names)\n", + "evaluate_and_plot_model(gb_best_model, X_test, y_test, 'Gradient Boosting', class_names)" ] } ], From 8d8ec3b4c3c5307607ed3e57964e3cdb8edbf3a4 Mon Sep 17 00:00:00 2001 From: "a.puchkina" Date: Thu, 12 Dec 2024 17:27:43 +0400 Subject: [PATCH 4/4] =?UTF-8?q?=D0=BE=D1=88=D1=80=D0=B8=D0=B1=D0=BA=D0=B8?= =?UTF-8?q?=20=D0=BE=D1=88=D0=B8=D0=B1=D0=BA=D0=B8=20=D0=BE=D1=82=D0=BA?= =?UTF-8?q?=D1=83=D0=B4=D0=B0=20=D0=B2=D1=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_4/lab4.ipynb | 212 ++++++++++++++++++++++++++--------------------- 1 file changed, 116 insertions(+), 96 deletions(-) diff --git a/lab_4/lab4.ipynb b/lab_4/lab4.ipynb index 61ff186..208db8e 100644 --- a/lab_4/lab4.ipynb +++ b/lab_4/lab4.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -417,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -471,7 +471,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -526,7 +526,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -673,7 +673,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -708,7 +708,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -717,55 +717,49 @@ "text": [ "PriceCategory\n", "1 946\n", - "2 946\n", "0 946\n", "Name: count, dtype: int64\n", - "Гиперпараметры для логистической регрессии:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 0.8891\n", - "Precision: 0.8888\n", - "Recall: 0.8891\n", - "F1-Score: 0.8881\n", - "ROC-AUC: 0.9674\n", + "Гиперпараметры для логистической регрессии:\n", + "Accuracy: 0.8760\n", + "Precision: 0.8784\n", + "Recall: 0.8760\n", + "F1-Score: 0.8762\n", + "ROC-AUC: 0.9521\n", "Гиперпараметры для случайного леса:\n", - "Accuracy: 0.9525\n", - "Precision: 0.9540\n", - "Recall: 0.9525\n", - "F1-Score: 0.9524\n", - "ROC-AUC: 0.9929\n", - "Гиперпараметры для градиентного бустинга:\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[6], line 148\u001b[0m\n\u001b[0;32m 143\u001b[0m gb_pipeline \u001b[38;5;241m=\u001b[39m Pipeline([\n\u001b[0;32m 144\u001b[0m (\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassifier\u001b[39m\u001b[38;5;124m'\u001b[39m, GradientBoostingClassifier(random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m))\n\u001b[0;32m 145\u001b[0m ])\n\u001b[0;32m 147\u001b[0m gb_random_search \u001b[38;5;241m=\u001b[39m RandomizedSearchCV(gb_pipeline, param_distributions\u001b[38;5;241m=\u001b[39mgb_param_dist, n_iter\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m50\u001b[39m, cv\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m, n_jobs\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m--> 148\u001b[0m \u001b[43mgb_random_search\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 149\u001b[0m gb_best_model \u001b[38;5;241m=\u001b[39m gb_random_search\u001b[38;5;241m.\u001b[39mbest_estimator_\n\u001b[0;32m 150\u001b[0m gb_results \u001b[38;5;241m=\u001b[39m evaluate_model(gb_best_model, X_test, y_test)\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1019\u001b[0m, in \u001b[0;36mBaseSearchCV.fit\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 1013\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_results(\n\u001b[0;32m 1014\u001b[0m all_candidate_params, n_splits, all_out, all_more_results\n\u001b[0;32m 1015\u001b[0m )\n\u001b[0;32m 1017\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m results\n\u001b[1;32m-> 1019\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevaluate_candidates\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1021\u001b[0m \u001b[38;5;66;03m# multimetric is determined here because in the case of a callable\u001b[39;00m\n\u001b[0;32m 1022\u001b[0m \u001b[38;5;66;03m# self.scoring the return type is only known after calling\u001b[39;00m\n\u001b[0;32m 1023\u001b[0m first_test_score \u001b[38;5;241m=\u001b[39m all_out[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_scores\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:1960\u001b[0m, in \u001b[0;36mRandomizedSearchCV._run_search\u001b[1;34m(self, evaluate_candidates)\u001b[0m\n\u001b[0;32m 1958\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_run_search\u001b[39m(\u001b[38;5;28mself\u001b[39m, evaluate_candidates):\n\u001b[0;32m 1959\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Search n_iter candidates from param_distributions\"\"\"\u001b[39;00m\n\u001b[1;32m-> 1960\u001b[0m \u001b[43mevaluate_candidates\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1961\u001b[0m \u001b[43m \u001b[49m\u001b[43mParameterSampler\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1962\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparam_distributions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mn_iter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrandom_state\u001b[49m\n\u001b[0;32m 1963\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1964\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_search.py:965\u001b[0m, in \u001b[0;36mBaseSearchCV.fit..evaluate_candidates\u001b[1;34m(candidate_params, cv, more_results)\u001b[0m\n\u001b[0;32m 957\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mverbose \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 958\u001b[0m \u001b[38;5;28mprint\u001b[39m(\n\u001b[0;32m 959\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFitting \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;124m folds for each of \u001b[39m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;124m candidates,\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 960\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m totalling \u001b[39m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m fits\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[0;32m 961\u001b[0m n_splits, n_candidates, n_candidates \u001b[38;5;241m*\u001b[39m n_splits\n\u001b[0;32m 962\u001b[0m )\n\u001b[0;32m 963\u001b[0m )\n\u001b[1;32m--> 965\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mparallel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 966\u001b[0m \u001b[43m \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_fit_and_score\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 967\u001b[0m \u001b[43m \u001b[49m\u001b[43mclone\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbase_estimator\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 968\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 969\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 970\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 971\u001b[0m \u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 972\u001b[0m \u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparameters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 973\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_progress\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msplit_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_splits\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 974\u001b[0m \u001b[43m \u001b[49m\u001b[43mcandidate_progress\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcand_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_candidates\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 975\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_and_score_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 976\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 977\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mcand_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mproduct\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 978\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43menumerate\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcandidate_params\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 979\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43menumerate\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplitter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 980\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 981\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 983\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(out) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m 984\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 985\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo fits were performed. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 986\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWas the CV iterator empty? \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 987\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWere there no candidates?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 988\u001b[0m )\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\parallel.py:74\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 69\u001b[0m config \u001b[38;5;241m=\u001b[39m get_config()\n\u001b[0;32m 70\u001b[0m iterable_with_config \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 71\u001b[0m (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m delayed_func, args, kwargs \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[0;32m 73\u001b[0m )\n\u001b[1;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43miterable_with_config\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:2007\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 2001\u001b[0m \u001b[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001b[39;00m\n\u001b[0;32m 2002\u001b[0m \u001b[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001b[39;00m\n\u001b[0;32m 2003\u001b[0m \u001b[38;5;66;03m# reaches the first `yield` statement. This starts the asynchronous\u001b[39;00m\n\u001b[0;32m 2004\u001b[0m \u001b[38;5;66;03m# dispatch of the tasks to the workers.\u001b[39;00m\n\u001b[0;32m 2005\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 2007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:1650\u001b[0m, in \u001b[0;36mParallel._get_outputs\u001b[1;34m(self, iterator, pre_dispatch)\u001b[0m\n\u001b[0;32m 1647\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[0;32m 1649\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backend\u001b[38;5;241m.\u001b[39mretrieval_context():\n\u001b[1;32m-> 1650\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_retrieve()\n\u001b[0;32m 1652\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mGeneratorExit\u001b[39;00m:\n\u001b[0;32m 1653\u001b[0m \u001b[38;5;66;03m# The generator has been garbage collected before being fully\u001b[39;00m\n\u001b[0;32m 1654\u001b[0m \u001b[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001b[39;00m\n\u001b[0;32m 1655\u001b[0m \u001b[38;5;66;03m# the user if necessary.\u001b[39;00m\n\u001b[0;32m 1656\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", - "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:1762\u001b[0m, in \u001b[0;36mParallel._retrieve\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1757\u001b[0m \u001b[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001b[39;00m\n\u001b[0;32m 1758\u001b[0m \u001b[38;5;66;03m# async callbacks to progress.\u001b[39;00m\n\u001b[0;32m 1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ((\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m\n\u001b[0;32m 1760\u001b[0m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mget_status(\n\u001b[0;32m 1761\u001b[0m timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtimeout) \u001b[38;5;241m==\u001b[39m TASK_PENDING)):\n\u001b[1;32m-> 1762\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0.01\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1763\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m 1765\u001b[0m \u001b[38;5;66;03m# We need to be careful: the job list can be filling up as\u001b[39;00m\n\u001b[0;32m 1766\u001b[0m \u001b[38;5;66;03m# we empty it and Python list are not thread-safe by\u001b[39;00m\n\u001b[0;32m 1767\u001b[0m \u001b[38;5;66;03m# default hence the use of the lock\u001b[39;00m\n", - "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + "Accuracy: 0.9393\n", + "Precision: 0.9395\n", + "Recall: 0.9393\n", + "F1-Score: 0.9392\n", + "ROC-AUC: 0.9833\n", + "Гиперпараметры для градиентного бустинга:\n", + "Accuracy: 0.9261\n", + "Precision: 0.9261\n", + "Recall: 0.9261\n", + "F1-Score: 0.9261\n", + "ROC-AUC: 0.9777\n", + "\n", + "Результаты моделей:\n", + "\n", + "Logistic Regression:\n", + "Accuracy: 0.8760\n", + "Precision: 0.8784\n", + "Recall: 0.8760\n", + "F1: 0.8762\n", + "Roc_auc: 0.9521\n", + "\n", + "Random Forest:\n", + "Accuracy: 0.9393\n", + "Precision: 0.9395\n", + "Recall: 0.9393\n", + "F1: 0.9392\n", + "Roc_auc: 0.9833\n", + "\n", + "Gradient Boosting:\n", + "Accuracy: 0.9261\n", + "Precision: 0.9261\n", + "Recall: 0.9261\n", + "F1: 0.9261\n", + "Roc_auc: 0.9777\n" ] } ], @@ -793,7 +787,7 @@ "target = 'PriceCategory' # Целевая переменная: категория цены\n", "\n", "# Преобразование целевой переменной в категории (например, бюджетные, средний класс, премиум)\n", - "bins = [0, 10000, 30000, float('inf')]\n", + "bins = [0, 10000, 60000, float('inf')]\n", "labels = ['Budget', 'Mid-Range', 'Premium']\n", "df['PriceCategory'] = pd.cut(df['Price'], bins=bins, labels=labels)\n", "\n", @@ -843,7 +837,7 @@ "# Функция для оценки модели\n", "def evaluate_model(model, X_test, y_test):\n", " y_pred = model.predict(X_test)\n", - " y_pred_proba = model.predict_proba(X_test)\n", + " y_pred_proba = model.predict_proba(X_test)[:, 1]\n", " \n", " accuracy = accuracy_score(y_test, y_pred)\n", " precision = precision_score(y_test, y_pred, average='weighted') \n", @@ -938,19 +932,74 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "metadata": {}, "outputs": [ { - "ename": "AttributeError", - "evalue": "'dict' object has no attribute 'best_estimator'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[38], line 66\u001b[0m\n\u001b[0;32m 64\u001b[0m plot_roc_curve_multiclass(y_test, y_pred_proba, class_names, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mROC Curve for \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodel_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 65\u001b[0m \u001b[38;5;66;03m# Обученные модели\u001b[39;00m\n\u001b[1;32m---> 66\u001b[0m logreg_best_model \u001b[38;5;241m=\u001b[39m \u001b[43mlogreg_results\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbest_estimator\u001b[49m \u001b[38;5;66;03m# Обученная модель\u001b[39;00m\n\u001b[0;32m 67\u001b[0m rf_best_model \u001b[38;5;241m=\u001b[39m rf_results\u001b[38;5;241m.\u001b[39mbest_estimator_ \u001b[38;5;66;03m# Обученная модель\u001b[39;00m\n\u001b[0;32m 68\u001b[0m gb_best_model \u001b[38;5;241m=\u001b[39m gb_results\u001b[38;5;241m.\u001b[39mbest_estimator_ \n", - "\u001b[1;31mAttributeError\u001b[0m: 'dict' object has no attribute 'best_estimator'" + "name": "stdout", + "output_type": "stream", + "text": [ + "Logistic Regression Metrics:\n", + "Accuracy: 0.8760\n", + "Precision: 0.8784\n", + "Recall: 0.8760\n", + "F1-Score: 0.8762\n", + "ROC-AUC: 0.9521\n" ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Random Forest Metrics:\n", + "Accuracy: 0.9393\n", + "Precision: 0.9395\n", + "Recall: 0.9393\n", + "F1-Score: 0.9392\n", + "ROC-AUC: 0.9833\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Gradient Boosting Metrics:\n", + "Accuracy: 0.9261\n", + "Precision: 0.9261\n", + "Recall: 0.9261\n", + "F1-Score: 0.9261\n", + "ROC-AUC: 0.9777\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -960,6 +1009,7 @@ "from sklearn.preprocessing import label_binarize\n", "import numpy as np\n", "\n", + "# Функция для построения матрицы ошибок\n", "def plot_confusion_matrix(y_true, y_pred, title):\n", " cm = confusion_matrix(y_true, y_pred)\n", " sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)\n", @@ -968,39 +1018,10 @@ " plt.ylabel('Истинные значения')\n", " plt.show()\n", "\n", - "def plot_roc_curve_multiclass(y_true, y_pred_proba, class_names, title):\n", - " # Бинаризация меток для мультиклассовой классификации\n", - " y_true_bin = label_binarize(y_true, classes=np.unique(y_true))\n", - " n_classes = y_true_bin.shape[1]\n", - "\n", - " # Вычисление ROC-кривых для каждого класса\n", - " fpr = dict()\n", - " tpr = dict()\n", - " roc_auc = dict()\n", - " for i in range(n_classes):\n", - " fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_pred_proba[:, i])\n", - " roc_auc[i] = auc(fpr[i], tpr[i])\n", - "\n", - " # Построение ROC-кривых\n", - " plt.figure(figsize=(8, 6))\n", - " colors = ['blue', 'red', 'green', 'purple', 'orange'] # Цвета для каждого класса\n", - " for i, color in zip(range(n_classes), colors):\n", - " plt.plot(fpr[i], tpr[i], color=color, lw=2,\n", - " label=f'ROC curve (class {class_names[i]}, area = {roc_auc[i]:.2f})')\n", - "\n", - " plt.plot([0, 1], [0, 1], 'k--', lw=2)\n", - " plt.xlim([0.0, 1.0])\n", - " plt.ylim([0.0, 1.05])\n", - " plt.xlabel('False Positive Rate')\n", - " plt.ylabel('True Positive Rate')\n", - " plt.title(title)\n", - " plt.legend(loc=\"lower right\")\n", - " plt.show()\n", - "\n", + "# Функция для оценки и визуализации модели\n", "def evaluate_and_plot_model(model, X_test, y_test, model_name, class_names):\n", " y_pred = model.predict(X_test)\n", - " y_pred_proba = model.predict_proba(X_test)\n", - " \n", + " y_pred_proba = model.predict_proba(X_test)[:, 1]\n", " # Метрики\n", " accuracy = accuracy_score(y_test, y_pred)\n", " precision = precision_score(y_test, y_pred, average='weighted')\n", @@ -1017,10 +1038,9 @@ " \n", " # Визуализация\n", " plot_confusion_matrix(y_test, y_pred, f'Confusion Matrix for {model_name}')\n", - " plot_roc_curve_multiclass(y_test, y_pred_proba, class_names, f'ROC Curve for {model_name}')\n", "\n", "# Пример использования\n", - "class_names = ['Budget', 'Mid-Range', 'Premium'] # Замените на ваши классы\n", + "class_names = ['Budget', 'Mid-Range', 'Premium']\n", "evaluate_and_plot_model(logreg_best_model, X_test, y_test, 'Logistic Regression', class_names)\n", "evaluate_and_plot_model(rf_best_model, X_test, y_test, 'Random Forest', class_names)\n", "evaluate_and_plot_model(gb_best_model, X_test, y_test, 'Gradient Boosting', class_names)"