{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Начало лабораторной работы" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Unnamed: 0', 'Name', 'Rating', 'Spec_score', 'No_of_sim', 'Ram',\n", " 'Battery', 'Display', 'Camera', 'External_Memory', 'Android_version',\n", " 'Price', 'company', 'Inbuilt_memory', 'fast_charging',\n", " 'Screen_resolution', 'Processor', 'Processor_name'],\n", " dtype='object')\n", "Index(['Unnamed: 0', 'Name', 'Brand', 'Model', 'Battery capacity (mAh)',\n", " 'Screen size (inches)', 'Touchscreen', 'Resolution x', 'Resolution y',\n", " 'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera',\n", " 'Front camera', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS',\n", " 'Number of SIMs', '3G', '4G/ LTE', 'Price'],\n", " dtype='object')\n" ] } ], "source": [ "import pandas as pd\n", "df1 = pd.read_csv(\"..//static//csv//mobile phone price prediction.csv\")\n", "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", "print(df1.columns)\n", "print(df.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Бизнес-цели:\n", "1. Прогнозирование цен на мобильные телефоны: \n", "\n", "Цель: Разработать точную модель машинного обучения для прогнозирования цен на мобильные телефоны на основе характеристик.\n", "\n", "Применение:\n", "Оптимизация ценообразования для производителей и продавцов.\n", "Помощь потребителям в принятии обоснованных решений при покупке.\n", "\n", "2. Оптимизация характеристик продукта:\n", "\n", "Цель: Определить оптимальный набор характеристик для мобильных телефонов, максимизирующий ценность для потребителей при заданном бюджете.\n", "\n", "Применение:\n", "Разработка новых моделей мобильных телефонов.\n", "Улучшение существующих моделей." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. Прогнозирование цен на мобильные телефоны " ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Среднее значение поля 'Price': 11465.825607064018\n", " Name Brand Model \\\n", "0 OnePlus 7T Pro McLaren Edition OnePlus 7T Pro McLaren Edition \n", "1 Realme X2 Pro Realme X2 Pro \n", "2 iPhone 11 Pro Max Apple iPhone 11 Pro Max \n", "3 iPhone 11 Apple iPhone 11 \n", "4 LG G8X ThinQ LG G8X ThinQ \n", "5 OnePlus 7T OnePlus 7T \n", "6 OnePlus 7T Pro OnePlus 7T Pro \n", "7 Samsung Galaxy Note 10+ Samsung Galaxy Note 10+ \n", "8 Asus ROG Phone 2 Asus ROG Phone 2 \n", "9 Xiaomi Redmi K20 Pro Xiaomi Redmi K20 Pro \n", "10 Oppo K3 Oppo K3 \n", "11 Realme X Realme X \n", "12 Xiaomi Redmi K20 Xiaomi Redmi K20 \n", "13 OnePlus 7 Pro OnePlus 7 Pro \n", "14 Oppo Reno 10x Zoom Oppo Reno 10x Zoom \n", "15 Realme 3 Pro Realme 3 Pro \n", "16 Huawei P30 Pro Huawei P30 Pro \n", "17 Redmi Note 7 Pro Xiaomi Redmi Note 7 Pro \n", "18 Huawei Mate 20 Pro Huawei Mate 20 Pro \n", "19 LG V40 ThinQ LG V40 ThinQ \n", "\n", " Battery capacity (mAh) Screen size (inches) Touchscreen Resolution x \\\n", "0 4085 6.67 Yes 1440 \n", "1 4000 6.50 Yes 1080 \n", "2 3969 6.50 Yes 1242 \n", "3 3110 6.10 Yes 828 \n", "4 4000 6.40 Yes 1080 \n", "5 3800 6.55 Yes 1080 \n", "6 4085 6.67 Yes 1440 \n", "7 4300 6.80 Yes 1440 \n", "8 6000 6.59 Yes 1080 \n", "9 4000 6.39 Yes 1080 \n", "10 3765 6.50 Yes 1080 \n", "11 3765 6.53 Yes 1080 \n", "12 4000 6.39 Yes 1080 \n", "13 4000 6.67 Yes 1440 \n", "14 4065 6.60 Yes 1080 \n", "15 4045 6.30 Yes 1080 \n", "16 4200 6.47 Yes 1080 \n", "17 4000 6.30 Yes 1080 \n", "18 4200 6.39 Yes 1440 \n", "19 3300 6.40 Yes 1440 \n", "\n", " Resolution y Processor RAM (MB) ... Operating system Wi-Fi \\\n", "0 3120 8 12000 ... Android Yes \n", "1 2400 8 6000 ... Android Yes \n", "2 2688 6 4000 ... iOS Yes \n", "3 1792 6 4000 ... iOS Yes \n", "4 2340 8 6000 ... Android Yes \n", "5 2400 8 8000 ... Android Yes \n", "6 3120 8 8000 ... Android Yes \n", "7 3040 8 12000 ... Android Yes \n", "8 2340 8 8000 ... Android Yes \n", "9 2340 8 6000 ... Android Yes \n", "10 2340 8 6000 ... Android Yes \n", "11 2340 8 4000 ... Android Yes \n", "12 2340 8 6000 ... Android Yes \n", "13 3120 8 6000 ... Android Yes \n", "14 2340 8 6000 ... Android Yes \n", "15 2340 8 4000 ... Android Yes \n", "16 2340 8 8000 ... Android Yes \n", "17 2340 8 4000 ... Android Yes \n", "18 3120 8 6000 ... Android Yes \n", "19 3120 8 6000 ... Android Yes \n", "\n", " Bluetooth GPS Number of SIMs 3G 4G/ LTE Price above_average_price \\\n", "0 Yes Yes 2 Yes Yes 58998 1 \n", "1 Yes Yes 2 Yes Yes 27999 1 \n", "2 Yes Yes 2 Yes Yes 106900 1 \n", "3 Yes Yes 2 Yes Yes 62900 1 \n", "4 Yes Yes 1 No No 49990 1 \n", "5 Yes No 2 Yes Yes 34930 1 \n", "6 Yes Yes 2 Yes Yes 52990 1 \n", "7 Yes Yes 2 Yes Yes 79699 1 \n", "8 Yes Yes 1 Yes Yes 37999 1 \n", "9 Yes Yes 2 No No 23190 1 \n", "10 Yes Yes 2 Yes Yes 23990 1 \n", "11 Yes Yes 2 Yes Yes 14999 1 \n", "12 Yes Yes 2 Yes Yes 19282 1 \n", "13 Yes Yes 2 Yes Yes 39995 1 \n", "14 Yes Yes 2 Yes Yes 36990 1 \n", "15 Yes Yes 2 Yes Yes 13999 1 \n", "16 Yes No 2 Yes Yes 54280 1 \n", "17 Yes Yes 2 Yes Yes 9799 0 \n", "18 Yes Yes 2 Yes Yes 63990 1 \n", "19 Yes Yes 2 Yes Yes 29999 1 \n", "\n", " price_volatility \n", "0 174496 \n", "1 174496 \n", "2 174496 \n", "3 174496 \n", "4 174496 \n", "5 174496 \n", "6 174496 \n", "7 174496 \n", "8 174496 \n", "9 174496 \n", "10 174496 \n", "11 174496 \n", "12 174496 \n", "13 174496 \n", "14 174496 \n", "15 174496 \n", "16 174496 \n", "17 174496 \n", "18 174496 \n", "19 174496 \n", "\n", "[20 rows x 23 columns]\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# Загружаем набор данных\n", "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", "\n", "# Удаление пустого столбца по имени\n", "df = df.drop('Unnamed: 0', axis=1)\n", "\n", "# Устанавливаем случайное состояние\n", "random_state = 42\n", "\n", "# Рассчитываем среднее значение цены\n", "average_price = df['Price'].mean()\n", "print(f\"Среднее значение поля 'Price': {average_price}\")\n", "\n", "# Создаем новую переменную, указывающую, превышает ли цена среднюю\n", "df['above_average_price'] = (df['Price'] > average_price).astype(int)\n", "\n", "# Рассчитываем волатильность (разницу между максимальной и минимальной ценой)\n", "df['price_volatility'] = df['Price'].max() - df['Price'].min()\n", "\n", "# Выводим первые строки измененной таблицы для проверки\n", "print(df.head(20))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. Оптимизация характеристик продукта " ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Средняя цена для 'Brand':\n", "Brand\n", "10.or 5999.000000\n", "Acer 6470.000000\n", "Alcatel 9201.600000\n", "Apple 45510.470588\n", "Aqua 4599.000000\n", " ... \n", "Zopo 6027.916667\n", "Zuk 29124.000000\n", "iBall 4051.000000\n", "iVoomi 4018.875000\n", "mPhone 6949.000000\n", "Name: Price, Length: 76, dtype: float64\n", "\n", "Средняя цена для 'Battery capacity (mAh)':\n", "Battery capacity (mAh)\n", "1010 3499.000000\n", "1050 4790.000000\n", "1200 3396.000000\n", "1250 2498.500000\n", "1300 3338.333333\n", " ... \n", "5000 13542.530612\n", "5020 23900.000000\n", "5100 16999.000000\n", "5300 8999.000000\n", "6000 19832.333333\n", "Name: Price, Length: 165, dtype: float64\n", "\n", "Средняя цена для 'Screen size (inches)':\n", "Screen size (inches)\n", "2.40 1249.000000\n", "2.44 6999.000000\n", "2.45 2999.000000\n", "2.60 5555.000000\n", "2.80 5345.000000\n", " ... \n", "6.70 58896.428571\n", "6.80 79699.000000\n", "6.90 92999.000000\n", "7.00 6199.000000\n", "7.30 164999.000000\n", "Name: Price, Length: 80, dtype: float64\n", "\n", "Средняя цена для 'Touchscreen':\n", "Touchscreen\n", "No 5255.411765\n", "Yes 11544.497019\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'Resolution x':\n", "Resolution x\n", "240 3819.888889\n", "320 2499.000000\n", "360 5797.500000\n", "400 2860.500000\n", "480 4128.602459\n", "540 7445.848485\n", "560 15990.000000\n", "600 4829.000000\n", "640 7697.200000\n", "720 7619.360684\n", "750 25999.000000\n", "768 8872.000000\n", "800 2190.000000\n", "828 54199.500000\n", "850 4115.000000\n", "854 4415.666667\n", "1024 6199.000000\n", "1080 17165.865952\n", "1125 75632.666667\n", "1176 77299.000000\n", "1242 88449.500000\n", "1280 7144.333333\n", "1440 34240.461538\n", "1520 7935.000000\n", "1536 164999.000000\n", "1600 29950.000000\n", "1880 20800.000000\n", "2160 45500.000000\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'Resolution y':\n", "Resolution y\n", "320 3811.125000\n", "480 4264.818182\n", "485 3299.000000\n", "584 4499.000000\n", "600 6199.000000\n", "640 4095.000000\n", "720 8539.900000\n", "800 4189.321429\n", "854 4087.901316\n", "960 6921.651163\n", "1080 7583.800000\n", "1136 12749.000000\n", "1280 7449.136674\n", "1290 14990.000000\n", "1334 25999.000000\n", "1440 6930.808824\n", "1480 13596.166667\n", "1498 4490.000000\n", "1500 9083.125000\n", "1520 8139.425000\n", "1544 10010.000000\n", "1548 8499.000000\n", "1560 9135.750000\n", "1580 9950.000000\n", "1600 9476.111111\n", "1620 21492.000000\n", "1792 54199.500000\n", "1820 6498.000000\n", "1920 14698.088496\n", "2152 164999.000000\n", "2160 17465.194444\n", "2220 20531.333333\n", "2240 49990.000000\n", "2244 18000.000000\n", "2246 16177.200000\n", "2248 39990.000000\n", "2270 8961.000000\n", "2280 19073.190476\n", "2310 25120.000000\n", "2316 29999.000000\n", "2340 19768.017241\n", "2400 32131.083333\n", "2436 75632.666667\n", "2520 11844.500000\n", "2560 26501.476190\n", "2636 174990.000000\n", "2688 88449.500000\n", "2880 34993.250000\n", "2960 38425.714286\n", "3040 81799.500000\n", "3120 45710.000000\n", "3200 77999.000000\n", "3840 45500.000000\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'Processor':\n", "Processor\n", "1 7861.095238\n", "2 7929.444444\n", "4 7209.401171\n", "6 39365.400000\n", "8 16193.927434\n", "10 8542.000000\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'RAM (MB)':\n", "RAM (MB)\n", "64 4249.000000\n", "256 2625.625000\n", "289 3190.000000\n", "384 5555.000000\n", "512 3419.847222\n", "768 4749.500000\n", "1000 5747.082153\n", "2000 8645.608187\n", "3000 11587.829787\n", "4000 17741.965000\n", "6000 28291.587302\n", "8000 45080.586207\n", "12000 99173.750000\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'Internal storage (GB)':\n", "Internal storage (GB)\n", "0.064 3499.000000\n", "0.128 4999.000000\n", "0.160 5555.000000\n", "0.512 3390.444444\n", "1.000 5197.000000\n", "2.000 2399.000000\n", "3.000 3190.000000\n", "4.000 4513.032258\n", "8.000 5311.654206\n", "16.000 8379.403341\n", "32.000 12089.644366\n", "64.000 21208.814607\n", "128.000 29477.043478\n", "256.000 68682.777778\n", "512.000 164999.000000\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'Rear camera':\n", "Rear camera\n", "0.0 4193.500000\n", "0.3 3054.666667\n", "2.0 3000.804878\n", "3.0 4329.333333\n", "3.2 3506.727273\n", "5.0 4413.183486\n", "8.0 6658.109966\n", "8.7 8999.000000\n", "10.0 10499.250000\n", "12.0 28533.950980\n", "12.2 29162.500000\n", "12.3 30569.250000\n", "13.0 9531.776018\n", "13.1 32490.000000\n", "13.2 7496.333333\n", "15.0 88719.000000\n", "16.0 17923.896825\n", "18.0 19999.000000\n", "19.0 39095.000000\n", "20.0 23739.777778\n", "20.7 35718.600000\n", "21.0 14513.900000\n", "21.5 6999.000000\n", "23.0 31966.090909\n", "24.0 18587.250000\n", "25.0 13980.000000\n", "32.0 19589.000000\n", "40.0 61389.750000\n", "41.0 10990.000000\n", "48.0 24205.615385\n", "64.0 19332.333333\n", "108.0 92999.000000\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'Front camera':\n", "Front camera\n", "0.0 6799.333333\n", "0.3 4398.430233\n", "0.9 19999.000000\n", "1.1 18745.000000\n", "1.2 18579.333333\n", "1.3 5640.352941\n", "1.5 18748.000000\n", "1.6 15833.000000\n", "1.9 11246.750000\n", "2.0 6711.995122\n", "2.1 24839.666667\n", "2.2 35823.333333\n", "2.4 5061.333333\n", "3.0 8500.000000\n", "3.2 4660.900000\n", "3.7 28925.000000\n", "4.0 13519.285714\n", "5.0 8347.064655\n", "7.0 49061.625000\n", "8.0 13894.422222\n", "10.0 105114.166667\n", "12.0 41929.500000\n", "13.0 11380.436364\n", "16.0 17646.113402\n", "20.0 13498.520000\n", "24.0 21860.562500\n", "25.0 19182.692308\n", "32.0 23749.782609\n", "40.0 92999.000000\n", "48.0 27999.000000\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'Operating system':\n", "Operating system\n", "Android 10989.947652\n", "BlackBerry 10509.600000\n", "Cyanogen 9173.600000\n", "Sailfish 4799.000000\n", "Tizen 4459.666667\n", "Windows 16706.684211\n", "iOS 45510.470588\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'Wi-Fi':\n", "Wi-Fi\n", "No 6334.375000\n", "Yes 11496.211695\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'Bluetooth':\n", "Bluetooth\n", "No 7794.466667\n", "Yes 11506.800595\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'GPS':\n", "GPS\n", "No 8316.148148\n", "Yes 11737.740208\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'Number of SIMs':\n", "Number of SIMs\n", "1 16479.718062\n", "2 10465.580018\n", "3 4590.000000\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для '3G':\n", "3G\n", "No 11616.882759\n", "Yes 11447.783361\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для '4G/ LTE':\n", "4G/ LTE\n", "No 7922.270893\n", "Yes 12680.858696\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для комбинации 'Brand' и 'Operating system':\n", "Brand Operating system\n", "10.or Android 5999.000000\n", "Acer Android 6470.000000\n", "Alcatel Android 9201.600000\n", "Apple iOS 45510.470588\n", "Aqua Android 4599.000000\n", " ... \n", "Zopo Android 6027.916667\n", "Zuk Android 29124.000000\n", "iBall Android 4051.000000\n", "iVoomi Android 4018.875000\n", "mPhone Android 6949.000000\n", "Name: Price, Length: 85, dtype: float64\n", "\n", "Средняя цена для комбинации 'RAM (MB)' и 'Internal storage (GB)':\n", "RAM (MB) Internal storage (GB)\n", "64 0.064 3499.000000\n", " 0.128 4999.000000\n", "256 0.512 2287.857143\n", " 8.000 4990.000000\n", "289 3.000 3190.000000\n", "384 0.160 5555.000000\n", "512 0.512 7249.500000\n", " 2.000 2399.000000\n", " 4.000 3222.244444\n", " 8.000 3513.750000\n", "768 4.000 6500.000000\n", " 8.000 2999.000000\n", "1000 1.000 5197.000000\n", " 4.000 8019.187500\n", " 8.000 5398.572464\n", " 16.000 6772.263158\n", " 32.000 6997.000000\n", "2000 8.000 6458.736842\n", " 16.000 8518.889273\n", " 32.000 10155.272727\n", " 64.000 36999.000000\n", "3000 16.000 9082.082192\n", " 32.000 11742.751269\n", " 64.000 24287.833333\n", "4000 32.000 14827.288462\n", " 64.000 19201.702290\n", " 128.000 15408.882353\n", "6000 64.000 26554.636364\n", " 128.000 29692.000000\n", " 256.000 44999.000000\n", "8000 64.000 54990.000000\n", " 128.000 37177.181818\n", " 256.000 72408.166667\n", "12000 128.000 92999.000000\n", " 256.000 69348.500000\n", " 512.000 164999.000000\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для комбинации 'Screen size (inches)' и 'Battery capacity (mAh)':\n", "Screen size (inches) Battery capacity (mAh)\n", "2.40 2000 1249.0\n", "2.44 1450 6999.0\n", "2.45 1500 2999.0\n", "2.60 1350 5555.0\n", "2.80 1200 3190.0\n", " ... \n", "6.70 4500 40259.0\n", "6.80 4300 79699.0\n", "6.90 5000 92999.0\n", "7.00 3450 6199.0\n", "7.30 4380 164999.0\n", "Name: Price, Length: 431, dtype: float64\n", "\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# Загружаем набор данных\n", "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", "\n", "# Удаление пустого столбца по имени\n", "df = df.drop('Unnamed: 0', axis=1)\n", "\n", "# Устанавливаем случайное состояние\n", "random_state = 42\n", "\n", "# Рассчитываем среднюю цену для каждого значения каждого признака (Model уберем, у всех разная)\n", "for column in ['Brand', 'Battery capacity (mAh)', 'Screen size (inches)', 'Touchscreen', \n", " 'Resolution x', 'Resolution y', 'Processor', 'RAM (MB)', 'Internal storage (GB)', \n", " 'Rear camera', 'Front camera', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', \n", " 'Number of SIMs', '3G', '4G/ LTE']:\n", " print(f\"Средняя цена для '{column}':\")\n", " print(df.groupby(column)['Price'].mean())\n", " print()\n", "\n", "# Рассчитываем среднюю цену для комбинаций признаков\n", "# для комбинации 'Brand' и 'Operating system'\n", "print(\"Средняя цена для комбинации 'Brand' и 'Operating system':\")\n", "print(df.groupby(['Brand', 'Operating system'])['Price'].mean())\n", "print()\n", "\n", "# Рассчитываем среднюю цену для комбинации 'RAM (MB)' и 'Internal storage (GB)'\n", "print(\"Средняя цена для комбинации 'RAM (MB)' и 'Internal storage (GB)':\")\n", "print(df.groupby(['RAM (MB)', 'Internal storage (GB)'])['Price'].mean())\n", "print()\n", "\n", "# Рассчитываем среднюю цену для комбинации 'Screen size (inches)' и 'Battery capacity (mAh)'\n", "print(\"Средняя цена для комбинации 'Screen size (inches)' и 'Battery capacity (mAh)':\")\n", "print(df.groupby(['Screen size (inches)', 'Battery capacity (mAh)'])['Price'].mean())\n", "print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Выбор ориентира для каждой задачи:\n", "\n", "1. Прогнозирование цен на мобильные телефоны:\n", "Ориентир:\n", "\n", "R² (коэффициент детерминации): 0.75 - 0.85\n", "\n", "MAE (средняя абсолютная ошибка): 5000 - 10000 рублей\n", "\n", "RMSE (среднеквадратичная ошибка): 10000 - 20000 рублей\n", "\n", "Объяснение:\n", "\n", "R²: Значение 0.75 - 0.85 будет означать, что модель объясняет 75-85% вариации цен на мобильные телефоны, что является хорошим результатом для задачи регрессии.\n", "\n", "MAE: Значение 5000 - 10000 рублей будет означать, что в среднем модель ошибается на 5000 - 10000 рублей при прогнозировании цен на мобильные телефоны. Учитывая диапазон цен от 9000 до 150000 рублей, этот ориентир является разумным.\n", "\n", "RMSE: Значение 10000 - 20000 рублей будет означать, что среднеквадратичная ошибка модели составляет 10000 - 20000 рублей.\n", "\n", "2. Оптимизация характеристик продукта:\n", "Ориентир:\n", "\n", "Увеличение прибыли компании: 5% - 10%\n", "\n", "Сохранение конкурентоспособных тарифов:\n", "\n", "Средняя цена мобильных телефонов не должна увеличиваться более чем на 5% по сравнению с текущими тарифами.\n", "\n", "Доля клиентов, считающих цены высокими, не должна увеличиваться более чем на 2%.\n", "\n", "Объяснение:\n", "\n", "Увеличение прибыли компании: \n", "\n", "Цель оптимизации характеристик продукта - максимизировать прибыль компании. Ориентир в 5% - 10% увеличения прибыли является реалистичным и достижимым.\n", "\n", "Сохранение конкурентоспособных тарифов: \n", "\n", "Важно, чтобы оптимизация характеристик продукта не привела к значительному увеличению цен для клиентов. Ориентир в 5% увеличения средней цены мобильных телефонов и 2% увеличения доли клиентов, считающих цены высокими, позволяет сохранить конкурентоспособность компании." ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MAE: 5424.128251450574\n", "MSE: 143399695.59118932\n", "RMSE: 11974.961193723733\n", "R²: 0.4559750691933513\n", "Ориентиры для прогнозирования цены мобильных устройств не достигнуты.\n", "Средняя цена для 'Battery capacity (mAh)':\n", "Battery capacity (mAh)\n", "1010 3499.000000\n", "1050 4790.000000\n", "1200 3396.000000\n", "1250 2498.500000\n", "1300 3338.333333\n", " ... \n", "5000 13542.530612\n", "5020 23900.000000\n", "5100 16999.000000\n", "5300 8999.000000\n", "6000 19832.333333\n", "Name: Price, Length: 165, dtype: float64\n", "\n", "Средняя цена для 'Screen size (inches)':\n", "Screen size (inches)\n", "2.40 1249.000000\n", "2.44 6999.000000\n", "2.45 2999.000000\n", "2.60 5555.000000\n", "2.80 5345.000000\n", " ... \n", "6.70 58896.428571\n", "6.80 79699.000000\n", "6.90 92999.000000\n", "7.00 6199.000000\n", "7.30 164999.000000\n", "Name: Price, Length: 80, dtype: float64\n", "\n", "Средняя цена для 'RAM (MB)':\n", "RAM (MB)\n", "64 4249.000000\n", "256 2625.625000\n", "289 3190.000000\n", "384 5555.000000\n", "512 3419.847222\n", "768 4749.500000\n", "1000 5747.082153\n", "2000 8645.608187\n", "3000 11587.829787\n", "4000 17741.965000\n", "6000 28291.587302\n", "8000 45080.586207\n", "12000 99173.750000\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'Internal storage (GB)':\n", "Internal storage (GB)\n", "0.064 3499.000000\n", "0.128 4999.000000\n", "0.160 5555.000000\n", "0.512 3390.444444\n", "1.000 5197.000000\n", "2.000 2399.000000\n", "3.000 3190.000000\n", "4.000 4513.032258\n", "8.000 5311.654206\n", "16.000 8379.403341\n", "32.000 12089.644366\n", "64.000 21208.814607\n", "128.000 29477.043478\n", "256.000 68682.777778\n", "512.000 164999.000000\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'Rear camera':\n", "Rear camera\n", "0.0 4193.500000\n", "0.3 3054.666667\n", "2.0 3000.804878\n", "3.0 4329.333333\n", "3.2 3506.727273\n", "5.0 4413.183486\n", "8.0 6658.109966\n", "8.7 8999.000000\n", "10.0 10499.250000\n", "12.0 28533.950980\n", "12.2 29162.500000\n", "12.3 30569.250000\n", "13.0 9531.776018\n", "13.1 32490.000000\n", "13.2 7496.333333\n", "15.0 88719.000000\n", "16.0 17923.896825\n", "18.0 19999.000000\n", "19.0 39095.000000\n", "20.0 23739.777778\n", "20.7 35718.600000\n", "21.0 14513.900000\n", "21.5 6999.000000\n", "23.0 31966.090909\n", "24.0 18587.250000\n", "25.0 13980.000000\n", "32.0 19589.000000\n", "40.0 61389.750000\n", "41.0 10990.000000\n", "48.0 24205.615385\n", "64.0 19332.333333\n", "108.0 92999.000000\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для 'Front camera':\n", "Front camera\n", "0.0 6799.333333\n", "0.3 4398.430233\n", "0.9 19999.000000\n", "1.1 18745.000000\n", "1.2 18579.333333\n", "1.3 5640.352941\n", "1.5 18748.000000\n", "1.6 15833.000000\n", "1.9 11246.750000\n", "2.0 6711.995122\n", "2.1 24839.666667\n", "2.2 35823.333333\n", "2.4 5061.333333\n", "3.0 8500.000000\n", "3.2 4660.900000\n", "3.7 28925.000000\n", "4.0 13519.285714\n", "5.0 8347.064655\n", "7.0 49061.625000\n", "8.0 13894.422222\n", "10.0 105114.166667\n", "12.0 41929.500000\n", "13.0 11380.436364\n", "16.0 17646.113402\n", "20.0 13498.520000\n", "24.0 21860.562500\n", "25.0 19182.692308\n", "32.0 23749.782609\n", "40.0 92999.000000\n", "48.0 27999.000000\n", "Name: Price, dtype: float64\n", "\n", "Средняя цена для комбинации 'Battery capacity (mAh)' и 'RAM (MB)':\n", "Battery capacity (mAh) RAM (MB)\n", "1010 64 3499.0\n", "1050 1000 4790.0\n", "1200 64 4999.0\n", " 289 3190.0\n", " 512 1999.0\n", " ... \n", "5020 3000 23900.0\n", "5100 3000 16999.0\n", "5300 4000 8999.0\n", "6000 4000 10749.0\n", " 8000 37999.0\n", "Name: Price, Length: 323, dtype: float64\n", "\n", "Средняя цена для комбинации 'Screen size (inches)' и 'Internal storage (GB)':\n", "Screen size (inches) Internal storage (GB)\n", "2.40 4.000 1249.0\n", "2.44 0.512 6999.0\n", "2.45 4.000 2999.0\n", "2.60 0.160 5555.0\n", "2.80 0.512 7500.0\n", " ... \n", "6.70 256.000 174990.0\n", "6.80 256.000 79699.0\n", "6.90 128.000 92999.0\n", "7.00 8.000 6199.0\n", "7.30 512.000 164999.0\n", "Name: Price, Length: 177, dtype: float64\n", "\n", "Средняя цена для комбинации 'Rear camera' и 'Front camera':\n", "Rear camera Front camera\n", "0.0 0.0 4193.500000\n", "0.3 0.3 3054.666667\n", "2.0 0.0 4396.000000\n", " 0.3 2940.937500\n", " 1.3 2240.000000\n", " ... \n", "48.0 48.0 27999.000000\n", "64.0 16.0 21499.000000\n", " 20.0 14999.000000\n", " 32.0 21499.000000\n", "108.0 40.0 92999.000000\n", "Name: Price, Length: 126, dtype: float64\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "d:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", " warnings.warn(\n" ] } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", "\n", "# Загружаем набор данных\n", "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", "\n", "# Удаление пустого столбца по имени\n", "df = df.drop('Unnamed: 0', axis=1)\n", "\n", "# Преобразуем категориальные переменные в числовые\n", "df = pd.get_dummies(df, columns=['Name', 'Brand', 'Model', 'Touchscreen', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS', '3G', '4G/ LTE'], drop_first=True)\n", "\n", "# Разделяем данные на признаки (X) и целевую переменную (y)\n", "X = df.drop('Price', axis=1)\n", "y = df['Price']\n", "\n", "# Разделяем данные на обучающую и тестовую выборки\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "# Стандартизируем признаки\n", "scaler = StandardScaler()\n", "X_train = scaler.fit_transform(X_train)\n", "X_test = scaler.transform(X_test)\n", "\n", "# Обучаем модель линейной регрессии\n", "model = LinearRegression()\n", "model.fit(X_train, y_train)\n", "\n", "# Делаем предсказания на тестовой выборке\n", "y_pred = model.predict(X_test)\n", "\n", "# Оцениваем качество модели\n", "mae = mean_absolute_error(y_test, y_pred)\n", "mse = mean_squared_error(y_test, y_pred)\n", "rmse = mean_squared_error(y_test, y_pred, squared=False)\n", "r2 = r2_score(y_test, y_pred)\n", "\n", "print(f\"MAE: {mae}\")\n", "print(f\"MSE: {mse}\")\n", "print(f\"RMSE: {rmse}\")\n", "print(f\"R²: {r2}\")\n", "\n", "# Проверяем, достигнуты ли ориентиры\n", "if r2 >= 0.75 and mae <= 10000 and rmse <= 20000:\n", " print(\"Ориентиры для прогнозирования цены мобильных устройств достигнуты!\")\n", "else:\n", " print(\"Ориентиры для прогнозирования цены мобильных устройств не достигнуты.\")\n", "\n", "# Оптимизация тарифной сетки\n", "# Убедитесь, что столбцы существуют\n", "columns_to_group = ['Battery capacity (mAh)', 'Screen size (inches)', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera', 'Front camera']\n", "\n", "# Рассчитываем среднюю цену для каждого значения каждого признака\n", "for column in columns_to_group:\n", " print(f\"Средняя цена для '{column}':\")\n", " print(df.groupby(column)['Price'].mean())\n", " print()\n", "\n", "# Рассчитываем среднюю цену для комбинаций признаков\n", "# Например, для комбинации 'Battery capacity (mAh)' и 'RAM (MB)'\n", "print(\"Средняя цена для комбинации 'Battery capacity (mAh)' и 'RAM (MB)':\")\n", "print(df.groupby(['Battery capacity (mAh)', 'RAM (MB)'])['Price'].mean())\n", "print()\n", "\n", "# Рассчитываем среднюю цену для комбинации 'Screen size (inches)' и 'Internal storage (GB)'\n", "print(\"Средняя цена для комбинации 'Screen size (inches)' и 'Internal storage (GB)':\")\n", "print(df.groupby(['Screen size (inches)', 'Internal storage (GB)'])['Price'].mean())\n", "print()\n", "\n", "# Рассчитываем среднюю цену для комбинации 'Rear camera' и 'Front camera'\n", "print(\"Средняя цена для комбинации 'Rear camera' и 'Front camera':\")\n", "print(df.groupby(['Rear camera', 'Front camera'])['Price'].mean())\n", "print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Анализ применимости алгоритмов обучения с учителем для решения поставленных задач:\n", "\n", "1. Прогнозирование стоимости страховых взносов: Задача: Регрессия\n", "\n", "Свойства алгоритмов:\n", "\n", "Линейная регрессия: Применимость: Хорошо подходит для задач, где зависимость между признаками и целевой переменной линейна. Преимущества: Проста в реализации, интерпретируема. Недостатки: Может плохо работать, если зависимость нелинейна.\n", "\n", "Деревья решений (регрессия): Применимость: Подходит для задач с нелинейными зависимостями. Преимущества: Может обрабатывать категориальные признаки, не требует масштабирования данных. Недостатки: Подвержены переобучению, могут давать нестабильные результаты.\n", "\n", "Случайный лес (регрессия): Применимость: Хорошо подходит для задач с нелинейными зависимостями и большим количеством признаков. Преимущества: Устойчив к переобучению, может обрабатывать категориальные признаки. Недостатки: Менее интерпретируем, чем линейная регрессия.\n", "\n", "Градиентный бустинг (регрессия): Применимость: Подходит для задач с нелинейными зависимостями и сложными взаимосвязями между признаками. Преимущества: Может достигать высокой точности, устойчив к переобучению. Недостатки: Сложнее в настройке, чем случайный лес, менее интерпретируем.\n", "\n", "Нейронные сети (регрессия): Применимость: Подходит для задач с очень сложными зависимостями и большим количеством данных. Преимущества: Может моделировать очень сложные зависимости. Недостатки: Требует большого количества данных, сложнее в настройке и интерпретации.\n", "\n", "#### Вывод:\n", "\n", "Линейная регрессия: Может быть хорошим выбором для начала, особенно если зависимость между признаками и целевой переменной линейна.\n", "\n", "Деревья решений и случайный лес: Подходят для задач с нелинейными зависимостями.\n", "\n", "Градиентный бустинг: Может давать более высокую точность, чем случайный лес, но требует больше времени на настройку.\n", "\n", "Нейронные сети: Могут быть излишними для этой задачи, если данных недостаточно много.\n", "\n", "2. Оптимизация тарифной сетки: Задача: Классификация (группировка клиентов по группам риска)\n", "\n", "Свойства алгоритмов:\n", "\n", "Логистическая регрессия: Применимость: Хорошо подходит для задач бинарной классификации, где зависимость между признаками и целевой переменной линейна. Преимущества: Проста в реализации, интерпретируема. Недостатки: Может плохо работать, если зависимость нелинейна.\n", "\n", "Деревья решений (классификация): Применимость: Подходит для задач с нелинейными зависимостями. Преимущества: Может обрабатывать категориальные признаки, не требует масштабирования данных. Недостатки: Подвержены переобучению, могут давать нестабильные результаты.\n", "\n", "Случайный лес (классификация): Применимость: Хорошо подходит для задач с нелинейными зависимостями и большим количеством признаков. Преимущества: Устойчив к переобучению, может обрабатывать категориальные признаки. Недостатки: Менее интерпретируем, чем линейная регрессия.\n", "\n", "Градиентный бустинг (классификация): Применимость: Подходит для задач с нелинейными зависимостями и сложными взаимосвязями между признаками. Преимущества: Может достигать высокой точности, устойчив к переобучению. Недостатки: Сложнее в настройке, чем случайный лес, менее интерпретируем.\n", "\n", "Нейронные сети (классификация): Применимость: Подходит для задач с очень сложными зависимостями и большим количеством данных. Преимущества: Может моделировать очень сложные зависимости. Недостатки: Требует большого количества данных, сложнее в настройке и интерпретации.\n", "\n", "#### Вывод:\n", "\n", "Логистическая регрессия: Может быть хорошим выбором для начала, особенно если зависимость между признаками и целевой переменной линейна.\n", "\n", "Деревья решений и случайный лес: Подходят для задач с нелинейными зависимостями.\n", "\n", "Градиентный бустинг: Может давать более высокую точность, чем случайный лес, но требует больше времени на настройку.\n", "\n", "Нейронные сети: Могут быть излишними для этой задачи, если данных недостаточно много.\n", "\n", "1. Прогнозирование стоимости страховых взносов: Выбранные модели:\n", "\n", "Линейная регрессия\n", "\n", "Случайный лес (регрессия)\n", "\n", "Градиентный бустинг (регрессия)\n", "\n", "2. Оптимизация тарифной сетки: Выбранные модели:\n", "\n", "Логистическая регрессия\n", "\n", "Случайный лес (классификация)\n", "\n", "Градиентный бустинг (классификация)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "could not convert string to float: '9.0 (Pie)'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[19], line 39\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;66;03m# Стандартизируем признаки для задачи регрессии\u001b[39;00m\n\u001b[0;32m 38\u001b[0m scaler_reg \u001b[38;5;241m=\u001b[39m StandardScaler()\n\u001b[1;32m---> 39\u001b[0m X_train_reg \u001b[38;5;241m=\u001b[39m \u001b[43mscaler_reg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train_reg\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 40\u001b[0m X_test_reg \u001b[38;5;241m=\u001b[39m scaler_reg\u001b[38;5;241m.\u001b[39mtransform(X_test_reg)\n\u001b[0;32m 42\u001b[0m \u001b[38;5;66;03m# Список моделей для задачи регрессии\u001b[39;00m\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:316\u001b[0m, in \u001b[0;36m_wrap_method_output..wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 315\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 316\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 318\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 319\u001b[0m return_tuple \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 320\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 321\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 322\u001b[0m )\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1098\u001b[0m, in \u001b[0;36mTransformerMixin.fit_transform\u001b[1;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[0;32m 1083\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 1084\u001b[0m (\n\u001b[0;32m 1085\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis object (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) has a `transform`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1093\u001b[0m \u001b[38;5;167;01mUserWarning\u001b[39;00m,\n\u001b[0;32m 1094\u001b[0m )\n\u001b[0;32m 1096\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m y \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 1097\u001b[0m \u001b[38;5;66;03m# fit method of arity 1 (unsupervised transformation)\u001b[39;00m\n\u001b[1;32m-> 1098\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_params\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mtransform(X)\n\u001b[0;32m 1099\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1100\u001b[0m \u001b[38;5;66;03m# fit method of arity 2 (supervised transformation)\u001b[39;00m\n\u001b[0;32m 1101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\u001b[38;5;241m.\u001b[39mtransform(X)\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_data.py:878\u001b[0m, in \u001b[0;36mStandardScaler.fit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 876\u001b[0m \u001b[38;5;66;03m# Reset internal state before fitting\u001b[39;00m\n\u001b[0;32m 877\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset()\n\u001b[1;32m--> 878\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpartial_fit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\preprocessing\\_data.py:914\u001b[0m, in \u001b[0;36mStandardScaler.partial_fit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 882\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Online computation of mean and std on X for later scaling.\u001b[39;00m\n\u001b[0;32m 883\u001b[0m \n\u001b[0;32m 884\u001b[0m \u001b[38;5;124;03mAll of X is processed as a single batch. This is intended for cases\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 911\u001b[0m \u001b[38;5;124;03m Fitted scaler.\u001b[39;00m\n\u001b[0;32m 912\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 913\u001b[0m first_call \u001b[38;5;241m=\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_samples_seen_\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 914\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 915\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 916\u001b[0m \u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcsr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcsc\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 917\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mFLOAT_DTYPES\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 918\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_all_finite\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mallow-nan\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 919\u001b[0m \u001b[43m \u001b[49m\u001b[43mreset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfirst_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 920\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 921\u001b[0m n_features \u001b[38;5;241m=\u001b[39m X\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 923\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m sample_weight \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\base.py:633\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[0;32m 631\u001b[0m out \u001b[38;5;241m=\u001b[39m X, y\n\u001b[0;32m 632\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m no_val_y:\n\u001b[1;32m--> 633\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mX\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcheck_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 634\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_y:\n\u001b[0;32m 635\u001b[0m out \u001b[38;5;241m=\u001b[39m _check_y(y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcheck_params)\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\sklearn\\utils\\validation.py:929\u001b[0m, in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 924\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pandas_requires_conversion:\n\u001b[0;32m 925\u001b[0m \u001b[38;5;66;03m# pandas dataframe requires conversion earlier to handle extension dtypes with\u001b[39;00m\n\u001b[0;32m 926\u001b[0m \u001b[38;5;66;03m# nans\u001b[39;00m\n\u001b[0;32m 927\u001b[0m \u001b[38;5;66;03m# Use the original dtype for conversion if dtype is None\u001b[39;00m\n\u001b[0;32m 928\u001b[0m new_dtype \u001b[38;5;241m=\u001b[39m dtype_orig \u001b[38;5;28;01mif\u001b[39;00m dtype \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m dtype\n\u001b[1;32m--> 929\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43marray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnew_dtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 930\u001b[0m \u001b[38;5;66;03m# Since we converted here, we do not need to convert again later\u001b[39;00m\n\u001b[0;32m 931\u001b[0m dtype \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\generic.py:6643\u001b[0m, in \u001b[0;36mNDFrame.astype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 6637\u001b[0m results \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 6638\u001b[0m ser\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy, errors\u001b[38;5;241m=\u001b[39merrors) \u001b[38;5;28;01mfor\u001b[39;00m _, ser \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems()\n\u001b[0;32m 6639\u001b[0m ]\n\u001b[0;32m 6641\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 6642\u001b[0m \u001b[38;5;66;03m# else, only a single dtype is given\u001b[39;00m\n\u001b[1;32m-> 6643\u001b[0m new_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mgr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 6644\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_constructor_from_mgr(new_data, axes\u001b[38;5;241m=\u001b[39mnew_data\u001b[38;5;241m.\u001b[39maxes)\n\u001b[0;32m 6645\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mastype\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:430\u001b[0m, in \u001b[0;36mBaseBlockManager.astype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 427\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m using_copy_on_write():\n\u001b[0;32m 428\u001b[0m copy \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m--> 430\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mastype\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 435\u001b[0m \u001b[43m \u001b[49m\u001b[43musing_cow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43musing_copy_on_write\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 436\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:363\u001b[0m, in \u001b[0;36mBaseBlockManager.apply\u001b[1;34m(self, f, align_keys, **kwargs)\u001b[0m\n\u001b[0;32m 361\u001b[0m applied \u001b[38;5;241m=\u001b[39m b\u001b[38;5;241m.\u001b[39mapply(f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 362\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 363\u001b[0m applied \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 364\u001b[0m result_blocks \u001b[38;5;241m=\u001b[39m extend_blocks(applied, result_blocks)\n\u001b[0;32m 366\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mfrom_blocks(result_blocks, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maxes)\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\internals\\blocks.py:758\u001b[0m, in \u001b[0;36mBlock.astype\u001b[1;34m(self, dtype, copy, errors, using_cow, squeeze)\u001b[0m\n\u001b[0;32m 755\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCan not squeeze with more than one column.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 756\u001b[0m values \u001b[38;5;241m=\u001b[39m values[\u001b[38;5;241m0\u001b[39m, :] \u001b[38;5;66;03m# type: ignore[call-overload]\u001b[39;00m\n\u001b[1;32m--> 758\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array_safe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 760\u001b[0m new_values \u001b[38;5;241m=\u001b[39m maybe_coerce_values(new_values)\n\u001b[0;32m 762\u001b[0m refs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:237\u001b[0m, in \u001b[0;36mastype_array_safe\u001b[1;34m(values, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 234\u001b[0m dtype \u001b[38;5;241m=\u001b[39m dtype\u001b[38;5;241m.\u001b[39mnumpy_dtype\n\u001b[0;32m 236\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 237\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 238\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m):\n\u001b[0;32m 239\u001b[0m \u001b[38;5;66;03m# e.g. _astype_nansafe can fail on object-dtype of strings\u001b[39;00m\n\u001b[0;32m 240\u001b[0m \u001b[38;5;66;03m# trying to convert to float\u001b[39;00m\n\u001b[0;32m 241\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:182\u001b[0m, in \u001b[0;36mastype_array\u001b[1;34m(values, dtype, copy)\u001b[0m\n\u001b[0;32m 179\u001b[0m values \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n\u001b[0;32m 181\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 182\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43m_astype_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 184\u001b[0m \u001b[38;5;66;03m# in pandas we don't store numpy str dtypes, so convert to object\u001b[39;00m\n\u001b[0;32m 185\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, np\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, \u001b[38;5;28mstr\u001b[39m):\n", "File \u001b[1;32md:\\ULSTU\\AIM2\\AIM-PIbd-32-Puchkina-A-A\\aimenv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:133\u001b[0m, in \u001b[0;36m_astype_nansafe\u001b[1;34m(arr, dtype, copy, skipna)\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg)\n\u001b[0;32m 131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m copy \u001b[38;5;129;01mor\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mobject\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m dtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mobject\u001b[39m:\n\u001b[0;32m 132\u001b[0m \u001b[38;5;66;03m# Explicit copy, or required since NumPy can't view from / to object.\u001b[39;00m\n\u001b[1;32m--> 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43marr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[0;32m 135\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n", "\u001b[1;31mValueError\u001b[0m: could not convert string to float: '9.0 (Pie)'" ] } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LinearRegression, LogisticRegression\n", "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", "from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score\n", "import re\n", "\n", "# Загружаем набор данных\n", "df = pd.read_csv(\"..//static//csv//ndtv_data_final.csv\")\n", "\n", "# Удаление пустого столбца по имени\n", "df = df.drop('Unnamed: 0', axis=1)\n", "\n", "# Извлечение числовых значений из столбца Battery\n", "df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", "df['Ram'] = df['Ram'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", "df['Camera'] = df['Camera'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", "df['Display'] = df['Display'].apply(lambda x: int(re.search(r'\\d+', x).group()) if re.search(r'\\d+', x) else None)\n", "\n", "# Преобразуем категориальные переменные в числовые\n", "# Предположим, что 'company' и 'Processor_name' являются категориальными\n", "df = pd.get_dummies(df, columns=['company', 'Processor_name'], drop_first=True)\n", "\n", "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n", "X_reg = df.drop('Price', axis=1)\n", "y_reg = df['Price']\n", "\n", "# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n", "X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n", "\n", "# Стандартизируем признаки для задачи регрессии\n", "scaler_reg = StandardScaler()\n", "X_train_reg = scaler_reg.fit_transform(X_train_reg)\n", "X_test_reg = scaler_reg.transform(X_test_reg)\n", "\n", "# Список моделей для задачи регрессии\n", "models_reg = {\n", " \"Linear Regression\": LinearRegression(),\n", " \"Random Forest Regression\": RandomForestRegressor(),\n", " \"Gradient Boosting Regression\": GradientBoostingRegressor()\n", "}\n", "\n", "# Обучаем и оцениваем модели для задачи регрессии\n", "print(\"Результаты для задачи регрессии:\")\n", "for name, model in models_reg.items():\n", " model.fit(X_train_reg, y_train_reg)\n", " y_pred_reg = model.predict(X_test_reg)\n", " mae = mean_absolute_error(y_test_reg, y_pred_reg)\n", " mse = mean_squared_error(y_test_reg, y_pred_reg)\n", " rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n", " r2 = r2_score(y_test_reg, y_pred_reg)\n", " print(f\"Model: {name}\")\n", " print(f\"MAE: {mae}\")\n", " print(f\"MSE: {mse}\")\n", " print(f\"RMSE: {rmse}\")\n", " print(f\"R²: {r2}\")\n", " print()\n", "\n", "# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n", "# Предположим, что мы классифицируем устройства по рейтингу выше среднего\n", "X_class = df.drop('Rating', axis=1)\n", "y_class = (df['Rating'] > df['Rating'].mean()).astype(int)\n", "\n", "# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n", "X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n", "\n", "# Стандартизируем признаки для задачи классификации\n", "scaler_class = StandardScaler()\n", "X_train_class = scaler_class.fit_transform(X_train_class)\n", "X_test_class = scaler_class.transform(X_test_class)\n", "\n", "# Список моделей для задачи классификации\n", "models_class = {\n", " \"Logistic Regression\": LogisticRegression(),\n", " \"Random Forest Classification\": RandomForestClassifier(),\n", " \"Gradient Boosting Classification\": GradientBoostingClassifier()\n", "}\n", "\n", "# Обучаем и оцениваем модели для задачи классификации\n", "print(\"Результаты для задачи классификации:\")\n", "for name, model in models_class.items():\n", " model.fit(X_train_class, y_train_class)\n", " y_pred_class = model.predict(X_test_class)\n", " accuracy = accuracy_score(y_test_class, y_pred_class)\n", " print(f\"Model: {name}\")\n", " print(f\"Accuracy: {accuracy}\")\n", " print()" ] } ], "metadata": { "kernelspec": { "display_name": "aimenv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 2 }