AIM-PIbd-31-Alekseev-I-S/Lab_2/Lab2.ipynb

626 lines
152 KiB
Plaintext
Raw Normal View History

2024-10-12 01:22:15 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1-й Датасет: Mobile Phone Price Prediction"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://www.kaggle.com/datasets/dewangmoghe/mobile-phone-price-prediction?resource=download\n",
"\n",
"* Из названия датасета(описания у Kaggle не было предсталвено) очевидно, что объектами иследования являются смартфоны.\n",
"* Атрибуты объектов: id, name, spec_score, no_of_sim, RAM, battery, display, camera, external_memory, android_version, price, company, inbuilt_memory, fast_charging, screen_resolution, processor, processor_name\n",
"* Очевидная цель этого датасета - научиться предсказывать цену смартфона."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"количество колонок: 18\n",
"колонки: Unnamed: 0, Name, Rating, Spec_score, No_of_sim, Ram, Battery, Display, Camera, External_Memory, Android_version, Price, company, Inbuilt_memory, fast_charging, Screen_resolution, Processor, Processor_name\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1370 entries, 0 to 1369\n",
"Data columns (total 18 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Unnamed: 0 1370 non-null int64 \n",
" 1 Name 1370 non-null object \n",
" 2 Rating 1370 non-null float64\n",
" 3 Spec_score 1370 non-null int64 \n",
" 4 No_of_sim 1370 non-null object \n",
" 5 Ram 1370 non-null object \n",
" 6 Battery 1370 non-null object \n",
" 7 Display 1370 non-null object \n",
" 8 Camera 1370 non-null object \n",
" 9 External_Memory 1370 non-null object \n",
" 10 Android_version 927 non-null object \n",
" 11 Price 1370 non-null object \n",
" 12 company 1370 non-null object \n",
" 13 Inbuilt_memory 1351 non-null object \n",
" 14 fast_charging 1281 non-null object \n",
" 15 Screen_resolution 1368 non-null object \n",
" 16 Processor 1342 non-null object \n",
" 17 Processor_name 1370 non-null object \n",
"dtypes: float64(1), int64(2), object(15)\n",
"memory usage: 192.8+ KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Name</th>\n",
" <th>Rating</th>\n",
" <th>Spec_score</th>\n",
" <th>No_of_sim</th>\n",
" <th>Ram</th>\n",
" <th>Battery</th>\n",
" <th>Display</th>\n",
" <th>Camera</th>\n",
" <th>External_Memory</th>\n",
" <th>Android_version</th>\n",
" <th>Price</th>\n",
" <th>company</th>\n",
" <th>Inbuilt_memory</th>\n",
" <th>fast_charging</th>\n",
" <th>Screen_resolution</th>\n",
" <th>Processor</th>\n",
" <th>Processor_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Samsung Galaxy F14 5G</td>\n",
" <td>4.65</td>\n",
" <td>68</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>6000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP + 2 MP Dual Rear &amp;amp; 13 MP Front Camera</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>13</td>\n",
" <td>9,999</td>\n",
" <td>Samsung</td>\n",
" <td>128 GB inbuilt</td>\n",
" <td>25W Fast Charging</td>\n",
" <td>2408 x 1080 px Display with Water Drop Notch</td>\n",
" <td>Octa Core Processor</td>\n",
" <td>Exynos 1330</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Samsung Galaxy A11</td>\n",
" <td>4.20</td>\n",
" <td>63</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>2 GB RAM</td>\n",
" <td>4000 mAh Battery</td>\n",
" <td>6.4 inches</td>\n",
" <td>13 MP + 5 MP + 2 MP Triple Rear &amp;amp; 8 MP Fro...</td>\n",
" <td>Memory Card Supported, upto 512 GB</td>\n",
" <td>10</td>\n",
" <td>9,990</td>\n",
" <td>Samsung</td>\n",
" <td>32 GB inbuilt</td>\n",
" <td>15W Fast Charging</td>\n",
" <td>720 x 1560 px Display with Punch Hole</td>\n",
" <td>1.8 GHz Processor</td>\n",
" <td>Octa Core</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>Samsung Galaxy A13</td>\n",
" <td>4.30</td>\n",
" <td>75</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP Quad Rear &amp;amp; 8 MP Front Camera</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>12</td>\n",
" <td>11,999</td>\n",
" <td>Samsung</td>\n",
" <td>64 GB inbuilt</td>\n",
" <td>25W Fast Charging</td>\n",
" <td>1080 x 2408 px Display with Water Drop Notch</td>\n",
" <td>2 GHz Processor</td>\n",
" <td>Octa Core</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>Samsung Galaxy F23</td>\n",
" <td>4.10</td>\n",
" <td>73</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>6000 mAh Battery</td>\n",
" <td>6.4 inches</td>\n",
" <td>48 MP Quad Rear &amp;amp; 13 MP Front Camera</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>12</td>\n",
" <td>11,999</td>\n",
" <td>Samsung</td>\n",
" <td>64 GB inbuilt</td>\n",
" <td>NaN</td>\n",
" <td>720 x 1600 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Helio G88</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>Samsung Galaxy A03s (4GB RAM + 64GB)</td>\n",
" <td>4.10</td>\n",
" <td>69</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.5 inches</td>\n",
" <td>13 MP + 2 MP + 2 MP Triple Rear &amp;amp; 5 MP Fro...</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>11</td>\n",
" <td>11,999</td>\n",
" <td>Samsung</td>\n",
" <td>64 GB inbuilt</td>\n",
" <td>15W Fast Charging</td>\n",
" <td>720 x 1600 px Display with Water Drop Notch</td>\n",
" <td>Octa Core</td>\n",
" <td>Helio P35</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Name Rating Spec_score \\\n",
"0 0 Samsung Galaxy F14 5G 4.65 68 \n",
"1 1 Samsung Galaxy A11 4.20 63 \n",
"2 2 Samsung Galaxy A13 4.30 75 \n",
"3 3 Samsung Galaxy F23 4.10 73 \n",
"4 4 Samsung Galaxy A03s (4GB RAM + 64GB) 4.10 69 \n",
"\n",
" No_of_sim Ram Battery Display \\\n",
"0 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 6000 mAh Battery 6.6 inches \n",
"1 Dual Sim, 3G, 4G, VoLTE, 2 GB RAM 4000 mAh Battery 6.4 inches \n",
"2 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.6 inches \n",
"3 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches \n",
"4 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches \n",
"\n",
" Camera \\\n",
"0 50 MP + 2 MP Dual Rear &amp; 13 MP Front Camera \n",
"1 13 MP + 5 MP + 2 MP Triple Rear &amp; 8 MP Fro... \n",
"2 50 MP Quad Rear &amp; 8 MP Front Camera \n",
"3 48 MP Quad Rear &amp; 13 MP Front Camera \n",
"4 13 MP + 2 MP + 2 MP Triple Rear &amp; 5 MP Fro... \n",
"\n",
" External_Memory Android_version Price company \\\n",
"0 Memory Card Supported, upto 1 TB 13 9,999 Samsung \n",
"1 Memory Card Supported, upto 512 GB 10 9,990 Samsung \n",
"2 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n",
"3 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n",
"4 Memory Card Supported, upto 1 TB 11 11,999 Samsung \n",
"\n",
" Inbuilt_memory fast_charging \\\n",
"0 128 GB inbuilt 25W Fast Charging \n",
"1 32 GB inbuilt 15W Fast Charging \n",
"2 64 GB inbuilt 25W Fast Charging \n",
"3 64 GB inbuilt NaN \n",
"4 64 GB inbuilt 15W Fast Charging \n",
"\n",
" Screen_resolution Processor \\\n",
"0 2408 x 1080 px Display with Water Drop Notch Octa Core Processor \n",
"1 720 x 1560 px Display with Punch Hole 1.8 GHz Processor \n",
"2 1080 x 2408 px Display with Water Drop Notch 2 GHz Processor \n",
"3 720 x 1600 px Octa Core \n",
"4 720 x 1600 px Display with Water Drop Notch Octa Core \n",
"\n",
" Processor_name \n",
"0 Exynos 1330 \n",
"1 Octa Core \n",
"2 Octa Core \n",
"3 Helio G88 \n",
"4 Helio P35 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"df = pd.read_csv(\".//static//csv//mppp.csv\", sep=\",\")\n",
"print('количество колонок: ' + str(df.columns.size)) \n",
"print('колонки: ' + ', '.join(df.columns))\n",
"\n",
"df.info()\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Получение сведений о пропущенных данных\n",
"\n",
"Типы пропущенных данных:\n",
"\n",
"* None - представление пустых данных в Python\n",
"* NaN - представление пустых данных в Pandas\n",
"* '' - пустая строка"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unnamed: 0 0\n",
"Name 0\n",
"Rating 0\n",
"Spec_score 0\n",
"No_of_sim 0\n",
"Ram 0\n",
"Battery 0\n",
"Display 0\n",
"Camera 0\n",
"External_Memory 0\n",
"Android_version 443\n",
"Price 0\n",
"company 0\n",
"Inbuilt_memory 19\n",
"fast_charging 89\n",
"Screen_resolution 2\n",
"Processor 28\n",
"Processor_name 0\n",
"dtype: int64\n",
"\n",
"Unnamed: 0 False\n",
"Name False\n",
"Rating False\n",
"Spec_score False\n",
"No_of_sim False\n",
"Ram False\n",
"Battery False\n",
"Display False\n",
"Camera False\n",
"External_Memory False\n",
"Android_version True\n",
"Price False\n",
"company False\n",
"Inbuilt_memory True\n",
"fast_charging True\n",
"Screen_resolution True\n",
"Processor True\n",
"Processor_name False\n",
"dtype: bool\n",
"\n",
"Android_version процент пустых значений: %32.34\n",
"Inbuilt_memory процент пустых значений: %1.39\n",
"fast_charging процент пустых значений: %6.50\n",
"Screen_resolution процент пустых значений: %0.15\n",
"Processor процент пустых значений: %2.04\n"
]
}
],
"source": [
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Судя по статистике выше, пустые значения всё-таки присутствуют. Наиболее корректным решением будет заполнение пропущенных данных, потому что количество объектов с пропущенными значениями составляет практически половину объектов датасета. Так как все атрибуты имеют строковый тип данных, воспользуемся заполнением наиболее частым значением."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unnamed: 0 0\n",
"Name 0\n",
"Rating 0\n",
"Spec_score 0\n",
"No_of_sim 0\n",
"Ram 0\n",
"Battery 0\n",
"Display 0\n",
"Camera 0\n",
"External_Memory 0\n",
"Android_version 0\n",
"Price 0\n",
"company 0\n",
"Inbuilt_memory 0\n",
"fast_charging 0\n",
"Screen_resolution 0\n",
"Processor 0\n",
"Processor_name 0\n",
"dtype: int64\n",
"\n",
"Unnamed: 0 False\n",
"Name False\n",
"Rating False\n",
"Spec_score False\n",
"No_of_sim False\n",
"Ram False\n",
"Battery False\n",
"Display False\n",
"Camera False\n",
"External_Memory False\n",
"Android_version False\n",
"Price False\n",
"company False\n",
"Inbuilt_memory False\n",
"fast_charging False\n",
"Screen_resolution False\n",
"Processor False\n",
"Processor_name False\n",
"dtype: bool\n",
"\n"
]
}
],
"source": [
"df['Inbuilt_memory'].fillna(df['Inbuilt_memory'].mode()[0], inplace=True)\n",
"df['Processor'].fillna(df['Processor'].mode()[0], inplace=True)\n",
"df['Android_version'].fillna(df['Android_version'].mode()[0], inplace=True)\n",
"df['fast_charging'].fillna(df['fast_charging'].mode()[0], inplace=True)\n",
"df['Screen_resolution'].fillna(df['Screen_resolution'].mode()[0], inplace=True)\n",
"\n",
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверим выбросы и устраним их:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Колонка Rating:\n",
" Есть выбросы: Нет\n",
" Количество выбросов: 0\n",
" Минимальное значение: 3.75\n",
" Максимальное значение: 4.75\n",
" 1-й квартиль (Q1): 4.15\n",
" 3-й квартиль (Q3): 4.55\n",
"\n",
"Колонка Spec_score:\n",
" Есть выбросы: Нет\n",
" Количество выбросов: 0\n",
" Минимальное значение: 58.5\n",
" Максимальное значение: 98.0\n",
" 1-й квартиль (Q1): 75.0\n",
" 3-й квартиль (Q3): 86.0\n",
"\n"
]
}
],
"source": [
"numeric_columns = ['Name', 'Rating', 'Spec_score', 'No_of_sim', 'Ram', 'Battery', 'Display', 'Camera', 'External_Memory', 'Android_version', 'Price', 'company', 'Inbuilt_memory', 'fast_charging', 'Screen_resolution', 'Processor', 'Processor_name']\n",
"for column in numeric_columns:\n",
" if pd.api.types.is_numeric_dtype(df[column]): # Проверяем, является ли колонка числовой\n",
" q1 = df[column].quantile(0.25) # Находим 1-й квартиль (Q1)\n",
" q3 = df[column].quantile(0.75) # Находим 3-й квартиль (Q3)\n",
" iqr = q3 - q1 # Вычисляем межквартильный размах (IQR)\n",
"\n",
" # Определяем границы для выбросов\n",
" lower_bound = q1 - 1.5 * iqr # Нижняя граница\n",
" upper_bound = q3 + 1.5 * iqr # Верхняя граница\n",
"\n",
" # Подсчитываем количество выбросов\n",
" outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]\n",
" outlier_count = outliers.shape[0]\n",
"\n",
" # Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n",
" df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
"\n",
" print(f\"Колонка {column}:\")\n",
" print(f\" Есть выбросы: {'Да' if outlier_count > 0 else 'Нет'}\")\n",
" print(f\" Количество выбросов: {outlier_count}\")\n",
" print(f\" Минимальное значение: {df[column].min()}\")\n",
" print(f\" Максимальное значение: {df[column].max()}\")\n",
" print(f\" 1-й квартиль (Q1): {q1}\")\n",
" print(f\" 3-й квартиль (Q3): {q3}\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Постараемся выявить зависимости Outcome от остальных колонок:"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABIoAAAK9CAYAAABLvofaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzde1yO9/8H8Net4123Skl3KUWlkpItx1DMSk4jJIcRxTYsp6K+kw7UaNpkwxy2u5CwzbDNMkyz2ZecYpuUEH7rtkyU2BLdvz+s6+tyVzqR8Xo+Htfjsev6fK7353N9uvL99f59Pp9LolKpVCAiIiIiIiIiohdes6buABERERERERERPRuYKCIiIiIiIiIiIgBMFBERERERERER0T+YKCIiIiIiIiIiIgBMFBERERERERER0T+YKCIiIiIiIiIiIgBMFBERERERERER0T+YKCIiIiIiIiIiIgBMFBERERERERER0T+YKCIiIiIiesGVlpYiODgYcrkcEokEs2bNauou1Sg/Px8SiQTJyclN3ZWnKjk5GRKJBPn5+cI1Ly8veHl5NVob0dHRkEgkjRaPiP59mCgiIiIiesIq/7ir6ggPD38ibf7888+Ijo7GzZs3n0j8hqgcj2PHjjV1V+pt1apVz1WSIj4+HsnJyXjrrbewceNGvP7660+0PRsbG9HvQatWrdC7d298+eWXT7Tdhvq39vtRd+7cQXR0NDIyMpq6K0T0DNJs6g4QERERvShiY2PRtm1b0bWOHTs+kbZ+/vlnxMTEIDAwEEZGRk+kjRfZqlWr0LJlSwQGBjZ1VxrF999/j+7duyMqKuqptenm5oa5c+cCAAoKCrBmzRr4+flh9erVePPNN2u819raGn/99Re0tLSeRldFGtLvJ+G7776r8z137txBTEwMAKjNRlqwYMETS2AT0b8DE0VERERET4mvry/c3d2buhsNcvv2bejr6zd1N5rMnTt3oKen19TdaHSFhYXo0KFDo8W7d+8eKioqoK2tXW2d1q1bY/z48cL5hAkTYGdnhw8++KDahMvDcXV1dRutv3XR0H43tsaOqampCU1N/plI9CLj0jMiIiKiZ8S3336L3r17Q19fH82bN8egQYPw22+/ieqcPn0agYGBaNeuHXR1dSGXyzF58mRcv35dqBMdHY2wsDAAQNu2bYVlMvn5+TXu7SKRSBAdHS2KI5FIcObMGYwdOxYtWrRAr169hPJNmzbh5ZdfhlQqhbGxMQICAnDlypV6PXtgYCBkMhkuX76MwYMHQyaToXXr1li5ciUA4JdffkG/fv2gr68Pa2trbN68WXR/5XK2gwcP4o033oCJiQkMDAwwYcIE3LhxQ629VatWwdnZGTo6OrCwsMD06dPVlul5eXmhY8eOOH78OPr06QM9PT385z//gY2NDX777Tf88MMPwthWzsooKipCaGgoXFxcIJPJYGBgAF9fX5w6dUoUOyMjAxKJBNu2bUNcXBwsLS2hq6uLV155BXl5eWr9PXLkCAYOHIgWLVpAX18frq6uSEpKEtU5e/YsRo4cCWNjY+jq6sLd3R27du2qcdwr+3Hx4kV88803oncFeJBACgoKgpmZGXR1ddGpUyekpKSIYlS+U8uWLcPy5ctha2sLHR0dnDlzpsa2HyWXy+Hk5ISLFy8+Nm517/HZs2fh7+8PU1NTSKVSODg44J133hHV+f333zF58mSYmZlBR0cHzs7O+PTTT+vU1/r2u7KPtfk5/fbbb+jXrx+kUiksLS2xePFiVFRUqNWrao+iv//+G9HR0Wjfvj10dXVhbm4OPz8/nD9/Hvn5+TA1NQUAxMTECD/zyt/9qvYounfvHhYtWiQ8i42NDf7zn/+grKxMVM/GxgaDBw/GTz/9hK5du0JXVxft2rXDhg0b6jW2RNQ0mComIiIiekqKi4vx559/iq61bNkSALBx40ZMnDgRPj4+WLp0Ke7cuYPVq1ejV69eOHnyJGxsbAAAe/fuxYULFzBp0iTI5XL89ttvWLt2LX777TccPnwYEokEfn5+yM3NRVpaGj744AOhDVNTU1y7dq3O/R41ahTs7e0RHx8PlUoFAIiLi0NkZCT8/f0RHByMa9eu4cMPP0SfPn1w8uTJei13u3//Pnx9fdGnTx8kJCQgNTUVM2bMgL6+Pt555x2MGzcOfn5++PjjjzFhwgT06NFDbSnfjBkzYGRkhOjoaOTk5GD16tW4dOmSkBABHvwhHBMTg/79++Ott94S6h09ehSHDh0SLWe6fv06fH19ERAQgPHjx8PMzAxeXl54++23IZPJhCSEmZkZAODChQvYsWMHRo0ahbZt2+KPP/7AmjVr4OnpiTNnzsDCwkLU3yVLlqBZs2YIDQ1FcXExEhISMG7cOBw5ckSos3fvXgwePBjm5uaYOXMm5HI5srOz8fXXX2PmzJkAHiQVPDw80Lp1a4SHh0NfXx/btm3DsGHD8MUXX2D48OFVjrmTkxM2btyI2bNnw9LSUlhSZWpqir/++gteXl7Iy8vDjBkz0LZtW3z22WcIDAzEzZs3hbYrKRQK/P3335g6dSp0dHRgbGxcp59/eXk5rly5AhMTk8fGrSphcvr0afTu3RtaWlqYOnUqbGxscP78eXz11VeIi4sDAPzxxx/o3r07JBIJZsyYAVNTU3z77bcICgpCSUlJvTbxrku/a/tzunr1Kvr27Yt79+4J9dauXQupVPrY/ty/fx+DBw/G/v37ERAQgJkzZ+LWrVvYu3cvfv31V/Tv3x+rV6/GW2+9heHDh8PPzw8A4OrqWm3M4OBgpKSkYOTIkZg7dy6OHDmCd999F9nZ2Wr7M+Xl5WHkyJEICgrCxIkT8emnnyIwMBAvv/wynJ2d6zq8RNQUVERERET0RCkUChWAKg+VSqW6deuWysjISDVlyhTRfVevXlUZGhqKrt+5c0ctflpamgqA6uDBg8K19957TwVAdfHiRVHdixcvqgCoFAqFWhwAqqioKOE8KipKBUA1ZswYUb38/HyVhoaGKi4uTnT9l19+UWlqaqpdr248jh49KlybOHGiCoAqPj5euHbjxg2VVCpVSSQS1ZYtW4TrZ8+eVetrZcyXX35ZdffuXeF6QkKCCoBq586dKpVKpSosLFRpa2urvL29Vffv3xfqffTRRyoAqk8//VS45unpqQKg+vjjj9WewdnZWeXp6al2/e+//xbFVakejLmOjo4qNjZWuHbgwAEVAJWTk5OqrKxMuJ6UlKQCoPrll19UKpVKde/ePVXbtm1V1tbWqhs3bojiVlRUCP/9yiuvqFxcXFR///23qLxnz54qe3t7tX4+ytraWjVo0CDRteXLl6sAqDZt2iRcu3v3rqpHjx4qmUymKikpEZ4PgMrAwEBVWFj42LYq2/P29lZdu3ZNde3aNdWpU6dUAQEBKgCqt99++7Fxq3qP+/Tpo2revLnq0qVLoroPj1NQUJDK3Nxc9eeff4rqBAQEqAwNDav8/WrMftf25zRr1iwVANWRI0eEa4WFhSpDQ0O132tPT0/Ru/jpp5+qAKjef/99tf5XjsW1a9fUfocqVf7eV8rKylIBUAUHB4vqhYaGqgCovv/+e9H4PPpvUWFhoUpHR0c1d+5ctbaI6NnEpWdERERET8nKlSuxd+9e0QE8mDFy8+ZNjBkzBn/++adwaGhooFu3bjhw4IAQ4+EZBX///Tf+/PNPdO/eHQBw4sSJJ9LvR/dd2b59OyoqKuDv7y/qr1wuh729vai/dRUcHCz8t5GRERwcHKCvrw9/f3/huoODA4yMjHDhwgW1+6dOnSqaEfTWW29BU1MTu3fvBgDs27cPd+/exaxZs9Cs2f/+T+EpU6bAwMAA33zzjSiejo4OJk2aVOv+6+joCHHv37+P69evQyaTwcHBocqfz6RJk0R7zPTu3RsAhGc7efIkLl68iFmzZqnN0qqcIVVUVITvv/8e/v7+uHXrlvDzuH79Onx8fHDu3Dn8/vvvtX6GSrt374ZcLseYMWOEa1paWggJCUFpaSl++OEHUf0RI0YIS5pq47vvvoOpqSlMTU3RqVMnfPbZZ3j99dexdOn
"text/plain": [
"<Figure size 1000x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.preprocessing import LabelEncoder\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Сначала Label Encoding для упорядоченных категорий (если нужно)\n",
"label_encoder = LabelEncoder()\n",
"\n",
"for column in df.columns:\n",
" if df[column].dtype == 'object': # Если тип данных - строка\n",
" if df[column].nunique() <= 10: # Если небольшое количество уникальных значений, применяем One-Hot Encoding\n",
" df = pd.get_dummies(df, columns=[column], drop_first=True)\n",
" else:\n",
" df[column] = label_encoder.fit_transform(df[column])\n",
"\n",
"# Удаление строки \"Price\" из признаков (она является целевой переменной)\n",
"X = df.drop('Price', axis=1) # Все признаки, кроме цены\n",
"y = df['Price']\n",
"\n",
"# Создание модели RandomForestRegressor\n",
"model = RandomForestRegressor()\n",
"model.fit(X, y)\n",
"\n",
"# Определение важности признаков\n",
"feature_importances = model.feature_importances_\n",
"features = X.columns\n",
"\n",
"# Визуализация важности признаков\n",
"plt.figure(figsize=(10, 8))\n",
"plt.barh(features, feature_importances)\n",
"plt.title('Feature Importance for Price Prediction')\n",
"plt.xlabel('Importance')\n",
"plt.ylabel('Features')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: (822, 32)\n",
"Размер контрольной выборки: (274, 32)\n",
"Размер тестовой выборки: (274, 32)\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Разделение на признаки (X) и целевую переменную (y)\n",
"X = df.drop('Price', axis=1) # Все признаки, кроме цены\n",
"y = df['Price'] # Целевая переменная (цена)\n",
"\n",
"# Сначала разбиваем данные на обучающую и промежуточную выборки (80% обучающих данных и 20% тестовых)\n",
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n",
"\n",
"# Теперь делим временные данные (X_temp и y_temp) на контрольную (валидационную) и тестовую выборки (по 20% каждой)\n",
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n",
"\n",
"# Выводим размеры выборок для проверки\n",
"print(f'Размер обучающей выборки: {X_train.shape}')\n",
"print(f'Размер контрольной выборки: {X_val.shape}')\n",
"print(f'Размер тестовой выборки: {X_test.shape}')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}