626 lines
152 KiB
Plaintext
626 lines
152 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## 1-й Датасет: Mobile Phone Price Prediction"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"https://www.kaggle.com/datasets/dewangmoghe/mobile-phone-price-prediction?resource=download\n",
|
|||
|
"\n",
|
|||
|
"* Из названия датасета(описания у Kaggle не было предсталвено) очевидно, что объектами иследования являются смартфоны.\n",
|
|||
|
"* Атрибуты объектов: id, name, spec_score, no_of_sim, RAM, battery, display, camera, external_memory, android_version, price, company, inbuilt_memory, fast_charging, screen_resolution, processor, processor_name\n",
|
|||
|
"* Очевидная цель этого датасета - научиться предсказывать цену смартфона."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"количество колонок: 18\n",
|
|||
|
"колонки: Unnamed: 0, Name, Rating, Spec_score, No_of_sim, Ram, Battery, Display, Camera, External_Memory, Android_version, Price, company, Inbuilt_memory, fast_charging, Screen_resolution, Processor, Processor_name\n",
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 1370 entries, 0 to 1369\n",
|
|||
|
"Data columns (total 18 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 Unnamed: 0 1370 non-null int64 \n",
|
|||
|
" 1 Name 1370 non-null object \n",
|
|||
|
" 2 Rating 1370 non-null float64\n",
|
|||
|
" 3 Spec_score 1370 non-null int64 \n",
|
|||
|
" 4 No_of_sim 1370 non-null object \n",
|
|||
|
" 5 Ram 1370 non-null object \n",
|
|||
|
" 6 Battery 1370 non-null object \n",
|
|||
|
" 7 Display 1370 non-null object \n",
|
|||
|
" 8 Camera 1370 non-null object \n",
|
|||
|
" 9 External_Memory 1370 non-null object \n",
|
|||
|
" 10 Android_version 927 non-null object \n",
|
|||
|
" 11 Price 1370 non-null object \n",
|
|||
|
" 12 company 1370 non-null object \n",
|
|||
|
" 13 Inbuilt_memory 1351 non-null object \n",
|
|||
|
" 14 fast_charging 1281 non-null object \n",
|
|||
|
" 15 Screen_resolution 1368 non-null object \n",
|
|||
|
" 16 Processor 1342 non-null object \n",
|
|||
|
" 17 Processor_name 1370 non-null object \n",
|
|||
|
"dtypes: float64(1), int64(2), object(15)\n",
|
|||
|
"memory usage: 192.8+ KB\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Unnamed: 0</th>\n",
|
|||
|
" <th>Name</th>\n",
|
|||
|
" <th>Rating</th>\n",
|
|||
|
" <th>Spec_score</th>\n",
|
|||
|
" <th>No_of_sim</th>\n",
|
|||
|
" <th>Ram</th>\n",
|
|||
|
" <th>Battery</th>\n",
|
|||
|
" <th>Display</th>\n",
|
|||
|
" <th>Camera</th>\n",
|
|||
|
" <th>External_Memory</th>\n",
|
|||
|
" <th>Android_version</th>\n",
|
|||
|
" <th>Price</th>\n",
|
|||
|
" <th>company</th>\n",
|
|||
|
" <th>Inbuilt_memory</th>\n",
|
|||
|
" <th>fast_charging</th>\n",
|
|||
|
" <th>Screen_resolution</th>\n",
|
|||
|
" <th>Processor</th>\n",
|
|||
|
" <th>Processor_name</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>Samsung Galaxy F14 5G</td>\n",
|
|||
|
" <td>4.65</td>\n",
|
|||
|
" <td>68</td>\n",
|
|||
|
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
|
|||
|
" <td>4 GB RAM</td>\n",
|
|||
|
" <td>6000 mAh Battery</td>\n",
|
|||
|
" <td>6.6 inches</td>\n",
|
|||
|
" <td>50 MP + 2 MP Dual Rear &amp; 13 MP Front Camera</td>\n",
|
|||
|
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
|||
|
" <td>13</td>\n",
|
|||
|
" <td>9,999</td>\n",
|
|||
|
" <td>Samsung</td>\n",
|
|||
|
" <td>128 GB inbuilt</td>\n",
|
|||
|
" <td>25W Fast Charging</td>\n",
|
|||
|
" <td>2408 x 1080 px Display with Water Drop Notch</td>\n",
|
|||
|
" <td>Octa Core Processor</td>\n",
|
|||
|
" <td>Exynos 1330</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>Samsung Galaxy A11</td>\n",
|
|||
|
" <td>4.20</td>\n",
|
|||
|
" <td>63</td>\n",
|
|||
|
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
|||
|
" <td>2 GB RAM</td>\n",
|
|||
|
" <td>4000 mAh Battery</td>\n",
|
|||
|
" <td>6.4 inches</td>\n",
|
|||
|
" <td>13 MP + 5 MP + 2 MP Triple Rear &amp; 8 MP Fro...</td>\n",
|
|||
|
" <td>Memory Card Supported, upto 512 GB</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>9,990</td>\n",
|
|||
|
" <td>Samsung</td>\n",
|
|||
|
" <td>32 GB inbuilt</td>\n",
|
|||
|
" <td>15W Fast Charging</td>\n",
|
|||
|
" <td>720 x 1560 px Display with Punch Hole</td>\n",
|
|||
|
" <td>1.8 GHz Processor</td>\n",
|
|||
|
" <td>Octa Core</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>Samsung Galaxy A13</td>\n",
|
|||
|
" <td>4.30</td>\n",
|
|||
|
" <td>75</td>\n",
|
|||
|
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
|||
|
" <td>4 GB RAM</td>\n",
|
|||
|
" <td>5000 mAh Battery</td>\n",
|
|||
|
" <td>6.6 inches</td>\n",
|
|||
|
" <td>50 MP Quad Rear &amp; 8 MP Front Camera</td>\n",
|
|||
|
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
|||
|
" <td>12</td>\n",
|
|||
|
" <td>11,999</td>\n",
|
|||
|
" <td>Samsung</td>\n",
|
|||
|
" <td>64 GB inbuilt</td>\n",
|
|||
|
" <td>25W Fast Charging</td>\n",
|
|||
|
" <td>1080 x 2408 px Display with Water Drop Notch</td>\n",
|
|||
|
" <td>2 GHz Processor</td>\n",
|
|||
|
" <td>Octa Core</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>Samsung Galaxy F23</td>\n",
|
|||
|
" <td>4.10</td>\n",
|
|||
|
" <td>73</td>\n",
|
|||
|
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
|||
|
" <td>4 GB RAM</td>\n",
|
|||
|
" <td>6000 mAh Battery</td>\n",
|
|||
|
" <td>6.4 inches</td>\n",
|
|||
|
" <td>48 MP Quad Rear &amp; 13 MP Front Camera</td>\n",
|
|||
|
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
|||
|
" <td>12</td>\n",
|
|||
|
" <td>11,999</td>\n",
|
|||
|
" <td>Samsung</td>\n",
|
|||
|
" <td>64 GB inbuilt</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>720 x 1600 px</td>\n",
|
|||
|
" <td>Octa Core</td>\n",
|
|||
|
" <td>Helio G88</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>Samsung Galaxy A03s (4GB RAM + 64GB)</td>\n",
|
|||
|
" <td>4.10</td>\n",
|
|||
|
" <td>69</td>\n",
|
|||
|
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
|||
|
" <td>4 GB RAM</td>\n",
|
|||
|
" <td>5000 mAh Battery</td>\n",
|
|||
|
" <td>6.5 inches</td>\n",
|
|||
|
" <td>13 MP + 2 MP + 2 MP Triple Rear &amp; 5 MP Fro...</td>\n",
|
|||
|
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
|||
|
" <td>11</td>\n",
|
|||
|
" <td>11,999</td>\n",
|
|||
|
" <td>Samsung</td>\n",
|
|||
|
" <td>64 GB inbuilt</td>\n",
|
|||
|
" <td>15W Fast Charging</td>\n",
|
|||
|
" <td>720 x 1600 px Display with Water Drop Notch</td>\n",
|
|||
|
" <td>Octa Core</td>\n",
|
|||
|
" <td>Helio P35</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Unnamed: 0 Name Rating Spec_score \\\n",
|
|||
|
"0 0 Samsung Galaxy F14 5G 4.65 68 \n",
|
|||
|
"1 1 Samsung Galaxy A11 4.20 63 \n",
|
|||
|
"2 2 Samsung Galaxy A13 4.30 75 \n",
|
|||
|
"3 3 Samsung Galaxy F23 4.10 73 \n",
|
|||
|
"4 4 Samsung Galaxy A03s (4GB RAM + 64GB) 4.10 69 \n",
|
|||
|
"\n",
|
|||
|
" No_of_sim Ram Battery Display \\\n",
|
|||
|
"0 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 6000 mAh Battery 6.6 inches \n",
|
|||
|
"1 Dual Sim, 3G, 4G, VoLTE, 2 GB RAM 4000 mAh Battery 6.4 inches \n",
|
|||
|
"2 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.6 inches \n",
|
|||
|
"3 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches \n",
|
|||
|
"4 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches \n",
|
|||
|
"\n",
|
|||
|
" Camera \\\n",
|
|||
|
"0 50 MP + 2 MP Dual Rear & 13 MP Front Camera \n",
|
|||
|
"1 13 MP + 5 MP + 2 MP Triple Rear & 8 MP Fro... \n",
|
|||
|
"2 50 MP Quad Rear & 8 MP Front Camera \n",
|
|||
|
"3 48 MP Quad Rear & 13 MP Front Camera \n",
|
|||
|
"4 13 MP + 2 MP + 2 MP Triple Rear & 5 MP Fro... \n",
|
|||
|
"\n",
|
|||
|
" External_Memory Android_version Price company \\\n",
|
|||
|
"0 Memory Card Supported, upto 1 TB 13 9,999 Samsung \n",
|
|||
|
"1 Memory Card Supported, upto 512 GB 10 9,990 Samsung \n",
|
|||
|
"2 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n",
|
|||
|
"3 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n",
|
|||
|
"4 Memory Card Supported, upto 1 TB 11 11,999 Samsung \n",
|
|||
|
"\n",
|
|||
|
" Inbuilt_memory fast_charging \\\n",
|
|||
|
"0 128 GB inbuilt 25W Fast Charging \n",
|
|||
|
"1 32 GB inbuilt 15W Fast Charging \n",
|
|||
|
"2 64 GB inbuilt 25W Fast Charging \n",
|
|||
|
"3 64 GB inbuilt NaN \n",
|
|||
|
"4 64 GB inbuilt 15W Fast Charging \n",
|
|||
|
"\n",
|
|||
|
" Screen_resolution Processor \\\n",
|
|||
|
"0 2408 x 1080 px Display with Water Drop Notch Octa Core Processor \n",
|
|||
|
"1 720 x 1560 px Display with Punch Hole 1.8 GHz Processor \n",
|
|||
|
"2 1080 x 2408 px Display with Water Drop Notch 2 GHz Processor \n",
|
|||
|
"3 720 x 1600 px Octa Core \n",
|
|||
|
"4 720 x 1600 px Display with Water Drop Notch Octa Core \n",
|
|||
|
"\n",
|
|||
|
" Processor_name \n",
|
|||
|
"0 Exynos 1330 \n",
|
|||
|
"1 Octa Core \n",
|
|||
|
"2 Octa Core \n",
|
|||
|
"3 Helio G88 \n",
|
|||
|
"4 Helio P35 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//mppp.csv\", sep=\",\")\n",
|
|||
|
"print('количество колонок: ' + str(df.columns.size)) \n",
|
|||
|
"print('колонки: ' + ', '.join(df.columns))\n",
|
|||
|
"\n",
|
|||
|
"df.info()\n",
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Получение сведений о пропущенных данных\n",
|
|||
|
"\n",
|
|||
|
"Типы пропущенных данных:\n",
|
|||
|
"\n",
|
|||
|
"* None - представление пустых данных в Python\n",
|
|||
|
"* NaN - представление пустых данных в Pandas\n",
|
|||
|
"* '' - пустая строка"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Unnamed: 0 0\n",
|
|||
|
"Name 0\n",
|
|||
|
"Rating 0\n",
|
|||
|
"Spec_score 0\n",
|
|||
|
"No_of_sim 0\n",
|
|||
|
"Ram 0\n",
|
|||
|
"Battery 0\n",
|
|||
|
"Display 0\n",
|
|||
|
"Camera 0\n",
|
|||
|
"External_Memory 0\n",
|
|||
|
"Android_version 443\n",
|
|||
|
"Price 0\n",
|
|||
|
"company 0\n",
|
|||
|
"Inbuilt_memory 19\n",
|
|||
|
"fast_charging 89\n",
|
|||
|
"Screen_resolution 2\n",
|
|||
|
"Processor 28\n",
|
|||
|
"Processor_name 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Unnamed: 0 False\n",
|
|||
|
"Name False\n",
|
|||
|
"Rating False\n",
|
|||
|
"Spec_score False\n",
|
|||
|
"No_of_sim False\n",
|
|||
|
"Ram False\n",
|
|||
|
"Battery False\n",
|
|||
|
"Display False\n",
|
|||
|
"Camera False\n",
|
|||
|
"External_Memory False\n",
|
|||
|
"Android_version True\n",
|
|||
|
"Price False\n",
|
|||
|
"company False\n",
|
|||
|
"Inbuilt_memory True\n",
|
|||
|
"fast_charging True\n",
|
|||
|
"Screen_resolution True\n",
|
|||
|
"Processor True\n",
|
|||
|
"Processor_name False\n",
|
|||
|
"dtype: bool\n",
|
|||
|
"\n",
|
|||
|
"Android_version процент пустых значений: %32.34\n",
|
|||
|
"Inbuilt_memory процент пустых значений: %1.39\n",
|
|||
|
"fast_charging процент пустых значений: %6.50\n",
|
|||
|
"Screen_resolution процент пустых значений: %0.15\n",
|
|||
|
"Processor процент пустых значений: %2.04\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Количество пустых значений признаков\n",
|
|||
|
"print(df.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Есть ли пустые значения признаков\n",
|
|||
|
"print(df.isnull().any())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Процент пустых значений признаков\n",
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Судя по статистике выше, пустые значения всё-таки присутствуют. Наиболее корректным решением будет заполнение пропущенных данных, потому что количество объектов с пропущенными значениями составляет практически половину объектов датасета. Так как все атрибуты имеют строковый тип данных, воспользуемся заполнением наиболее частым значением."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Unnamed: 0 0\n",
|
|||
|
"Name 0\n",
|
|||
|
"Rating 0\n",
|
|||
|
"Spec_score 0\n",
|
|||
|
"No_of_sim 0\n",
|
|||
|
"Ram 0\n",
|
|||
|
"Battery 0\n",
|
|||
|
"Display 0\n",
|
|||
|
"Camera 0\n",
|
|||
|
"External_Memory 0\n",
|
|||
|
"Android_version 0\n",
|
|||
|
"Price 0\n",
|
|||
|
"company 0\n",
|
|||
|
"Inbuilt_memory 0\n",
|
|||
|
"fast_charging 0\n",
|
|||
|
"Screen_resolution 0\n",
|
|||
|
"Processor 0\n",
|
|||
|
"Processor_name 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Unnamed: 0 False\n",
|
|||
|
"Name False\n",
|
|||
|
"Rating False\n",
|
|||
|
"Spec_score False\n",
|
|||
|
"No_of_sim False\n",
|
|||
|
"Ram False\n",
|
|||
|
"Battery False\n",
|
|||
|
"Display False\n",
|
|||
|
"Camera False\n",
|
|||
|
"External_Memory False\n",
|
|||
|
"Android_version False\n",
|
|||
|
"Price False\n",
|
|||
|
"company False\n",
|
|||
|
"Inbuilt_memory False\n",
|
|||
|
"fast_charging False\n",
|
|||
|
"Screen_resolution False\n",
|
|||
|
"Processor False\n",
|
|||
|
"Processor_name False\n",
|
|||
|
"dtype: bool\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df['Inbuilt_memory'].fillna(df['Inbuilt_memory'].mode()[0], inplace=True)\n",
|
|||
|
"df['Processor'].fillna(df['Processor'].mode()[0], inplace=True)\n",
|
|||
|
"df['Android_version'].fillna(df['Android_version'].mode()[0], inplace=True)\n",
|
|||
|
"df['fast_charging'].fillna(df['fast_charging'].mode()[0], inplace=True)\n",
|
|||
|
"df['Screen_resolution'].fillna(df['Screen_resolution'].mode()[0], inplace=True)\n",
|
|||
|
"\n",
|
|||
|
"# Количество пустых значений признаков\n",
|
|||
|
"print(df.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Есть ли пустые значения признаков\n",
|
|||
|
"print(df.isnull().any())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Процент пустых значений признаков\n",
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверим выбросы и устраним их:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 14,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Колонка Rating:\n",
|
|||
|
" Есть выбросы: Нет\n",
|
|||
|
" Количество выбросов: 0\n",
|
|||
|
" Минимальное значение: 3.75\n",
|
|||
|
" Максимальное значение: 4.75\n",
|
|||
|
" 1-й квартиль (Q1): 4.15\n",
|
|||
|
" 3-й квартиль (Q3): 4.55\n",
|
|||
|
"\n",
|
|||
|
"Колонка Spec_score:\n",
|
|||
|
" Есть выбросы: Нет\n",
|
|||
|
" Количество выбросов: 0\n",
|
|||
|
" Минимальное значение: 58.5\n",
|
|||
|
" Максимальное значение: 98.0\n",
|
|||
|
" 1-й квартиль (Q1): 75.0\n",
|
|||
|
" 3-й квартиль (Q3): 86.0\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"numeric_columns = ['Name', 'Rating', 'Spec_score', 'No_of_sim', 'Ram', 'Battery', 'Display', 'Camera', 'External_Memory', 'Android_version', 'Price', 'company', 'Inbuilt_memory', 'fast_charging', 'Screen_resolution', 'Processor', 'Processor_name']\n",
|
|||
|
"for column in numeric_columns:\n",
|
|||
|
" if pd.api.types.is_numeric_dtype(df[column]): # Проверяем, является ли колонка числовой\n",
|
|||
|
" q1 = df[column].quantile(0.25) # Находим 1-й квартиль (Q1)\n",
|
|||
|
" q3 = df[column].quantile(0.75) # Находим 3-й квартиль (Q3)\n",
|
|||
|
" iqr = q3 - q1 # Вычисляем межквартильный размах (IQR)\n",
|
|||
|
"\n",
|
|||
|
" # Определяем границы для выбросов\n",
|
|||
|
" lower_bound = q1 - 1.5 * iqr # Нижняя граница\n",
|
|||
|
" upper_bound = q3 + 1.5 * iqr # Верхняя граница\n",
|
|||
|
"\n",
|
|||
|
" # Подсчитываем количество выбросов\n",
|
|||
|
" outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]\n",
|
|||
|
" outlier_count = outliers.shape[0]\n",
|
|||
|
"\n",
|
|||
|
" # Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю\n",
|
|||
|
" df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
|
|||
|
"\n",
|
|||
|
" print(f\"Колонка {column}:\")\n",
|
|||
|
" print(f\" Есть выбросы: {'Да' if outlier_count > 0 else 'Нет'}\")\n",
|
|||
|
" print(f\" Количество выбросов: {outlier_count}\")\n",
|
|||
|
" print(f\" Минимальное значение: {df[column].min()}\")\n",
|
|||
|
" print(f\" Максимальное значение: {df[column].max()}\")\n",
|
|||
|
" print(f\" 1-й квартиль (Q1): {q1}\")\n",
|
|||
|
" print(f\" 3-й квартиль (Q3): {q3}\\n\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Постараемся выявить зависимости Outcome от остальных колонок:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABIoAAAK9CAYAAABLvofaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzde1yO9/8H8Net4123Skl3KUWlkpItx1DMSk4jJIcRxTYsp6K+kw7UaNpkwxy2u5CwzbDNMkyz2ZecYpuUEH7rtkyU2BLdvz+s6+tyVzqR8Xo+Htfjsev6fK7353N9uvL99f59Pp9LolKpVCAiIiIiIiIiohdes6buABERERERERERPRuYKCIiIiIiIiIiIgBMFBERERERERER0T+YKCIiIiIiIiIiIgBMFBERERERERER0T+YKCIiIiIiIiIiIgBMFBERERERERER0T+YKCIiIiIiIiIiIgBMFBERERERERER0T+YKCIiIiIiesGVlpYiODgYcrkcEokEs2bNauou1Sg/Px8SiQTJyclN3ZWnKjk5GRKJBPn5+cI1Ly8veHl5NVob0dHRkEgkjRaPiP59mCgiIiIiesIq/7ir6ggPD38ibf7888+Ijo7GzZs3n0j8hqgcj2PHjjV1V+pt1apVz1WSIj4+HsnJyXjrrbewceNGvP7660+0PRsbG9HvQatWrdC7d298+eWXT7Tdhvq39vtRd+7cQXR0NDIyMpq6K0T0DNJs6g4QERERvShiY2PRtm1b0bWOHTs+kbZ+/vlnxMTEIDAwEEZGRk+kjRfZqlWr0LJlSwQGBjZ1VxrF999/j+7duyMqKuqptenm5oa5c+cCAAoKCrBmzRr4+flh9erVePPNN2u819raGn/99Re0tLSeRldFGtLvJ+G7776r8z137txBTEwMAKjNRlqwYMETS2AT0b8DE0VERERET4mvry/c3d2buhsNcvv2bejr6zd1N5rMnTt3oKen19TdaHSFhYXo0KFDo8W7d+8eKioqoK2tXW2d1q1bY/z48cL5hAkTYGdnhw8++KDahMvDcXV1dRutv3XR0H43tsaOqampCU1N/plI9CLj0jMiIiKiZ8S3336L3r17Q19fH82bN8egQYPw22+/ieqcPn0agYGBaNeuHXR1dSGXyzF58mRcv35dqBMdHY2wsDAAQNu2bYVlMvn5+TXu7SKRSBAdHS2KI5FIcObMGYwdOxYtWrRAr169hPJNmzbh5ZdfhlQqhbGxMQICAnDlypV6PXtgYCBkMhkuX76MwYMHQyaToXXr1li5ciUA4JdffkG/fv2gr68Pa2trbN68WXR/5XK2gwcP4o033oCJiQkMDAwwYcIE3LhxQ629VatWwdnZGTo6OrCwsMD06dPVlul5eXmhY8eOOH78OPr06QM9PT385z//gY2NDX777Tf88MMPwthWzsooKipCaGgoXFxcIJPJYGBgAF9fX5w6dUoUOyMjAxKJBNu2bUNcXBwsLS2hq6uLV155BXl5eWr9PXLkCAYOHIgWLVpAX18frq6uSEpKEtU5e/YsRo4cCWNjY+jq6sLd3R27du2qcdwr+3Hx4kV88803oncFeJBACgoKgpmZGXR1ddGpUyekpKSIYlS+U8uWLcPy5ctha2sLHR0dnDlzpsa2HyWXy+Hk5ISLFy8+Nm517/HZs2fh7+8PU1NTSKVSODg44J133hHV+f333zF58mSYmZlBR0cHzs7O+PTTT+vU1/r2u7KPtfk5/fbbb+jXrx+kUiksLS2xePFiVFRUqNWrao+iv//+G9HR0Wjfvj10dXVhbm4OPz8/nD9/Hvn5+TA1NQUAxMTECD/zyt/9qvYounfvHhYtWiQ8i42NDf7zn/+grKxMVM/GxgaDBw/GTz/9hK5du0JXVxft2rXDhg0b6jW2RNQ0mComIiIiekqKi4vx559/iq61bNkSALBx40ZMnDgRPj4+WLp0Ke7cuYPVq1ejV69eOHnyJGxsbAAAe/fuxYULFzBp0iTI5XL89ttvWLt2LX777TccPnwYEokEfn5+yM3NRVpaGj744AOhDVNTU1y7dq3O/R41ahTs7e0RHx8PlUoFAIiLi0NkZCT8/f0RHByMa9eu4cMPP0SfPn1w8uTJei13u3//Pnx9fdGnTx8kJCQgNTUVM2bMgL6+Pt555x2MGzcOfn5++PjjjzFhwgT06NFDbSnfjBkzYGRkhOjoaOTk5GD16tW4dOmSkBABHvwhHBMTg/79++Ott94S6h09ehSHDh0SLWe6fv06fH19ERAQgPHjx8PMzAxeXl54++23IZPJhCSEmZkZAODChQvYsWMHRo0ahbZt2+KPP/7AmjVr4OnpiTNnzsDCwkLU3yVLlqBZs2YIDQ1FcXExEhISMG7cOBw5ckSos3fvXgwePBjm5uaYOXMm5HI5srOz8fXXX2PmzJkAHiQVPDw80Lp1a4SHh0NfXx/btm3DsGHD8MUXX2D48OFVjrmTkxM2btyI2bNnw9LSUlhSZWpqir/++gteXl7Iy8vDjBkz0LZtW3z22WcIDAzEzZs3hbYrKRQK/P3335g6dSp0dHRgbGxcp59/eXk5rly5AhMTk8fGrSphcvr0afTu3RtaWlqYOnUqbGxscP78eXz11VeIi4sDAPzxxx/o3r07JBIJZsyYAVNTU3z77bcICgpCSUlJvTbxrku/a/tzunr1Kvr27Yt79+4J9dauXQupVPrY/ty/fx+DBw/G/v37ERAQgJkzZ+LWrVvYu3cvfv31V/Tv3x+rV6/GW2+9heHDh8PPzw8A4OrqWm3M4OBgpKSkYOTIkZg7dy6OHDmCd999F9nZ2Wr7M+Xl5WHkyJEICgrCxIkT8emnnyIwMBAvv/wynJ2d6zq8RNQUVERERET0RCkUChWAKg+VSqW6deuWysjISDVlyhTRfVevXlUZGhqKrt+5c0ctflpamgqA6uDBg8K19957TwVAdfHiRVHdixcvqgCoFAqFWhwAqqioKOE8KipKBUA1ZswYUb38/HyVhoaGKi4uTnT9l19+UWlqaqpdr248jh49KlybOHGiCoAqPj5euHbjxg2VVCpVSSQS1ZYtW4TrZ8+eVetrZcyXX35ZdffuXeF6QkKCCoBq586dKpVKpSosLFRpa2urvL29Vffv3xfqffTRRyoAqk8//VS45unpqQKg+vjjj9WewdnZWeXp6al2/e+//xbFVakejLmOjo4qNjZWuHbgwAEVAJWTk5OqrKxMuJ6UlKQCoPrll19UKpVKde/ePVXbtm1V1tbWqhs3bojiVlRUCP/9yiuvqFxcXFR///23qLxnz54qe3t7tX4+ytraWjVo0CDRteXLl6sAqDZt2iRcu3v3rqpHjx4qmUymKikpEZ4PgMrAwEBVWFj42LYq2/P29lZdu3ZNde3aNdWpU6dUAQEBKgCqt99++7Fxq3qP+/Tpo2revLnq0qVLoroPj1NQUJDK3Nxc9eeff4rqBAQEqAwNDav8/WrMftf25zRr1iwVANWRI0eEa4WFhSpDQ0O132tPT0/Ru/jpp5+qAKjef/99tf5XjsW1a9fUfocqVf7eV8rKylIBUAUHB4vqhYaGqgCovv/+e9H4PPpvUWFhoUpHR0c1d+5ctbaI6NnEpWdERERET8nKlSuxd+9e0QE8mDFy8+ZNjBkzBn/++adwaGhooFu3bjhw4IAQ4+EZBX///Tf+/PNPdO/eHQBw4sSJJ9LvR/dd2b59OyoqKuDv7y/qr1wuh729vai/dRUcHCz8t5GRERwcHKCvrw9/f3/huoODA4yMjHDhwgW1+6dOnSqaEfTWW29BU1MTu3fvBgDs27cPd+/exaxZs9Cs2f/+T+EpU6bAwMAA33zzjSiejo4OJk2aVOv+6+joCHHv37+P69evQyaTwcHBocqfz6RJk0R7zPTu3RsAhGc7efIkLl68iFmzZqnN0qqcIVVUVITvv/8e/v7+uHXrlvDzuH79Onx8fHDu3Dn8/vvvtX6GSrt374ZcLseYMWOEa1paWggJCUFpaSl++OEHUf0RI0YIS5pq47vvvoOpqSlMTU3RqVMnfPbZZ3j99dexdOn
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x800 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.ensemble import RandomForestRegressor\n",
|
|||
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Сначала Label Encoding для упорядоченных категорий (если нужно)\n",
|
|||
|
"label_encoder = LabelEncoder()\n",
|
|||
|
"\n",
|
|||
|
"for column in df.columns:\n",
|
|||
|
" if df[column].dtype == 'object': # Если тип данных - строка\n",
|
|||
|
" if df[column].nunique() <= 10: # Если небольшое количество уникальных значений, применяем One-Hot Encoding\n",
|
|||
|
" df = pd.get_dummies(df, columns=[column], drop_first=True)\n",
|
|||
|
" else:\n",
|
|||
|
" df[column] = label_encoder.fit_transform(df[column])\n",
|
|||
|
"\n",
|
|||
|
"# Удаление строки \"Price\" из признаков (она является целевой переменной)\n",
|
|||
|
"X = df.drop('Price', axis=1) # Все признаки, кроме цены\n",
|
|||
|
"y = df['Price']\n",
|
|||
|
"\n",
|
|||
|
"# Создание модели RandomForestRegressor\n",
|
|||
|
"model = RandomForestRegressor()\n",
|
|||
|
"model.fit(X, y)\n",
|
|||
|
"\n",
|
|||
|
"# Определение важности признаков\n",
|
|||
|
"feature_importances = model.feature_importances_\n",
|
|||
|
"features = X.columns\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация важности признаков\n",
|
|||
|
"plt.figure(figsize=(10, 8))\n",
|
|||
|
"plt.barh(features, feature_importances)\n",
|
|||
|
"plt.title('Feature Importance for Price Prediction')\n",
|
|||
|
"plt.xlabel('Importance')\n",
|
|||
|
"plt.ylabel('Features')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 29,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: (822, 32)\n",
|
|||
|
"Размер контрольной выборки: (274, 32)\n",
|
|||
|
"Размер тестовой выборки: (274, 32)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на признаки (X) и целевую переменную (y)\n",
|
|||
|
"X = df.drop('Price', axis=1) # Все признаки, кроме цены\n",
|
|||
|
"y = df['Price'] # Целевая переменная (цена)\n",
|
|||
|
"\n",
|
|||
|
"# Сначала разбиваем данные на обучающую и промежуточную выборки (80% обучающих данных и 20% тестовых)\n",
|
|||
|
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Теперь делим временные данные (X_temp и y_temp) на контрольную (валидационную) и тестовую выборки (по 20% каждой)\n",
|
|||
|
"X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Выводим размеры выборок для проверки\n",
|
|||
|
"print(f'Размер обучающей выборки: {X_train.shape}')\n",
|
|||
|
"print(f'Размер контрольной выборки: {X_val.shape}')\n",
|
|||
|
"print(f'Размер тестовой выборки: {X_test.shape}')"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|