1358 lines
50 KiB
Plaintext
1358 lines
50 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Загрузка данных в DataFrame"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 83,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 1370 entries, 0 to 1369\n",
|
||
"Data columns (total 18 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 Name 1370 non-null object \n",
|
||
" 1 Rating 1370 non-null float64\n",
|
||
" 2 Spec_score 1367 non-null float64\n",
|
||
" 3 No_of_sim 1370 non-null object \n",
|
||
" 4 Ram 1370 non-null object \n",
|
||
" 5 Battery 1370 non-null object \n",
|
||
" 6 Display 1370 non-null object \n",
|
||
" 7 Camera 1370 non-null object \n",
|
||
" 8 External_Memory 1370 non-null object \n",
|
||
" 9 Android_version 927 non-null object \n",
|
||
" 10 Price 1370 non-null object \n",
|
||
" 11 company 1370 non-null object \n",
|
||
" 12 Inbuilt_memory 1351 non-null object \n",
|
||
" 13 fast_charging 1281 non-null object \n",
|
||
" 14 Screen_resolution 1368 non-null object \n",
|
||
" 15 Processor 1342 non-null object \n",
|
||
" 16 Processor_name 1370 non-null object \n",
|
||
" 17 Rating_index 1370 non-null int64 \n",
|
||
"dtypes: float64(2), int64(1), object(15)\n",
|
||
"memory usage: 203.4+ KB\n",
|
||
"(1370, 18)\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Rating</th>\n",
|
||
" <th>Spec_score</th>\n",
|
||
" <th>No_of_sim</th>\n",
|
||
" <th>Ram</th>\n",
|
||
" <th>Battery</th>\n",
|
||
" <th>Display</th>\n",
|
||
" <th>Camera</th>\n",
|
||
" <th>External_Memory</th>\n",
|
||
" <th>Android_version</th>\n",
|
||
" <th>Price</th>\n",
|
||
" <th>company</th>\n",
|
||
" <th>Inbuilt_memory</th>\n",
|
||
" <th>fast_charging</th>\n",
|
||
" <th>Screen_resolution</th>\n",
|
||
" <th>Processor</th>\n",
|
||
" <th>Processor_name</th>\n",
|
||
" <th>Rating_index</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>ID</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Samsung Galaxy F14 5G</td>\n",
|
||
" <td>4.65</td>\n",
|
||
" <td>68.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
|
||
" <td>4 GB RAM</td>\n",
|
||
" <td>6000 mAh Battery</td>\n",
|
||
" <td>6.6 inches</td>\n",
|
||
" <td>50 MP + 2 MP Dual Rear &amp; 13 MP Front Camera</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>9.999</td>\n",
|
||
" <td>Samsung</td>\n",
|
||
" <td>128 GB inbuilt</td>\n",
|
||
" <td>25W Fast Charging</td>\n",
|
||
" <td>2408 x 1080 px Display with Water Drop Notch</td>\n",
|
||
" <td>Octa Core Processor</td>\n",
|
||
" <td>Exynos 1330</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Samsung Galaxy A11</td>\n",
|
||
" <td>4.20</td>\n",
|
||
" <td>63.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
||
" <td>2 GB RAM</td>\n",
|
||
" <td>4000 mAh Battery</td>\n",
|
||
" <td>6.4 inches</td>\n",
|
||
" <td>13 MP + 5 MP + 2 MP Triple Rear &amp; 8 MP Fro...</td>\n",
|
||
" <td>Memory Card Supported, upto 512 GB</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>9,990</td>\n",
|
||
" <td>Samsung</td>\n",
|
||
" <td>32 GB inbuilt</td>\n",
|
||
" <td>15W Fast Charging</td>\n",
|
||
" <td>720 x 1560 px Display with Punch Hole</td>\n",
|
||
" <td>1.8 GHz Processor</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Samsung Galaxy A13</td>\n",
|
||
" <td>4.30</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
||
" <td>4 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.6 inches</td>\n",
|
||
" <td>50 MP Quad Rear &amp; 8 MP Front Camera</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>11,999</td>\n",
|
||
" <td>Samsung</td>\n",
|
||
" <td>64 GB inbuilt</td>\n",
|
||
" <td>25W Fast Charging</td>\n",
|
||
" <td>1080 x 2408 px Display with Water Drop Notch</td>\n",
|
||
" <td>2 GHz Processor</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Samsung Galaxy F23</td>\n",
|
||
" <td>4.10</td>\n",
|
||
" <td>73.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
||
" <td>4 GB RAM</td>\n",
|
||
" <td>6000 mAh Battery</td>\n",
|
||
" <td>6.4 inches</td>\n",
|
||
" <td>48 MP Quad Rear &amp; 13 MP Front Camera</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>11,999</td>\n",
|
||
" <td>Samsung</td>\n",
|
||
" <td>64 GB inbuilt</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>720 x 1600 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Helio G88</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Samsung Galaxy A03s (4GB RAM + 64GB)</td>\n",
|
||
" <td>4.10</td>\n",
|
||
" <td>69.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
||
" <td>4 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.5 inches</td>\n",
|
||
" <td>13 MP + 2 MP + 2 MP Triple Rear &amp; 5 MP Fro...</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>11,999</td>\n",
|
||
" <td>Samsung</td>\n",
|
||
" <td>64 GB inbuilt</td>\n",
|
||
" <td>15W Fast Charging</td>\n",
|
||
" <td>720 x 1600 px Display with Water Drop Notch</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Helio P35</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Name Rating Spec_score \\\n",
|
||
"ID \n",
|
||
"0 Samsung Galaxy F14 5G 4.65 68.0 \n",
|
||
"1 Samsung Galaxy A11 4.20 63.0 \n",
|
||
"2 Samsung Galaxy A13 4.30 NaN \n",
|
||
"3 Samsung Galaxy F23 4.10 73.0 \n",
|
||
"4 Samsung Galaxy A03s (4GB RAM + 64GB) 4.10 69.0 \n",
|
||
"\n",
|
||
" No_of_sim Ram Battery Display \\\n",
|
||
"ID \n",
|
||
"0 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 6000 mAh Battery 6.6 inches \n",
|
||
"1 Dual Sim, 3G, 4G, VoLTE, 2 GB RAM 4000 mAh Battery 6.4 inches \n",
|
||
"2 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.6 inches \n",
|
||
"3 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches \n",
|
||
"4 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches \n",
|
||
"\n",
|
||
" Camera \\\n",
|
||
"ID \n",
|
||
"0 50 MP + 2 MP Dual Rear & 13 MP Front Camera \n",
|
||
"1 13 MP + 5 MP + 2 MP Triple Rear & 8 MP Fro... \n",
|
||
"2 50 MP Quad Rear & 8 MP Front Camera \n",
|
||
"3 48 MP Quad Rear & 13 MP Front Camera \n",
|
||
"4 13 MP + 2 MP + 2 MP Triple Rear & 5 MP Fro... \n",
|
||
"\n",
|
||
" External_Memory Android_version Price company \\\n",
|
||
"ID \n",
|
||
"0 Memory Card Supported, upto 1 TB 13 9.999 Samsung \n",
|
||
"1 Memory Card Supported, upto 512 GB 10 9,990 Samsung \n",
|
||
"2 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n",
|
||
"3 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n",
|
||
"4 Memory Card Supported, upto 1 TB 11 11,999 Samsung \n",
|
||
"\n",
|
||
" Inbuilt_memory fast_charging \\\n",
|
||
"ID \n",
|
||
"0 128 GB inbuilt 25W Fast Charging \n",
|
||
"1 32 GB inbuilt 15W Fast Charging \n",
|
||
"2 64 GB inbuilt 25W Fast Charging \n",
|
||
"3 64 GB inbuilt NaN \n",
|
||
"4 64 GB inbuilt 15W Fast Charging \n",
|
||
"\n",
|
||
" Screen_resolution Processor \\\n",
|
||
"ID \n",
|
||
"0 2408 x 1080 px Display with Water Drop Notch Octa Core Processor \n",
|
||
"1 720 x 1560 px Display with Punch Hole 1.8 GHz Processor \n",
|
||
"2 1080 x 2408 px Display with Water Drop Notch 2 GHz Processor \n",
|
||
"3 720 x 1600 px Octa Core \n",
|
||
"4 720 x 1600 px Display with Water Drop Notch Octa Core \n",
|
||
"\n",
|
||
" Processor_name Rating_index \n",
|
||
"ID \n",
|
||
"0 Exynos 1330 1 \n",
|
||
"1 Octa Core 0 \n",
|
||
"2 Octa Core 0 \n",
|
||
"3 Helio G88 0 \n",
|
||
"4 Helio P35 0 "
|
||
]
|
||
},
|
||
"execution_count": 83,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"df1 = pd.read_csv(\"../data/mobile_phone_price_prediction.csv\", index_col=\"ID\")\n",
|
||
"df1[\"Spec_score\"] = df1[\"Spec_score\"].replace({\"\": None})\n",
|
||
"df1[\"Rating_index\"] = df1[\"Rating\"].apply(lambda x: 1 if float(x) > 4.5 else 0)\n",
|
||
"\n",
|
||
"df1.info()\n",
|
||
"\n",
|
||
"print(df1.shape)\n",
|
||
"\n",
|
||
"df1.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Получение сведений о пропущенных данных"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Типы пропущенных данных:\n",
|
||
"- None - представление пустых данных в Python\n",
|
||
"- NaN - представление пустых данных в Pandas\n",
|
||
"- '' - пустая строка"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 68,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Name 0\n",
|
||
"Rating 0\n",
|
||
"Spec_score 3\n",
|
||
"No_of_sim 0\n",
|
||
"Ram 0\n",
|
||
"Battery 0\n",
|
||
"Display 0\n",
|
||
"Camera 0\n",
|
||
"External_Memory 0\n",
|
||
"Android_version 443\n",
|
||
"Price 0\n",
|
||
"company 0\n",
|
||
"Inbuilt_memory 19\n",
|
||
"fast_charging 89\n",
|
||
"Screen_resolution 2\n",
|
||
"Processor 28\n",
|
||
"Processor_name 0\n",
|
||
"company_index 0\n",
|
||
"dtype: int64\n",
|
||
"\n",
|
||
"Name False\n",
|
||
"Rating False\n",
|
||
"Spec_score True\n",
|
||
"No_of_sim False\n",
|
||
"Ram False\n",
|
||
"Battery False\n",
|
||
"Display False\n",
|
||
"Camera False\n",
|
||
"External_Memory False\n",
|
||
"Android_version True\n",
|
||
"Price False\n",
|
||
"company False\n",
|
||
"Inbuilt_memory True\n",
|
||
"fast_charging True\n",
|
||
"Screen_resolution True\n",
|
||
"Processor True\n",
|
||
"Processor_name False\n",
|
||
"company_index False\n",
|
||
"dtype: bool\n",
|
||
"\n",
|
||
"Spec_score процент пустых значений: %0.22\n",
|
||
"Android_version процент пустых значений: %32.34\n",
|
||
"Inbuilt_memory процент пустых значений: %1.39\n",
|
||
"fast_charging процент пустых значений: %6.50\n",
|
||
"Screen_resolution процент пустых значений: %0.15\n",
|
||
"Processor процент пустых значений: %2.04\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Количество пустых значений признаков\n",
|
||
"print(df1.isnull().sum())\n",
|
||
"\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Есть ли пустые значения признаков\n",
|
||
"print(df1.isnull().any())\n",
|
||
"\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Процент пустых значений признаков\n",
|
||
"for i in df1.columns:\n",
|
||
" null_rate = df1[i].isnull().sum() / len(df1) * 100\n",
|
||
" if null_rate > 0:\n",
|
||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Заполнение пропущенных данных\n",
|
||
"\n",
|
||
"https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
|
||
"\n",
|
||
"https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 86,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"(1370, 18)\n",
|
||
"Name False\n",
|
||
"Rating False\n",
|
||
"Spec_score False\n",
|
||
"No_of_sim False\n",
|
||
"Ram False\n",
|
||
"Battery False\n",
|
||
"Display False\n",
|
||
"Camera False\n",
|
||
"External_Memory False\n",
|
||
"Android_version False\n",
|
||
"Price False\n",
|
||
"company False\n",
|
||
"Inbuilt_memory False\n",
|
||
"fast_charging False\n",
|
||
"Screen_resolution False\n",
|
||
"Processor False\n",
|
||
"Processor_name False\n",
|
||
"Rating_index False\n",
|
||
"dtype: bool\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Rating</th>\n",
|
||
" <th>Spec_score</th>\n",
|
||
" <th>No_of_sim</th>\n",
|
||
" <th>Ram</th>\n",
|
||
" <th>Battery</th>\n",
|
||
" <th>Display</th>\n",
|
||
" <th>Camera</th>\n",
|
||
" <th>External_Memory</th>\n",
|
||
" <th>Android_version</th>\n",
|
||
" <th>Price</th>\n",
|
||
" <th>company</th>\n",
|
||
" <th>Inbuilt_memory</th>\n",
|
||
" <th>fast_charging</th>\n",
|
||
" <th>Screen_resolution</th>\n",
|
||
" <th>Processor</th>\n",
|
||
" <th>Processor_name</th>\n",
|
||
" <th>Rating_index</th>\n",
|
||
" <th>Spec_scoreFillNA</th>\n",
|
||
" <th>Spec_scoreFillMedian</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>ID</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1365</th>\n",
|
||
" <td>TCL 40R</td>\n",
|
||
" <td>4.05</td>\n",
|
||
" <td>75.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
|
||
" <td>4 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.6 inches</td>\n",
|
||
" <td>50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro...</td>\n",
|
||
" <td>Memory Card (Hybrid)</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>18,999</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>64 GB inbuilt</td>\n",
|
||
" <td>15W Fast Charging</td>\n",
|
||
" <td>720 x 1612 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Dimensity 700 5G</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>75.0</td>\n",
|
||
" <td>75.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1366</th>\n",
|
||
" <td>TCL 50 XL NxtPaper 5G</td>\n",
|
||
" <td>4.10</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
||
" <td>8 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.8 inches</td>\n",
|
||
" <td>50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera</td>\n",
|
||
" <td>Memory Card (Hybrid)</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>24,990</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>128 GB inbuilt</td>\n",
|
||
" <td>33W Fast Charging</td>\n",
|
||
" <td>1200 x 2400 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Dimensity 7050</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1367</th>\n",
|
||
" <td>TCL 50 XE NxtPaper 5G</td>\n",
|
||
" <td>4.00</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
|
||
" <td>6 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.6 inches</td>\n",
|
||
" <td>50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>23,990</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>256 GB inbuilt</td>\n",
|
||
" <td>18W Fast Charging</td>\n",
|
||
" <td>720 x 1612 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Dimensity 6080</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1368</th>\n",
|
||
" <td>TCL 40 NxtPaper 5G</td>\n",
|
||
" <td>4.50</td>\n",
|
||
" <td>79.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
|
||
" <td>6 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.6 inches</td>\n",
|
||
" <td>50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro...</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>22,499</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>256 GB inbuilt</td>\n",
|
||
" <td>15W Fast Charging</td>\n",
|
||
" <td>720 x 1612 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Dimensity 6020</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>79.0</td>\n",
|
||
" <td>79.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1369</th>\n",
|
||
" <td>TCL Trifold</td>\n",
|
||
" <td>4.65</td>\n",
|
||
" <td>93.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G,</td>\n",
|
||
" <td>12 GB RAM</td>\n",
|
||
" <td>4600 mAh Battery</td>\n",
|
||
" <td>10 inches</td>\n",
|
||
" <td>Foldable Display, Dual Display</td>\n",
|
||
" <td>50 MP + 48 MP + 8 MP Triple Rear &amp; 32 MP F...</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>1,19,990</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>256 GB inbuilt</td>\n",
|
||
" <td>67W Fast Charging</td>\n",
|
||
" <td>1916 x 2160 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Snapdragon 8 Gen2</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>93.0</td>\n",
|
||
" <td>93.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Name Rating Spec_score \\\n",
|
||
"ID \n",
|
||
"1365 TCL 40R 4.05 75.0 \n",
|
||
"1366 TCL 50 XL NxtPaper 5G 4.10 80.0 \n",
|
||
"1367 TCL 50 XE NxtPaper 5G 4.00 80.0 \n",
|
||
"1368 TCL 40 NxtPaper 5G 4.50 79.0 \n",
|
||
"1369 TCL Trifold 4.65 93.0 \n",
|
||
"\n",
|
||
" No_of_sim Ram Battery \\\n",
|
||
"ID \n",
|
||
"1365 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery \n",
|
||
"1366 Dual Sim, 3G, 4G, VoLTE, 8 GB RAM 5000 mAh Battery \n",
|
||
"1367 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n",
|
||
"1368 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n",
|
||
"1369 Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, 12 GB RAM 4600 mAh Battery \n",
|
||
"\n",
|
||
" Display Camera \\\n",
|
||
"ID \n",
|
||
"1365 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro... \n",
|
||
"1366 6.8 inches 50 MP + 2 MP Dual Rear & 16 MP Front Camera \n",
|
||
"1367 6.6 inches 50 MP + 2 MP Dual Rear & 16 MP Front Camera \n",
|
||
"1368 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro... \n",
|
||
"1369 10 inches Foldable Display, Dual Display \n",
|
||
"\n",
|
||
" External_Memory Android_version \\\n",
|
||
"ID \n",
|
||
"1365 Memory Card (Hybrid) 12 \n",
|
||
"1366 Memory Card (Hybrid) 14 \n",
|
||
"1367 Memory Card Supported, upto 1 TB 13 \n",
|
||
"1368 Memory Card Supported, upto 1 TB 13 \n",
|
||
"1369 50 MP + 48 MP + 8 MP Triple Rear & 32 MP F... 13 \n",
|
||
"\n",
|
||
" Price company Inbuilt_memory fast_charging Screen_resolution \\\n",
|
||
"ID \n",
|
||
"1365 18,999 TCL 64 GB inbuilt 15W Fast Charging 720 x 1612 px \n",
|
||
"1366 24,990 TCL 128 GB inbuilt 33W Fast Charging 1200 x 2400 px \n",
|
||
"1367 23,990 TCL 256 GB inbuilt 18W Fast Charging 720 x 1612 px \n",
|
||
"1368 22,499 TCL 256 GB inbuilt 15W Fast Charging 720 x 1612 px \n",
|
||
"1369 1,19,990 TCL 256 GB inbuilt 67W Fast Charging 1916 x 2160 px \n",
|
||
"\n",
|
||
" Processor Processor_name Rating_index Spec_scoreFillNA \\\n",
|
||
"ID \n",
|
||
"1365 Octa Core Dimensity 700 5G 0 75.0 \n",
|
||
"1366 Octa Core Dimensity 7050 0 80.0 \n",
|
||
"1367 Octa Core Dimensity 6080 0 80.0 \n",
|
||
"1368 Octa Core Dimensity 6020 0 79.0 \n",
|
||
"1369 Octa Core Snapdragon 8 Gen2 1 93.0 \n",
|
||
"\n",
|
||
" Spec_scoreFillMedian \n",
|
||
"ID \n",
|
||
"1365 75.0 \n",
|
||
"1366 80.0 \n",
|
||
"1367 80.0 \n",
|
||
"1368 79.0 \n",
|
||
"1369 93.0 "
|
||
]
|
||
},
|
||
"execution_count": 86,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"fillna_df = df1.fillna(0)\n",
|
||
"\n",
|
||
"print(fillna_df.shape)\n",
|
||
"\n",
|
||
"print(fillna_df.isnull().any())\n",
|
||
"\n",
|
||
"# Замена пустых данных на 0\n",
|
||
"df1[\"Spec_scoreFillNA\"] = df1[\"Spec_score\"].fillna(0)\n",
|
||
"\n",
|
||
"# Замена пустых данных на медиану\n",
|
||
"df1[\"Spec_scoreFillMedian\"] = df1[\"Spec_score\"].fillna(df1[\"Spec_scoreFillNA\"].median())\n",
|
||
"\n",
|
||
"df1.tail()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 70,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Name</th>\n",
|
||
" <th>Rating</th>\n",
|
||
" <th>Spec_score</th>\n",
|
||
" <th>No_of_sim</th>\n",
|
||
" <th>Ram</th>\n",
|
||
" <th>Battery</th>\n",
|
||
" <th>Display</th>\n",
|
||
" <th>Camera</th>\n",
|
||
" <th>External_Memory</th>\n",
|
||
" <th>Android_version</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>company</th>\n",
|
||
" <th>Inbuilt_memory</th>\n",
|
||
" <th>fast_charging</th>\n",
|
||
" <th>Screen_resolution</th>\n",
|
||
" <th>Processor</th>\n",
|
||
" <th>Processor_name</th>\n",
|
||
" <th>company_index</th>\n",
|
||
" <th>Spec_scoreFillNA</th>\n",
|
||
" <th>Spec_scoreFillMedian</th>\n",
|
||
" <th>Android_versionCopy</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>ID</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1365</th>\n",
|
||
" <td>TCL 40R</td>\n",
|
||
" <td>4.05</td>\n",
|
||
" <td>75.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
|
||
" <td>4 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.6 inches</td>\n",
|
||
" <td>50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro...</td>\n",
|
||
" <td>Memory Card (Hybrid)</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>64 GB inbuilt</td>\n",
|
||
" <td>15W Fast Charging</td>\n",
|
||
" <td>720 x 1612 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Dimensity 700 5G</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>75.0</td>\n",
|
||
" <td>75.0</td>\n",
|
||
" <td>12</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1366</th>\n",
|
||
" <td>TCL 50 XL NxtPaper 5G</td>\n",
|
||
" <td>4.10</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
||
" <td>8 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.8 inches</td>\n",
|
||
" <td>50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera</td>\n",
|
||
" <td>Memory Card (Hybrid)</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>128 GB inbuilt</td>\n",
|
||
" <td>33W Fast Charging</td>\n",
|
||
" <td>1200 x 2400 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Dimensity 7050</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>14</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1367</th>\n",
|
||
" <td>TCL 50 XE NxtPaper 5G</td>\n",
|
||
" <td>4.00</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
|
||
" <td>6 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.6 inches</td>\n",
|
||
" <td>50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>256 GB inbuilt</td>\n",
|
||
" <td>18W Fast Charging</td>\n",
|
||
" <td>720 x 1612 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Dimensity 6080</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>13</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1368</th>\n",
|
||
" <td>TCL 40 NxtPaper 5G</td>\n",
|
||
" <td>4.50</td>\n",
|
||
" <td>79.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
|
||
" <td>6 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.6 inches</td>\n",
|
||
" <td>50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro...</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>256 GB inbuilt</td>\n",
|
||
" <td>15W Fast Charging</td>\n",
|
||
" <td>720 x 1612 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Dimensity 6020</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>79.0</td>\n",
|
||
" <td>79.0</td>\n",
|
||
" <td>13</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1369</th>\n",
|
||
" <td>TCL Trifold</td>\n",
|
||
" <td>4.65</td>\n",
|
||
" <td>93.0</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G,</td>\n",
|
||
" <td>12 GB RAM</td>\n",
|
||
" <td>4600 mAh Battery</td>\n",
|
||
" <td>10 inches</td>\n",
|
||
" <td>Foldable Display, Dual Display</td>\n",
|
||
" <td>50 MP + 48 MP + 8 MP Triple Rear &amp; 32 MP F...</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>256 GB inbuilt</td>\n",
|
||
" <td>67W Fast Charging</td>\n",
|
||
" <td>1916 x 2160 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Snapdragon 8 Gen2</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>93.0</td>\n",
|
||
" <td>93.0</td>\n",
|
||
" <td>13</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 21 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Name Rating Spec_score \\\n",
|
||
"ID \n",
|
||
"1365 TCL 40R 4.05 75.0 \n",
|
||
"1366 TCL 50 XL NxtPaper 5G 4.10 80.0 \n",
|
||
"1367 TCL 50 XE NxtPaper 5G 4.00 80.0 \n",
|
||
"1368 TCL 40 NxtPaper 5G 4.50 79.0 \n",
|
||
"1369 TCL Trifold 4.65 93.0 \n",
|
||
"\n",
|
||
" No_of_sim Ram Battery \\\n",
|
||
"ID \n",
|
||
"1365 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery \n",
|
||
"1366 Dual Sim, 3G, 4G, VoLTE, 8 GB RAM 5000 mAh Battery \n",
|
||
"1367 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n",
|
||
"1368 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n",
|
||
"1369 Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, 12 GB RAM 4600 mAh Battery \n",
|
||
"\n",
|
||
" Display Camera \\\n",
|
||
"ID \n",
|
||
"1365 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro... \n",
|
||
"1366 6.8 inches 50 MP + 2 MP Dual Rear & 16 MP Front Camera \n",
|
||
"1367 6.6 inches 50 MP + 2 MP Dual Rear & 16 MP Front Camera \n",
|
||
"1368 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro... \n",
|
||
"1369 10 inches Foldable Display, Dual Display \n",
|
||
"\n",
|
||
" External_Memory Android_version ... \\\n",
|
||
"ID ... \n",
|
||
"1365 Memory Card (Hybrid) 12 ... \n",
|
||
"1366 Memory Card (Hybrid) 14 ... \n",
|
||
"1367 Memory Card Supported, upto 1 TB 13 ... \n",
|
||
"1368 Memory Card Supported, upto 1 TB 13 ... \n",
|
||
"1369 50 MP + 48 MP + 8 MP Triple Rear & 32 MP F... 13 ... \n",
|
||
"\n",
|
||
" company Inbuilt_memory fast_charging Screen_resolution \\\n",
|
||
"ID \n",
|
||
"1365 TCL 64 GB inbuilt 15W Fast Charging 720 x 1612 px \n",
|
||
"1366 TCL 128 GB inbuilt 33W Fast Charging 1200 x 2400 px \n",
|
||
"1367 TCL 256 GB inbuilt 18W Fast Charging 720 x 1612 px \n",
|
||
"1368 TCL 256 GB inbuilt 15W Fast Charging 720 x 1612 px \n",
|
||
"1369 TCL 256 GB inbuilt 67W Fast Charging 1916 x 2160 px \n",
|
||
"\n",
|
||
" Processor Processor_name company_index Spec_scoreFillNA \\\n",
|
||
"ID \n",
|
||
"1365 Octa Core Dimensity 700 5G TCL 75.0 \n",
|
||
"1366 Octa Core Dimensity 7050 TCL 80.0 \n",
|
||
"1367 Octa Core Dimensity 6080 TCL 80.0 \n",
|
||
"1368 Octa Core Dimensity 6020 TCL 79.0 \n",
|
||
"1369 Octa Core Snapdragon 8 Gen2 TCL 93.0 \n",
|
||
"\n",
|
||
" Spec_scoreFillMedian Android_versionCopy \n",
|
||
"ID \n",
|
||
"1365 75.0 12 \n",
|
||
"1366 80.0 14 \n",
|
||
"1367 80.0 13 \n",
|
||
"1368 79.0 13 \n",
|
||
"1369 93.0 13 \n",
|
||
"\n",
|
||
"[5 rows x 21 columns]"
|
||
]
|
||
},
|
||
"execution_count": 70,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df1[\"Android_versionCopy\"] = df1[\"Android_version\"]\n",
|
||
"\n",
|
||
"# Замена данных сразу в DataFrame без копирования\n",
|
||
"df1.fillna({\"Android_versionCopy\": 0}, inplace=True)\n",
|
||
"\n",
|
||
"df1.tail()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Удаление наблюдений с пропусками"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 71,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"(814, 21)\n",
|
||
"Name False\n",
|
||
"Rating False\n",
|
||
"Spec_score False\n",
|
||
"No_of_sim False\n",
|
||
"Ram False\n",
|
||
"Battery False\n",
|
||
"Display False\n",
|
||
"Camera False\n",
|
||
"External_Memory False\n",
|
||
"Android_version False\n",
|
||
"Price False\n",
|
||
"company False\n",
|
||
"Inbuilt_memory False\n",
|
||
"fast_charging False\n",
|
||
"Screen_resolution False\n",
|
||
"Processor False\n",
|
||
"Processor_name False\n",
|
||
"company_index False\n",
|
||
"dtype: bool\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"dropna_df = df1.dropna()\n",
|
||
"\n",
|
||
"print(dropna_df.shape)\n",
|
||
"\n",
|
||
"print(fillna_df.isnull().any())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Создание выборок данных\n",
|
||
"\n",
|
||
"Библиотека scikit-learn\n",
|
||
"\n",
|
||
"https://scikit-learn.org/stable/index.html"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 72,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Функция для создания выборок\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"\n",
|
||
"def split_stratified_into_train_val_test(\n",
|
||
" df_input,\n",
|
||
" stratify_colname=\"y\",\n",
|
||
" frac_train=0.6,\n",
|
||
" frac_val=0.15,\n",
|
||
" frac_test=0.25,\n",
|
||
" random_state=None,\n",
|
||
"):\n",
|
||
" \"\"\"\n",
|
||
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
|
||
" following fractional ratios provided by the user, where each subset is\n",
|
||
" stratified by the values in a specific column (that is, each subset has\n",
|
||
" the same relative frequency of the values in the column). It performs this\n",
|
||
" splitting by running train_test_split() twice.\n",
|
||
"\n",
|
||
" Parameters\n",
|
||
" ----------\n",
|
||
" df_input : Pandas dataframe\n",
|
||
" Input dataframe to be split.\n",
|
||
" stratify_colname : str\n",
|
||
" The name of the column that will be used for stratification. Usually\n",
|
||
" this column would be for the label.\n",
|
||
" frac_train : float\n",
|
||
" frac_val : float\n",
|
||
" frac_test : float\n",
|
||
" The ratios with which the dataframe will be split into train, val, and\n",
|
||
" test data. The values should be expressed as float fractions and should\n",
|
||
" sum to 1.0.\n",
|
||
" random_state : int, None, or RandomStateInstance\n",
|
||
" Value to be passed to train_test_split().\n",
|
||
"\n",
|
||
" Returns\n",
|
||
" -------\n",
|
||
" df_train, df_val, df_test :\n",
|
||
" Dataframes containing the three splits.\n",
|
||
" \"\"\"\n",
|
||
"\n",
|
||
" if frac_train + frac_val + frac_test != 1.0:\n",
|
||
" raise ValueError(\n",
|
||
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
||
" % (frac_train, frac_val, frac_test)\n",
|
||
" )\n",
|
||
"\n",
|
||
" if stratify_colname not in df_input.columns:\n",
|
||
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
||
"\n",
|
||
" X = df_input # Contains all columns.\n",
|
||
" y = df_input[\n",
|
||
" [stratify_colname]\n",
|
||
" ] # Dataframe of just the column on which to stratify.\n",
|
||
"\n",
|
||
" # Split original dataframe into train and temp dataframes.\n",
|
||
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
||
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Split the temp dataframe into val and test dataframes.\n",
|
||
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
||
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
||
" df_temp,\n",
|
||
" y_temp,\n",
|
||
" stratify=y_temp,\n",
|
||
" test_size=relative_frac_test,\n",
|
||
" random_state=random_state,\n",
|
||
" )\n",
|
||
"\n",
|
||
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
||
"\n",
|
||
" return df_train, df_val, df_test"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 89,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Rating_index\n",
|
||
"0 942\n",
|
||
"1 428\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Обучающая выборка: (822, 2)\n",
|
||
"Rating_index\n",
|
||
"0 565\n",
|
||
"1 257\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Контрольная выборка: (274, 2)\n",
|
||
"Rating_index\n",
|
||
"0 189\n",
|
||
"1 85\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Тестовая выборка: (274, 2)\n",
|
||
"Rating_index\n",
|
||
"0 188\n",
|
||
"1 86\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Вывод распределения количества наблюдений по меткам (классам)\n",
|
||
"print(df1.Rating_index.value_counts())\n",
|
||
"\n",
|
||
"data = df1[[\"Rating_index\", \"Spec_scoreFillMedian\"]].copy()\n",
|
||
"\n",
|
||
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
||
" data,\n",
|
||
" stratify_colname=\"Rating_index\",\n",
|
||
" frac_train=0.60,\n",
|
||
" frac_val=0.20,\n",
|
||
" frac_test=0.20,\n",
|
||
")\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"print(df_train.Rating_index.value_counts())\n",
|
||
"\n",
|
||
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
||
"print(df_val.Rating_index.value_counts())\n",
|
||
"\n",
|
||
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
||
"print(df_test.Rating_index.value_counts())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Выборка с избытком (oversampling)\n",
|
||
"\n",
|
||
"https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
|
||
"\n",
|
||
"https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
|
||
"\n",
|
||
"Выборка с недостатком (undersampling)\n",
|
||
"\n",
|
||
"https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
|
||
"\n",
|
||
"Библиотека imbalanced-learn\n",
|
||
"\n",
|
||
"https://imbalanced-learn.org/stable/"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 90,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Обучающая выборка: (822, 2)\n",
|
||
"Rating_index\n",
|
||
"0 565\n",
|
||
"1 257\n",
|
||
"Name: count, dtype: int64\n",
|
||
"Обучающая выборка после oversampling: (1127, 2)\n",
|
||
"Rating_index\n",
|
||
"0 565\n",
|
||
"1 562\n",
|
||
"Name: count, dtype: int64\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Rating_index</th>\n",
|
||
" <th>Spec_scoreFillMedian</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>75.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>83.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>76.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>80.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>65.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1122</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>64.392639</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1123</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>63.805529</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1124</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>64.607184</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1125</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>64.688453</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1126</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>64.376356</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>1127 rows × 2 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Rating_index Spec_scoreFillMedian\n",
|
||
"0 0 75.000000\n",
|
||
"1 0 83.000000\n",
|
||
"2 0 76.000000\n",
|
||
"3 0 80.000000\n",
|
||
"4 0 65.000000\n",
|
||
"... ... ...\n",
|
||
"1122 1 64.392639\n",
|
||
"1123 1 63.805529\n",
|
||
"1124 1 64.607184\n",
|
||
"1125 1 64.688453\n",
|
||
"1126 1 64.376356\n",
|
||
"\n",
|
||
"[1127 rows x 2 columns]"
|
||
]
|
||
},
|
||
"execution_count": 90,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.over_sampling import ADASYN\n",
|
||
"\n",
|
||
"ada = ADASYN()\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"print(df_train.Rating_index.value_counts())\n",
|
||
"\n",
|
||
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Rating_index\"])\n",
|
||
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
||
"print(df_train_adasyn.Rating_index.value_counts())\n",
|
||
"\n",
|
||
"df_train_adasyn"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.6"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|