MII_Yunusov_Niyaz/notebooks/lec2.ipynb
2024-09-30 23:02:17 +04:00

1358 lines
50 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Загрузка данных в DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 1370 entries, 0 to 1369\n",
"Data columns (total 18 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Name 1370 non-null object \n",
" 1 Rating 1370 non-null float64\n",
" 2 Spec_score 1367 non-null float64\n",
" 3 No_of_sim 1370 non-null object \n",
" 4 Ram 1370 non-null object \n",
" 5 Battery 1370 non-null object \n",
" 6 Display 1370 non-null object \n",
" 7 Camera 1370 non-null object \n",
" 8 External_Memory 1370 non-null object \n",
" 9 Android_version 927 non-null object \n",
" 10 Price 1370 non-null object \n",
" 11 company 1370 non-null object \n",
" 12 Inbuilt_memory 1351 non-null object \n",
" 13 fast_charging 1281 non-null object \n",
" 14 Screen_resolution 1368 non-null object \n",
" 15 Processor 1342 non-null object \n",
" 16 Processor_name 1370 non-null object \n",
" 17 Rating_index 1370 non-null int64 \n",
"dtypes: float64(2), int64(1), object(15)\n",
"memory usage: 203.4+ KB\n",
"(1370, 18)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Rating</th>\n",
" <th>Spec_score</th>\n",
" <th>No_of_sim</th>\n",
" <th>Ram</th>\n",
" <th>Battery</th>\n",
" <th>Display</th>\n",
" <th>Camera</th>\n",
" <th>External_Memory</th>\n",
" <th>Android_version</th>\n",
" <th>Price</th>\n",
" <th>company</th>\n",
" <th>Inbuilt_memory</th>\n",
" <th>fast_charging</th>\n",
" <th>Screen_resolution</th>\n",
" <th>Processor</th>\n",
" <th>Processor_name</th>\n",
" <th>Rating_index</th>\n",
" </tr>\n",
" <tr>\n",
" <th>ID</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Samsung Galaxy F14 5G</td>\n",
" <td>4.65</td>\n",
" <td>68.0</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>6000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP + 2 MP Dual Rear &amp;amp; 13 MP Front Camera</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>13</td>\n",
" <td>9.999</td>\n",
" <td>Samsung</td>\n",
" <td>128 GB inbuilt</td>\n",
" <td>25W Fast Charging</td>\n",
" <td>2408 x 1080 px Display with Water Drop Notch</td>\n",
" <td>Octa Core Processor</td>\n",
" <td>Exynos 1330</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Samsung Galaxy A11</td>\n",
" <td>4.20</td>\n",
" <td>63.0</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>2 GB RAM</td>\n",
" <td>4000 mAh Battery</td>\n",
" <td>6.4 inches</td>\n",
" <td>13 MP + 5 MP + 2 MP Triple Rear &amp;amp; 8 MP Fro...</td>\n",
" <td>Memory Card Supported, upto 512 GB</td>\n",
" <td>10</td>\n",
" <td>9,990</td>\n",
" <td>Samsung</td>\n",
" <td>32 GB inbuilt</td>\n",
" <td>15W Fast Charging</td>\n",
" <td>720 x 1560 px Display with Punch Hole</td>\n",
" <td>1.8 GHz Processor</td>\n",
" <td>Octa Core</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Samsung Galaxy A13</td>\n",
" <td>4.30</td>\n",
" <td>NaN</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP Quad Rear &amp;amp; 8 MP Front Camera</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>12</td>\n",
" <td>11,999</td>\n",
" <td>Samsung</td>\n",
" <td>64 GB inbuilt</td>\n",
" <td>25W Fast Charging</td>\n",
" <td>1080 x 2408 px Display with Water Drop Notch</td>\n",
" <td>2 GHz Processor</td>\n",
" <td>Octa Core</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Samsung Galaxy F23</td>\n",
" <td>4.10</td>\n",
" <td>73.0</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>6000 mAh Battery</td>\n",
" <td>6.4 inches</td>\n",
" <td>48 MP Quad Rear &amp;amp; 13 MP Front Camera</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>12</td>\n",
" <td>11,999</td>\n",
" <td>Samsung</td>\n",
" <td>64 GB inbuilt</td>\n",
" <td>NaN</td>\n",
" <td>720 x 1600 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Helio G88</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Samsung Galaxy A03s (4GB RAM + 64GB)</td>\n",
" <td>4.10</td>\n",
" <td>69.0</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.5 inches</td>\n",
" <td>13 MP + 2 MP + 2 MP Triple Rear &amp;amp; 5 MP Fro...</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>11</td>\n",
" <td>11,999</td>\n",
" <td>Samsung</td>\n",
" <td>64 GB inbuilt</td>\n",
" <td>15W Fast Charging</td>\n",
" <td>720 x 1600 px Display with Water Drop Notch</td>\n",
" <td>Octa Core</td>\n",
" <td>Helio P35</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Rating Spec_score \\\n",
"ID \n",
"0 Samsung Galaxy F14 5G 4.65 68.0 \n",
"1 Samsung Galaxy A11 4.20 63.0 \n",
"2 Samsung Galaxy A13 4.30 NaN \n",
"3 Samsung Galaxy F23 4.10 73.0 \n",
"4 Samsung Galaxy A03s (4GB RAM + 64GB) 4.10 69.0 \n",
"\n",
" No_of_sim Ram Battery Display \\\n",
"ID \n",
"0 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 6000 mAh Battery 6.6 inches \n",
"1 Dual Sim, 3G, 4G, VoLTE, 2 GB RAM 4000 mAh Battery 6.4 inches \n",
"2 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.6 inches \n",
"3 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches \n",
"4 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches \n",
"\n",
" Camera \\\n",
"ID \n",
"0 50 MP + 2 MP Dual Rear &amp; 13 MP Front Camera \n",
"1 13 MP + 5 MP + 2 MP Triple Rear &amp; 8 MP Fro... \n",
"2 50 MP Quad Rear &amp; 8 MP Front Camera \n",
"3 48 MP Quad Rear &amp; 13 MP Front Camera \n",
"4 13 MP + 2 MP + 2 MP Triple Rear &amp; 5 MP Fro... \n",
"\n",
" External_Memory Android_version Price company \\\n",
"ID \n",
"0 Memory Card Supported, upto 1 TB 13 9.999 Samsung \n",
"1 Memory Card Supported, upto 512 GB 10 9,990 Samsung \n",
"2 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n",
"3 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n",
"4 Memory Card Supported, upto 1 TB 11 11,999 Samsung \n",
"\n",
" Inbuilt_memory fast_charging \\\n",
"ID \n",
"0 128 GB inbuilt 25W Fast Charging \n",
"1 32 GB inbuilt 15W Fast Charging \n",
"2 64 GB inbuilt 25W Fast Charging \n",
"3 64 GB inbuilt NaN \n",
"4 64 GB inbuilt 15W Fast Charging \n",
"\n",
" Screen_resolution Processor \\\n",
"ID \n",
"0 2408 x 1080 px Display with Water Drop Notch Octa Core Processor \n",
"1 720 x 1560 px Display with Punch Hole 1.8 GHz Processor \n",
"2 1080 x 2408 px Display with Water Drop Notch 2 GHz Processor \n",
"3 720 x 1600 px Octa Core \n",
"4 720 x 1600 px Display with Water Drop Notch Octa Core \n",
"\n",
" Processor_name Rating_index \n",
"ID \n",
"0 Exynos 1330 1 \n",
"1 Octa Core 0 \n",
"2 Octa Core 0 \n",
"3 Helio G88 0 \n",
"4 Helio P35 0 "
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df1 = pd.read_csv(\"../data/mobile_phone_price_prediction.csv\", index_col=\"ID\")\n",
"df1[\"Spec_score\"] = df1[\"Spec_score\"].replace({\"\": None})\n",
"df1[\"Rating_index\"] = df1[\"Rating\"].apply(lambda x: 1 if float(x) > 4.5 else 0)\n",
"\n",
"df1.info()\n",
"\n",
"print(df1.shape)\n",
"\n",
"df1.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Получение сведений о пропущенных данных"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Типы пропущенных данных:\n",
"- None - представление пустых данных в Python\n",
"- NaN - представление пустых данных в Pandas\n",
"- '' - пустая строка"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Name 0\n",
"Rating 0\n",
"Spec_score 3\n",
"No_of_sim 0\n",
"Ram 0\n",
"Battery 0\n",
"Display 0\n",
"Camera 0\n",
"External_Memory 0\n",
"Android_version 443\n",
"Price 0\n",
"company 0\n",
"Inbuilt_memory 19\n",
"fast_charging 89\n",
"Screen_resolution 2\n",
"Processor 28\n",
"Processor_name 0\n",
"company_index 0\n",
"dtype: int64\n",
"\n",
"Name False\n",
"Rating False\n",
"Spec_score True\n",
"No_of_sim False\n",
"Ram False\n",
"Battery False\n",
"Display False\n",
"Camera False\n",
"External_Memory False\n",
"Android_version True\n",
"Price False\n",
"company False\n",
"Inbuilt_memory True\n",
"fast_charging True\n",
"Screen_resolution True\n",
"Processor True\n",
"Processor_name False\n",
"company_index False\n",
"dtype: bool\n",
"\n",
"Spec_score процент пустых значений: %0.22\n",
"Android_version процент пустых значений: %32.34\n",
"Inbuilt_memory процент пустых значений: %1.39\n",
"fast_charging процент пустых значений: %6.50\n",
"Screen_resolution процент пустых значений: %0.15\n",
"Processor процент пустых значений: %2.04\n"
]
}
],
"source": [
"# Количество пустых значений признаков\n",
"print(df1.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df1.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df1.columns:\n",
" null_rate = df1[i].isnull().sum() / len(df1) * 100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Заполнение пропущенных данных\n",
"\n",
"https://pythonmldaily.com/posts/pandas-dataframes-search-drop-empty-values\n",
"\n",
"https://scales.arabpsychology.com/stats/how-to-fill-nan-values-with-median-in-pandas/"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1370, 18)\n",
"Name False\n",
"Rating False\n",
"Spec_score False\n",
"No_of_sim False\n",
"Ram False\n",
"Battery False\n",
"Display False\n",
"Camera False\n",
"External_Memory False\n",
"Android_version False\n",
"Price False\n",
"company False\n",
"Inbuilt_memory False\n",
"fast_charging False\n",
"Screen_resolution False\n",
"Processor False\n",
"Processor_name False\n",
"Rating_index False\n",
"dtype: bool\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Rating</th>\n",
" <th>Spec_score</th>\n",
" <th>No_of_sim</th>\n",
" <th>Ram</th>\n",
" <th>Battery</th>\n",
" <th>Display</th>\n",
" <th>Camera</th>\n",
" <th>External_Memory</th>\n",
" <th>Android_version</th>\n",
" <th>Price</th>\n",
" <th>company</th>\n",
" <th>Inbuilt_memory</th>\n",
" <th>fast_charging</th>\n",
" <th>Screen_resolution</th>\n",
" <th>Processor</th>\n",
" <th>Processor_name</th>\n",
" <th>Rating_index</th>\n",
" <th>Spec_scoreFillNA</th>\n",
" <th>Spec_scoreFillMedian</th>\n",
" </tr>\n",
" <tr>\n",
" <th>ID</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1365</th>\n",
" <td>TCL 40R</td>\n",
" <td>4.05</td>\n",
" <td>75.0</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP + 2 MP + 2 MP Triple Rear &amp;amp; 8 MP Fro...</td>\n",
" <td>Memory Card (Hybrid)</td>\n",
" <td>12</td>\n",
" <td>18,999</td>\n",
" <td>TCL</td>\n",
" <td>64 GB inbuilt</td>\n",
" <td>15W Fast Charging</td>\n",
" <td>720 x 1612 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Dimensity 700 5G</td>\n",
" <td>0</td>\n",
" <td>75.0</td>\n",
" <td>75.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1366</th>\n",
" <td>TCL 50 XL NxtPaper 5G</td>\n",
" <td>4.10</td>\n",
" <td>80.0</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>8 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.8 inches</td>\n",
" <td>50 MP + 2 MP Dual Rear &amp;amp; 16 MP Front Camera</td>\n",
" <td>Memory Card (Hybrid)</td>\n",
" <td>14</td>\n",
" <td>24,990</td>\n",
" <td>TCL</td>\n",
" <td>128 GB inbuilt</td>\n",
" <td>33W Fast Charging</td>\n",
" <td>1200 x 2400 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Dimensity 7050</td>\n",
" <td>0</td>\n",
" <td>80.0</td>\n",
" <td>80.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1367</th>\n",
" <td>TCL 50 XE NxtPaper 5G</td>\n",
" <td>4.00</td>\n",
" <td>80.0</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
" <td>6 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP + 2 MP Dual Rear &amp;amp; 16 MP Front Camera</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>13</td>\n",
" <td>23,990</td>\n",
" <td>TCL</td>\n",
" <td>256 GB inbuilt</td>\n",
" <td>18W Fast Charging</td>\n",
" <td>720 x 1612 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Dimensity 6080</td>\n",
" <td>0</td>\n",
" <td>80.0</td>\n",
" <td>80.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1368</th>\n",
" <td>TCL 40 NxtPaper 5G</td>\n",
" <td>4.50</td>\n",
" <td>79.0</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
" <td>6 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP + 2 MP + 2 MP Triple Rear &amp;amp; 8 MP Fro...</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>13</td>\n",
" <td>22,499</td>\n",
" <td>TCL</td>\n",
" <td>256 GB inbuilt</td>\n",
" <td>15W Fast Charging</td>\n",
" <td>720 x 1612 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Dimensity 6020</td>\n",
" <td>0</td>\n",
" <td>79.0</td>\n",
" <td>79.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1369</th>\n",
" <td>TCL Trifold</td>\n",
" <td>4.65</td>\n",
" <td>93.0</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G,</td>\n",
" <td>12 GB RAM</td>\n",
" <td>4600 mAh Battery</td>\n",
" <td>10 inches</td>\n",
" <td>Foldable Display, Dual Display</td>\n",
" <td>50 MP + 48 MP + 8 MP Triple Rear &amp;amp; 32 MP F...</td>\n",
" <td>13</td>\n",
" <td>1,19,990</td>\n",
" <td>TCL</td>\n",
" <td>256 GB inbuilt</td>\n",
" <td>67W Fast Charging</td>\n",
" <td>1916 x 2160 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Snapdragon 8 Gen2</td>\n",
" <td>1</td>\n",
" <td>93.0</td>\n",
" <td>93.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Rating Spec_score \\\n",
"ID \n",
"1365 TCL 40R 4.05 75.0 \n",
"1366 TCL 50 XL NxtPaper 5G 4.10 80.0 \n",
"1367 TCL 50 XE NxtPaper 5G 4.00 80.0 \n",
"1368 TCL 40 NxtPaper 5G 4.50 79.0 \n",
"1369 TCL Trifold 4.65 93.0 \n",
"\n",
" No_of_sim Ram Battery \\\n",
"ID \n",
"1365 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery \n",
"1366 Dual Sim, 3G, 4G, VoLTE, 8 GB RAM 5000 mAh Battery \n",
"1367 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n",
"1368 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n",
"1369 Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, 12 GB RAM 4600 mAh Battery \n",
"\n",
" Display Camera \\\n",
"ID \n",
"1365 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro... \n",
"1366 6.8 inches 50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera \n",
"1367 6.6 inches 50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera \n",
"1368 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro... \n",
"1369 10 inches Foldable Display, Dual Display \n",
"\n",
" External_Memory Android_version \\\n",
"ID \n",
"1365 Memory Card (Hybrid) 12 \n",
"1366 Memory Card (Hybrid) 14 \n",
"1367 Memory Card Supported, upto 1 TB 13 \n",
"1368 Memory Card Supported, upto 1 TB 13 \n",
"1369 50 MP + 48 MP + 8 MP Triple Rear &amp; 32 MP F... 13 \n",
"\n",
" Price company Inbuilt_memory fast_charging Screen_resolution \\\n",
"ID \n",
"1365 18,999 TCL 64 GB inbuilt 15W Fast Charging 720 x 1612 px \n",
"1366 24,990 TCL 128 GB inbuilt 33W Fast Charging 1200 x 2400 px \n",
"1367 23,990 TCL 256 GB inbuilt 18W Fast Charging 720 x 1612 px \n",
"1368 22,499 TCL 256 GB inbuilt 15W Fast Charging 720 x 1612 px \n",
"1369 1,19,990 TCL 256 GB inbuilt 67W Fast Charging 1916 x 2160 px \n",
"\n",
" Processor Processor_name Rating_index Spec_scoreFillNA \\\n",
"ID \n",
"1365 Octa Core Dimensity 700 5G 0 75.0 \n",
"1366 Octa Core Dimensity 7050 0 80.0 \n",
"1367 Octa Core Dimensity 6080 0 80.0 \n",
"1368 Octa Core Dimensity 6020 0 79.0 \n",
"1369 Octa Core Snapdragon 8 Gen2 1 93.0 \n",
"\n",
" Spec_scoreFillMedian \n",
"ID \n",
"1365 75.0 \n",
"1366 80.0 \n",
"1367 80.0 \n",
"1368 79.0 \n",
"1369 93.0 "
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fillna_df = df1.fillna(0)\n",
"\n",
"print(fillna_df.shape)\n",
"\n",
"print(fillna_df.isnull().any())\n",
"\n",
"# Замена пустых данных на 0\n",
"df1[\"Spec_scoreFillNA\"] = df1[\"Spec_score\"].fillna(0)\n",
"\n",
"# Замена пустых данных на медиану\n",
"df1[\"Spec_scoreFillMedian\"] = df1[\"Spec_score\"].fillna(df1[\"Spec_scoreFillNA\"].median())\n",
"\n",
"df1.tail()"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Rating</th>\n",
" <th>Spec_score</th>\n",
" <th>No_of_sim</th>\n",
" <th>Ram</th>\n",
" <th>Battery</th>\n",
" <th>Display</th>\n",
" <th>Camera</th>\n",
" <th>External_Memory</th>\n",
" <th>Android_version</th>\n",
" <th>...</th>\n",
" <th>company</th>\n",
" <th>Inbuilt_memory</th>\n",
" <th>fast_charging</th>\n",
" <th>Screen_resolution</th>\n",
" <th>Processor</th>\n",
" <th>Processor_name</th>\n",
" <th>company_index</th>\n",
" <th>Spec_scoreFillNA</th>\n",
" <th>Spec_scoreFillMedian</th>\n",
" <th>Android_versionCopy</th>\n",
" </tr>\n",
" <tr>\n",
" <th>ID</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1365</th>\n",
" <td>TCL 40R</td>\n",
" <td>4.05</td>\n",
" <td>75.0</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP + 2 MP + 2 MP Triple Rear &amp;amp; 8 MP Fro...</td>\n",
" <td>Memory Card (Hybrid)</td>\n",
" <td>12</td>\n",
" <td>...</td>\n",
" <td>TCL</td>\n",
" <td>64 GB inbuilt</td>\n",
" <td>15W Fast Charging</td>\n",
" <td>720 x 1612 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Dimensity 700 5G</td>\n",
" <td>TCL</td>\n",
" <td>75.0</td>\n",
" <td>75.0</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1366</th>\n",
" <td>TCL 50 XL NxtPaper 5G</td>\n",
" <td>4.10</td>\n",
" <td>80.0</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>8 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.8 inches</td>\n",
" <td>50 MP + 2 MP Dual Rear &amp;amp; 16 MP Front Camera</td>\n",
" <td>Memory Card (Hybrid)</td>\n",
" <td>14</td>\n",
" <td>...</td>\n",
" <td>TCL</td>\n",
" <td>128 GB inbuilt</td>\n",
" <td>33W Fast Charging</td>\n",
" <td>1200 x 2400 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Dimensity 7050</td>\n",
" <td>TCL</td>\n",
" <td>80.0</td>\n",
" <td>80.0</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1367</th>\n",
" <td>TCL 50 XE NxtPaper 5G</td>\n",
" <td>4.00</td>\n",
" <td>80.0</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
" <td>6 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP + 2 MP Dual Rear &amp;amp; 16 MP Front Camera</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>13</td>\n",
" <td>...</td>\n",
" <td>TCL</td>\n",
" <td>256 GB inbuilt</td>\n",
" <td>18W Fast Charging</td>\n",
" <td>720 x 1612 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Dimensity 6080</td>\n",
" <td>TCL</td>\n",
" <td>80.0</td>\n",
" <td>80.0</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1368</th>\n",
" <td>TCL 40 NxtPaper 5G</td>\n",
" <td>4.50</td>\n",
" <td>79.0</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
" <td>6 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP + 2 MP + 2 MP Triple Rear &amp;amp; 8 MP Fro...</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>13</td>\n",
" <td>...</td>\n",
" <td>TCL</td>\n",
" <td>256 GB inbuilt</td>\n",
" <td>15W Fast Charging</td>\n",
" <td>720 x 1612 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Dimensity 6020</td>\n",
" <td>TCL</td>\n",
" <td>79.0</td>\n",
" <td>79.0</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1369</th>\n",
" <td>TCL Trifold</td>\n",
" <td>4.65</td>\n",
" <td>93.0</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G,</td>\n",
" <td>12 GB RAM</td>\n",
" <td>4600 mAh Battery</td>\n",
" <td>10 inches</td>\n",
" <td>Foldable Display, Dual Display</td>\n",
" <td>50 MP + 48 MP + 8 MP Triple Rear &amp;amp; 32 MP F...</td>\n",
" <td>13</td>\n",
" <td>...</td>\n",
" <td>TCL</td>\n",
" <td>256 GB inbuilt</td>\n",
" <td>67W Fast Charging</td>\n",
" <td>1916 x 2160 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Snapdragon 8 Gen2</td>\n",
" <td>TCL</td>\n",
" <td>93.0</td>\n",
" <td>93.0</td>\n",
" <td>13</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" Name Rating Spec_score \\\n",
"ID \n",
"1365 TCL 40R 4.05 75.0 \n",
"1366 TCL 50 XL NxtPaper 5G 4.10 80.0 \n",
"1367 TCL 50 XE NxtPaper 5G 4.00 80.0 \n",
"1368 TCL 40 NxtPaper 5G 4.50 79.0 \n",
"1369 TCL Trifold 4.65 93.0 \n",
"\n",
" No_of_sim Ram Battery \\\n",
"ID \n",
"1365 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery \n",
"1366 Dual Sim, 3G, 4G, VoLTE, 8 GB RAM 5000 mAh Battery \n",
"1367 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n",
"1368 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n",
"1369 Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, 12 GB RAM 4600 mAh Battery \n",
"\n",
" Display Camera \\\n",
"ID \n",
"1365 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro... \n",
"1366 6.8 inches 50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera \n",
"1367 6.6 inches 50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera \n",
"1368 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro... \n",
"1369 10 inches Foldable Display, Dual Display \n",
"\n",
" External_Memory Android_version ... \\\n",
"ID ... \n",
"1365 Memory Card (Hybrid) 12 ... \n",
"1366 Memory Card (Hybrid) 14 ... \n",
"1367 Memory Card Supported, upto 1 TB 13 ... \n",
"1368 Memory Card Supported, upto 1 TB 13 ... \n",
"1369 50 MP + 48 MP + 8 MP Triple Rear &amp; 32 MP F... 13 ... \n",
"\n",
" company Inbuilt_memory fast_charging Screen_resolution \\\n",
"ID \n",
"1365 TCL 64 GB inbuilt 15W Fast Charging 720 x 1612 px \n",
"1366 TCL 128 GB inbuilt 33W Fast Charging 1200 x 2400 px \n",
"1367 TCL 256 GB inbuilt 18W Fast Charging 720 x 1612 px \n",
"1368 TCL 256 GB inbuilt 15W Fast Charging 720 x 1612 px \n",
"1369 TCL 256 GB inbuilt 67W Fast Charging 1916 x 2160 px \n",
"\n",
" Processor Processor_name company_index Spec_scoreFillNA \\\n",
"ID \n",
"1365 Octa Core Dimensity 700 5G TCL 75.0 \n",
"1366 Octa Core Dimensity 7050 TCL 80.0 \n",
"1367 Octa Core Dimensity 6080 TCL 80.0 \n",
"1368 Octa Core Dimensity 6020 TCL 79.0 \n",
"1369 Octa Core Snapdragon 8 Gen2 TCL 93.0 \n",
"\n",
" Spec_scoreFillMedian Android_versionCopy \n",
"ID \n",
"1365 75.0 12 \n",
"1366 80.0 14 \n",
"1367 80.0 13 \n",
"1368 79.0 13 \n",
"1369 93.0 13 \n",
"\n",
"[5 rows x 21 columns]"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1[\"Android_versionCopy\"] = df1[\"Android_version\"]\n",
"\n",
"# Замена данных сразу в DataFrame без копирования\n",
"df1.fillna({\"Android_versionCopy\": 0}, inplace=True)\n",
"\n",
"df1.tail()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Удаление наблюдений с пропусками"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(814, 21)\n",
"Name False\n",
"Rating False\n",
"Spec_score False\n",
"No_of_sim False\n",
"Ram False\n",
"Battery False\n",
"Display False\n",
"Camera False\n",
"External_Memory False\n",
"Android_version False\n",
"Price False\n",
"company False\n",
"Inbuilt_memory False\n",
"fast_charging False\n",
"Screen_resolution False\n",
"Processor False\n",
"Processor_name False\n",
"company_index False\n",
"dtype: bool\n"
]
}
],
"source": [
"dropna_df = df1.dropna()\n",
"\n",
"print(dropna_df.shape)\n",
"\n",
"print(fillna_df.isnull().any())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Создание выборок данных\n",
"\n",
"Библиотека scikit-learn\n",
"\n",
"https://scikit-learn.org/stable/index.html"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"assets/lec2-split.png\" width=\"600\" style=\"background-color: white\">"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"# Функция для создания выборок\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
"):\n",
" \"\"\"\n",
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
" following fractional ratios provided by the user, where each subset is\n",
" stratified by the values in a specific column (that is, each subset has\n",
" the same relative frequency of the values in the column). It performs this\n",
" splitting by running train_test_split() twice.\n",
"\n",
" Parameters\n",
" ----------\n",
" df_input : Pandas dataframe\n",
" Input dataframe to be split.\n",
" stratify_colname : str\n",
" The name of the column that will be used for stratification. Usually\n",
" this column would be for the label.\n",
" frac_train : float\n",
" frac_val : float\n",
" frac_test : float\n",
" The ratios with which the dataframe will be split into train, val, and\n",
" test data. The values should be expressed as float fractions and should\n",
" sum to 1.0.\n",
" random_state : int, None, or RandomStateInstance\n",
" Value to be passed to train_test_split().\n",
"\n",
" Returns\n",
" -------\n",
" df_train, df_val, df_test :\n",
" Dataframes containing the three splits.\n",
" \"\"\"\n",
"\n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
"\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
"\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
"\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
"\n",
" return df_train, df_val, df_test"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rating_index\n",
"0 942\n",
"1 428\n",
"Name: count, dtype: int64\n",
"Обучающая выборка: (822, 2)\n",
"Rating_index\n",
"0 565\n",
"1 257\n",
"Name: count, dtype: int64\n",
"Контрольная выборка: (274, 2)\n",
"Rating_index\n",
"0 189\n",
"1 85\n",
"Name: count, dtype: int64\n",
"Тестовая выборка: (274, 2)\n",
"Rating_index\n",
"0 188\n",
"1 86\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# Вывод распределения количества наблюдений по меткам (классам)\n",
"print(df1.Rating_index.value_counts())\n",
"\n",
"data = df1[[\"Rating_index\", \"Spec_scoreFillMedian\"]].copy()\n",
"\n",
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
" data,\n",
" stratify_colname=\"Rating_index\",\n",
" frac_train=0.60,\n",
" frac_val=0.20,\n",
" frac_test=0.20,\n",
")\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
"print(df_train.Rating_index.value_counts())\n",
"\n",
"print(\"Контрольная выборка: \", df_val.shape)\n",
"print(df_val.Rating_index.value_counts())\n",
"\n",
"print(\"Тестовая выборка: \", df_test.shape)\n",
"print(df_test.Rating_index.value_counts())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выборка с избытком (oversampling)\n",
"\n",
"https://www.blog.trainindata.com/oversampling-techniques-for-imbalanced-data/\n",
"\n",
"https://datacrayon.com/machine-learning/class-imbalance-and-oversampling/\n",
"\n",
"Выборка с недостатком (undersampling)\n",
"\n",
"https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/\n",
"\n",
"Библиотека imbalanced-learn\n",
"\n",
"https://imbalanced-learn.org/stable/"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка: (822, 2)\n",
"Rating_index\n",
"0 565\n",
"1 257\n",
"Name: count, dtype: int64\n",
"Обучающая выборка после oversampling: (1127, 2)\n",
"Rating_index\n",
"0 565\n",
"1 562\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rating_index</th>\n",
" <th>Spec_scoreFillMedian</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>75.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>83.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>76.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>80.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>65.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1122</th>\n",
" <td>1</td>\n",
" <td>64.392639</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1123</th>\n",
" <td>1</td>\n",
" <td>63.805529</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1124</th>\n",
" <td>1</td>\n",
" <td>64.607184</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1125</th>\n",
" <td>1</td>\n",
" <td>64.688453</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1126</th>\n",
" <td>1</td>\n",
" <td>64.376356</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1127 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" Rating_index Spec_scoreFillMedian\n",
"0 0 75.000000\n",
"1 0 83.000000\n",
"2 0 76.000000\n",
"3 0 80.000000\n",
"4 0 65.000000\n",
"... ... ...\n",
"1122 1 64.392639\n",
"1123 1 63.805529\n",
"1124 1 64.607184\n",
"1125 1 64.688453\n",
"1126 1 64.376356\n",
"\n",
"[1127 rows x 2 columns]"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from imblearn.over_sampling import ADASYN\n",
"\n",
"ada = ADASYN()\n",
"\n",
"print(\"Обучающая выборка: \", df_train.shape)\n",
"print(df_train.Rating_index.value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Rating_index\"])\n",
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
"print(df_train_adasyn.Rating_index.value_counts())\n",
"\n",
"df_train_adasyn"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}