3089 lines
91 KiB
Plaintext
3089 lines
91 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 1354 entries, 0 to 1369\n",
|
||
"Data columns (total 18 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 Name_phone 1354 non-null object \n",
|
||
" 1 Rating 1354 non-null float64\n",
|
||
" 2 Spec_score 1354 non-null int64 \n",
|
||
" 3 No_of_sim 1354 non-null object \n",
|
||
" 4 Ram 1354 non-null object \n",
|
||
" 5 Battery 1354 non-null object \n",
|
||
" 6 Display 1354 non-null object \n",
|
||
" 7 Camera 1354 non-null object \n",
|
||
" 8 External_Memory 1354 non-null object \n",
|
||
" 9 Android_version 914 non-null object \n",
|
||
" 10 Price 1354 non-null object \n",
|
||
" 11 company 1354 non-null object \n",
|
||
" 12 Inbuilt_memory 1335 non-null object \n",
|
||
" 13 fast_charging 1267 non-null object \n",
|
||
" 14 Screen_resolution 1353 non-null object \n",
|
||
" 15 Processor 1327 non-null object \n",
|
||
" 16 Processor_name 1354 non-null object \n",
|
||
" 17 Class 1354 non-null int64 \n",
|
||
"dtypes: float64(1), int64(2), object(15)\n",
|
||
"memory usage: 201.0+ KB\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(1354, 18)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Name_phone</th>\n",
|
||
" <th>Rating</th>\n",
|
||
" <th>Spec_score</th>\n",
|
||
" <th>No_of_sim</th>\n",
|
||
" <th>Ram</th>\n",
|
||
" <th>Battery</th>\n",
|
||
" <th>Display</th>\n",
|
||
" <th>Camera</th>\n",
|
||
" <th>External_Memory</th>\n",
|
||
" <th>Android_version</th>\n",
|
||
" <th>Price</th>\n",
|
||
" <th>company</th>\n",
|
||
" <th>Inbuilt_memory</th>\n",
|
||
" <th>fast_charging</th>\n",
|
||
" <th>Screen_resolution</th>\n",
|
||
" <th>Processor</th>\n",
|
||
" <th>Processor_name</th>\n",
|
||
" <th>Class</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Id</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Samsung Galaxy F14 5G</td>\n",
|
||
" <td>4.65</td>\n",
|
||
" <td>68</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
|
||
" <td>4 GB RAM</td>\n",
|
||
" <td>6000 mAh Battery</td>\n",
|
||
" <td>6.6 inches</td>\n",
|
||
" <td>50 MP + 2 MP Dual Rear &amp; 13 MP Front Camera</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>9,999</td>\n",
|
||
" <td>Samsung</td>\n",
|
||
" <td>128 GB inbuilt</td>\n",
|
||
" <td>25W Fast Charging</td>\n",
|
||
" <td>2408 x 1080 px Display with Water Drop Notch</td>\n",
|
||
" <td>Octa Core Processor</td>\n",
|
||
" <td>Exynos 1330</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Samsung Galaxy A11</td>\n",
|
||
" <td>4.20</td>\n",
|
||
" <td>63</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
||
" <td>2 GB RAM</td>\n",
|
||
" <td>4000 mAh Battery</td>\n",
|
||
" <td>6.4 inches</td>\n",
|
||
" <td>13 MP + 5 MP + 2 MP Triple Rear &amp; 8 MP Fro...</td>\n",
|
||
" <td>Memory Card Supported, upto 512 GB</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>9,990</td>\n",
|
||
" <td>Samsung</td>\n",
|
||
" <td>32 GB inbuilt</td>\n",
|
||
" <td>15W Fast Charging</td>\n",
|
||
" <td>720 x 1560 px Display with Punch Hole</td>\n",
|
||
" <td>1.8 GHz Processor</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Samsung Galaxy F23</td>\n",
|
||
" <td>4.10</td>\n",
|
||
" <td>73</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
||
" <td>4 GB RAM</td>\n",
|
||
" <td>6000 mAh Battery</td>\n",
|
||
" <td>6.4 inches</td>\n",
|
||
" <td>48 MP Quad Rear &amp; 13 MP Front Camera</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>11,999</td>\n",
|
||
" <td>Samsung</td>\n",
|
||
" <td>64 GB inbuilt</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>720 x 1600 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Helio G88</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Samsung Galaxy A03s (4GB RAM + 64GB)</td>\n",
|
||
" <td>4.10</td>\n",
|
||
" <td>69</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
||
" <td>4 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.5 inches</td>\n",
|
||
" <td>13 MP + 2 MP + 2 MP Triple Rear &amp; 5 MP Fro...</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>11,999</td>\n",
|
||
" <td>Samsung</td>\n",
|
||
" <td>64 GB inbuilt</td>\n",
|
||
" <td>15W Fast Charging</td>\n",
|
||
" <td>720 x 1600 px Display with Water Drop Notch</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Helio P35</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Samsung Galaxy M13 5G</td>\n",
|
||
" <td>4.40</td>\n",
|
||
" <td>75</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
|
||
" <td>6 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.5 inches</td>\n",
|
||
" <td>50 MP + 2 MP Dual Rear &amp; 5 MP Front Camera</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>11,990</td>\n",
|
||
" <td>Samsung</td>\n",
|
||
" <td>128 GB inbuilt</td>\n",
|
||
" <td>15W Fast Charging</td>\n",
|
||
" <td>720 x 1600 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Dimensity 700</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Name_phone Rating Spec_score \\\n",
|
||
"Id \n",
|
||
"0 Samsung Galaxy F14 5G 4.65 68 \n",
|
||
"1 Samsung Galaxy A11 4.20 63 \n",
|
||
"3 Samsung Galaxy F23 4.10 73 \n",
|
||
"4 Samsung Galaxy A03s (4GB RAM + 64GB) 4.10 69 \n",
|
||
"5 Samsung Galaxy M13 5G 4.40 75 \n",
|
||
"\n",
|
||
" No_of_sim Ram Battery Display \\\n",
|
||
"Id \n",
|
||
"0 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 6000 mAh Battery 6.6 inches \n",
|
||
"1 Dual Sim, 3G, 4G, VoLTE, 2 GB RAM 4000 mAh Battery 6.4 inches \n",
|
||
"3 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches \n",
|
||
"4 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches \n",
|
||
"5 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery 6.5 inches \n",
|
||
"\n",
|
||
" Camera \\\n",
|
||
"Id \n",
|
||
"0 50 MP + 2 MP Dual Rear & 13 MP Front Camera \n",
|
||
"1 13 MP + 5 MP + 2 MP Triple Rear & 8 MP Fro... \n",
|
||
"3 48 MP Quad Rear & 13 MP Front Camera \n",
|
||
"4 13 MP + 2 MP + 2 MP Triple Rear & 5 MP Fro... \n",
|
||
"5 50 MP + 2 MP Dual Rear & 5 MP Front Camera \n",
|
||
"\n",
|
||
" External_Memory Android_version Price company \\\n",
|
||
"Id \n",
|
||
"0 Memory Card Supported, upto 1 TB 13 9,999 Samsung \n",
|
||
"1 Memory Card Supported, upto 512 GB 10 9,990 Samsung \n",
|
||
"3 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n",
|
||
"4 Memory Card Supported, upto 1 TB 11 11,999 Samsung \n",
|
||
"5 Memory Card Supported, upto 1 TB 12 11,990 Samsung \n",
|
||
"\n",
|
||
" Inbuilt_memory fast_charging \\\n",
|
||
"Id \n",
|
||
"0 128 GB inbuilt 25W Fast Charging \n",
|
||
"1 32 GB inbuilt 15W Fast Charging \n",
|
||
"3 64 GB inbuilt NaN \n",
|
||
"4 64 GB inbuilt 15W Fast Charging \n",
|
||
"5 128 GB inbuilt 15W Fast Charging \n",
|
||
"\n",
|
||
" Screen_resolution Processor \\\n",
|
||
"Id \n",
|
||
"0 2408 x 1080 px Display with Water Drop Notch Octa Core Processor \n",
|
||
"1 720 x 1560 px Display with Punch Hole 1.8 GHz Processor \n",
|
||
"3 720 x 1600 px Octa Core \n",
|
||
"4 720 x 1600 px Display with Water Drop Notch Octa Core \n",
|
||
"5 720 x 1600 px Octa Core \n",
|
||
"\n",
|
||
" Processor_name Class \n",
|
||
"Id \n",
|
||
"0 Exynos 1330 2 \n",
|
||
"1 Octa Core 2 \n",
|
||
"3 Helio G88 1 \n",
|
||
"4 Helio P35 2 \n",
|
||
"5 Dimensity 700 1 "
|
||
]
|
||
},
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"phone = pd.read_csv(\"data/phone_price.csv\", index_col=\"Id\")\n",
|
||
"\n",
|
||
"phone.info()\n",
|
||
"\n",
|
||
"display(phone.shape)\n",
|
||
"\n",
|
||
"phone.head()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 19227 entries, 0 to 19226\n",
|
||
"Data columns (total 20 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 Unnamed: 0 19227 non-null int64 \n",
|
||
" 1 ID 19227 non-null int64 \n",
|
||
" 2 Price 19227 non-null int64 \n",
|
||
" 3 Levy 19227 non-null object \n",
|
||
" 4 Manufacturer 19227 non-null object \n",
|
||
" 5 Model 19227 non-null object \n",
|
||
" 6 ProdYear 19227 non-null int64 \n",
|
||
" 7 Category 19227 non-null object \n",
|
||
" 8 Leather interior 19227 non-null object \n",
|
||
" 9 Fuel type 19227 non-null object \n",
|
||
" 10 Engine volume 19227 non-null object \n",
|
||
" 11 Mileage 19227 non-null object \n",
|
||
" 12 Cylinders 19227 non-null float64\n",
|
||
" 13 Gear box type 19227 non-null object \n",
|
||
" 14 Drive wheels 19227 non-null object \n",
|
||
" 15 Doors 19227 non-null object \n",
|
||
" 16 Wheel 19227 non-null object \n",
|
||
" 17 Color 19227 non-null object \n",
|
||
" 18 Airbags 19227 non-null int64 \n",
|
||
" 19 LeatherInterior1 19227 non-null int64 \n",
|
||
"dtypes: float64(1), int64(6), object(13)\n",
|
||
"memory usage: 2.9+ MB\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(19227, 20)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Unnamed: 0</th>\n",
|
||
" <th>ID</th>\n",
|
||
" <th>Price</th>\n",
|
||
" <th>Levy</th>\n",
|
||
" <th>Manufacturer</th>\n",
|
||
" <th>Model</th>\n",
|
||
" <th>ProdYear</th>\n",
|
||
" <th>Category</th>\n",
|
||
" <th>Leather interior</th>\n",
|
||
" <th>Fuel type</th>\n",
|
||
" <th>Engine volume</th>\n",
|
||
" <th>Mileage</th>\n",
|
||
" <th>Cylinders</th>\n",
|
||
" <th>Gear box type</th>\n",
|
||
" <th>Drive wheels</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Wheel</th>\n",
|
||
" <th>Color</th>\n",
|
||
" <th>Airbags</th>\n",
|
||
" <th>LeatherInterior1</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>45654403</td>\n",
|
||
" <td>13328</td>\n",
|
||
" <td>1399</td>\n",
|
||
" <td>LEXUS</td>\n",
|
||
" <td>RX 450</td>\n",
|
||
" <td>2010</td>\n",
|
||
" <td>Jeep</td>\n",
|
||
" <td>Yes</td>\n",
|
||
" <td>Hybrid</td>\n",
|
||
" <td>3.5</td>\n",
|
||
" <td>186005 km</td>\n",
|
||
" <td>6.0</td>\n",
|
||
" <td>Automatic</td>\n",
|
||
" <td>4x4</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Silver</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>44731507</td>\n",
|
||
" <td>16621</td>\n",
|
||
" <td>1018</td>\n",
|
||
" <td>CHEVROLET</td>\n",
|
||
" <td>Equinox</td>\n",
|
||
" <td>2011</td>\n",
|
||
" <td>Jeep</td>\n",
|
||
" <td>No</td>\n",
|
||
" <td>Petrol</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>192000 km</td>\n",
|
||
" <td>6.0</td>\n",
|
||
" <td>Tiptronic</td>\n",
|
||
" <td>4x4</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Black</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>45774419</td>\n",
|
||
" <td>8467</td>\n",
|
||
" <td>-</td>\n",
|
||
" <td>HONDA</td>\n",
|
||
" <td>FIT</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>Hatchback</td>\n",
|
||
" <td>No</td>\n",
|
||
" <td>Petrol</td>\n",
|
||
" <td>1.3</td>\n",
|
||
" <td>200000 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Variator</td>\n",
|
||
" <td>Front</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Right-hand drive</td>\n",
|
||
" <td>Black</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>45769185</td>\n",
|
||
" <td>3607</td>\n",
|
||
" <td>862</td>\n",
|
||
" <td>FORD</td>\n",
|
||
" <td>Escape</td>\n",
|
||
" <td>2011</td>\n",
|
||
" <td>Jeep</td>\n",
|
||
" <td>Yes</td>\n",
|
||
" <td>Hybrid</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>168966 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Automatic</td>\n",
|
||
" <td>4x4</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>45809263</td>\n",
|
||
" <td>11726</td>\n",
|
||
" <td>446</td>\n",
|
||
" <td>HONDA</td>\n",
|
||
" <td>FIT</td>\n",
|
||
" <td>2014</td>\n",
|
||
" <td>Hatchback</td>\n",
|
||
" <td>Yes</td>\n",
|
||
" <td>Petrol</td>\n",
|
||
" <td>1.3</td>\n",
|
||
" <td>91901 km</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Automatic</td>\n",
|
||
" <td>Front</td>\n",
|
||
" <td>04-May</td>\n",
|
||
" <td>Left wheel</td>\n",
|
||
" <td>Silver</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Unnamed: 0 ID Price Levy Manufacturer Model ProdYear \\\n",
|
||
"0 0 45654403 13328 1399 LEXUS RX 450 2010 \n",
|
||
"1 1 44731507 16621 1018 CHEVROLET Equinox 2011 \n",
|
||
"2 2 45774419 8467 - HONDA FIT 2006 \n",
|
||
"3 3 45769185 3607 862 FORD Escape 2011 \n",
|
||
"4 4 45809263 11726 446 HONDA FIT 2014 \n",
|
||
"\n",
|
||
" Category Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
|
||
"0 Jeep Yes Hybrid 3.5 186005 km 6.0 \n",
|
||
"1 Jeep No Petrol 3 192000 km 6.0 \n",
|
||
"2 Hatchback No Petrol 1.3 200000 km 4.0 \n",
|
||
"3 Jeep Yes Hybrid 2.5 168966 km 4.0 \n",
|
||
"4 Hatchback Yes Petrol 1.3 91901 km 4.0 \n",
|
||
"\n",
|
||
" Gear box type Drive wheels Doors Wheel Color Airbags \\\n",
|
||
"0 Automatic 4x4 04-May Left wheel Silver 12 \n",
|
||
"1 Tiptronic 4x4 04-May Left wheel Black 8 \n",
|
||
"2 Variator Front 04-May Right-hand drive Black 2 \n",
|
||
"3 Automatic 4x4 04-May Left wheel White 0 \n",
|
||
"4 Automatic Front 04-May Left wheel Silver 4 \n",
|
||
"\n",
|
||
" LeatherInterior1 \n",
|
||
"0 1 \n",
|
||
"1 0 \n",
|
||
"2 0 \n",
|
||
"3 1 \n",
|
||
"4 1 "
|
||
]
|
||
},
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"car = pd.read_csv(\"data/car_price.csv\")\n",
|
||
"\n",
|
||
"car.info()\n",
|
||
"\n",
|
||
"display(car.shape)\n",
|
||
"\n",
|
||
"car.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 3299 entries, 0 to 3298\n",
|
||
"Data columns (total 21 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 id 3299 non-null int64 \n",
|
||
" 1 date 3299 non-null object \n",
|
||
" 2 price 3299 non-null float64\n",
|
||
" 3 bedrooms 3299 non-null int64 \n",
|
||
" 4 bathrooms 3299 non-null float64\n",
|
||
" 5 sqft_living 3299 non-null int64 \n",
|
||
" 6 sqft_lot 3299 non-null int64 \n",
|
||
" 7 floors 3299 non-null float64\n",
|
||
" 8 waterfront 3299 non-null int64 \n",
|
||
" 9 view 3299 non-null int64 \n",
|
||
" 10 condition 3299 non-null int64 \n",
|
||
" 11 grade 3299 non-null int64 \n",
|
||
" 12 sqft_above 3299 non-null int64 \n",
|
||
" 13 sqft_basement 3299 non-null int64 \n",
|
||
" 14 yr_built 3299 non-null int64 \n",
|
||
" 15 yr_renovated 3299 non-null int64 \n",
|
||
" 16 zipcode 3299 non-null int64 \n",
|
||
" 17 lat 3299 non-null float64\n",
|
||
" 18 long 3299 non-null float64\n",
|
||
" 19 sqft_living15 3299 non-null int64 \n",
|
||
" 20 sqft_lot15 3299 non-null int64 \n",
|
||
"dtypes: float64(5), int64(15), object(1)\n",
|
||
"memory usage: 541.4+ KB\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(3299, 21)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>price</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>bathrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>floors</th>\n",
|
||
" <th>waterfront</th>\n",
|
||
" <th>view</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>grade</th>\n",
|
||
" <th>sqft_above</th>\n",
|
||
" <th>sqft_basement</th>\n",
|
||
" <th>yr_built</th>\n",
|
||
" <th>yr_renovated</th>\n",
|
||
" <th>zipcode</th>\n",
|
||
" <th>lat</th>\n",
|
||
" <th>long</th>\n",
|
||
" <th>sqft_living15</th>\n",
|
||
" <th>sqft_lot15</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>7129300520</td>\n",
|
||
" <td>20141013T000000</td>\n",
|
||
" <td>221900.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1180</td>\n",
|
||
" <td>5650</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1180</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1955</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98178</td>\n",
|
||
" <td>47.5112</td>\n",
|
||
" <td>-122.257</td>\n",
|
||
" <td>1340</td>\n",
|
||
" <td>5650</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>6414100192</td>\n",
|
||
" <td>20141209T000000</td>\n",
|
||
" <td>538000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.25</td>\n",
|
||
" <td>2570</td>\n",
|
||
" <td>7242</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>2170</td>\n",
|
||
" <td>400</td>\n",
|
||
" <td>1951</td>\n",
|
||
" <td>1991</td>\n",
|
||
" <td>98125</td>\n",
|
||
" <td>47.7210</td>\n",
|
||
" <td>-122.319</td>\n",
|
||
" <td>1690</td>\n",
|
||
" <td>7639</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>5631500400</td>\n",
|
||
" <td>20150225T000000</td>\n",
|
||
" <td>180000.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>770</td>\n",
|
||
" <td>10000</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>770</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1933</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98028</td>\n",
|
||
" <td>47.7379</td>\n",
|
||
" <td>-122.233</td>\n",
|
||
" <td>2720</td>\n",
|
||
" <td>8062</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2487200875</td>\n",
|
||
" <td>20141209T000000</td>\n",
|
||
" <td>604000.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>3.00</td>\n",
|
||
" <td>1960</td>\n",
|
||
" <td>5000</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>1050</td>\n",
|
||
" <td>910</td>\n",
|
||
" <td>1965</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98136</td>\n",
|
||
" <td>47.5208</td>\n",
|
||
" <td>-122.393</td>\n",
|
||
" <td>1360</td>\n",
|
||
" <td>5000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1954400510</td>\n",
|
||
" <td>20150218T000000</td>\n",
|
||
" <td>510000.0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>8080</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>1680</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1987</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>98074</td>\n",
|
||
" <td>47.6168</td>\n",
|
||
" <td>-122.045</td>\n",
|
||
" <td>1800</td>\n",
|
||
" <td>7503</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 21 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" id date price bedrooms bathrooms sqft_living \\\n",
|
||
"0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
|
||
"1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
|
||
"2 5631500400 20150225T000000 180000.0 2 1.00 770 \n",
|
||
"3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
|
||
"4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
|
||
"\n",
|
||
" sqft_lot floors waterfront view ... grade sqft_above sqft_basement \\\n",
|
||
"0 5650 1.0 0 0 ... 7 1180 0 \n",
|
||
"1 7242 2.0 0 0 ... 7 2170 400 \n",
|
||
"2 10000 1.0 0 0 ... 6 770 0 \n",
|
||
"3 5000 1.0 0 0 ... 7 1050 910 \n",
|
||
"4 8080 1.0 0 0 ... 8 1680 0 \n",
|
||
"\n",
|
||
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
|
||
"0 1955 0 98178 47.5112 -122.257 1340 \n",
|
||
"1 1951 1991 98125 47.7210 -122.319 1690 \n",
|
||
"2 1933 0 98028 47.7379 -122.233 2720 \n",
|
||
"3 1965 0 98136 47.5208 -122.393 1360 \n",
|
||
"4 1987 0 98074 47.6168 -122.045 1800 \n",
|
||
"\n",
|
||
" sqft_lot15 \n",
|
||
"0 5650 \n",
|
||
"1 7639 \n",
|
||
"2 8062 \n",
|
||
"3 5000 \n",
|
||
"4 7503 \n",
|
||
"\n",
|
||
"[5 rows x 21 columns]"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"house = pd.read_csv(\"data/house_data.csv\")\n",
|
||
"\n",
|
||
"house.info()\n",
|
||
"\n",
|
||
"display(house.shape)\n",
|
||
"\n",
|
||
"house.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Name_phone 0\n",
|
||
"Rating 0\n",
|
||
"Spec_score 0\n",
|
||
"No_of_sim 0\n",
|
||
"Ram 0\n",
|
||
"Battery 0\n",
|
||
"Display 0\n",
|
||
"Camera 0\n",
|
||
"External_Memory 0\n",
|
||
"Android_version 440\n",
|
||
"Price 0\n",
|
||
"company 0\n",
|
||
"Inbuilt_memory 19\n",
|
||
"fast_charging 87\n",
|
||
"Screen_resolution 1\n",
|
||
"Processor 27\n",
|
||
"Processor_name 0\n",
|
||
"Class 0\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Name_phone False\n",
|
||
"Rating False\n",
|
||
"Spec_score False\n",
|
||
"No_of_sim False\n",
|
||
"Ram False\n",
|
||
"Battery False\n",
|
||
"Display False\n",
|
||
"Camera False\n",
|
||
"External_Memory False\n",
|
||
"Android_version True\n",
|
||
"Price False\n",
|
||
"company False\n",
|
||
"Inbuilt_memory True\n",
|
||
"fast_charging True\n",
|
||
"Screen_resolution True\n",
|
||
"Processor True\n",
|
||
"Processor_name False\n",
|
||
"Class False\n",
|
||
"dtype: bool"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Android_version процент пустых значений: %32.50'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Inbuilt_memory процент пустых значений: %1.40'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'fast_charging процент пустых значений: %6.43'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Screen_resolution процент пустых значений: %0.07'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Processor процент пустых значений: %1.99'"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Количество пустых значений признаков\n",
|
||
"display(phone.isnull().sum())\n",
|
||
"display()\n",
|
||
"\n",
|
||
"# Есть ли пустые значения признаков\n",
|
||
"display(phone.isnull().any())\n",
|
||
"display()\n",
|
||
"\n",
|
||
"# Процент пустых значений признаков\n",
|
||
"for i in phone.columns:\n",
|
||
" null_rate = phone[i].isnull().sum() / len(phone) * 100\n",
|
||
" if null_rate > 0:\n",
|
||
" display(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Unnamed: 0 0\n",
|
||
"ID 0\n",
|
||
"Price 0\n",
|
||
"Levy 0\n",
|
||
"Manufacturer 0\n",
|
||
"Model 0\n",
|
||
"ProdYear 0\n",
|
||
"Category 0\n",
|
||
"Leather interior 0\n",
|
||
"Fuel type 0\n",
|
||
"Engine volume 0\n",
|
||
"Mileage 0\n",
|
||
"Cylinders 0\n",
|
||
"Gear box type 0\n",
|
||
"Drive wheels 0\n",
|
||
"Doors 0\n",
|
||
"Wheel 0\n",
|
||
"Color 0\n",
|
||
"Airbags 0\n",
|
||
"LeatherInterior1 0\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Unnamed: 0 False\n",
|
||
"ID False\n",
|
||
"Price False\n",
|
||
"Levy False\n",
|
||
"Manufacturer False\n",
|
||
"Model False\n",
|
||
"ProdYear False\n",
|
||
"Category False\n",
|
||
"Leather interior False\n",
|
||
"Fuel type False\n",
|
||
"Engine volume False\n",
|
||
"Mileage False\n",
|
||
"Cylinders False\n",
|
||
"Gear box type False\n",
|
||
"Drive wheels False\n",
|
||
"Doors False\n",
|
||
"Wheel False\n",
|
||
"Color False\n",
|
||
"Airbags False\n",
|
||
"LeatherInterior1 False\n",
|
||
"dtype: bool"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Количество пустых значений признаков\n",
|
||
"display(car.isnull().sum())\n",
|
||
"display()\n",
|
||
"\n",
|
||
"# Есть ли пустые значения признаков\n",
|
||
"display(car.isnull().any())\n",
|
||
"display()\n",
|
||
"\n",
|
||
"# Процент пустых значений признаков\n",
|
||
"for i in car.columns:\n",
|
||
" null_rate = car[i].isnull().sum() / len(car) * 100\n",
|
||
" if null_rate > 0:\n",
|
||
" display(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"id 0\n",
|
||
"date 0\n",
|
||
"price 0\n",
|
||
"bedrooms 0\n",
|
||
"bathrooms 0\n",
|
||
"sqft_living 0\n",
|
||
"sqft_lot 0\n",
|
||
"floors 0\n",
|
||
"waterfront 0\n",
|
||
"view 0\n",
|
||
"condition 0\n",
|
||
"grade 0\n",
|
||
"sqft_above 0\n",
|
||
"sqft_basement 0\n",
|
||
"yr_built 0\n",
|
||
"yr_renovated 0\n",
|
||
"zipcode 0\n",
|
||
"lat 0\n",
|
||
"long 0\n",
|
||
"sqft_living15 0\n",
|
||
"sqft_lot15 0\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"id False\n",
|
||
"date False\n",
|
||
"price False\n",
|
||
"bedrooms False\n",
|
||
"bathrooms False\n",
|
||
"sqft_living False\n",
|
||
"sqft_lot False\n",
|
||
"floors False\n",
|
||
"waterfront False\n",
|
||
"view False\n",
|
||
"condition False\n",
|
||
"grade False\n",
|
||
"sqft_above False\n",
|
||
"sqft_basement False\n",
|
||
"yr_built False\n",
|
||
"yr_renovated False\n",
|
||
"zipcode False\n",
|
||
"lat False\n",
|
||
"long False\n",
|
||
"sqft_living15 False\n",
|
||
"sqft_lot15 False\n",
|
||
"dtype: bool"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Количество пустых значений признаков\n",
|
||
"display(house.isnull().sum())\n",
|
||
"display()\n",
|
||
"\n",
|
||
"# Есть ли пустые значения признаков\n",
|
||
"display(house.isnull().any())\n",
|
||
"display()\n",
|
||
"\n",
|
||
"# Процент пустых значений признаков\n",
|
||
"for i in house.columns:\n",
|
||
" null_rate = house[i].isnull().sum() / len(house) * 100\n",
|
||
" if null_rate > 0:\n",
|
||
" display(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"В 1 наборе данных phone имеются пустые значения, в остальных они отсутствуют.\n",
|
||
"Заполним пустые значения из 1 датасета."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(805, 18)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Name_phone False\n",
|
||
"Rating False\n",
|
||
"Spec_score False\n",
|
||
"No_of_sim False\n",
|
||
"Ram False\n",
|
||
"Battery False\n",
|
||
"Display False\n",
|
||
"Camera False\n",
|
||
"External_Memory False\n",
|
||
"Android_version False\n",
|
||
"Price False\n",
|
||
"company False\n",
|
||
"Inbuilt_memory False\n",
|
||
"fast_charging False\n",
|
||
"Screen_resolution False\n",
|
||
"Processor False\n",
|
||
"Processor_name False\n",
|
||
"Class False\n",
|
||
"dtype: bool"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Name_phone</th>\n",
|
||
" <th>Rating</th>\n",
|
||
" <th>Spec_score</th>\n",
|
||
" <th>No_of_sim</th>\n",
|
||
" <th>Ram</th>\n",
|
||
" <th>Battery</th>\n",
|
||
" <th>Display</th>\n",
|
||
" <th>Camera</th>\n",
|
||
" <th>External_Memory</th>\n",
|
||
" <th>Android_version</th>\n",
|
||
" <th>Price</th>\n",
|
||
" <th>company</th>\n",
|
||
" <th>Inbuilt_memory</th>\n",
|
||
" <th>fast_charging</th>\n",
|
||
" <th>Screen_resolution</th>\n",
|
||
" <th>Processor</th>\n",
|
||
" <th>Processor_name</th>\n",
|
||
" <th>Class</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Id</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1365</th>\n",
|
||
" <td>TCL 40R</td>\n",
|
||
" <td>4.05</td>\n",
|
||
" <td>75</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
|
||
" <td>4 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.6 inches</td>\n",
|
||
" <td>50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro...</td>\n",
|
||
" <td>Memory Card (Hybrid)</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>18,999</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>64 GB inbuilt</td>\n",
|
||
" <td>15W Fast Charging</td>\n",
|
||
" <td>720 x 1612 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Dimensity 700 5G</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1366</th>\n",
|
||
" <td>TCL 50 XL NxtPaper 5G</td>\n",
|
||
" <td>4.10</td>\n",
|
||
" <td>80</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
|
||
" <td>8 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.8 inches</td>\n",
|
||
" <td>50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera</td>\n",
|
||
" <td>Memory Card (Hybrid)</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>24,990</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>128 GB inbuilt</td>\n",
|
||
" <td>33W Fast Charging</td>\n",
|
||
" <td>1200 x 2400 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Dimensity 7050</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1367</th>\n",
|
||
" <td>TCL 50 XE NxtPaper 5G</td>\n",
|
||
" <td>4.00</td>\n",
|
||
" <td>80</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
|
||
" <td>6 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.6 inches</td>\n",
|
||
" <td>50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>23,990</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>256 GB inbuilt</td>\n",
|
||
" <td>18W Fast Charging</td>\n",
|
||
" <td>720 x 1612 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Dimensity 6080</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1368</th>\n",
|
||
" <td>TCL 40 NxtPaper 5G</td>\n",
|
||
" <td>4.50</td>\n",
|
||
" <td>79</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
|
||
" <td>6 GB RAM</td>\n",
|
||
" <td>5000 mAh Battery</td>\n",
|
||
" <td>6.6 inches</td>\n",
|
||
" <td>50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro...</td>\n",
|
||
" <td>Memory Card Supported, upto 1 TB</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>22,499</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>256 GB inbuilt</td>\n",
|
||
" <td>15W Fast Charging</td>\n",
|
||
" <td>720 x 1612 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Dimensity 6020</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1369</th>\n",
|
||
" <td>TCL Trifold</td>\n",
|
||
" <td>4.65</td>\n",
|
||
" <td>93</td>\n",
|
||
" <td>Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G,</td>\n",
|
||
" <td>12 GB RAM</td>\n",
|
||
" <td>4600 mAh Battery</td>\n",
|
||
" <td>10 inches</td>\n",
|
||
" <td>Foldable Display, Dual Display</td>\n",
|
||
" <td>50 MP + 48 MP + 8 MP Triple Rear &amp; 32 MP F...</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>1,19,990</td>\n",
|
||
" <td>TCL</td>\n",
|
||
" <td>256 GB inbuilt</td>\n",
|
||
" <td>67W Fast Charging</td>\n",
|
||
" <td>1916 x 2160 px</td>\n",
|
||
" <td>Octa Core</td>\n",
|
||
" <td>Snapdragon 8 Gen2</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Name_phone Rating Spec_score \\\n",
|
||
"Id \n",
|
||
"1365 TCL 40R 4.05 75 \n",
|
||
"1366 TCL 50 XL NxtPaper 5G 4.10 80 \n",
|
||
"1367 TCL 50 XE NxtPaper 5G 4.00 80 \n",
|
||
"1368 TCL 40 NxtPaper 5G 4.50 79 \n",
|
||
"1369 TCL Trifold 4.65 93 \n",
|
||
"\n",
|
||
" No_of_sim Ram Battery \\\n",
|
||
"Id \n",
|
||
"1365 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery \n",
|
||
"1366 Dual Sim, 3G, 4G, VoLTE, 8 GB RAM 5000 mAh Battery \n",
|
||
"1367 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n",
|
||
"1368 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n",
|
||
"1369 Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, 12 GB RAM 4600 mAh Battery \n",
|
||
"\n",
|
||
" Display Camera \\\n",
|
||
"Id \n",
|
||
"1365 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro... \n",
|
||
"1366 6.8 inches 50 MP + 2 MP Dual Rear & 16 MP Front Camera \n",
|
||
"1367 6.6 inches 50 MP + 2 MP Dual Rear & 16 MP Front Camera \n",
|
||
"1368 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro... \n",
|
||
"1369 10 inches Foldable Display, Dual Display \n",
|
||
"\n",
|
||
" External_Memory Android_version \\\n",
|
||
"Id \n",
|
||
"1365 Memory Card (Hybrid) 12 \n",
|
||
"1366 Memory Card (Hybrid) 14 \n",
|
||
"1367 Memory Card Supported, upto 1 TB 13 \n",
|
||
"1368 Memory Card Supported, upto 1 TB 13 \n",
|
||
"1369 50 MP + 48 MP + 8 MP Triple Rear & 32 MP F... 13 \n",
|
||
"\n",
|
||
" Price company Inbuilt_memory fast_charging Screen_resolution \\\n",
|
||
"Id \n",
|
||
"1365 18,999 TCL 64 GB inbuilt 15W Fast Charging 720 x 1612 px \n",
|
||
"1366 24,990 TCL 128 GB inbuilt 33W Fast Charging 1200 x 2400 px \n",
|
||
"1367 23,990 TCL 256 GB inbuilt 18W Fast Charging 720 x 1612 px \n",
|
||
"1368 22,499 TCL 256 GB inbuilt 15W Fast Charging 720 x 1612 px \n",
|
||
"1369 1,19,990 TCL 256 GB inbuilt 67W Fast Charging 1916 x 2160 px \n",
|
||
"\n",
|
||
" Processor Processor_name Class \n",
|
||
"Id \n",
|
||
"1365 Octa Core Dimensity 700 5G 2 \n",
|
||
"1366 Octa Core Dimensity 7050 1 \n",
|
||
"1367 Octa Core Dimensity 6080 2 \n",
|
||
"1368 Octa Core Dimensity 6020 2 \n",
|
||
"1369 Octa Core Snapdragon 8 Gen2 2 "
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Замена пустых данных на моду (часто встречающееся значение)\n",
|
||
"mode_Android = phone['Android_version'].mode() \n",
|
||
"phone.fillna({'Android_version':mode_Android}, inplace=True)\n",
|
||
"\n",
|
||
"Inbuilt_memory = phone['Inbuilt_memory'].mode() \n",
|
||
"phone.fillna({'Inbuilt_memory':Inbuilt_memory}, inplace=True)\n",
|
||
"\n",
|
||
"fillna_df = phone.fillna(0)\n",
|
||
"\n",
|
||
"dropna_df = phone.dropna()\n",
|
||
"\n",
|
||
"display(dropna_df.shape)\n",
|
||
"\n",
|
||
"display(fillna_df.isnull().any())\n",
|
||
"\n",
|
||
"phone.tail()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Class\n",
|
||
"1 718\n",
|
||
"2 636\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(812, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Class\n",
|
||
"1 431\n",
|
||
"2 381\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Контрольная выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(271, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Class\n",
|
||
"1 143\n",
|
||
"2 128\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Тестовая выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(271, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Class\n",
|
||
"1 144\n",
|
||
"2 127\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Вывод распределения количества наблюдений по меткам (классам)\n",
|
||
"from src.utils import split_stratified_into_train_val_test\n",
|
||
"\n",
|
||
"\n",
|
||
"display(phone.Class.value_counts())\n",
|
||
"display()\n",
|
||
"\n",
|
||
"data = phone[[\"Class\", \"Spec_score\", \"Rating\"]].copy()\n",
|
||
"\n",
|
||
"df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
|
||
" data, stratify_colname=\"Class\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
||
")\n",
|
||
"\n",
|
||
"display(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"display(df_train.Class.value_counts())\n",
|
||
"\n",
|
||
"display(\"Контрольная выборка: \", df_val.shape)\n",
|
||
"display(df_val.Class.value_counts())\n",
|
||
"\n",
|
||
"display(\"Тестовая выборка: \", df_test.shape)\n",
|
||
"display(df_test.Class.value_counts())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(812, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Class\n",
|
||
"1 431\n",
|
||
"2 381\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка после oversampling: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(856, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Class\n",
|
||
"1 431\n",
|
||
"2 425\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Class</th>\n",
|
||
" <th>Spec_score</th>\n",
|
||
" <th>Rating</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>90</td>\n",
|
||
" <td>4.550000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>80</td>\n",
|
||
" <td>4.400000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>80</td>\n",
|
||
" <td>4.650000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>75</td>\n",
|
||
" <td>4.500000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>92</td>\n",
|
||
" <td>4.650000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>851</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>96</td>\n",
|
||
" <td>4.061287</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>852</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>95</td>\n",
|
||
" <td>4.191740</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>853</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>73</td>\n",
|
||
" <td>4.612413</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>854</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>94</td>\n",
|
||
" <td>4.193295</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>855</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>92</td>\n",
|
||
" <td>4.650000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>856 rows × 3 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Class Spec_score Rating\n",
|
||
"0 1 90 4.550000\n",
|
||
"1 1 80 4.400000\n",
|
||
"2 1 80 4.650000\n",
|
||
"3 1 75 4.500000\n",
|
||
"4 2 92 4.650000\n",
|
||
".. ... ... ...\n",
|
||
"851 2 96 4.061287\n",
|
||
"852 2 95 4.191740\n",
|
||
"853 2 73 4.612413\n",
|
||
"854 2 94 4.193295\n",
|
||
"855 2 92 4.650000\n",
|
||
"\n",
|
||
"[856 rows x 3 columns]"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.over_sampling import ADASYN\n",
|
||
"\n",
|
||
"ada = ADASYN()\n",
|
||
"\n",
|
||
"display(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"display(df_train.Class.value_counts())\n",
|
||
"\n",
|
||
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Class\"]) # type: ignore\n",
|
||
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"display(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
||
"display(df_train_adasyn.Class.value_counts())\n",
|
||
"\n",
|
||
"df_train_adasyn"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(812, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Class\n",
|
||
"1 431\n",
|
||
"2 381\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка после undersampling: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(762, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Class\n",
|
||
"1 381\n",
|
||
"2 381\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Spec_score</th>\n",
|
||
" <th>Rating</th>\n",
|
||
" <th>Class</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Id</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>468</th>\n",
|
||
" <td>69</td>\n",
|
||
" <td>4.45</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1093</th>\n",
|
||
" <td>86</td>\n",
|
||
" <td>4.45</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1102</th>\n",
|
||
" <td>89</td>\n",
|
||
" <td>4.00</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>422</th>\n",
|
||
" <td>70</td>\n",
|
||
" <td>4.15</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1192</th>\n",
|
||
" <td>81</td>\n",
|
||
" <td>4.00</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>298</th>\n",
|
||
" <td>75</td>\n",
|
||
" <td>4.70</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>249</th>\n",
|
||
" <td>75</td>\n",
|
||
" <td>4.25</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1341</th>\n",
|
||
" <td>66</td>\n",
|
||
" <td>4.25</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>923</th>\n",
|
||
" <td>88</td>\n",
|
||
" <td>4.15</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>301</th>\n",
|
||
" <td>80</td>\n",
|
||
" <td>4.50</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>762 rows × 3 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Spec_score Rating Class\n",
|
||
"Id \n",
|
||
"468 69 4.45 1\n",
|
||
"1093 86 4.45 1\n",
|
||
"1102 89 4.00 1\n",
|
||
"422 70 4.15 1\n",
|
||
"1192 81 4.00 1\n",
|
||
"... ... ... ...\n",
|
||
"298 75 4.70 2\n",
|
||
"249 75 4.25 2\n",
|
||
"1341 66 4.25 2\n",
|
||
"923 88 4.15 2\n",
|
||
"301 80 4.50 2\n",
|
||
"\n",
|
||
"[762 rows x 3 columns]"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Создание экземпляра RandomUnderSampler\n",
|
||
"rus = RandomUnderSampler()\n",
|
||
"\n",
|
||
"# Проверка исходной обучающей выборки\n",
|
||
"display(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"display(df_train.Class.value_counts())\n",
|
||
"\n",
|
||
"# Выполнение undersampling\n",
|
||
"X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=[\"Class\"]), df_train[\"Class\"]) # type: ignore\n",
|
||
"df_train_undersampled = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"# Добавление целевой переменной обратно в выборку\n",
|
||
"df_train_undersampled[\"Class\"] = y_resampled\n",
|
||
"\n",
|
||
"# Проверка обучающей выборки после undersampling\n",
|
||
"display(\"Обучающая выборка после undersampling: \", df_train_undersampled.shape)\n",
|
||
"display(df_train_undersampled.Class.value_counts())\n",
|
||
"\n",
|
||
"df_train_undersampled\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"LeatherInterior1\n",
|
||
"1 13949\n",
|
||
"0 5278\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(11536, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"LeatherInterior1\n",
|
||
"1 8369\n",
|
||
"0 3167\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Контрольная выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(3845, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"LeatherInterior1\n",
|
||
"1 2790\n",
|
||
"0 1055\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Тестовая выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(3846, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"LeatherInterior1\n",
|
||
"1 2790\n",
|
||
"0 1056\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Вывод распределения количества наблюдений по меткам (классам)\n",
|
||
"from src.utils import split_stratified_into_train_val_test\n",
|
||
"\n",
|
||
"\n",
|
||
"display(car.LeatherInterior1.value_counts())\n",
|
||
"display()\n",
|
||
"\n",
|
||
"data = car[[\"LeatherInterior1\", \"Airbags\", \"ProdYear\"]].copy()\n",
|
||
"\n",
|
||
"df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
|
||
" data, stratify_colname=\"LeatherInterior1\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
||
")\n",
|
||
"\n",
|
||
"display(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"display(df_train.LeatherInterior1.value_counts())\n",
|
||
"\n",
|
||
"display(\"Контрольная выборка: \", df_val.shape)\n",
|
||
"display(df_val.LeatherInterior1.value_counts())\n",
|
||
"\n",
|
||
"display(\"Тестовая выборка: \", df_test.shape)\n",
|
||
"display(df_test.LeatherInterior1.value_counts())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(11536, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"LeatherInterior1\n",
|
||
"1 8369\n",
|
||
"0 3167\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка после oversampling: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(16728, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"LeatherInterior1\n",
|
||
"1 8369\n",
|
||
"0 8359\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>LeatherInterior1</th>\n",
|
||
" <th>Airbags</th>\n",
|
||
" <th>ProdYear</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>16</td>\n",
|
||
" <td>2011</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>2017</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2018</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2011</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2010</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16723</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>2004</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16724</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>2004</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16725</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>2004</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16726</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>2004</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16727</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>2003</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>16728 rows × 3 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" LeatherInterior1 Airbags ProdYear\n",
|
||
"0 0 16 2011\n",
|
||
"1 1 10 2017\n",
|
||
"2 1 4 2018\n",
|
||
"3 1 0 2011\n",
|
||
"4 1 6 2010\n",
|
||
"... ... ... ...\n",
|
||
"16723 0 12 2004\n",
|
||
"16724 0 12 2004\n",
|
||
"16725 0 12 2004\n",
|
||
"16726 0 12 2004\n",
|
||
"16727 0 12 2003\n",
|
||
"\n",
|
||
"[16728 rows x 3 columns]"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.over_sampling import ADASYN\n",
|
||
"\n",
|
||
"ada = ADASYN()\n",
|
||
"\n",
|
||
"display(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"display(df_train.LeatherInterior1.value_counts())\n",
|
||
"\n",
|
||
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"LeatherInterior1\"]) # type: ignore\n",
|
||
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"display(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
||
"display(df_train_adasyn.LeatherInterior1.value_counts())\n",
|
||
"\n",
|
||
"df_train_adasyn"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(11536, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"LeatherInterior1\n",
|
||
"1 8369\n",
|
||
"0 3167\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка после undersampling: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(6334, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"LeatherInterior1\n",
|
||
"0 3167\n",
|
||
"1 3167\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Airbags</th>\n",
|
||
" <th>ProdYear</th>\n",
|
||
" <th>LeatherInterior1</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1315</th>\n",
|
||
" <td>16</td>\n",
|
||
" <td>2011</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1569</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2014</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6317</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2006</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>626</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2000</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12439</th>\n",
|
||
" <td>12</td>\n",
|
||
" <td>2012</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3361</th>\n",
|
||
" <td>12</td>\n",
|
||
" <td>2012</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3286</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2017</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17666</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2015</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4902</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2012</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4613</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2014</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>6334 rows × 3 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Airbags ProdYear LeatherInterior1\n",
|
||
"1315 16 2011 0\n",
|
||
"1569 0 2014 0\n",
|
||
"6317 6 2006 0\n",
|
||
"626 4 2000 0\n",
|
||
"12439 12 2012 0\n",
|
||
"... ... ... ...\n",
|
||
"3361 12 2012 1\n",
|
||
"3286 4 2017 1\n",
|
||
"17666 4 2015 1\n",
|
||
"4902 0 2012 1\n",
|
||
"4613 4 2014 1\n",
|
||
"\n",
|
||
"[6334 rows x 3 columns]"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Создание экземпляра RandomUnderSampler\n",
|
||
"rus = RandomUnderSampler()\n",
|
||
"\n",
|
||
"# Проверка исходной обучающей выборки\n",
|
||
"display(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"display(df_train.LeatherInterior1.value_counts())\n",
|
||
"\n",
|
||
"# Выполнение undersampling\n",
|
||
"X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=[\"LeatherInterior1\"]), df_train[\"LeatherInterior1\"]) # type: ignore\n",
|
||
"df_train_undersampled = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"# Добавление целевой переменной обратно в выборку\n",
|
||
"df_train_undersampled[\"LeatherInterior1\"] = y_resampled\n",
|
||
"\n",
|
||
"# Проверка обучающей выборки после undersampling\n",
|
||
"display(\"Обучающая выборка после undersampling: \", df_train_undersampled.shape)\n",
|
||
"display(df_train_undersampled.LeatherInterior1.value_counts())\n",
|
||
"\n",
|
||
"df_train_undersampled"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"bedrooms\n",
|
||
"3 1518\n",
|
||
"4 1058\n",
|
||
"2 404\n",
|
||
"5 234\n",
|
||
"6 55\n",
|
||
"1 30\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(1979, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"bedrooms\n",
|
||
"3 911\n",
|
||
"4 635\n",
|
||
"2 242\n",
|
||
"5 140\n",
|
||
"6 33\n",
|
||
"1 18\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Контрольная выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(660, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"bedrooms\n",
|
||
"3 304\n",
|
||
"4 211\n",
|
||
"2 81\n",
|
||
"5 47\n",
|
||
"6 11\n",
|
||
"1 6\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Тестовая выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(660, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"bedrooms\n",
|
||
"3 303\n",
|
||
"4 212\n",
|
||
"2 81\n",
|
||
"5 47\n",
|
||
"6 11\n",
|
||
"1 6\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Вывод распределения количества наблюдений по меткам (классам)\n",
|
||
"from src.utils import split_stratified_into_train_val_test\n",
|
||
"\n",
|
||
"\n",
|
||
"display(house.bedrooms.value_counts())\n",
|
||
"display()\n",
|
||
"\n",
|
||
"data = house[[\"bedrooms\", \"sqft_living\", \"sqft_lot\"]].copy()\n",
|
||
"\n",
|
||
"df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
|
||
" data, stratify_colname=\"bedrooms\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
||
")\n",
|
||
"\n",
|
||
"display(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"display(df_train.bedrooms.value_counts())\n",
|
||
"\n",
|
||
"display(\"Контрольная выборка: \", df_val.shape)\n",
|
||
"display(df_val.bedrooms.value_counts())\n",
|
||
"\n",
|
||
"display(\"Тестовая выборка: \", df_test.shape)\n",
|
||
"display(df_test.bedrooms.value_counts())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(1979, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"bedrooms\n",
|
||
"3 911\n",
|
||
"4 635\n",
|
||
"2 242\n",
|
||
"5 140\n",
|
||
"6 33\n",
|
||
"1 18\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка после oversampling: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(5380, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"bedrooms\n",
|
||
"2 932\n",
|
||
"1 914\n",
|
||
"3 911\n",
|
||
"6 907\n",
|
||
"5 888\n",
|
||
"4 828\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>1940</td>\n",
|
||
" <td>10035</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1920</td>\n",
|
||
" <td>4862</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2340</td>\n",
|
||
" <td>3784</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>3450</td>\n",
|
||
" <td>33460</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>2230</td>\n",
|
||
" <td>26989</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5375</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2575</td>\n",
|
||
" <td>6858</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5376</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2635</td>\n",
|
||
" <td>8286</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5377</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2815</td>\n",
|
||
" <td>7930</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5378</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2857</td>\n",
|
||
" <td>7735</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5379</th>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2923</td>\n",
|
||
" <td>7315</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5380 rows × 3 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" bedrooms sqft_living sqft_lot\n",
|
||
"0 3 1940 10035\n",
|
||
"1 4 1920 4862\n",
|
||
"2 4 2340 3784\n",
|
||
"3 4 3450 33460\n",
|
||
"4 4 2230 26989\n",
|
||
"... ... ... ...\n",
|
||
"5375 6 2575 6858\n",
|
||
"5376 6 2635 8286\n",
|
||
"5377 6 2815 7930\n",
|
||
"5378 6 2857 7735\n",
|
||
"5379 6 2923 7315\n",
|
||
"\n",
|
||
"[5380 rows x 3 columns]"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.over_sampling import ADASYN\n",
|
||
"\n",
|
||
"ada = ADASYN()\n",
|
||
"\n",
|
||
"display(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"display(df_train.bedrooms.value_counts())\n",
|
||
"\n",
|
||
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"bedrooms\"]) # type: ignore\n",
|
||
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"display(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
||
"display(df_train_adasyn.bedrooms.value_counts())\n",
|
||
"\n",
|
||
"df_train_adasyn"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(1979, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"bedrooms\n",
|
||
"3 911\n",
|
||
"4 635\n",
|
||
"2 242\n",
|
||
"5 140\n",
|
||
"6 33\n",
|
||
"1 18\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Обучающая выборка после undersampling: '"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(108, 3)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"bedrooms\n",
|
||
"1 18\n",
|
||
"2 18\n",
|
||
"3 18\n",
|
||
"4 18\n",
|
||
"5 18\n",
|
||
"6 18\n",
|
||
"Name: count, dtype: int64"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>sqft_living</th>\n",
|
||
" <th>sqft_lot</th>\n",
|
||
" <th>bedrooms</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>2003</th>\n",
|
||
" <td>1090</td>\n",
|
||
" <td>8750</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2922</th>\n",
|
||
" <td>780</td>\n",
|
||
" <td>10235</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>350</th>\n",
|
||
" <td>560</td>\n",
|
||
" <td>12120</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2682</th>\n",
|
||
" <td>710</td>\n",
|
||
" <td>6000</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3037</th>\n",
|
||
" <td>890</td>\n",
|
||
" <td>211576</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>832</th>\n",
|
||
" <td>2450</td>\n",
|
||
" <td>25600</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>783</th>\n",
|
||
" <td>3610</td>\n",
|
||
" <td>10003</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>486</th>\n",
|
||
" <td>4860</td>\n",
|
||
" <td>11793</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1499</th>\n",
|
||
" <td>3840</td>\n",
|
||
" <td>14040</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1302</th>\n",
|
||
" <td>3010</td>\n",
|
||
" <td>17864</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>108 rows × 3 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" sqft_living sqft_lot bedrooms\n",
|
||
"2003 1090 8750 1\n",
|
||
"2922 780 10235 1\n",
|
||
"350 560 12120 1\n",
|
||
"2682 710 6000 1\n",
|
||
"3037 890 211576 1\n",
|
||
"... ... ... ...\n",
|
||
"832 2450 25600 6\n",
|
||
"783 3610 10003 6\n",
|
||
"486 4860 11793 6\n",
|
||
"1499 3840 14040 6\n",
|
||
"1302 3010 17864 6\n",
|
||
"\n",
|
||
"[108 rows x 3 columns]"
|
||
]
|
||
},
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Создание экземпляра RandomUnderSampler\n",
|
||
"rus = RandomUnderSampler()\n",
|
||
"\n",
|
||
"# Проверка исходной обучающей выборки\n",
|
||
"display(\"Обучающая выборка: \", df_train.shape)\n",
|
||
"display(df_train.bedrooms.value_counts())\n",
|
||
"\n",
|
||
"# Выполнение undersampling\n",
|
||
"X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=[\"bedrooms\"]), df_train[\"bedrooms\"]) # type: ignore\n",
|
||
"df_train_undersampled = pd.DataFrame(X_resampled)\n",
|
||
"\n",
|
||
"# Добавление целевой переменной обратно в выборку\n",
|
||
"df_train_undersampled[\"bedrooms\"] = y_resampled\n",
|
||
"\n",
|
||
"# Проверка обучающей выборки после undersampling\n",
|
||
"display(\"Обучающая выборка после undersampling: \", df_train_undersampled.shape)\n",
|
||
"display(df_train_undersampled.bedrooms.value_counts())\n",
|
||
"\n",
|
||
"df_train_undersampled"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.5"
|
||
},
|
||
"orig_nbformat": 4
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|