PredictiveAnalytics/lab2.ipynb

3089 lines
91 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 1354 entries, 0 to 1369\n",
"Data columns (total 18 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Name_phone 1354 non-null object \n",
" 1 Rating 1354 non-null float64\n",
" 2 Spec_score 1354 non-null int64 \n",
" 3 No_of_sim 1354 non-null object \n",
" 4 Ram 1354 non-null object \n",
" 5 Battery 1354 non-null object \n",
" 6 Display 1354 non-null object \n",
" 7 Camera 1354 non-null object \n",
" 8 External_Memory 1354 non-null object \n",
" 9 Android_version 914 non-null object \n",
" 10 Price 1354 non-null object \n",
" 11 company 1354 non-null object \n",
" 12 Inbuilt_memory 1335 non-null object \n",
" 13 fast_charging 1267 non-null object \n",
" 14 Screen_resolution 1353 non-null object \n",
" 15 Processor 1327 non-null object \n",
" 16 Processor_name 1354 non-null object \n",
" 17 Class 1354 non-null int64 \n",
"dtypes: float64(1), int64(2), object(15)\n",
"memory usage: 201.0+ KB\n"
]
},
{
"data": {
"text/plain": [
"(1354, 18)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name_phone</th>\n",
" <th>Rating</th>\n",
" <th>Spec_score</th>\n",
" <th>No_of_sim</th>\n",
" <th>Ram</th>\n",
" <th>Battery</th>\n",
" <th>Display</th>\n",
" <th>Camera</th>\n",
" <th>External_Memory</th>\n",
" <th>Android_version</th>\n",
" <th>Price</th>\n",
" <th>company</th>\n",
" <th>Inbuilt_memory</th>\n",
" <th>fast_charging</th>\n",
" <th>Screen_resolution</th>\n",
" <th>Processor</th>\n",
" <th>Processor_name</th>\n",
" <th>Class</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Samsung Galaxy F14 5G</td>\n",
" <td>4.65</td>\n",
" <td>68</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>6000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP + 2 MP Dual Rear &amp;amp; 13 MP Front Camera</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>13</td>\n",
" <td>9,999</td>\n",
" <td>Samsung</td>\n",
" <td>128 GB inbuilt</td>\n",
" <td>25W Fast Charging</td>\n",
" <td>2408 x 1080 px Display with Water Drop Notch</td>\n",
" <td>Octa Core Processor</td>\n",
" <td>Exynos 1330</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Samsung Galaxy A11</td>\n",
" <td>4.20</td>\n",
" <td>63</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>2 GB RAM</td>\n",
" <td>4000 mAh Battery</td>\n",
" <td>6.4 inches</td>\n",
" <td>13 MP + 5 MP + 2 MP Triple Rear &amp;amp; 8 MP Fro...</td>\n",
" <td>Memory Card Supported, upto 512 GB</td>\n",
" <td>10</td>\n",
" <td>9,990</td>\n",
" <td>Samsung</td>\n",
" <td>32 GB inbuilt</td>\n",
" <td>15W Fast Charging</td>\n",
" <td>720 x 1560 px Display with Punch Hole</td>\n",
" <td>1.8 GHz Processor</td>\n",
" <td>Octa Core</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Samsung Galaxy F23</td>\n",
" <td>4.10</td>\n",
" <td>73</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>6000 mAh Battery</td>\n",
" <td>6.4 inches</td>\n",
" <td>48 MP Quad Rear &amp;amp; 13 MP Front Camera</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>12</td>\n",
" <td>11,999</td>\n",
" <td>Samsung</td>\n",
" <td>64 GB inbuilt</td>\n",
" <td>NaN</td>\n",
" <td>720 x 1600 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Helio G88</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Samsung Galaxy A03s (4GB RAM + 64GB)</td>\n",
" <td>4.10</td>\n",
" <td>69</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.5 inches</td>\n",
" <td>13 MP + 2 MP + 2 MP Triple Rear &amp;amp; 5 MP Fro...</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>11</td>\n",
" <td>11,999</td>\n",
" <td>Samsung</td>\n",
" <td>64 GB inbuilt</td>\n",
" <td>15W Fast Charging</td>\n",
" <td>720 x 1600 px Display with Water Drop Notch</td>\n",
" <td>Octa Core</td>\n",
" <td>Helio P35</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Samsung Galaxy M13 5G</td>\n",
" <td>4.40</td>\n",
" <td>75</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
" <td>6 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.5 inches</td>\n",
" <td>50 MP + 2 MP Dual Rear &amp;amp; 5 MP Front Camera</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>12</td>\n",
" <td>11,990</td>\n",
" <td>Samsung</td>\n",
" <td>128 GB inbuilt</td>\n",
" <td>15W Fast Charging</td>\n",
" <td>720 x 1600 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Dimensity 700</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name_phone Rating Spec_score \\\n",
"Id \n",
"0 Samsung Galaxy F14 5G 4.65 68 \n",
"1 Samsung Galaxy A11 4.20 63 \n",
"3 Samsung Galaxy F23 4.10 73 \n",
"4 Samsung Galaxy A03s (4GB RAM + 64GB) 4.10 69 \n",
"5 Samsung Galaxy M13 5G 4.40 75 \n",
"\n",
" No_of_sim Ram Battery Display \\\n",
"Id \n",
"0 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 6000 mAh Battery 6.6 inches \n",
"1 Dual Sim, 3G, 4G, VoLTE, 2 GB RAM 4000 mAh Battery 6.4 inches \n",
"3 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches \n",
"4 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches \n",
"5 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery 6.5 inches \n",
"\n",
" Camera \\\n",
"Id \n",
"0 50 MP + 2 MP Dual Rear &amp; 13 MP Front Camera \n",
"1 13 MP + 5 MP + 2 MP Triple Rear &amp; 8 MP Fro... \n",
"3 48 MP Quad Rear &amp; 13 MP Front Camera \n",
"4 13 MP + 2 MP + 2 MP Triple Rear &amp; 5 MP Fro... \n",
"5 50 MP + 2 MP Dual Rear &amp; 5 MP Front Camera \n",
"\n",
" External_Memory Android_version Price company \\\n",
"Id \n",
"0 Memory Card Supported, upto 1 TB 13 9,999 Samsung \n",
"1 Memory Card Supported, upto 512 GB 10 9,990 Samsung \n",
"3 Memory Card Supported, upto 1 TB 12 11,999 Samsung \n",
"4 Memory Card Supported, upto 1 TB 11 11,999 Samsung \n",
"5 Memory Card Supported, upto 1 TB 12 11,990 Samsung \n",
"\n",
" Inbuilt_memory fast_charging \\\n",
"Id \n",
"0 128 GB inbuilt 25W Fast Charging \n",
"1 32 GB inbuilt 15W Fast Charging \n",
"3 64 GB inbuilt NaN \n",
"4 64 GB inbuilt 15W Fast Charging \n",
"5 128 GB inbuilt 15W Fast Charging \n",
"\n",
" Screen_resolution Processor \\\n",
"Id \n",
"0 2408 x 1080 px Display with Water Drop Notch Octa Core Processor \n",
"1 720 x 1560 px Display with Punch Hole 1.8 GHz Processor \n",
"3 720 x 1600 px Octa Core \n",
"4 720 x 1600 px Display with Water Drop Notch Octa Core \n",
"5 720 x 1600 px Octa Core \n",
"\n",
" Processor_name Class \n",
"Id \n",
"0 Exynos 1330 2 \n",
"1 Octa Core 2 \n",
"3 Helio G88 1 \n",
"4 Helio P35 2 \n",
"5 Dimensity 700 1 "
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"phone = pd.read_csv(\"data/phone_price.csv\", index_col=\"Id\")\n",
"\n",
"phone.info()\n",
"\n",
"display(phone.shape)\n",
"\n",
"phone.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 19227 entries, 0 to 19226\n",
"Data columns (total 20 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Unnamed: 0 19227 non-null int64 \n",
" 1 ID 19227 non-null int64 \n",
" 2 Price 19227 non-null int64 \n",
" 3 Levy 19227 non-null object \n",
" 4 Manufacturer 19227 non-null object \n",
" 5 Model 19227 non-null object \n",
" 6 ProdYear 19227 non-null int64 \n",
" 7 Category 19227 non-null object \n",
" 8 Leather interior 19227 non-null object \n",
" 9 Fuel type 19227 non-null object \n",
" 10 Engine volume 19227 non-null object \n",
" 11 Mileage 19227 non-null object \n",
" 12 Cylinders 19227 non-null float64\n",
" 13 Gear box type 19227 non-null object \n",
" 14 Drive wheels 19227 non-null object \n",
" 15 Doors 19227 non-null object \n",
" 16 Wheel 19227 non-null object \n",
" 17 Color 19227 non-null object \n",
" 18 Airbags 19227 non-null int64 \n",
" 19 LeatherInterior1 19227 non-null int64 \n",
"dtypes: float64(1), int64(6), object(13)\n",
"memory usage: 2.9+ MB\n"
]
},
{
"data": {
"text/plain": [
"(19227, 20)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>ID</th>\n",
" <th>Price</th>\n",
" <th>Levy</th>\n",
" <th>Manufacturer</th>\n",
" <th>Model</th>\n",
" <th>ProdYear</th>\n",
" <th>Category</th>\n",
" <th>Leather interior</th>\n",
" <th>Fuel type</th>\n",
" <th>Engine volume</th>\n",
" <th>Mileage</th>\n",
" <th>Cylinders</th>\n",
" <th>Gear box type</th>\n",
" <th>Drive wheels</th>\n",
" <th>Doors</th>\n",
" <th>Wheel</th>\n",
" <th>Color</th>\n",
" <th>Airbags</th>\n",
" <th>LeatherInterior1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>45654403</td>\n",
" <td>13328</td>\n",
" <td>1399</td>\n",
" <td>LEXUS</td>\n",
" <td>RX 450</td>\n",
" <td>2010</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Hybrid</td>\n",
" <td>3.5</td>\n",
" <td>186005 km</td>\n",
" <td>6.0</td>\n",
" <td>Automatic</td>\n",
" <td>4x4</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>44731507</td>\n",
" <td>16621</td>\n",
" <td>1018</td>\n",
" <td>CHEVROLET</td>\n",
" <td>Equinox</td>\n",
" <td>2011</td>\n",
" <td>Jeep</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>3</td>\n",
" <td>192000 km</td>\n",
" <td>6.0</td>\n",
" <td>Tiptronic</td>\n",
" <td>4x4</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Black</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>45774419</td>\n",
" <td>8467</td>\n",
" <td>-</td>\n",
" <td>HONDA</td>\n",
" <td>FIT</td>\n",
" <td>2006</td>\n",
" <td>Hatchback</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>1.3</td>\n",
" <td>200000 km</td>\n",
" <td>4.0</td>\n",
" <td>Variator</td>\n",
" <td>Front</td>\n",
" <td>04-May</td>\n",
" <td>Right-hand drive</td>\n",
" <td>Black</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>45769185</td>\n",
" <td>3607</td>\n",
" <td>862</td>\n",
" <td>FORD</td>\n",
" <td>Escape</td>\n",
" <td>2011</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Hybrid</td>\n",
" <td>2.5</td>\n",
" <td>168966 km</td>\n",
" <td>4.0</td>\n",
" <td>Automatic</td>\n",
" <td>4x4</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>White</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>45809263</td>\n",
" <td>11726</td>\n",
" <td>446</td>\n",
" <td>HONDA</td>\n",
" <td>FIT</td>\n",
" <td>2014</td>\n",
" <td>Hatchback</td>\n",
" <td>Yes</td>\n",
" <td>Petrol</td>\n",
" <td>1.3</td>\n",
" <td>91901 km</td>\n",
" <td>4.0</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 ID Price Levy Manufacturer Model ProdYear \\\n",
"0 0 45654403 13328 1399 LEXUS RX 450 2010 \n",
"1 1 44731507 16621 1018 CHEVROLET Equinox 2011 \n",
"2 2 45774419 8467 - HONDA FIT 2006 \n",
"3 3 45769185 3607 862 FORD Escape 2011 \n",
"4 4 45809263 11726 446 HONDA FIT 2014 \n",
"\n",
" Category Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
"0 Jeep Yes Hybrid 3.5 186005 km 6.0 \n",
"1 Jeep No Petrol 3 192000 km 6.0 \n",
"2 Hatchback No Petrol 1.3 200000 km 4.0 \n",
"3 Jeep Yes Hybrid 2.5 168966 km 4.0 \n",
"4 Hatchback Yes Petrol 1.3 91901 km 4.0 \n",
"\n",
" Gear box type Drive wheels Doors Wheel Color Airbags \\\n",
"0 Automatic 4x4 04-May Left wheel Silver 12 \n",
"1 Tiptronic 4x4 04-May Left wheel Black 8 \n",
"2 Variator Front 04-May Right-hand drive Black 2 \n",
"3 Automatic 4x4 04-May Left wheel White 0 \n",
"4 Automatic Front 04-May Left wheel Silver 4 \n",
"\n",
" LeatherInterior1 \n",
"0 1 \n",
"1 0 \n",
"2 0 \n",
"3 1 \n",
"4 1 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car = pd.read_csv(\"data/car_price.csv\")\n",
"\n",
"car.info()\n",
"\n",
"display(car.shape)\n",
"\n",
"car.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 3299 entries, 0 to 3298\n",
"Data columns (total 21 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 3299 non-null int64 \n",
" 1 date 3299 non-null object \n",
" 2 price 3299 non-null float64\n",
" 3 bedrooms 3299 non-null int64 \n",
" 4 bathrooms 3299 non-null float64\n",
" 5 sqft_living 3299 non-null int64 \n",
" 6 sqft_lot 3299 non-null int64 \n",
" 7 floors 3299 non-null float64\n",
" 8 waterfront 3299 non-null int64 \n",
" 9 view 3299 non-null int64 \n",
" 10 condition 3299 non-null int64 \n",
" 11 grade 3299 non-null int64 \n",
" 12 sqft_above 3299 non-null int64 \n",
" 13 sqft_basement 3299 non-null int64 \n",
" 14 yr_built 3299 non-null int64 \n",
" 15 yr_renovated 3299 non-null int64 \n",
" 16 zipcode 3299 non-null int64 \n",
" 17 lat 3299 non-null float64\n",
" 18 long 3299 non-null float64\n",
" 19 sqft_living15 3299 non-null int64 \n",
" 20 sqft_lot15 3299 non-null int64 \n",
"dtypes: float64(5), int64(15), object(1)\n",
"memory usage: 541.4+ KB\n"
]
},
{
"data": {
"text/plain": [
"(3299, 21)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>grade</th>\n",
" <th>sqft_above</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7129300520</td>\n",
" <td>20141013T000000</td>\n",
" <td>221900.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>1180</td>\n",
" <td>5650</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>1180</td>\n",
" <td>0</td>\n",
" <td>1955</td>\n",
" <td>0</td>\n",
" <td>98178</td>\n",
" <td>47.5112</td>\n",
" <td>-122.257</td>\n",
" <td>1340</td>\n",
" <td>5650</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6414100192</td>\n",
" <td>20141209T000000</td>\n",
" <td>538000.0</td>\n",
" <td>3</td>\n",
" <td>2.25</td>\n",
" <td>2570</td>\n",
" <td>7242</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>2170</td>\n",
" <td>400</td>\n",
" <td>1951</td>\n",
" <td>1991</td>\n",
" <td>98125</td>\n",
" <td>47.7210</td>\n",
" <td>-122.319</td>\n",
" <td>1690</td>\n",
" <td>7639</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5631500400</td>\n",
" <td>20150225T000000</td>\n",
" <td>180000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>770</td>\n",
" <td>10000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>770</td>\n",
" <td>0</td>\n",
" <td>1933</td>\n",
" <td>0</td>\n",
" <td>98028</td>\n",
" <td>47.7379</td>\n",
" <td>-122.233</td>\n",
" <td>2720</td>\n",
" <td>8062</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2487200875</td>\n",
" <td>20141209T000000</td>\n",
" <td>604000.0</td>\n",
" <td>4</td>\n",
" <td>3.00</td>\n",
" <td>1960</td>\n",
" <td>5000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>1050</td>\n",
" <td>910</td>\n",
" <td>1965</td>\n",
" <td>0</td>\n",
" <td>98136</td>\n",
" <td>47.5208</td>\n",
" <td>-122.393</td>\n",
" <td>1360</td>\n",
" <td>5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1954400510</td>\n",
" <td>20150218T000000</td>\n",
" <td>510000.0</td>\n",
" <td>3</td>\n",
" <td>2.00</td>\n",
" <td>1680</td>\n",
" <td>8080</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>8</td>\n",
" <td>1680</td>\n",
" <td>0</td>\n",
" <td>1987</td>\n",
" <td>0</td>\n",
" <td>98074</td>\n",
" <td>47.6168</td>\n",
" <td>-122.045</td>\n",
" <td>1800</td>\n",
" <td>7503</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" id date price bedrooms bathrooms sqft_living \\\n",
"0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
"1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
"2 5631500400 20150225T000000 180000.0 2 1.00 770 \n",
"3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
"4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
"\n",
" sqft_lot floors waterfront view ... grade sqft_above sqft_basement \\\n",
"0 5650 1.0 0 0 ... 7 1180 0 \n",
"1 7242 2.0 0 0 ... 7 2170 400 \n",
"2 10000 1.0 0 0 ... 6 770 0 \n",
"3 5000 1.0 0 0 ... 7 1050 910 \n",
"4 8080 1.0 0 0 ... 8 1680 0 \n",
"\n",
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
"0 1955 0 98178 47.5112 -122.257 1340 \n",
"1 1951 1991 98125 47.7210 -122.319 1690 \n",
"2 1933 0 98028 47.7379 -122.233 2720 \n",
"3 1965 0 98136 47.5208 -122.393 1360 \n",
"4 1987 0 98074 47.6168 -122.045 1800 \n",
"\n",
" sqft_lot15 \n",
"0 5650 \n",
"1 7639 \n",
"2 8062 \n",
"3 5000 \n",
"4 7503 \n",
"\n",
"[5 rows x 21 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"house = pd.read_csv(\"data/house_data.csv\")\n",
"\n",
"house.info()\n",
"\n",
"display(house.shape)\n",
"\n",
"house.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Name_phone 0\n",
"Rating 0\n",
"Spec_score 0\n",
"No_of_sim 0\n",
"Ram 0\n",
"Battery 0\n",
"Display 0\n",
"Camera 0\n",
"External_Memory 0\n",
"Android_version 440\n",
"Price 0\n",
"company 0\n",
"Inbuilt_memory 19\n",
"fast_charging 87\n",
"Screen_resolution 1\n",
"Processor 27\n",
"Processor_name 0\n",
"Class 0\n",
"dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Name_phone False\n",
"Rating False\n",
"Spec_score False\n",
"No_of_sim False\n",
"Ram False\n",
"Battery False\n",
"Display False\n",
"Camera False\n",
"External_Memory False\n",
"Android_version True\n",
"Price False\n",
"company False\n",
"Inbuilt_memory True\n",
"fast_charging True\n",
"Screen_resolution True\n",
"Processor True\n",
"Processor_name False\n",
"Class False\n",
"dtype: bool"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Android_version процент пустых значений: %32.50'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Inbuilt_memory процент пустых значений: %1.40'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'fast_charging процент пустых значений: %6.43'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Screen_resolution процент пустых значений: %0.07'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Processor процент пустых значений: %1.99'"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Количество пустых значений признаков\n",
"display(phone.isnull().sum())\n",
"display()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"display(phone.isnull().any())\n",
"display()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in phone.columns:\n",
" null_rate = phone[i].isnull().sum() / len(phone) * 100\n",
" if null_rate > 0:\n",
" display(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Unnamed: 0 0\n",
"ID 0\n",
"Price 0\n",
"Levy 0\n",
"Manufacturer 0\n",
"Model 0\n",
"ProdYear 0\n",
"Category 0\n",
"Leather interior 0\n",
"Fuel type 0\n",
"Engine volume 0\n",
"Mileage 0\n",
"Cylinders 0\n",
"Gear box type 0\n",
"Drive wheels 0\n",
"Doors 0\n",
"Wheel 0\n",
"Color 0\n",
"Airbags 0\n",
"LeatherInterior1 0\n",
"dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Unnamed: 0 False\n",
"ID False\n",
"Price False\n",
"Levy False\n",
"Manufacturer False\n",
"Model False\n",
"ProdYear False\n",
"Category False\n",
"Leather interior False\n",
"Fuel type False\n",
"Engine volume False\n",
"Mileage False\n",
"Cylinders False\n",
"Gear box type False\n",
"Drive wheels False\n",
"Doors False\n",
"Wheel False\n",
"Color False\n",
"Airbags False\n",
"LeatherInterior1 False\n",
"dtype: bool"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Количество пустых значений признаков\n",
"display(car.isnull().sum())\n",
"display()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"display(car.isnull().any())\n",
"display()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in car.columns:\n",
" null_rate = car[i].isnull().sum() / len(car) * 100\n",
" if null_rate > 0:\n",
" display(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id 0\n",
"date 0\n",
"price 0\n",
"bedrooms 0\n",
"bathrooms 0\n",
"sqft_living 0\n",
"sqft_lot 0\n",
"floors 0\n",
"waterfront 0\n",
"view 0\n",
"condition 0\n",
"grade 0\n",
"sqft_above 0\n",
"sqft_basement 0\n",
"yr_built 0\n",
"yr_renovated 0\n",
"zipcode 0\n",
"lat 0\n",
"long 0\n",
"sqft_living15 0\n",
"sqft_lot15 0\n",
"dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"id False\n",
"date False\n",
"price False\n",
"bedrooms False\n",
"bathrooms False\n",
"sqft_living False\n",
"sqft_lot False\n",
"floors False\n",
"waterfront False\n",
"view False\n",
"condition False\n",
"grade False\n",
"sqft_above False\n",
"sqft_basement False\n",
"yr_built False\n",
"yr_renovated False\n",
"zipcode False\n",
"lat False\n",
"long False\n",
"sqft_living15 False\n",
"sqft_lot15 False\n",
"dtype: bool"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Количество пустых значений признаков\n",
"display(house.isnull().sum())\n",
"display()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"display(house.isnull().any())\n",
"display()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in house.columns:\n",
" null_rate = house[i].isnull().sum() / len(house) * 100\n",
" if null_rate > 0:\n",
" display(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"В 1 наборе данных phone имеются пустые значения, в остальных они отсутствуют.\n",
"Заполним пустые значения из 1 датасета."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(805, 18)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Name_phone False\n",
"Rating False\n",
"Spec_score False\n",
"No_of_sim False\n",
"Ram False\n",
"Battery False\n",
"Display False\n",
"Camera False\n",
"External_Memory False\n",
"Android_version False\n",
"Price False\n",
"company False\n",
"Inbuilt_memory False\n",
"fast_charging False\n",
"Screen_resolution False\n",
"Processor False\n",
"Processor_name False\n",
"Class False\n",
"dtype: bool"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name_phone</th>\n",
" <th>Rating</th>\n",
" <th>Spec_score</th>\n",
" <th>No_of_sim</th>\n",
" <th>Ram</th>\n",
" <th>Battery</th>\n",
" <th>Display</th>\n",
" <th>Camera</th>\n",
" <th>External_Memory</th>\n",
" <th>Android_version</th>\n",
" <th>Price</th>\n",
" <th>company</th>\n",
" <th>Inbuilt_memory</th>\n",
" <th>fast_charging</th>\n",
" <th>Screen_resolution</th>\n",
" <th>Processor</th>\n",
" <th>Processor_name</th>\n",
" <th>Class</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1365</th>\n",
" <td>TCL 40R</td>\n",
" <td>4.05</td>\n",
" <td>75</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
" <td>4 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP + 2 MP + 2 MP Triple Rear &amp;amp; 8 MP Fro...</td>\n",
" <td>Memory Card (Hybrid)</td>\n",
" <td>12</td>\n",
" <td>18,999</td>\n",
" <td>TCL</td>\n",
" <td>64 GB inbuilt</td>\n",
" <td>15W Fast Charging</td>\n",
" <td>720 x 1612 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Dimensity 700 5G</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1366</th>\n",
" <td>TCL 50 XL NxtPaper 5G</td>\n",
" <td>4.10</td>\n",
" <td>80</td>\n",
" <td>Dual Sim, 3G, 4G, VoLTE,</td>\n",
" <td>8 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.8 inches</td>\n",
" <td>50 MP + 2 MP Dual Rear &amp;amp; 16 MP Front Camera</td>\n",
" <td>Memory Card (Hybrid)</td>\n",
" <td>14</td>\n",
" <td>24,990</td>\n",
" <td>TCL</td>\n",
" <td>128 GB inbuilt</td>\n",
" <td>33W Fast Charging</td>\n",
" <td>1200 x 2400 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Dimensity 7050</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1367</th>\n",
" <td>TCL 50 XE NxtPaper 5G</td>\n",
" <td>4.00</td>\n",
" <td>80</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
" <td>6 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP + 2 MP Dual Rear &amp;amp; 16 MP Front Camera</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>13</td>\n",
" <td>23,990</td>\n",
" <td>TCL</td>\n",
" <td>256 GB inbuilt</td>\n",
" <td>18W Fast Charging</td>\n",
" <td>720 x 1612 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Dimensity 6080</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1368</th>\n",
" <td>TCL 40 NxtPaper 5G</td>\n",
" <td>4.50</td>\n",
" <td>79</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE,</td>\n",
" <td>6 GB RAM</td>\n",
" <td>5000 mAh Battery</td>\n",
" <td>6.6 inches</td>\n",
" <td>50 MP + 2 MP + 2 MP Triple Rear &amp;amp; 8 MP Fro...</td>\n",
" <td>Memory Card Supported, upto 1 TB</td>\n",
" <td>13</td>\n",
" <td>22,499</td>\n",
" <td>TCL</td>\n",
" <td>256 GB inbuilt</td>\n",
" <td>15W Fast Charging</td>\n",
" <td>720 x 1612 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Dimensity 6020</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1369</th>\n",
" <td>TCL Trifold</td>\n",
" <td>4.65</td>\n",
" <td>93</td>\n",
" <td>Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G,</td>\n",
" <td>12 GB RAM</td>\n",
" <td>4600 mAh Battery</td>\n",
" <td>10 inches</td>\n",
" <td>Foldable Display, Dual Display</td>\n",
" <td>50 MP + 48 MP + 8 MP Triple Rear &amp;amp; 32 MP F...</td>\n",
" <td>13</td>\n",
" <td>1,19,990</td>\n",
" <td>TCL</td>\n",
" <td>256 GB inbuilt</td>\n",
" <td>67W Fast Charging</td>\n",
" <td>1916 x 2160 px</td>\n",
" <td>Octa Core</td>\n",
" <td>Snapdragon 8 Gen2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name_phone Rating Spec_score \\\n",
"Id \n",
"1365 TCL 40R 4.05 75 \n",
"1366 TCL 50 XL NxtPaper 5G 4.10 80 \n",
"1367 TCL 50 XE NxtPaper 5G 4.00 80 \n",
"1368 TCL 40 NxtPaper 5G 4.50 79 \n",
"1369 TCL Trifold 4.65 93 \n",
"\n",
" No_of_sim Ram Battery \\\n",
"Id \n",
"1365 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery \n",
"1366 Dual Sim, 3G, 4G, VoLTE, 8 GB RAM 5000 mAh Battery \n",
"1367 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n",
"1368 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery \n",
"1369 Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, 12 GB RAM 4600 mAh Battery \n",
"\n",
" Display Camera \\\n",
"Id \n",
"1365 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro... \n",
"1366 6.8 inches 50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera \n",
"1367 6.6 inches 50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera \n",
"1368 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro... \n",
"1369 10 inches Foldable Display, Dual Display \n",
"\n",
" External_Memory Android_version \\\n",
"Id \n",
"1365 Memory Card (Hybrid) 12 \n",
"1366 Memory Card (Hybrid) 14 \n",
"1367 Memory Card Supported, upto 1 TB 13 \n",
"1368 Memory Card Supported, upto 1 TB 13 \n",
"1369 50 MP + 48 MP + 8 MP Triple Rear &amp; 32 MP F... 13 \n",
"\n",
" Price company Inbuilt_memory fast_charging Screen_resolution \\\n",
"Id \n",
"1365 18,999 TCL 64 GB inbuilt 15W Fast Charging 720 x 1612 px \n",
"1366 24,990 TCL 128 GB inbuilt 33W Fast Charging 1200 x 2400 px \n",
"1367 23,990 TCL 256 GB inbuilt 18W Fast Charging 720 x 1612 px \n",
"1368 22,499 TCL 256 GB inbuilt 15W Fast Charging 720 x 1612 px \n",
"1369 1,19,990 TCL 256 GB inbuilt 67W Fast Charging 1916 x 2160 px \n",
"\n",
" Processor Processor_name Class \n",
"Id \n",
"1365 Octa Core Dimensity 700 5G 2 \n",
"1366 Octa Core Dimensity 7050 1 \n",
"1367 Octa Core Dimensity 6080 2 \n",
"1368 Octa Core Dimensity 6020 2 \n",
"1369 Octa Core Snapdragon 8 Gen2 2 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Замена пустых данных на моду (часто встречающееся значение)\n",
"mode_Android = phone['Android_version'].mode() \n",
"phone.fillna({'Android_version':mode_Android}, inplace=True)\n",
"\n",
"Inbuilt_memory = phone['Inbuilt_memory'].mode() \n",
"phone.fillna({'Inbuilt_memory':Inbuilt_memory}, inplace=True)\n",
"\n",
"fillna_df = phone.fillna(0)\n",
"\n",
"dropna_df = phone.dropna()\n",
"\n",
"display(dropna_df.shape)\n",
"\n",
"display(fillna_df.isnull().any())\n",
"\n",
"phone.tail()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Class\n",
"1 718\n",
"2 636\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Обучающая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(812, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Class\n",
"1 431\n",
"2 381\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Контрольная выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(271, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Class\n",
"1 143\n",
"2 128\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Тестовая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(271, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Class\n",
"1 144\n",
"2 127\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Вывод распределения количества наблюдений по меткам (классам)\n",
"from src.utils import split_stratified_into_train_val_test\n",
"\n",
"\n",
"display(phone.Class.value_counts())\n",
"display()\n",
"\n",
"data = phone[[\"Class\", \"Spec_score\", \"Rating\"]].copy()\n",
"\n",
"df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
" data, stratify_colname=\"Class\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
")\n",
"\n",
"display(\"Обучающая выборка: \", df_train.shape)\n",
"display(df_train.Class.value_counts())\n",
"\n",
"display(\"Контрольная выборка: \", df_val.shape)\n",
"display(df_val.Class.value_counts())\n",
"\n",
"display(\"Тестовая выборка: \", df_test.shape)\n",
"display(df_test.Class.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Обучающая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(812, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Class\n",
"1 431\n",
"2 381\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Обучающая выборка после oversampling: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(856, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Class\n",
"1 431\n",
"2 425\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Class</th>\n",
" <th>Spec_score</th>\n",
" <th>Rating</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>90</td>\n",
" <td>4.550000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>80</td>\n",
" <td>4.400000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>80</td>\n",
" <td>4.650000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>75</td>\n",
" <td>4.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>92</td>\n",
" <td>4.650000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>851</th>\n",
" <td>2</td>\n",
" <td>96</td>\n",
" <td>4.061287</td>\n",
" </tr>\n",
" <tr>\n",
" <th>852</th>\n",
" <td>2</td>\n",
" <td>95</td>\n",
" <td>4.191740</td>\n",
" </tr>\n",
" <tr>\n",
" <th>853</th>\n",
" <td>2</td>\n",
" <td>73</td>\n",
" <td>4.612413</td>\n",
" </tr>\n",
" <tr>\n",
" <th>854</th>\n",
" <td>2</td>\n",
" <td>94</td>\n",
" <td>4.193295</td>\n",
" </tr>\n",
" <tr>\n",
" <th>855</th>\n",
" <td>2</td>\n",
" <td>92</td>\n",
" <td>4.650000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>856 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Class Spec_score Rating\n",
"0 1 90 4.550000\n",
"1 1 80 4.400000\n",
"2 1 80 4.650000\n",
"3 1 75 4.500000\n",
"4 2 92 4.650000\n",
".. ... ... ...\n",
"851 2 96 4.061287\n",
"852 2 95 4.191740\n",
"853 2 73 4.612413\n",
"854 2 94 4.193295\n",
"855 2 92 4.650000\n",
"\n",
"[856 rows x 3 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from imblearn.over_sampling import ADASYN\n",
"\n",
"ada = ADASYN()\n",
"\n",
"display(\"Обучающая выборка: \", df_train.shape)\n",
"display(df_train.Class.value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"Class\"]) # type: ignore\n",
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"display(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
"display(df_train_adasyn.Class.value_counts())\n",
"\n",
"df_train_adasyn"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Обучающая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(812, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Class\n",
"1 431\n",
"2 381\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Обучающая выборка после undersampling: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(762, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Class\n",
"1 381\n",
"2 381\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Spec_score</th>\n",
" <th>Rating</th>\n",
" <th>Class</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>468</th>\n",
" <td>69</td>\n",
" <td>4.45</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1093</th>\n",
" <td>86</td>\n",
" <td>4.45</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1102</th>\n",
" <td>89</td>\n",
" <td>4.00</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>422</th>\n",
" <td>70</td>\n",
" <td>4.15</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1192</th>\n",
" <td>81</td>\n",
" <td>4.00</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>298</th>\n",
" <td>75</td>\n",
" <td>4.70</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>249</th>\n",
" <td>75</td>\n",
" <td>4.25</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1341</th>\n",
" <td>66</td>\n",
" <td>4.25</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>923</th>\n",
" <td>88</td>\n",
" <td>4.15</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>301</th>\n",
" <td>80</td>\n",
" <td>4.50</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>762 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Spec_score Rating Class\n",
"Id \n",
"468 69 4.45 1\n",
"1093 86 4.45 1\n",
"1102 89 4.00 1\n",
"422 70 4.15 1\n",
"1192 81 4.00 1\n",
"... ... ... ...\n",
"298 75 4.70 2\n",
"249 75 4.25 2\n",
"1341 66 4.25 2\n",
"923 88 4.15 2\n",
"301 80 4.50 2\n",
"\n",
"[762 rows x 3 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from imblearn.under_sampling import RandomUnderSampler\n",
"import pandas as pd\n",
"\n",
"# Создание экземпляра RandomUnderSampler\n",
"rus = RandomUnderSampler()\n",
"\n",
"# Проверка исходной обучающей выборки\n",
"display(\"Обучающая выборка: \", df_train.shape)\n",
"display(df_train.Class.value_counts())\n",
"\n",
"# Выполнение undersampling\n",
"X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=[\"Class\"]), df_train[\"Class\"]) # type: ignore\n",
"df_train_undersampled = pd.DataFrame(X_resampled)\n",
"\n",
"# Добавление целевой переменной обратно в выборку\n",
"df_train_undersampled[\"Class\"] = y_resampled\n",
"\n",
"# Проверка обучающей выборки после undersampling\n",
"display(\"Обучающая выборка после undersampling: \", df_train_undersampled.shape)\n",
"display(df_train_undersampled.Class.value_counts())\n",
"\n",
"df_train_undersampled\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LeatherInterior1\n",
"1 13949\n",
"0 5278\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Обучающая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(11536, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"LeatherInterior1\n",
"1 8369\n",
"0 3167\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Контрольная выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(3845, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"LeatherInterior1\n",
"1 2790\n",
"0 1055\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Тестовая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(3846, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"LeatherInterior1\n",
"1 2790\n",
"0 1056\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Вывод распределения количества наблюдений по меткам (классам)\n",
"from src.utils import split_stratified_into_train_val_test\n",
"\n",
"\n",
"display(car.LeatherInterior1.value_counts())\n",
"display()\n",
"\n",
"data = car[[\"LeatherInterior1\", \"Airbags\", \"ProdYear\"]].copy()\n",
"\n",
"df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
" data, stratify_colname=\"LeatherInterior1\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
")\n",
"\n",
"display(\"Обучающая выборка: \", df_train.shape)\n",
"display(df_train.LeatherInterior1.value_counts())\n",
"\n",
"display(\"Контрольная выборка: \", df_val.shape)\n",
"display(df_val.LeatherInterior1.value_counts())\n",
"\n",
"display(\"Тестовая выборка: \", df_test.shape)\n",
"display(df_test.LeatherInterior1.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Обучающая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(11536, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"LeatherInterior1\n",
"1 8369\n",
"0 3167\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Обучающая выборка после oversampling: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(16728, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"LeatherInterior1\n",
"1 8369\n",
"0 8359\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LeatherInterior1</th>\n",
" <th>Airbags</th>\n",
" <th>ProdYear</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>16</td>\n",
" <td>2011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>2017</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>2018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>2010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16723</th>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>2004</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16724</th>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>2004</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16725</th>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>2004</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16726</th>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>2004</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16727</th>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>2003</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>16728 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" LeatherInterior1 Airbags ProdYear\n",
"0 0 16 2011\n",
"1 1 10 2017\n",
"2 1 4 2018\n",
"3 1 0 2011\n",
"4 1 6 2010\n",
"... ... ... ...\n",
"16723 0 12 2004\n",
"16724 0 12 2004\n",
"16725 0 12 2004\n",
"16726 0 12 2004\n",
"16727 0 12 2003\n",
"\n",
"[16728 rows x 3 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from imblearn.over_sampling import ADASYN\n",
"\n",
"ada = ADASYN()\n",
"\n",
"display(\"Обучающая выборка: \", df_train.shape)\n",
"display(df_train.LeatherInterior1.value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"LeatherInterior1\"]) # type: ignore\n",
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"display(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
"display(df_train_adasyn.LeatherInterior1.value_counts())\n",
"\n",
"df_train_adasyn"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Обучающая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(11536, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"LeatherInterior1\n",
"1 8369\n",
"0 3167\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Обучающая выборка после undersampling: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(6334, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"LeatherInterior1\n",
"0 3167\n",
"1 3167\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Airbags</th>\n",
" <th>ProdYear</th>\n",
" <th>LeatherInterior1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1315</th>\n",
" <td>16</td>\n",
" <td>2011</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1569</th>\n",
" <td>0</td>\n",
" <td>2014</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6317</th>\n",
" <td>6</td>\n",
" <td>2006</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>626</th>\n",
" <td>4</td>\n",
" <td>2000</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12439</th>\n",
" <td>12</td>\n",
" <td>2012</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3361</th>\n",
" <td>12</td>\n",
" <td>2012</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3286</th>\n",
" <td>4</td>\n",
" <td>2017</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17666</th>\n",
" <td>4</td>\n",
" <td>2015</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4902</th>\n",
" <td>0</td>\n",
" <td>2012</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4613</th>\n",
" <td>4</td>\n",
" <td>2014</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6334 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Airbags ProdYear LeatherInterior1\n",
"1315 16 2011 0\n",
"1569 0 2014 0\n",
"6317 6 2006 0\n",
"626 4 2000 0\n",
"12439 12 2012 0\n",
"... ... ... ...\n",
"3361 12 2012 1\n",
"3286 4 2017 1\n",
"17666 4 2015 1\n",
"4902 0 2012 1\n",
"4613 4 2014 1\n",
"\n",
"[6334 rows x 3 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from imblearn.under_sampling import RandomUnderSampler\n",
"import pandas as pd\n",
"\n",
"# Создание экземпляра RandomUnderSampler\n",
"rus = RandomUnderSampler()\n",
"\n",
"# Проверка исходной обучающей выборки\n",
"display(\"Обучающая выборка: \", df_train.shape)\n",
"display(df_train.LeatherInterior1.value_counts())\n",
"\n",
"# Выполнение undersampling\n",
"X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=[\"LeatherInterior1\"]), df_train[\"LeatherInterior1\"]) # type: ignore\n",
"df_train_undersampled = pd.DataFrame(X_resampled)\n",
"\n",
"# Добавление целевой переменной обратно в выборку\n",
"df_train_undersampled[\"LeatherInterior1\"] = y_resampled\n",
"\n",
"# Проверка обучающей выборки после undersampling\n",
"display(\"Обучающая выборка после undersampling: \", df_train_undersampled.shape)\n",
"display(df_train_undersampled.LeatherInterior1.value_counts())\n",
"\n",
"df_train_undersampled"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"bedrooms\n",
"3 1518\n",
"4 1058\n",
"2 404\n",
"5 234\n",
"6 55\n",
"1 30\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Обучающая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(1979, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"bedrooms\n",
"3 911\n",
"4 635\n",
"2 242\n",
"5 140\n",
"6 33\n",
"1 18\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Контрольная выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(660, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"bedrooms\n",
"3 304\n",
"4 211\n",
"2 81\n",
"5 47\n",
"6 11\n",
"1 6\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Тестовая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(660, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"bedrooms\n",
"3 303\n",
"4 212\n",
"2 81\n",
"5 47\n",
"6 11\n",
"1 6\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Вывод распределения количества наблюдений по меткам (классам)\n",
"from src.utils import split_stratified_into_train_val_test\n",
"\n",
"\n",
"display(house.bedrooms.value_counts())\n",
"display()\n",
"\n",
"data = house[[\"bedrooms\", \"sqft_living\", \"sqft_lot\"]].copy()\n",
"\n",
"df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
" data, stratify_colname=\"bedrooms\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
")\n",
"\n",
"display(\"Обучающая выборка: \", df_train.shape)\n",
"display(df_train.bedrooms.value_counts())\n",
"\n",
"display(\"Контрольная выборка: \", df_val.shape)\n",
"display(df_val.bedrooms.value_counts())\n",
"\n",
"display(\"Тестовая выборка: \", df_test.shape)\n",
"display(df_test.bedrooms.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Обучающая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(1979, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"bedrooms\n",
"3 911\n",
"4 635\n",
"2 242\n",
"5 140\n",
"6 33\n",
"1 18\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Обучающая выборка после oversampling: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(5380, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"bedrooms\n",
"2 932\n",
"1 914\n",
"3 911\n",
"6 907\n",
"5 888\n",
"4 828\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>bedrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>1940</td>\n",
" <td>10035</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4</td>\n",
" <td>1920</td>\n",
" <td>4862</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4</td>\n",
" <td>2340</td>\n",
" <td>3784</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>3450</td>\n",
" <td>33460</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>2230</td>\n",
" <td>26989</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5375</th>\n",
" <td>6</td>\n",
" <td>2575</td>\n",
" <td>6858</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5376</th>\n",
" <td>6</td>\n",
" <td>2635</td>\n",
" <td>8286</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5377</th>\n",
" <td>6</td>\n",
" <td>2815</td>\n",
" <td>7930</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5378</th>\n",
" <td>6</td>\n",
" <td>2857</td>\n",
" <td>7735</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5379</th>\n",
" <td>6</td>\n",
" <td>2923</td>\n",
" <td>7315</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5380 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" bedrooms sqft_living sqft_lot\n",
"0 3 1940 10035\n",
"1 4 1920 4862\n",
"2 4 2340 3784\n",
"3 4 3450 33460\n",
"4 4 2230 26989\n",
"... ... ... ...\n",
"5375 6 2575 6858\n",
"5376 6 2635 8286\n",
"5377 6 2815 7930\n",
"5378 6 2857 7735\n",
"5379 6 2923 7315\n",
"\n",
"[5380 rows x 3 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from imblearn.over_sampling import ADASYN\n",
"\n",
"ada = ADASYN()\n",
"\n",
"display(\"Обучающая выборка: \", df_train.shape)\n",
"display(df_train.bedrooms.value_counts())\n",
"\n",
"X_resampled, y_resampled = ada.fit_resample(df_train, df_train[\"bedrooms\"]) # type: ignore\n",
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
"\n",
"display(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
"display(df_train_adasyn.bedrooms.value_counts())\n",
"\n",
"df_train_adasyn"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Обучающая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(1979, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"bedrooms\n",
"3 911\n",
"4 635\n",
"2 242\n",
"5 140\n",
"6 33\n",
"1 18\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Обучающая выборка после undersampling: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(108, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"bedrooms\n",
"1 18\n",
"2 18\n",
"3 18\n",
"4 18\n",
"5 18\n",
"6 18\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>bedrooms</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2003</th>\n",
" <td>1090</td>\n",
" <td>8750</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2922</th>\n",
" <td>780</td>\n",
" <td>10235</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>350</th>\n",
" <td>560</td>\n",
" <td>12120</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2682</th>\n",
" <td>710</td>\n",
" <td>6000</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3037</th>\n",
" <td>890</td>\n",
" <td>211576</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>832</th>\n",
" <td>2450</td>\n",
" <td>25600</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>783</th>\n",
" <td>3610</td>\n",
" <td>10003</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>486</th>\n",
" <td>4860</td>\n",
" <td>11793</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1499</th>\n",
" <td>3840</td>\n",
" <td>14040</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1302</th>\n",
" <td>3010</td>\n",
" <td>17864</td>\n",
" <td>6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>108 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" sqft_living sqft_lot bedrooms\n",
"2003 1090 8750 1\n",
"2922 780 10235 1\n",
"350 560 12120 1\n",
"2682 710 6000 1\n",
"3037 890 211576 1\n",
"... ... ... ...\n",
"832 2450 25600 6\n",
"783 3610 10003 6\n",
"486 4860 11793 6\n",
"1499 3840 14040 6\n",
"1302 3010 17864 6\n",
"\n",
"[108 rows x 3 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from imblearn.under_sampling import RandomUnderSampler\n",
"import pandas as pd\n",
"\n",
"# Создание экземпляра RandomUnderSampler\n",
"rus = RandomUnderSampler()\n",
"\n",
"# Проверка исходной обучающей выборки\n",
"display(\"Обучающая выборка: \", df_train.shape)\n",
"display(df_train.bedrooms.value_counts())\n",
"\n",
"# Выполнение undersampling\n",
"X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=[\"bedrooms\"]), df_train[\"bedrooms\"]) # type: ignore\n",
"df_train_undersampled = pd.DataFrame(X_resampled)\n",
"\n",
"# Добавление целевой переменной обратно в выборку\n",
"df_train_undersampled[\"bedrooms\"] = y_resampled\n",
"\n",
"# Проверка обучающей выборки после undersampling\n",
"display(\"Обучающая выборка после undersampling: \", df_train_undersampled.shape)\n",
"display(df_train_undersampled.bedrooms.value_counts())\n",
"\n",
"df_train_undersampled"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}