PredictiveAnalytics/lab2.ipynb

91 KiB
Raw Permalink Blame History

In [1]:
import pandas as pd

phone = pd.read_csv("data/phone_price.csv", index_col="Id")

phone.info()

display(phone.shape)

phone.head()
<class 'pandas.core.frame.DataFrame'>
Index: 1354 entries, 0 to 1369
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name_phone         1354 non-null   object 
 1   Rating             1354 non-null   float64
 2   Spec_score         1354 non-null   int64  
 3   No_of_sim          1354 non-null   object 
 4   Ram                1354 non-null   object 
 5   Battery            1354 non-null   object 
 6   Display            1354 non-null   object 
 7   Camera             1354 non-null   object 
 8   External_Memory    1354 non-null   object 
 9   Android_version    914 non-null    object 
 10  Price              1354 non-null   object 
 11  company            1354 non-null   object 
 12  Inbuilt_memory     1335 non-null   object 
 13  fast_charging      1267 non-null   object 
 14  Screen_resolution  1353 non-null   object 
 15  Processor          1327 non-null   object 
 16  Processor_name     1354 non-null   object 
 17  Class              1354 non-null   int64  
dtypes: float64(1), int64(2), object(15)
memory usage: 201.0+ KB
(1354, 18)
Out[1]:
Name_phone Rating Spec_score No_of_sim Ram Battery Display Camera External_Memory Android_version Price company Inbuilt_memory fast_charging Screen_resolution Processor Processor_name Class
Id
0 Samsung Galaxy F14 5G 4.65 68 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 6000 mAh Battery 6.6 inches 50 MP + 2 MP Dual Rear &amp; 13 MP Front Camera Memory Card Supported, upto 1 TB 13 9,999 Samsung 128 GB inbuilt 25W Fast Charging 2408 x 1080 px Display with Water Drop Notch Octa Core Processor Exynos 1330 2
1 Samsung Galaxy A11 4.20 63 Dual Sim, 3G, 4G, VoLTE, 2 GB RAM 4000 mAh Battery 6.4 inches 13 MP + 5 MP + 2 MP Triple Rear &amp; 8 MP Fro... Memory Card Supported, upto 512 GB 10 9,990 Samsung 32 GB inbuilt 15W Fast Charging 720 x 1560 px Display with Punch Hole 1.8 GHz Processor Octa Core 2
3 Samsung Galaxy F23 4.10 73 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 6000 mAh Battery 6.4 inches 48 MP Quad Rear &amp; 13 MP Front Camera Memory Card Supported, upto 1 TB 12 11,999 Samsung 64 GB inbuilt NaN 720 x 1600 px Octa Core Helio G88 1
4 Samsung Galaxy A03s (4GB RAM + 64GB) 4.10 69 Dual Sim, 3G, 4G, VoLTE, 4 GB RAM 5000 mAh Battery 6.5 inches 13 MP + 2 MP + 2 MP Triple Rear &amp; 5 MP Fro... Memory Card Supported, upto 1 TB 11 11,999 Samsung 64 GB inbuilt 15W Fast Charging 720 x 1600 px Display with Water Drop Notch Octa Core Helio P35 2
5 Samsung Galaxy M13 5G 4.40 75 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery 6.5 inches 50 MP + 2 MP Dual Rear &amp; 5 MP Front Camera Memory Card Supported, upto 1 TB 12 11,990 Samsung 128 GB inbuilt 15W Fast Charging 720 x 1600 px Octa Core Dimensity 700 1
In [2]:
car = pd.read_csv("data/car_price.csv")

car.info()

display(car.shape)

car.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19227 entries, 0 to 19226
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        19227 non-null  int64  
 1   ID                19227 non-null  int64  
 2   Price             19227 non-null  int64  
 3   Levy              19227 non-null  object 
 4   Manufacturer      19227 non-null  object 
 5   Model             19227 non-null  object 
 6   ProdYear          19227 non-null  int64  
 7   Category          19227 non-null  object 
 8   Leather interior  19227 non-null  object 
 9   Fuel type         19227 non-null  object 
 10  Engine volume     19227 non-null  object 
 11  Mileage           19227 non-null  object 
 12  Cylinders         19227 non-null  float64
 13  Gear box type     19227 non-null  object 
 14  Drive wheels      19227 non-null  object 
 15  Doors             19227 non-null  object 
 16  Wheel             19227 non-null  object 
 17  Color             19227 non-null  object 
 18  Airbags           19227 non-null  int64  
 19  LeatherInterior1  19227 non-null  int64  
dtypes: float64(1), int64(6), object(13)
memory usage: 2.9+ MB
(19227, 20)
Out[2]:
Unnamed: 0 ID Price Levy Manufacturer Model ProdYear Category Leather interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags LeatherInterior1
0 0 45654403 13328 1399 LEXUS RX 450 2010 Jeep Yes Hybrid 3.5 186005 km 6.0 Automatic 4x4 04-May Left wheel Silver 12 1
1 1 44731507 16621 1018 CHEVROLET Equinox 2011 Jeep No Petrol 3 192000 km 6.0 Tiptronic 4x4 04-May Left wheel Black 8 0
2 2 45774419 8467 - HONDA FIT 2006 Hatchback No Petrol 1.3 200000 km 4.0 Variator Front 04-May Right-hand drive Black 2 0
3 3 45769185 3607 862 FORD Escape 2011 Jeep Yes Hybrid 2.5 168966 km 4.0 Automatic 4x4 04-May Left wheel White 0 1
4 4 45809263 11726 446 HONDA FIT 2014 Hatchback Yes Petrol 1.3 91901 km 4.0 Automatic Front 04-May Left wheel Silver 4 1
In [3]:
house = pd.read_csv("data/house_data.csv")

house.info()

display(house.shape)

house.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3299 entries, 0 to 3298
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             3299 non-null   int64  
 1   date           3299 non-null   object 
 2   price          3299 non-null   float64
 3   bedrooms       3299 non-null   int64  
 4   bathrooms      3299 non-null   float64
 5   sqft_living    3299 non-null   int64  
 6   sqft_lot       3299 non-null   int64  
 7   floors         3299 non-null   float64
 8   waterfront     3299 non-null   int64  
 9   view           3299 non-null   int64  
 10  condition      3299 non-null   int64  
 11  grade          3299 non-null   int64  
 12  sqft_above     3299 non-null   int64  
 13  sqft_basement  3299 non-null   int64  
 14  yr_built       3299 non-null   int64  
 15  yr_renovated   3299 non-null   int64  
 16  zipcode        3299 non-null   int64  
 17  lat            3299 non-null   float64
 18  long           3299 non-null   float64
 19  sqft_living15  3299 non-null   int64  
 20  sqft_lot15     3299 non-null   int64  
dtypes: float64(5), int64(15), object(1)
memory usage: 541.4+ KB
(3299, 21)
Out[3]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7129300520 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 ... 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 ... 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 ... 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 ... 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 ... 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503

5 rows × 21 columns

In [4]:
# Количество пустых значений признаков
display(phone.isnull().sum())
display()

# Есть ли пустые значения признаков
display(phone.isnull().any())
display()

# Процент пустых значений признаков
for i in phone.columns:
    null_rate = phone[i].isnull().sum() / len(phone) * 100
    if null_rate > 0:
        display(f"{i} процент пустых значений: %{null_rate:.2f}")
Name_phone             0
Rating                 0
Spec_score             0
No_of_sim              0
Ram                    0
Battery                0
Display                0
Camera                 0
External_Memory        0
Android_version      440
Price                  0
company                0
Inbuilt_memory        19
fast_charging         87
Screen_resolution      1
Processor             27
Processor_name         0
Class                  0
dtype: int64
Name_phone           False
Rating               False
Spec_score           False
No_of_sim            False
Ram                  False
Battery              False
Display              False
Camera               False
External_Memory      False
Android_version       True
Price                False
company              False
Inbuilt_memory        True
fast_charging         True
Screen_resolution     True
Processor             True
Processor_name       False
Class                False
dtype: bool
'Android_version процент пустых значений: %32.50'
'Inbuilt_memory процент пустых значений: %1.40'
'fast_charging процент пустых значений: %6.43'
'Screen_resolution процент пустых значений: %0.07'
'Processor процент пустых значений: %1.99'
In [5]:
# Количество пустых значений признаков
display(car.isnull().sum())
display()

# Есть ли пустые значения признаков
display(car.isnull().any())
display()

# Процент пустых значений признаков
for i in car.columns:
    null_rate = car[i].isnull().sum() / len(car) * 100
    if null_rate > 0:
        display(f"{i} процент пустых значений: %{null_rate:.2f}")
Unnamed: 0          0
ID                  0
Price               0
Levy                0
Manufacturer        0
Model               0
ProdYear            0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
LeatherInterior1    0
dtype: int64
Unnamed: 0          False
ID                  False
Price               False
Levy                False
Manufacturer        False
Model               False
ProdYear            False
Category            False
Leather interior    False
Fuel type           False
Engine volume       False
Mileage             False
Cylinders           False
Gear box type       False
Drive wheels        False
Doors               False
Wheel               False
Color               False
Airbags             False
LeatherInterior1    False
dtype: bool
In [6]:
# Количество пустых значений признаков
display(house.isnull().sum())
display()

# Есть ли пустые значения признаков
display(house.isnull().any())
display()

# Процент пустых значений признаков
for i in house.columns:
    null_rate = house[i].isnull().sum() / len(house) * 100
    if null_rate > 0:
        display(f"{i} процент пустых значений: %{null_rate:.2f}")
id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64
id               False
date             False
price            False
bedrooms         False
bathrooms        False
sqft_living      False
sqft_lot         False
floors           False
waterfront       False
view             False
condition        False
grade            False
sqft_above       False
sqft_basement    False
yr_built         False
yr_renovated     False
zipcode          False
lat              False
long             False
sqft_living15    False
sqft_lot15       False
dtype: bool

В 1 наборе данных phone имеются пустые значения, в остальных они отсутствуют. Заполним пустые значения из 1 датасета.

In [7]:
# Замена пустых данных на моду (часто встречающееся значение)
mode_Android = phone['Android_version'].mode()  
phone.fillna({'Android_version':mode_Android}, inplace=True)

Inbuilt_memory = phone['Inbuilt_memory'].mode()  
phone.fillna({'Inbuilt_memory':Inbuilt_memory}, inplace=True)

fillna_df = phone.fillna(0)

dropna_df = phone.dropna()

display(dropna_df.shape)

display(fillna_df.isnull().any())

phone.tail()
(805, 18)
Name_phone           False
Rating               False
Spec_score           False
No_of_sim            False
Ram                  False
Battery              False
Display              False
Camera               False
External_Memory      False
Android_version      False
Price                False
company              False
Inbuilt_memory       False
fast_charging        False
Screen_resolution    False
Processor            False
Processor_name       False
Class                False
dtype: bool
Out[7]:
Name_phone Rating Spec_score No_of_sim Ram Battery Display Camera External_Memory Android_version Price company Inbuilt_memory fast_charging Screen_resolution Processor Processor_name Class
Id
1365 TCL 40R 4.05 75 Dual Sim, 3G, 4G, 5G, VoLTE, 4 GB RAM 5000 mAh Battery 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro... Memory Card (Hybrid) 12 18,999 TCL 64 GB inbuilt 15W Fast Charging 720 x 1612 px Octa Core Dimensity 700 5G 2
1366 TCL 50 XL NxtPaper 5G 4.10 80 Dual Sim, 3G, 4G, VoLTE, 8 GB RAM 5000 mAh Battery 6.8 inches 50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera Memory Card (Hybrid) 14 24,990 TCL 128 GB inbuilt 33W Fast Charging 1200 x 2400 px Octa Core Dimensity 7050 1
1367 TCL 50 XE NxtPaper 5G 4.00 80 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery 6.6 inches 50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera Memory Card Supported, upto 1 TB 13 23,990 TCL 256 GB inbuilt 18W Fast Charging 720 x 1612 px Octa Core Dimensity 6080 2
1368 TCL 40 NxtPaper 5G 4.50 79 Dual Sim, 3G, 4G, 5G, VoLTE, 6 GB RAM 5000 mAh Battery 6.6 inches 50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro... Memory Card Supported, upto 1 TB 13 22,499 TCL 256 GB inbuilt 15W Fast Charging 720 x 1612 px Octa Core Dimensity 6020 2
1369 TCL Trifold 4.65 93 Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, 12 GB RAM 4600 mAh Battery 10 inches Foldable Display, Dual Display 50 MP + 48 MP + 8 MP Triple Rear &amp; 32 MP F... 13 1,19,990 TCL 256 GB inbuilt 67W Fast Charging 1916 x 2160 px Octa Core Snapdragon 8 Gen2 2
In [8]:
# Вывод распределения количества наблюдений по меткам (классам)
from src.utils import split_stratified_into_train_val_test


display(phone.Class.value_counts())
display()

data = phone[["Class", "Spec_score", "Rating"]].copy()

df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
   data, stratify_colname="Class", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

display("Обучающая выборка: ", df_train.shape)
display(df_train.Class.value_counts())

display("Контрольная выборка: ", df_val.shape)
display(df_val.Class.value_counts())

display("Тестовая выборка: ", df_test.shape)
display(df_test.Class.value_counts())
Class
1    718
2    636
Name: count, dtype: int64
'Обучающая выборка: '
(812, 3)
Class
1    431
2    381
Name: count, dtype: int64
'Контрольная выборка: '
(271, 3)
Class
1    143
2    128
Name: count, dtype: int64
'Тестовая выборка: '
(271, 3)
Class
1    144
2    127
Name: count, dtype: int64
In [9]:
from imblearn.over_sampling import ADASYN

ada = ADASYN()

display("Обучающая выборка: ", df_train.shape)
display(df_train.Class.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["Class"]) # type: ignore
df_train_adasyn = pd.DataFrame(X_resampled)

display("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
display(df_train_adasyn.Class.value_counts())

df_train_adasyn
'Обучающая выборка: '
(812, 3)
Class
1    431
2    381
Name: count, dtype: int64
'Обучающая выборка после oversampling: '
(856, 3)
Class
1    431
2    425
Name: count, dtype: int64
Out[9]:
Class Spec_score Rating
0 1 90 4.550000
1 1 80 4.400000
2 1 80 4.650000
3 1 75 4.500000
4 2 92 4.650000
... ... ... ...
851 2 96 4.061287
852 2 95 4.191740
853 2 73 4.612413
854 2 94 4.193295
855 2 92 4.650000

856 rows × 3 columns

In [10]:
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# Создание экземпляра RandomUnderSampler
rus = RandomUnderSampler()

# Проверка исходной обучающей выборки
display("Обучающая выборка: ", df_train.shape)
display(df_train.Class.value_counts())

# Выполнение undersampling
X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=["Class"]), df_train["Class"])  # type: ignore
df_train_undersampled = pd.DataFrame(X_resampled)

# Добавление целевой переменной обратно в выборку
df_train_undersampled["Class"] = y_resampled

# Проверка обучающей выборки после undersampling
display("Обучающая выборка после undersampling: ", df_train_undersampled.shape)
display(df_train_undersampled.Class.value_counts())

df_train_undersampled
'Обучающая выборка: '
(812, 3)
Class
1    431
2    381
Name: count, dtype: int64
'Обучающая выборка после undersampling: '
(762, 3)
Class
1    381
2    381
Name: count, dtype: int64
Out[10]:
Spec_score Rating Class
Id
468 69 4.45 1
1093 86 4.45 1
1102 89 4.00 1
422 70 4.15 1
1192 81 4.00 1
... ... ... ...
298 75 4.70 2
249 75 4.25 2
1341 66 4.25 2
923 88 4.15 2
301 80 4.50 2

762 rows × 3 columns

In [11]:
# Вывод распределения количества наблюдений по меткам (классам)
from src.utils import split_stratified_into_train_val_test


display(car.LeatherInterior1.value_counts())
display()

data = car[["LeatherInterior1", "Airbags", "ProdYear"]].copy()

df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
   data, stratify_colname="LeatherInterior1", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

display("Обучающая выборка: ", df_train.shape)
display(df_train.LeatherInterior1.value_counts())

display("Контрольная выборка: ", df_val.shape)
display(df_val.LeatherInterior1.value_counts())

display("Тестовая выборка: ", df_test.shape)
display(df_test.LeatherInterior1.value_counts())
LeatherInterior1
1    13949
0     5278
Name: count, dtype: int64
'Обучающая выборка: '
(11536, 3)
LeatherInterior1
1    8369
0    3167
Name: count, dtype: int64
'Контрольная выборка: '
(3845, 3)
LeatherInterior1
1    2790
0    1055
Name: count, dtype: int64
'Тестовая выборка: '
(3846, 3)
LeatherInterior1
1    2790
0    1056
Name: count, dtype: int64
In [12]:
from imblearn.over_sampling import ADASYN

ada = ADASYN()

display("Обучающая выборка: ", df_train.shape)
display(df_train.LeatherInterior1.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["LeatherInterior1"]) # type: ignore
df_train_adasyn = pd.DataFrame(X_resampled)

display("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
display(df_train_adasyn.LeatherInterior1.value_counts())

df_train_adasyn
'Обучающая выборка: '
(11536, 3)
LeatherInterior1
1    8369
0    3167
Name: count, dtype: int64
'Обучающая выборка после oversampling: '
(16728, 3)
LeatherInterior1
1    8369
0    8359
Name: count, dtype: int64
Out[12]:
LeatherInterior1 Airbags ProdYear
0 0 16 2011
1 1 10 2017
2 1 4 2018
3 1 0 2011
4 1 6 2010
... ... ... ...
16723 0 12 2004
16724 0 12 2004
16725 0 12 2004
16726 0 12 2004
16727 0 12 2003

16728 rows × 3 columns

In [13]:
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# Создание экземпляра RandomUnderSampler
rus = RandomUnderSampler()

# Проверка исходной обучающей выборки
display("Обучающая выборка: ", df_train.shape)
display(df_train.LeatherInterior1.value_counts())

# Выполнение undersampling
X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=["LeatherInterior1"]), df_train["LeatherInterior1"])  # type: ignore
df_train_undersampled = pd.DataFrame(X_resampled)

# Добавление целевой переменной обратно в выборку
df_train_undersampled["LeatherInterior1"] = y_resampled

# Проверка обучающей выборки после undersampling
display("Обучающая выборка после undersampling: ", df_train_undersampled.shape)
display(df_train_undersampled.LeatherInterior1.value_counts())

df_train_undersampled
'Обучающая выборка: '
(11536, 3)
LeatherInterior1
1    8369
0    3167
Name: count, dtype: int64
'Обучающая выборка после undersampling: '
(6334, 3)
LeatherInterior1
0    3167
1    3167
Name: count, dtype: int64
Out[13]:
Airbags ProdYear LeatherInterior1
1315 16 2011 0
1569 0 2014 0
6317 6 2006 0
626 4 2000 0
12439 12 2012 0
... ... ... ...
3361 12 2012 1
3286 4 2017 1
17666 4 2015 1
4902 0 2012 1
4613 4 2014 1

6334 rows × 3 columns

In [14]:
# Вывод распределения количества наблюдений по меткам (классам)
from src.utils import split_stratified_into_train_val_test


display(house.bedrooms.value_counts())
display()

data = house[["bedrooms", "sqft_living", "sqft_lot"]].copy()

df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
   data, stratify_colname="bedrooms", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

display("Обучающая выборка: ", df_train.shape)
display(df_train.bedrooms.value_counts())

display("Контрольная выборка: ", df_val.shape)
display(df_val.bedrooms.value_counts())

display("Тестовая выборка: ", df_test.shape)
display(df_test.bedrooms.value_counts())
bedrooms
3    1518
4    1058
2     404
5     234
6      55
1      30
Name: count, dtype: int64
'Обучающая выборка: '
(1979, 3)
bedrooms
3    911
4    635
2    242
5    140
6     33
1     18
Name: count, dtype: int64
'Контрольная выборка: '
(660, 3)
bedrooms
3    304
4    211
2     81
5     47
6     11
1      6
Name: count, dtype: int64
'Тестовая выборка: '
(660, 3)
bedrooms
3    303
4    212
2     81
5     47
6     11
1      6
Name: count, dtype: int64
In [15]:
from imblearn.over_sampling import ADASYN

ada = ADASYN()

display("Обучающая выборка: ", df_train.shape)
display(df_train.bedrooms.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["bedrooms"]) # type: ignore
df_train_adasyn = pd.DataFrame(X_resampled)

display("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
display(df_train_adasyn.bedrooms.value_counts())

df_train_adasyn
'Обучающая выборка: '
(1979, 3)
bedrooms
3    911
4    635
2    242
5    140
6     33
1     18
Name: count, dtype: int64
'Обучающая выборка после oversampling: '
(5380, 3)
bedrooms
2    932
1    914
3    911
6    907
5    888
4    828
Name: count, dtype: int64
Out[15]:
bedrooms sqft_living sqft_lot
0 3 1940 10035
1 4 1920 4862
2 4 2340 3784
3 4 3450 33460
4 4 2230 26989
... ... ... ...
5375 6 2575 6858
5376 6 2635 8286
5377 6 2815 7930
5378 6 2857 7735
5379 6 2923 7315

5380 rows × 3 columns

In [16]:
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# Создание экземпляра RandomUnderSampler
rus = RandomUnderSampler()

# Проверка исходной обучающей выборки
display("Обучающая выборка: ", df_train.shape)
display(df_train.bedrooms.value_counts())

# Выполнение undersampling
X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=["bedrooms"]), df_train["bedrooms"])  # type: ignore
df_train_undersampled = pd.DataFrame(X_resampled)

# Добавление целевой переменной обратно в выборку
df_train_undersampled["bedrooms"] = y_resampled

# Проверка обучающей выборки после undersampling
display("Обучающая выборка после undersampling: ", df_train_undersampled.shape)
display(df_train_undersampled.bedrooms.value_counts())

df_train_undersampled
'Обучающая выборка: '
(1979, 3)
bedrooms
3    911
4    635
2    242
5    140
6     33
1     18
Name: count, dtype: int64
'Обучающая выборка после undersampling: '
(108, 3)
bedrooms
1    18
2    18
3    18
4    18
5    18
6    18
Name: count, dtype: int64
Out[16]:
sqft_living sqft_lot bedrooms
2003 1090 8750 1
2922 780 10235 1
350 560 12120 1
2682 710 6000 1
3037 890 211576 1
... ... ... ...
832 2450 25600 6
783 3610 10003 6
486 4860 11793 6
1499 3840 14040 6
1302 3010 17864 6

108 rows × 3 columns