PredictiveAnalytics/lab2.ipynb at main

Тукаева Альфия 595a505c62 fix lab2, lab3

2025-01-08 18:21:45 +04:00

91 KiB

Raw Permalink Blame History

In [1]:

import pandas as pd

phone = pd.read_csv("data/phone_price.csv", index_col="Id")

phone.info()

display(phone.shape)

phone.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1354 entries, 0 to 1369
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name_phone         1354 non-null   object 
 1   Rating             1354 non-null   float64
 2   Spec_score         1354 non-null   int64  
 3   No_of_sim          1354 non-null   object 
 4   Ram                1354 non-null   object 
 5   Battery            1354 non-null   object 
 6   Display            1354 non-null   object 
 7   Camera             1354 non-null   object 
 8   External_Memory    1354 non-null   object 
 9   Android_version    914 non-null    object 
 10  Price              1354 non-null   object 
 11  company            1354 non-null   object 
 12  Inbuilt_memory     1335 non-null   object 
 13  fast_charging      1267 non-null   object 
 14  Screen_resolution  1353 non-null   object 
 15  Processor          1327 non-null   object 
 16  Processor_name     1354 non-null   object 
 17  Class              1354 non-null   int64  
dtypes: float64(1), int64(2), object(15)
memory usage: 201.0+ KB

(1354, 18)

Out[1]:

	Name_phone	Rating	Spec_score	No_of_sim	Ram	Battery	Display	Camera	External_Memory	Android_version	Price	company	Inbuilt_memory	fast_charging	Screen_resolution	Processor	Processor_name	Class
Id
0	Samsung Galaxy F14 5G	4.65	68	Dual Sim, 3G, 4G, 5G, VoLTE,	4 GB RAM	6000 mAh Battery	6.6 inches	50 MP + 2 MP Dual Rear & 13 MP Front Camera	Memory Card Supported, upto 1 TB	13	9,999	Samsung	128 GB inbuilt	25W Fast Charging	2408 x 1080 px Display with Water Drop Notch	Octa Core Processor	Exynos 1330	2
1	Samsung Galaxy A11	4.20	63	Dual Sim, 3G, 4G, VoLTE,	2 GB RAM	4000 mAh Battery	6.4 inches	13 MP + 5 MP + 2 MP Triple Rear & 8 MP Fro...	Memory Card Supported, upto 512 GB	10	9,990	Samsung	32 GB inbuilt	15W Fast Charging	720 x 1560 px Display with Punch Hole	1.8 GHz Processor	Octa Core	2
3	Samsung Galaxy F23	4.10	73	Dual Sim, 3G, 4G, VoLTE,	4 GB RAM	6000 mAh Battery	6.4 inches	48 MP Quad Rear & 13 MP Front Camera	Memory Card Supported, upto 1 TB	12	11,999	Samsung	64 GB inbuilt	NaN	720 x 1600 px	Octa Core	Helio G88	1
4	Samsung Galaxy A03s (4GB RAM + 64GB)	4.10	69	Dual Sim, 3G, 4G, VoLTE,	4 GB RAM	5000 mAh Battery	6.5 inches	13 MP + 2 MP + 2 MP Triple Rear & 5 MP Fro...	Memory Card Supported, upto 1 TB	11	11,999	Samsung	64 GB inbuilt	15W Fast Charging	720 x 1600 px Display with Water Drop Notch	Octa Core	Helio P35	2
5	Samsung Galaxy M13 5G	4.40	75	Dual Sim, 3G, 4G, 5G, VoLTE,	6 GB RAM	5000 mAh Battery	6.5 inches	50 MP + 2 MP Dual Rear & 5 MP Front Camera	Memory Card Supported, upto 1 TB	12	11,990	Samsung	128 GB inbuilt	15W Fast Charging	720 x 1600 px	Octa Core	Dimensity 700	1

In [2]:

car = pd.read_csv("data/car_price.csv")

car.info()

display(car.shape)

car.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19227 entries, 0 to 19226
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        19227 non-null  int64  
 1   ID                19227 non-null  int64  
 2   Price             19227 non-null  int64  
 3   Levy              19227 non-null  object 
 4   Manufacturer      19227 non-null  object 
 5   Model             19227 non-null  object 
 6   ProdYear          19227 non-null  int64  
 7   Category          19227 non-null  object 
 8   Leather interior  19227 non-null  object 
 9   Fuel type         19227 non-null  object 
 10  Engine volume     19227 non-null  object 
 11  Mileage           19227 non-null  object 
 12  Cylinders         19227 non-null  float64
 13  Gear box type     19227 non-null  object 
 14  Drive wheels      19227 non-null  object 
 15  Doors             19227 non-null  object 
 16  Wheel             19227 non-null  object 
 17  Color             19227 non-null  object 
 18  Airbags           19227 non-null  int64  
 19  LeatherInterior1  19227 non-null  int64  
dtypes: float64(1), int64(6), object(13)
memory usage: 2.9+ MB

(19227, 20)

Out[2]:

	Unnamed: 0	ID	Price	Levy	Manufacturer	Model	ProdYear	Category	Leather interior	Fuel type	Engine volume	Mileage	Cylinders	Gear box type	Drive wheels	Doors	Wheel	Color	Airbags	LeatherInterior1
0	0	45654403	13328	1399	LEXUS	RX 450	2010	Jeep	Yes	Hybrid	3.5	186005 km	6.0	Automatic	4x4	04-May	Left wheel	Silver	12	1
1	1	44731507	16621	1018	CHEVROLET	Equinox	2011	Jeep	No	Petrol	3	192000 km	6.0	Tiptronic	4x4	04-May	Left wheel	Black	8	0
2	2	45774419	8467	-	HONDA	FIT	2006	Hatchback	No	Petrol	1.3	200000 km	4.0	Variator	Front	04-May	Right-hand drive	Black	2	0
3	3	45769185	3607	862	FORD	Escape	2011	Jeep	Yes	Hybrid	2.5	168966 km	4.0	Automatic	4x4	04-May	Left wheel	White	0	1
4	4	45809263	11726	446	HONDA	FIT	2014	Hatchback	Yes	Petrol	1.3	91901 km	4.0	Automatic	Front	04-May	Left wheel	Silver	4	1

In [3]:

house = pd.read_csv("data/house_data.csv")

house.info()

display(house.shape)

house.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3299 entries, 0 to 3298
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             3299 non-null   int64  
 1   date           3299 non-null   object 
 2   price          3299 non-null   float64
 3   bedrooms       3299 non-null   int64  
 4   bathrooms      3299 non-null   float64
 5   sqft_living    3299 non-null   int64  
 6   sqft_lot       3299 non-null   int64  
 7   floors         3299 non-null   float64
 8   waterfront     3299 non-null   int64  
 9   view           3299 non-null   int64  
 10  condition      3299 non-null   int64  
 11  grade          3299 non-null   int64  
 12  sqft_above     3299 non-null   int64  
 13  sqft_basement  3299 non-null   int64  
 14  yr_built       3299 non-null   int64  
 15  yr_renovated   3299 non-null   int64  
 16  zipcode        3299 non-null   int64  
 17  lat            3299 non-null   float64
 18  long           3299 non-null   float64
 19  sqft_living15  3299 non-null   int64  
 20  sqft_lot15     3299 non-null   int64  
dtypes: float64(5), int64(15), object(1)
memory usage: 541.4+ KB

(3299, 21)

Out[3]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	...	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	...	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	...	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	...	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	...	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503

5 rows × 21 columns

In [4]:

# Количество пустых значений признаков
display(phone.isnull().sum())
display()

# Есть ли пустые значения признаков
display(phone.isnull().any())
display()

# Процент пустых значений признаков
for i in phone.columns:
    null_rate = phone[i].isnull().sum() / len(phone) * 100
    if null_rate > 0:
        display(f"{i} процент пустых значений: %{null_rate:.2f}")

Name_phone             0
Rating                 0
Spec_score             0
No_of_sim              0
Ram                    0
Battery                0
Display                0
Camera                 0
External_Memory        0
Android_version      440
Price                  0
company                0
Inbuilt_memory        19
fast_charging         87
Screen_resolution      1
Processor             27
Processor_name         0
Class                  0
dtype: int64

Name_phone           False
Rating               False
Spec_score           False
No_of_sim            False
Ram                  False
Battery              False
Display              False
Camera               False
External_Memory      False
Android_version       True
Price                False
company              False
Inbuilt_memory        True
fast_charging         True
Screen_resolution     True
Processor             True
Processor_name       False
Class                False
dtype: bool

'Android_version процент пустых значений: %32.50'

'Inbuilt_memory процент пустых значений: %1.40'

'fast_charging процент пустых значений: %6.43'

'Screen_resolution процент пустых значений: %0.07'

'Processor процент пустых значений: %1.99'

In [5]:

# Количество пустых значений признаков
display(car.isnull().sum())
display()

# Есть ли пустые значения признаков
display(car.isnull().any())
display()

# Процент пустых значений признаков
for i in car.columns:
    null_rate = car[i].isnull().sum() / len(car) * 100
    if null_rate > 0:
        display(f"{i} процент пустых значений: %{null_rate:.2f}")

Unnamed: 0          0
ID                  0
Price               0
Levy                0
Manufacturer        0
Model               0
ProdYear            0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
LeatherInterior1    0
dtype: int64

Unnamed: 0          False
ID                  False
Price               False
Levy                False
Manufacturer        False
Model               False
ProdYear            False
Category            False
Leather interior    False
Fuel type           False
Engine volume       False
Mileage             False
Cylinders           False
Gear box type       False
Drive wheels        False
Doors               False
Wheel               False
Color               False
Airbags             False
LeatherInterior1    False
dtype: bool

In [6]:

# Количество пустых значений признаков
display(house.isnull().sum())
display()

# Есть ли пустые значения признаков
display(house.isnull().any())
display()

# Процент пустых значений признаков
for i in house.columns:
    null_rate = house[i].isnull().sum() / len(house) * 100
    if null_rate > 0:
        display(f"{i} процент пустых значений: %{null_rate:.2f}")

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

id               False
date             False
price            False
bedrooms         False
bathrooms        False
sqft_living      False
sqft_lot         False
floors           False
waterfront       False
view             False
condition        False
grade            False
sqft_above       False
sqft_basement    False
yr_built         False
yr_renovated     False
zipcode          False
lat              False
long             False
sqft_living15    False
sqft_lot15       False
dtype: bool

В 1 наборе данных phone имеются пустые значения, в остальных они отсутствуют. Заполним пустые значения из 1 датасета.

In [7]:

# Замена пустых данных на моду (часто встречающееся значение)
mode_Android = phone['Android_version'].mode()  
phone.fillna({'Android_version':mode_Android}, inplace=True)

Inbuilt_memory = phone['Inbuilt_memory'].mode()  
phone.fillna({'Inbuilt_memory':Inbuilt_memory}, inplace=True)

fillna_df = phone.fillna(0)

dropna_df = phone.dropna()

display(dropna_df.shape)

display(fillna_df.isnull().any())

phone.tail()

(805, 18)

Name_phone           False
Rating               False
Spec_score           False
No_of_sim            False
Ram                  False
Battery              False
Display              False
Camera               False
External_Memory      False
Android_version      False
Price                False
company              False
Inbuilt_memory       False
fast_charging        False
Screen_resolution    False
Processor            False
Processor_name       False
Class                False
dtype: bool

Out[7]:

	Name_phone	Rating	Spec_score	No_of_sim	Ram	Battery	Display	Camera	External_Memory	Android_version	Price	company	Inbuilt_memory	fast_charging	Screen_resolution	Processor	Processor_name	Class
Id
1365	TCL 40R	4.05	75	Dual Sim, 3G, 4G, 5G, VoLTE,	4 GB RAM	5000 mAh Battery	6.6 inches	50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro...	Memory Card (Hybrid)	12	18,999	TCL	64 GB inbuilt	15W Fast Charging	720 x 1612 px	Octa Core	Dimensity 700 5G	2
1366	TCL 50 XL NxtPaper 5G	4.10	80	Dual Sim, 3G, 4G, VoLTE,	8 GB RAM	5000 mAh Battery	6.8 inches	50 MP + 2 MP Dual Rear & 16 MP Front Camera	Memory Card (Hybrid)	14	24,990	TCL	128 GB inbuilt	33W Fast Charging	1200 x 2400 px	Octa Core	Dimensity 7050	1
1367	TCL 50 XE NxtPaper 5G	4.00	80	Dual Sim, 3G, 4G, 5G, VoLTE,	6 GB RAM	5000 mAh Battery	6.6 inches	50 MP + 2 MP Dual Rear & 16 MP Front Camera	Memory Card Supported, upto 1 TB	13	23,990	TCL	256 GB inbuilt	18W Fast Charging	720 x 1612 px	Octa Core	Dimensity 6080	2
1368	TCL 40 NxtPaper 5G	4.50	79	Dual Sim, 3G, 4G, 5G, VoLTE,	6 GB RAM	5000 mAh Battery	6.6 inches	50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro...	Memory Card Supported, upto 1 TB	13	22,499	TCL	256 GB inbuilt	15W Fast Charging	720 x 1612 px	Octa Core	Dimensity 6020	2
1369	TCL Trifold	4.65	93	Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G,	12 GB RAM	4600 mAh Battery	10 inches	Foldable Display, Dual Display	50 MP + 48 MP + 8 MP Triple Rear & 32 MP F...	13	1,19,990	TCL	256 GB inbuilt	67W Fast Charging	1916 x 2160 px	Octa Core	Snapdragon 8 Gen2	2

In [8]:

# Вывод распределения количества наблюдений по меткам (классам)
from src.utils import split_stratified_into_train_val_test


display(phone.Class.value_counts())
display()

data = phone[["Class", "Spec_score", "Rating"]].copy()

df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
   data, stratify_colname="Class", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

display("Обучающая выборка: ", df_train.shape)
display(df_train.Class.value_counts())

display("Контрольная выборка: ", df_val.shape)
display(df_val.Class.value_counts())

display("Тестовая выборка: ", df_test.shape)
display(df_test.Class.value_counts())

Class
1    718
2    636
Name: count, dtype: int64

'Обучающая выборка: '

(812, 3)

Class
1    431
2    381
Name: count, dtype: int64

'Контрольная выборка: '

(271, 3)

Class
1    143
2    128
Name: count, dtype: int64

'Тестовая выборка: '

(271, 3)

Class
1    144
2    127
Name: count, dtype: int64

In [9]:

from imblearn.over_sampling import ADASYN

ada = ADASYN()

display("Обучающая выборка: ", df_train.shape)
display(df_train.Class.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["Class"]) # type: ignore
df_train_adasyn = pd.DataFrame(X_resampled)

display("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
display(df_train_adasyn.Class.value_counts())

df_train_adasyn

'Обучающая выборка: '

(812, 3)

Class
1    431
2    381
Name: count, dtype: int64

'Обучающая выборка после oversampling: '

(856, 3)

Class
1    431
2    425
Name: count, dtype: int64

Out[9]:

	Class	Spec_score	Rating
0	1	90	4.550000
1	1	80	4.400000
2	1	80	4.650000
3	1	75	4.500000
4	2	92	4.650000
...	...	...	...
851	2	96	4.061287
852	2	95	4.191740
853	2	73	4.612413
854	2	94	4.193295
855	2	92	4.650000

856 rows × 3 columns

In [10]:

from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# Создание экземпляра RandomUnderSampler
rus = RandomUnderSampler()

# Проверка исходной обучающей выборки
display("Обучающая выборка: ", df_train.shape)
display(df_train.Class.value_counts())

# Выполнение undersampling
X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=["Class"]), df_train["Class"])  # type: ignore
df_train_undersampled = pd.DataFrame(X_resampled)

# Добавление целевой переменной обратно в выборку
df_train_undersampled["Class"] = y_resampled

# Проверка обучающей выборки после undersampling
display("Обучающая выборка после undersampling: ", df_train_undersampled.shape)
display(df_train_undersampled.Class.value_counts())

df_train_undersampled

'Обучающая выборка: '

(812, 3)

Class
1    431
2    381
Name: count, dtype: int64

'Обучающая выборка после undersampling: '

(762, 3)

Class
1    381
2    381
Name: count, dtype: int64

Out[10]:

	Spec_score	Rating	Class
Id
468	69	4.45	1
1093	86	4.45	1
1102	89	4.00	1
422	70	4.15	1
1192	81	4.00	1
...	...	...	...
298	75	4.70	2
249	75	4.25	2
1341	66	4.25	2
923	88	4.15	2
301	80	4.50	2

762 rows × 3 columns

In [11]:

# Вывод распределения количества наблюдений по меткам (классам)
from src.utils import split_stratified_into_train_val_test


display(car.LeatherInterior1.value_counts())
display()

data = car[["LeatherInterior1", "Airbags", "ProdYear"]].copy()

df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
   data, stratify_colname="LeatherInterior1", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

display("Обучающая выборка: ", df_train.shape)
display(df_train.LeatherInterior1.value_counts())

display("Контрольная выборка: ", df_val.shape)
display(df_val.LeatherInterior1.value_counts())

display("Тестовая выборка: ", df_test.shape)
display(df_test.LeatherInterior1.value_counts())

LeatherInterior1
1    13949
0     5278
Name: count, dtype: int64

'Обучающая выборка: '

(11536, 3)

LeatherInterior1
1    8369
0    3167
Name: count, dtype: int64

'Контрольная выборка: '

(3845, 3)

LeatherInterior1
1    2790
0    1055
Name: count, dtype: int64

'Тестовая выборка: '

(3846, 3)

LeatherInterior1
1    2790
0    1056
Name: count, dtype: int64

In [12]:

from imblearn.over_sampling import ADASYN

ada = ADASYN()

display("Обучающая выборка: ", df_train.shape)
display(df_train.LeatherInterior1.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["LeatherInterior1"]) # type: ignore
df_train_adasyn = pd.DataFrame(X_resampled)

display("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
display(df_train_adasyn.LeatherInterior1.value_counts())

df_train_adasyn

'Обучающая выборка: '

(11536, 3)

LeatherInterior1
1    8369
0    3167
Name: count, dtype: int64

'Обучающая выборка после oversampling: '

(16728, 3)

LeatherInterior1
1    8369
0    8359
Name: count, dtype: int64

Out[12]:

	LeatherInterior1	Airbags	ProdYear
0	0	16	2011
1	1	10	2017
2	1	4	2018
3	1	0	2011
4	1	6	2010
...	...	...	...
16723	0	12	2004
16724	0	12	2004
16725	0	12	2004
16726	0	12	2004
16727	0	12	2003

16728 rows × 3 columns

In [13]:

from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# Создание экземпляра RandomUnderSampler
rus = RandomUnderSampler()

# Проверка исходной обучающей выборки
display("Обучающая выборка: ", df_train.shape)
display(df_train.LeatherInterior1.value_counts())

# Выполнение undersampling
X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=["LeatherInterior1"]), df_train["LeatherInterior1"])  # type: ignore
df_train_undersampled = pd.DataFrame(X_resampled)

# Добавление целевой переменной обратно в выборку
df_train_undersampled["LeatherInterior1"] = y_resampled

# Проверка обучающей выборки после undersampling
display("Обучающая выборка после undersampling: ", df_train_undersampled.shape)
display(df_train_undersampled.LeatherInterior1.value_counts())

df_train_undersampled

'Обучающая выборка: '

(11536, 3)

LeatherInterior1
1    8369
0    3167
Name: count, dtype: int64

'Обучающая выборка после undersampling: '

(6334, 3)

LeatherInterior1
0    3167
1    3167
Name: count, dtype: int64

Out[13]:

	Airbags	ProdYear	LeatherInterior1
1315	16	2011	0
1569	0	2014	0
6317	6	2006	0
626	4	2000	0
12439	12	2012	0
...	...	...	...
3361	12	2012	1
3286	4	2017	1
17666	4	2015	1
4902	0	2012	1
4613	4	2014	1

6334 rows × 3 columns

In [14]:

# Вывод распределения количества наблюдений по меткам (классам)
from src.utils import split_stratified_into_train_val_test


display(house.bedrooms.value_counts())
display()

data = house[["bedrooms", "sqft_living", "sqft_lot"]].copy()

df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
   data, stratify_colname="bedrooms", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

display("Обучающая выборка: ", df_train.shape)
display(df_train.bedrooms.value_counts())

display("Контрольная выборка: ", df_val.shape)
display(df_val.bedrooms.value_counts())

display("Тестовая выборка: ", df_test.shape)
display(df_test.bedrooms.value_counts())

bedrooms
3    1518
4    1058
2     404
5     234
6      55
1      30
Name: count, dtype: int64

'Обучающая выборка: '

(1979, 3)

bedrooms
3    911
4    635
2    242
5    140
6     33
1     18
Name: count, dtype: int64

'Контрольная выборка: '

(660, 3)

bedrooms
3    304
4    211
2     81
5     47
6     11
1      6
Name: count, dtype: int64

'Тестовая выборка: '

(660, 3)

bedrooms
3    303
4    212
2     81
5     47
6     11
1      6
Name: count, dtype: int64

In [15]:

from imblearn.over_sampling import ADASYN

ada = ADASYN()

display("Обучающая выборка: ", df_train.shape)
display(df_train.bedrooms.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["bedrooms"]) # type: ignore
df_train_adasyn = pd.DataFrame(X_resampled)

display("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
display(df_train_adasyn.bedrooms.value_counts())

df_train_adasyn

'Обучающая выборка: '

(1979, 3)

bedrooms
3    911
4    635
2    242
5    140
6     33
1     18
Name: count, dtype: int64

'Обучающая выборка после oversampling: '

(5380, 3)

bedrooms
2    932
1    914
3    911
6    907
5    888
4    828
Name: count, dtype: int64

Out[15]:

	bedrooms	sqft_living	sqft_lot
0	3	1940	10035
1	4	1920	4862
2	4	2340	3784
3	4	3450	33460
4	4	2230	26989
...	...	...	...
5375	6	2575	6858
5376	6	2635	8286
5377	6	2815	7930
5378	6	2857	7735
5379	6	2923	7315

5380 rows × 3 columns

In [16]:

from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# Создание экземпляра RandomUnderSampler
rus = RandomUnderSampler()

# Проверка исходной обучающей выборки
display("Обучающая выборка: ", df_train.shape)
display(df_train.bedrooms.value_counts())

# Выполнение undersampling
X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=["bedrooms"]), df_train["bedrooms"])  # type: ignore
df_train_undersampled = pd.DataFrame(X_resampled)

# Добавление целевой переменной обратно в выборку
df_train_undersampled["bedrooms"] = y_resampled

# Проверка обучающей выборки после undersampling
display("Обучающая выборка после undersampling: ", df_train_undersampled.shape)
display(df_train_undersampled.bedrooms.value_counts())

df_train_undersampled

'Обучающая выборка: '

(1979, 3)

bedrooms
3    911
4    635
2    242
5    140
6     33
1     18
Name: count, dtype: int64

'Обучающая выборка после undersampling: '

(108, 3)

bedrooms
1    18
2    18
3    18
4    18
5    18
6    18
Name: count, dtype: int64

Out[16]:

	sqft_living	sqft_lot	bedrooms
2003	1090	8750	1
2922	780	10235	1
350	560	12120	1
2682	710	6000	1
3037	890	211576	1
...	...	...	...
832	2450	25600	6
783	3610	10003	6
486	4860	11793	6
1499	3840	14040	6
1302	3010	17864	6

108 rows × 3 columns

91 KiB Raw Permalink Blame History Unescape Escape

91 KiB

Raw Permalink Blame History