PredictiveAnalytics/lab2.ipynb at cc642627db633388859bd1ea1dc7a80b2e7ebded

2025-01-06 23:37:51 +04:00

96 KiB

Raw Blame History

In [228]:

import pandas as pd

phone = pd.read_csv("data/phone_price.csv", index_col="Id")

phone.info()

display(phone.shape)

phone.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1354 entries, 0 to 1369
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name_phone         1354 non-null   object 
 1   Rating             1354 non-null   float64
 2   Spec_score         1354 non-null   int64  
 3   No_of_sim          1354 non-null   object 
 4   Ram                1354 non-null   object 
 5   Battery            1354 non-null   object 
 6   Display            1354 non-null   object 
 7   Camera             1354 non-null   object 
 8   External_Memory    1354 non-null   object 
 9   Android_version    914 non-null    object 
 10  Price              1354 non-null   object 
 11  company            1354 non-null   object 
 12  Inbuilt_memory     1335 non-null   object 
 13  fast_charging      1267 non-null   object 
 14  Screen_resolution  1353 non-null   object 
 15  Processor          1327 non-null   object 
 16  Processor_name     1354 non-null   object 
 17  Class              1354 non-null   int64  
dtypes: float64(1), int64(2), object(15)
memory usage: 201.0+ KB

(1354, 18)

Out[228]:

	Name_phone	Rating	Spec_score	No_of_sim	Ram	Battery	Display	Camera	External_Memory	Android_version	Price	company	Inbuilt_memory	fast_charging	Screen_resolution	Processor	Processor_name	Class
Id
0	Samsung Galaxy F14 5G	4.65	68	Dual Sim, 3G, 4G, 5G, VoLTE,	4 GB RAM	6000 mAh Battery	6.6 inches	50 MP + 2 MP Dual Rear & 13 MP Front Camera	Memory Card Supported, upto 1 TB	13	9,999	Samsung	128 GB inbuilt	25W Fast Charging	2408 x 1080 px Display with Water Drop Notch	Octa Core Processor	Exynos 1330	2
1	Samsung Galaxy A11	4.20	63	Dual Sim, 3G, 4G, VoLTE,	2 GB RAM	4000 mAh Battery	6.4 inches	13 MP + 5 MP + 2 MP Triple Rear & 8 MP Fro...	Memory Card Supported, upto 512 GB	10	9,990	Samsung	32 GB inbuilt	15W Fast Charging	720 x 1560 px Display with Punch Hole	1.8 GHz Processor	Octa Core	2
3	Samsung Galaxy F23	4.10	73	Dual Sim, 3G, 4G, VoLTE,	4 GB RAM	6000 mAh Battery	6.4 inches	48 MP Quad Rear & 13 MP Front Camera	Memory Card Supported, upto 1 TB	12	11,999	Samsung	64 GB inbuilt	NaN	720 x 1600 px	Octa Core	Helio G88	1
4	Samsung Galaxy A03s (4GB RAM + 64GB)	4.10	69	Dual Sim, 3G, 4G, VoLTE,	4 GB RAM	5000 mAh Battery	6.5 inches	13 MP + 2 MP + 2 MP Triple Rear & 5 MP Fro...	Memory Card Supported, upto 1 TB	11	11,999	Samsung	64 GB inbuilt	15W Fast Charging	720 x 1600 px Display with Water Drop Notch	Octa Core	Helio P35	2
5	Samsung Galaxy M13 5G	4.40	75	Dual Sim, 3G, 4G, 5G, VoLTE,	6 GB RAM	5000 mAh Battery	6.5 inches	50 MP + 2 MP Dual Rear & 5 MP Front Camera	Memory Card Supported, upto 1 TB	12	11,990	Samsung	128 GB inbuilt	15W Fast Charging	720 x 1600 px	Octa Core	Dimensity 700	1

In [229]:

car = pd.read_csv("data/car_price.csv")

car.info()

display(car.shape)

car.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19227 entries, 0 to 19226
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        19227 non-null  int64  
 1   ID                19227 non-null  int64  
 2   Price             19227 non-null  int64  
 3   Levy              19227 non-null  object 
 4   Manufacturer      19227 non-null  object 
 5   Model             19227 non-null  object 
 6   ProdYear          19227 non-null  int64  
 7   Category          19227 non-null  object 
 8   Leather interior  19227 non-null  object 
 9   Fuel type         19227 non-null  object 
 10  Engine volume     19227 non-null  object 
 11  Mileage           19227 non-null  object 
 12  Cylinders         19227 non-null  float64
 13  Gear box type     19227 non-null  object 
 14  Drive wheels      19227 non-null  object 
 15  Doors             19227 non-null  object 
 16  Wheel             19227 non-null  object 
 17  Color             19227 non-null  object 
 18  Airbags           19227 non-null  int64  
 19  LeatherInterior1  19227 non-null  int64  
dtypes: float64(1), int64(6), object(13)
memory usage: 2.9+ MB

(19227, 20)

Out[229]:

	Unnamed: 0	ID	Price	Levy	Manufacturer	Model	ProdYear	Category	Leather interior	Fuel type	Engine volume	Mileage	Cylinders	Gear box type	Drive wheels	Doors	Wheel	Color	Airbags	LeatherInterior1
0	0	45654403	13328	1399	LEXUS	RX 450	2010	Jeep	Yes	Hybrid	3.5	186005 km	6.0	Automatic	4x4	04-May	Left wheel	Silver	12	1
1	1	44731507	16621	1018	CHEVROLET	Equinox	2011	Jeep	No	Petrol	3	192000 km	6.0	Tiptronic	4x4	04-May	Left wheel	Black	8	0
2	2	45774419	8467	-	HONDA	FIT	2006	Hatchback	No	Petrol	1.3	200000 km	4.0	Variator	Front	04-May	Right-hand drive	Black	2	0
3	3	45769185	3607	862	FORD	Escape	2011	Jeep	Yes	Hybrid	2.5	168966 km	4.0	Automatic	4x4	04-May	Left wheel	White	0	1
4	4	45809263	11726	446	HONDA	FIT	2014	Hatchback	Yes	Petrol	1.3	91901 km	4.0	Automatic	Front	04-May	Left wheel	Silver	4	1

In [230]:

house = pd.read_csv("data/house_data.csv")

house.info()

display(house.shape)

house.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3299 entries, 0 to 3298
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             3299 non-null   int64  
 1   date           3299 non-null   object 
 2   price          3299 non-null   float64
 3   bedrooms       3299 non-null   int64  
 4   bathrooms      3299 non-null   float64
 5   sqft_living    3299 non-null   int64  
 6   sqft_lot       3299 non-null   int64  
 7   floors         3299 non-null   float64
 8   waterfront     3299 non-null   int64  
 9   view           3299 non-null   int64  
 10  condition      3299 non-null   int64  
 11  grade          3299 non-null   int64  
 12  sqft_above     3299 non-null   int64  
 13  sqft_basement  3299 non-null   int64  
 14  yr_built       3299 non-null   int64  
 15  yr_renovated   3299 non-null   int64  
 16  zipcode        3299 non-null   int64  
 17  lat            3299 non-null   float64
 18  long           3299 non-null   float64
 19  sqft_living15  3299 non-null   int64  
 20  sqft_lot15     3299 non-null   int64  
dtypes: float64(5), int64(15), object(1)
memory usage: 541.4+ KB

(3299, 21)

Out[230]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	...	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	...	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	...	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	...	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	...	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503

5 rows × 21 columns

Продажа домов

Цель: Определение рыночной стоимости домов для оптимизации ценовой политики.

Эффект: Улучшение конкурентоспособности на рынке недвижимости и увеличение объема продаж за счет правильного ценообразования. Возможность точного прогнозирования цен и повышения удовлетворенности клиентов.
Цены на автомобили

Цель: Оценка цен на автомобили в зависимости от различных факторов (например, пробег, марка, модель, год выпуска).

Эффект: Увеличение привлекательности предложения за счет адекватного ценообразования и минимизации убытков от продажи автомобилей. Позволяет более точно прогнозировать потребности клиентов и улучшить управление запасами.
Цены на мобильные устройства

Цель: Определение причин изменения цен на мобильные устройства и оценка влияния сезонных скидок на спрос.

Эффект: Оптимизация маркетинговых стратегий и акций, что может привести к увеличению объемов продаж. Улучшение прогнозирования спроса и управление ассортиментом на основе ценовых изменений.

Продажа домов

Цель технического проекта: Разработка системы оценки рыночной стоимости недвижимости на основе анализа данных о продажах.

Вход: Набор данных о прошедших продажах домов (цена, площадь, местоположение, количество комнат, год постройки и т.д.).

Целевой признак: Рыночная цена продажи домов.
Цены на автомобили

Цель технического проекта: Создание алгоритма, который предсказывает цены на автомобили с учетом различных характеристик и рыночных тенденций.

Вход: Данные о характеристиках автомобилей (марка, модель, год выпуска, пробег, состояние) и цены на аналогичные автомобили.

Целевой признак: Цена автомобиля.
Цены на мобильные устройства

Цель технического проекта: Разработка системы, анализирующей влияние маркетинговых акций на цены и спрос на мобильные устройства.

Вход: Данные о ценах на мобильные устройства (с учетом скидок), история продаж, информация о маркетинговых акциях и сезонные колебания.

Целевой признак: Объем продаж мобильных устройств.

In [231]:

# Количество пустых значений признаков
display(phone.isnull().sum())
display()

# Есть ли пустые значения признаков
display(phone.isnull().any())
display()

# Процент пустых значений признаков
for i in phone.columns:
    null_rate = phone[i].isnull().sum() / len(phone) * 100
    if null_rate > 0:
        display(f"{i} процент пустых значений: %{null_rate:.2f}")

Name_phone             0
Rating                 0
Spec_score             0
No_of_sim              0
Ram                    0
Battery                0
Display                0
Camera                 0
External_Memory        0
Android_version      440
Price                  0
company                0
Inbuilt_memory        19
fast_charging         87
Screen_resolution      1
Processor             27
Processor_name         0
Class                  0
dtype: int64

Name_phone           False
Rating               False
Spec_score           False
No_of_sim            False
Ram                  False
Battery              False
Display              False
Camera               False
External_Memory      False
Android_version       True
Price                False
company              False
Inbuilt_memory        True
fast_charging         True
Screen_resolution     True
Processor             True
Processor_name       False
Class                False
dtype: bool

'Android_version процент пустых значений: %32.50'

'Inbuilt_memory процент пустых значений: %1.40'

'fast_charging процент пустых значений: %6.43'

'Screen_resolution процент пустых значений: %0.07'

'Processor процент пустых значений: %1.99'

In [232]:

# Количество пустых значений признаков
display(car.isnull().sum())
display()

# Есть ли пустые значения признаков
display(car.isnull().any())
display()

# Процент пустых значений признаков
for i in car.columns:
    null_rate = car[i].isnull().sum() / len(car) * 100
    if null_rate > 0:
        display(f"{i} процент пустых значений: %{null_rate:.2f}")

Unnamed: 0          0
ID                  0
Price               0
Levy                0
Manufacturer        0
Model               0
ProdYear            0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
LeatherInterior1    0
dtype: int64

Unnamed: 0          False
ID                  False
Price               False
Levy                False
Manufacturer        False
Model               False
ProdYear            False
Category            False
Leather interior    False
Fuel type           False
Engine volume       False
Mileage             False
Cylinders           False
Gear box type       False
Drive wheels        False
Doors               False
Wheel               False
Color               False
Airbags             False
LeatherInterior1    False
dtype: bool

In [233]:

# Количество пустых значений признаков
display(house.isnull().sum())
display()

# Есть ли пустые значения признаков
display(house.isnull().any())
display()

# Процент пустых значений признаков
for i in house.columns:
    null_rate = house[i].isnull().sum() / len(house) * 100
    if null_rate > 0:
        display(f"{i} процент пустых значений: %{null_rate:.2f}")

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

id               False
date             False
price            False
bedrooms         False
bathrooms        False
sqft_living      False
sqft_lot         False
floors           False
waterfront       False
view             False
condition        False
grade            False
sqft_above       False
sqft_basement    False
yr_built         False
yr_renovated     False
zipcode          False
lat              False
long             False
sqft_living15    False
sqft_lot15       False
dtype: bool

В 1 наборе данных phone имеются пустые значения, в остальных они отсутствуют. Заполним пустые значения из 1 датасета.

In [234]:

# Замена пустых данных на моду (часто встречающееся значение)
mode_Android = phone['Android_version'].mode()  
phone.fillna({'Android_version':mode_Android}, inplace=True)

Inbuilt_memory = phone['Inbuilt_memory'].mode()  
phone.fillna({'Inbuilt_memory':Inbuilt_memory}, inplace=True)

fillna_df = phone.fillna(0)

dropna_df = phone.dropna()

display(dropna_df.shape)

display(fillna_df.isnull().any())

phone.tail()

(805, 18)

Name_phone           False
Rating               False
Spec_score           False
No_of_sim            False
Ram                  False
Battery              False
Display              False
Camera               False
External_Memory      False
Android_version      False
Price                False
company              False
Inbuilt_memory       False
fast_charging        False
Screen_resolution    False
Processor            False
Processor_name       False
Class                False
dtype: bool

Out[234]:

	Name_phone	Rating	Spec_score	No_of_sim	Ram	Battery	Display	Camera	External_Memory	Android_version	Price	company	Inbuilt_memory	fast_charging	Screen_resolution	Processor	Processor_name	Class
Id
1365	TCL 40R	4.05	75	Dual Sim, 3G, 4G, 5G, VoLTE,	4 GB RAM	5000 mAh Battery	6.6 inches	50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro...	Memory Card (Hybrid)	12	18,999	TCL	64 GB inbuilt	15W Fast Charging	720 x 1612 px	Octa Core	Dimensity 700 5G	2
1366	TCL 50 XL NxtPaper 5G	4.10	80	Dual Sim, 3G, 4G, VoLTE,	8 GB RAM	5000 mAh Battery	6.8 inches	50 MP + 2 MP Dual Rear & 16 MP Front Camera	Memory Card (Hybrid)	14	24,990	TCL	128 GB inbuilt	33W Fast Charging	1200 x 2400 px	Octa Core	Dimensity 7050	1
1367	TCL 50 XE NxtPaper 5G	4.00	80	Dual Sim, 3G, 4G, 5G, VoLTE,	6 GB RAM	5000 mAh Battery	6.6 inches	50 MP + 2 MP Dual Rear & 16 MP Front Camera	Memory Card Supported, upto 1 TB	13	23,990	TCL	256 GB inbuilt	18W Fast Charging	720 x 1612 px	Octa Core	Dimensity 6080	2
1368	TCL 40 NxtPaper 5G	4.50	79	Dual Sim, 3G, 4G, 5G, VoLTE,	6 GB RAM	5000 mAh Battery	6.6 inches	50 MP + 2 MP + 2 MP Triple Rear & 8 MP Fro...	Memory Card Supported, upto 1 TB	13	22,499	TCL	256 GB inbuilt	15W Fast Charging	720 x 1612 px	Octa Core	Dimensity 6020	2
1369	TCL Trifold	4.65	93	Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G,	12 GB RAM	4600 mAh Battery	10 inches	Foldable Display, Dual Display	50 MP + 48 MP + 8 MP Triple Rear & 32 MP F...	13	1,19,990	TCL	256 GB inbuilt	67W Fast Charging	1916 x 2160 px	Octa Core	Snapdragon 8 Gen2	2

In [235]:

# Вывод распределения количества наблюдений по меткам (классам)
from src.utils import split_stratified_into_train_val_test


display(phone.Class.value_counts())
display()

data = phone[["Class", "Spec_score", "Rating"]].copy()

df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
   data, stratify_colname="Class", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

display("Обучающая выборка: ", df_train.shape)
display(df_train.Class.value_counts())

display("Контрольная выборка: ", df_val.shape)
display(df_val.Class.value_counts())

display("Тестовая выборка: ", df_test.shape)
display(df_test.Class.value_counts())

Class
1    718
2    636
Name: count, dtype: int64

'Обучающая выборка: '

(812, 3)

Class
1    431
2    381
Name: count, dtype: int64

'Контрольная выборка: '

(271, 3)

Class
1    144
2    127
Name: count, dtype: int64

'Тестовая выборка: '

(271, 3)

Class
1    143
2    128
Name: count, dtype: int64

In [236]:

from imblearn.over_sampling import ADASYN

ada = ADASYN()

display("Обучающая выборка: ", df_train.shape)
display(df_train.Class.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["Class"]) # type: ignore
df_train_adasyn = pd.DataFrame(X_resampled)

display("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
display(df_train_adasyn.Class.value_counts())

df_train_adasyn

'Обучающая выборка: '

(812, 3)

Class
1    431
2    381
Name: count, dtype: int64

'Обучающая выборка после oversampling: '

(856, 3)

Class
1    431
2    425
Name: count, dtype: int64

Out[236]:

	Class	Spec_score	Rating
0	1	89	4.600000
1	2	95	4.350000
2	2	84	4.200000
3	1	92	4.100000
4	2	69	4.150000
...	...	...	...
851	2	53	4.441967
852	2	56	4.295098
853	2	60	4.489467
854	2	60	4.511445
855	2	72	4.207826

856 rows × 3 columns

In [237]:

from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# Создание экземпляра RandomUnderSampler
rus = RandomUnderSampler()

# Проверка исходной обучающей выборки
display("Обучающая выборка: ", df_train.shape)
display(df_train.Class.value_counts())

# Выполнение undersampling
X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=["Class"]), df_train["Class"])  # type: ignore
df_train_undersampled = pd.DataFrame(X_resampled)

# Добавление целевой переменной обратно в выборку
df_train_undersampled["Class"] = y_resampled

# Проверка обучающей выборки после undersampling
display("Обучающая выборка после undersampling: ", df_train_undersampled.shape)
display(df_train_undersampled.Class.value_counts())

df_train_undersampled

'Обучающая выборка: '

(812, 3)

Class
1    431
2    381
Name: count, dtype: int64

'Обучающая выборка после undersampling: '

(762, 3)

Class
1    381
2    381
Name: count, dtype: int64

Out[237]:

	Spec_score	Rating	Class
Id
786	80	4.30	1
670	91	4.35	1
96	72	4.10	1
1207	79	4.75	1
1165	80	4.60	1
...	...	...	...
875	75	4.25	2
544	87	4.65	2
787	84	4.15	2
893	83	4.00	2
519	81	4.30	2

762 rows × 3 columns

In [238]:

# Вывод распределения количества наблюдений по меткам (классам)
from src.utils import split_stratified_into_train_val_test


display(car.LeatherInterior1.value_counts())
display()

data = car[["LeatherInterior1", "Airbags", "ProdYear"]].copy()

df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
   data, stratify_colname="LeatherInterior1", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

display("Обучающая выборка: ", df_train.shape)
display(df_train.LeatherInterior1.value_counts())

display("Контрольная выборка: ", df_val.shape)
display(df_val.LeatherInterior1.value_counts())

display("Тестовая выборка: ", df_test.shape)
display(df_test.LeatherInterior1.value_counts())

LeatherInterior1
1    13949
0     5278
Name: count, dtype: int64

'Обучающая выборка: '

(11536, 3)

LeatherInterior1
1    8369
0    3167
Name: count, dtype: int64

'Контрольная выборка: '

(3845, 3)

LeatherInterior1
1    2790
0    1055
Name: count, dtype: int64

'Тестовая выборка: '

(3846, 3)

LeatherInterior1
1    2790
0    1056
Name: count, dtype: int64

In [239]:

from imblearn.over_sampling import ADASYN

ada = ADASYN()

display("Обучающая выборка: ", df_train.shape)
display(df_train.LeatherInterior1.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["LeatherInterior1"]) # type: ignore
df_train_adasyn = pd.DataFrame(X_resampled)

display("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
display(df_train_adasyn.LeatherInterior1.value_counts())

df_train_adasyn

'Обучающая выборка: '

(11536, 3)

LeatherInterior1
1    8369
0    3167
Name: count, dtype: int64

'Обучающая выборка после oversampling: '

(16720, 3)

LeatherInterior1
1    8369
0    8351
Name: count, dtype: int64

Out[239]:

	LeatherInterior1	Airbags	ProdYear
0	0	2	2003
1	0	0	2010
2	0	2	1995
3	1	4	2010
4	1	4	2015
...	...	...	...
16715	0	12	1999
16716	0	12	2000
16717	0	11	1998
16718	0	12	2000
16719	0	11	1998

16720 rows × 3 columns

In [240]:

from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# Создание экземпляра RandomUnderSampler
rus = RandomUnderSampler()

# Проверка исходной обучающей выборки
display("Обучающая выборка: ", df_train.shape)
display(df_train.LeatherInterior1.value_counts())

# Выполнение undersampling
X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=["LeatherInterior1"]), df_train["LeatherInterior1"])  # type: ignore
df_train_undersampled = pd.DataFrame(X_resampled)

# Добавление целевой переменной обратно в выборку
df_train_undersampled["LeatherInterior1"] = y_resampled

# Проверка обучающей выборки после undersampling
display("Обучающая выборка после undersampling: ", df_train_undersampled.shape)
display(df_train_undersampled.LeatherInterior1.value_counts())

df_train_undersampled

'Обучающая выборка: '

(11536, 3)

LeatherInterior1
1    8369
0    3167
Name: count, dtype: int64

'Обучающая выборка после undersampling: '

(6334, 3)

LeatherInterior1
0    3167
1    3167
Name: count, dtype: int64

Out[240]:

	Airbags	ProdYear	LeatherInterior1
1271	2	2003	0
9524	0	2010	0
9985	2	1995	0
12285	8	2012	0
14259	12	2014	0
...	...	...	...
1243	6	2006	1
10022	10	2010	1
18753	12	2018	1
14272	4	2013	1
16792	4	2010	1

6334 rows × 3 columns

In [241]:

# Вывод распределения количества наблюдений по меткам (классам)
from src.utils import split_stratified_into_train_val_test


display(house.bedrooms.value_counts())
display()

data = house[["bedrooms", "sqft_living", "sqft_lot"]].copy()

df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
   data, stratify_colname="bedrooms", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

display("Обучающая выборка: ", df_train.shape)
display(df_train.bedrooms.value_counts())

display("Контрольная выборка: ", df_val.shape)
display(df_val.bedrooms.value_counts())

display("Тестовая выборка: ", df_test.shape)
display(df_test.bedrooms.value_counts())

bedrooms
3    1518
4    1058
2     404
5     234
6      55
1      30
Name: count, dtype: int64

'Обучающая выборка: '

(1979, 3)

bedrooms
3    911
4    635
2    242
5    140
6     33
1     18
Name: count, dtype: int64

'Контрольная выборка: '

(660, 3)

bedrooms
3    303
4    212
2     81
5     47
6     11
1      6
Name: count, dtype: int64

'Тестовая выборка: '

(660, 3)

bedrooms
3    304
4    211
2     81
5     47
6     11
1      6
Name: count, dtype: int64

In [242]:

from imblearn.over_sampling import ADASYN

ada = ADASYN()

display("Обучающая выборка: ", df_train.shape)
display(df_train.bedrooms.value_counts())

X_resampled, y_resampled = ada.fit_resample(df_train, df_train["bedrooms"]) # type: ignore
df_train_adasyn = pd.DataFrame(X_resampled)

display("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
display(df_train_adasyn.bedrooms.value_counts())

df_train_adasyn

'Обучающая выборка: '

(1979, 3)

bedrooms
3    911
4    635
2    242
5    140
6     33
1     18
Name: count, dtype: int64

'Обучающая выборка после oversampling: '

(5332, 3)

bedrooms
3    911
6    907
1    907
5    892
2    887
4    828
Name: count, dtype: int64

Out[242]:

	bedrooms	sqft_living	sqft_lot
0	2	620	4760
1	2	1760	2275
2	6	3840	14040
3	4	2230	6791
4	4	1850	5040
...	...	...	...
5327	6	3272	5227
5328	6	3007	4422
5329	6	3147	4818
5330	6	3021	4954
5331	6	3010	5088

5332 rows × 3 columns

In [243]:

from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# Создание экземпляра RandomUnderSampler
rus = RandomUnderSampler()

# Проверка исходной обучающей выборки
display("Обучающая выборка: ", df_train.shape)
display(df_train.bedrooms.value_counts())

# Выполнение undersampling
X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=["bedrooms"]), df_train["bedrooms"])  # type: ignore
df_train_undersampled = pd.DataFrame(X_resampled)

# Добавление целевой переменной обратно в выборку
df_train_undersampled["bedrooms"] = y_resampled

# Проверка обучающей выборки после undersampling
display("Обучающая выборка после undersampling: ", df_train_undersampled.shape)
display(df_train_undersampled.bedrooms.value_counts())

df_train_undersampled

'Обучающая выборка: '

(1979, 3)

bedrooms
3    911
4    635
2    242
5    140
6     33
1     18
Name: count, dtype: int64

'Обучающая выборка после undersampling: '

(108, 3)

bedrooms
1    18
2    18
3    18
4    18
5    18
6    18
Name: count, dtype: int64

Out[243]:

	sqft_living	sqft_lot	bedrooms
1729	680	1638	1
1453	660	2600	1
1649	3000	204732	1
2151	1010	5750	1
154	700	5100	1
...	...	...	...
2069	2410	6000	6
1192	2560	8320	6
2065	3240	5750	6
2492	2400	9373	6
232	2940	7350	6

108 rows × 3 columns

96 KiB Raw Blame History Unescape Escape

96 KiB

Raw Blame History