263 KiB
263 KiB
Загрузка набора данных¶
In [30]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import featuretools as ft
import re
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
df = pd.read_csv("../data/mobile phone price prediction.csv")
df.drop(["Unnamed: 0"], axis=1, inplace=True)
df["Price"] = df["Price"].str.replace(",", "").astype(float)
numerical_features = [
"Ram",
"Battery",
"Display",
"Camera",
]
for feature in numerical_features:
df[feature] = df[feature].apply(
lambda x: int(re.search(r"\d+", x).group()) if re.search(r"\d+", x) else None # type: ignore
)
df
Out[30]:
Бизнес-цели¶
- Классифицировать мобильные устройства по ценовым категориям (например, бюджетные, средний класс, флагманы).
- Определить, какие характеристики мобильных устройств наиболее сильно влияют на их рейтинг.
Проверка на пропущенные значения¶
In [31]:
print("Пропущенные данные по каждому столбцу:")
print(df.isnull().sum())
In [32]:
df.dropna(inplace=True)
print(df.isnull().sum())
Проверка на выбросы¶
In [33]:
column1 = "Spec_score"
column2 = "Price"
plt.figure(figsize=(10, 6))
plt.scatter(df[column1], df[column2], alpha=0.5)
plt.xlabel(column1)
plt.ylabel(column2)
plt.title(f"Scatter Plot of {column1} vs {column2} (Before Removing Outliers)")
plt.show()
def remove_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
df_cleaned = df.copy()
for column in [column1, column2]:
df_cleaned = remove_outliers(df_cleaned, column)
plt.figure(figsize=(10, 6))
plt.scatter(df_cleaned[column1], df_cleaned[column2], alpha=0.5)
plt.xlabel(column1)
plt.ylabel(column2)
plt.title(f"Scatter Plot of {column1} vs {column2} (After Removing Outliers)")
plt.show()
print(f"Количество строк до удаления выбросов: {len(df)}")
print(f"Количество строк после удаления выбросов: {len(df_cleaned)}")
df = df_cleaned
Разбиение данных на выборки.¶
In [34]:
X = df
y = df["company"]
train_df, X_temp, y_train, y_temp = train_test_split(
X, y, test_size=0.4, random_state=42
)
val_df, test_df, y_val, y_test = train_test_split(
X_temp, y_temp, test_size=0.5, random_state=42
)
print("Размеры выборок:")
print(f"Обучающая выборка: {train_df.shape[0]} записей")
print(train_df.company.value_counts())
print(f"Контрольная выборка: {val_df.shape[0]} записей")
print(val_df.company.value_counts())
print(f"Тестовая выборка: {test_df.shape[0]} записей")
print(test_df.company.value_counts())
Oversampling¶
In [35]:
def oversample(df):
X = df.drop("company", axis=1)
y = df["company"]
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore
resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
return resampled_df
train_df_overs = oversample(train_df)
val_df_overs = oversample(val_df)
test_df_overs = oversample(test_df)
print("Размеры выборок:")
print(f"Обучающая выборка: {train_df_overs.shape[0]} записей")
print(train_df_overs.company.value_counts())
print(f"Контрольная выборка: {val_df_overs.shape[0]} записей")
print(val_df_overs.company.value_counts())
print(f"Тестовая выборка: {test_df_overs.shape[0]} записей")
print(test_df_overs.company.value_counts())
Дискретизация числовых признаков¶
In [36]:
numerical_features = ["Spec_score", "Battery", "Ram", "Camera"]
def discretize_features(df, features, bins=3, labels=False):
for feature in features:
try:
df[f"{feature}_bin"] = pd.cut(df[feature], bins=bins, labels=labels) # type: ignore
except Exception as e:
print(f"Ошибка при дискретизации признака {feature}: {e}")
return df
train_df_disc = discretize_features(train_df_overs, numerical_features)
val_df_disc = discretize_features(val_df_overs, numerical_features)
test_df_disc = discretize_features(test_df_overs, numerical_features)
train_df_disc
Out[36]:
Унитарное кодирование категориальных признаков¶
In [37]:
categorical_features = ["Spec_score_bin", "Battery_bin", "Ram_bin", "Camera_bin"]
train_df_enc = pd.get_dummies(train_df_disc, columns=categorical_features)
val_df_enc = pd.get_dummies(val_df_disc, columns=categorical_features)
test_df_enc = pd.get_dummies(test_df_disc, columns=categorical_features)
train_df_enc
Out[37]:
Ручной синтез признаков.¶
In [38]:
train_df_enc["Camera_to_Display_Ratio"] = (train_df_enc["Camera"] / train_df_enc["Display"])
val_df_enc["Camera_to_Display_Ratio"] = val_df_enc["Camera"] / val_df_enc["Display"]
test_df_enc["Camera_to_Display_Ratio"] = test_df_enc["Camera"] / test_df_enc["Display"]
train_df_enc
Out[38]:
Масштабирование признаков¶
In [39]:
scaler = StandardScaler()
numerical_features = [
"Ram",
"Battery",
"Display",
"Camera",
]
train_df_enc[numerical_features] = scaler.fit_transform(
train_df_enc[numerical_features]
)
val_df_enc[numerical_features] = scaler.transform(val_df_enc[numerical_features])
test_df_enc[numerical_features] = scaler.transform(test_df_enc[numerical_features])
train_df_enc
Out[39]:
Конструирование признаков с помощью Featuretools¶
In [40]:
es = ft.EntitySet(id="mobile_data")
es = es.add_dataframe(dataframe_name="train", dataframe=train_df_enc, index="id")
feature_matrix, feature_defs = ft.dfs(
entityset=es, target_dataframe_name="train", max_depth=2
)
feature_defs
Out[40]: