220 KiB
220 KiB
Загрузка набора данных¶
In [91]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import featuretools as ft
import re
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
df = pd.read_csv("../data/car_price_prediction.csv")
df = df.drop(columns=["ID"])
df
Out[91]:
Анализ датасета и очистка данных¶
In [92]:
df.dtypes
Out[92]:
In [93]:
df["Engine volume"] = df["Engine volume"].str.replace("Turbo", "")
df["Engine volume"] = pd.to_numeric(df["Engine volume"])
df["Engine volume"].unique()
Out[93]:
In [94]:
df["Mileage"] = df["Mileage"].str.replace("km", "")
df["Mileage"] = df["Mileage"].astype("int64")
df["Mileage"].unique()
Out[94]:
In [95]:
df["Levy"] = df["Levy"].replace("-", "0")
df["Levy"] = df["Levy"].astype("int64")
df["Levy"].unique()
Out[95]:
In [96]:
df["Cylinders"] = df["Cylinders"].astype("int64")
df["Cylinders"].unique()
Out[96]:
In [97]:
df["Doors"].unique()
Out[97]:
In [98]:
df["Doors"] = df["Doors"].map(
{"02-Mar": "Двухдверный", "04-May": "Четырехдверный", ">5": "Многодверный"}
)
df["Doors"].unique()
Out[98]:
In [99]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[99]:
In [100]:
print(f"Количество строк до удаления некорректных значений: {len(df)}")
df = df[df["Price"] >= 500]
print(f"Количество строк после удаления некорректных значений: {len(df)}")
In [101]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[101]:
In [102]:
sorted_df = df.sort_values(by="Prod. year")
sorted_df["Prod. year"].unique()
Out[102]:
In [103]:
df
Out[103]:
Очистка дубликатов и пропущенных значений¶
In [104]:
df.duplicated().sum()
Out[104]:
In [105]:
df.drop_duplicates(inplace=True)
In [106]:
df.isna().sum()
Out[106]:
Очистка выбросов¶
In [107]:
df.dtypes
Out[107]:
In [108]:
numeric_features_with_outliers = [
"Price",
"Levy",
"Mileage",
"Prod. year",
]
i = 1
for col in numeric_features_with_outliers:
plt.figure(figsize=(4, 30))
plt.subplot(6, 1, i)
df.boxplot(column=col)
i += 1
In [109]:
def remove_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
print(f"Количество строк до удаления выбросов: {len(df)}")
for column in numeric_features_with_outliers:
df = remove_outliers(df, column)
print(f"Количество строк после удаления выбросов: {len(df)}")
In [110]:
i = 1
for col in numeric_features_with_outliers:
plt.figure(figsize=(4, 30))
plt.subplot(6, 1, i)
df.boxplot(column=col)
i += 1
Разбиение на выборки¶
In [112]:
X = df
y = df["Category"]
train_df, test_df, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
print("Размеры выборок:")
print(f"Обучающая выборка: {train_df.shape[0]} записей")
print(train_df.Category.value_counts())
print(f"Тестовая выборка: {test_df.shape[0]} записей")
print(test_df.Category.value_counts())
Oversampling¶
In [113]:
def oversample(df):
X = df.drop("Category", axis=1)
y = df["Category"]
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore
resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
return resampled_df
train_df_overs = oversample(train_df)
test_df_overs = oversample(test_df)
print("Размеры выборок:")
print(f"Обучающая выборка: {train_df_overs.shape[0]} записей")
print(train_df_overs.Category.value_counts())
print(f"Тестовая выборка: {test_df_overs.shape[0]} записей")
print(test_df_overs.Category.value_counts())
Ручной синтез признаков.¶
In [114]:
def age_create(df):
df["Age"] = 2020 - df["Prod. year"]
df = df.drop("Prod. year", axis=1)
return df
train_df = age_create(train_df)
test_df = age_create(test_df)
In [115]:
sorted_df = train_df.sort_values(by="Age")
sorted_df["Age"].unique()
Out[115]:
In [116]:
plt.figure(figsize=(4, 30))
plt.subplot(6, 1, i)
train_df.boxplot(column="Age")
Out[116]:
Дискретизация числовых признаков¶
In [117]:
train_df.dtypes
Out[117]:
In [118]:
numeric_features_for_discritization = ["Age"]
def discretize_features(df, features, bins=4, labels=["Новый", "Средний", "Старый", "Очень старый"]):
for feature in features:
try:
df[f"{feature}_bin"] = pd.cut(df[feature], bins=bins, labels=labels) # type: ignore
except Exception as e:
print(f"Ошибка при дискретизации признака {feature}: {e}")
return df
train_df = discretize_features(train_df, numeric_features_for_discritization)
test_df = discretize_features(test_df, numeric_features_for_discritization)
train_df
Out[118]:
Унитарное кодирование категориальных признаков¶
In [119]:
train_df.dtypes
Out[119]:
In [120]:
categorical_features_for_encoding = [
"Leather interior",
"Category",
"Fuel type",
"Gear box type",
"Drive wheels",
"Doors",
"Wheel",
"Age_bin",
]
train_df = pd.get_dummies(train_df, columns=categorical_features_for_encoding)
test_df = pd.get_dummies(test_df, columns=categorical_features_for_encoding)
train_df
Out[120]:
Масштабирование признаков¶
In [121]:
train_df.dtypes
Out[121]:
In [122]:
scaler = StandardScaler()
numeric_features_for_stardartization = [
"Price",
"Levy",
"Engine volume",
"Mileage",
"Cylinders",
"Airbags",
"Age",
]
train_df[numeric_features_for_stardartization] = scaler.fit_transform(
train_df[numeric_features_for_stardartization]
)
test_df[numeric_features_for_stardartization] = scaler.transform(
test_df[numeric_features_for_stardartization]
)
train_df
Out[122]:
Конструирование признаков с помощью Featuretools¶
In [123]:
es = ft.EntitySet(id="car_data")
es = es.add_dataframe(dataframe_name="train", dataframe=train_df, index="id")
feature_matrix, feature_defs = ft.dfs(
entityset=es,
target_dataframe_name="train",
max_depth=1,
)
In [124]:
feature_defs
Out[124]: