205 KiB
205 KiB
Загрузка набора данных¶
In [971]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import featuretools as ft
import re
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
df = pd.read_csv("../data/car_price_prediction.csv")
df = df.drop(columns=["ID"])
df
Out[971]:
Анализ датасета и очистка данных¶
In [972]:
df.dtypes
Out[972]:
In [973]:
df["Engine volume"] = df["Engine volume"].str.replace("Turbo", "")
df["Engine volume"] = pd.to_numeric(df["Engine volume"])
df["Engine volume"].unique()
Out[973]:
In [974]:
df["Mileage"] = df["Mileage"].str.replace("km", "")
df["Mileage"] = df["Mileage"].astype("int64")
df["Mileage"].unique()
Out[974]:
In [975]:
df["Levy"] = df["Levy"].replace("-", "0")
df["Levy"] = df["Levy"].astype("int64")
df["Levy"].unique()
Out[975]:
In [976]:
df["Cylinders"] = df["Cylinders"].astype("int64")
df["Cylinders"].unique()
Out[976]:
In [977]:
df["Doors"].unique()
Out[977]:
In [978]:
df["Doors"] = df["Doors"].map(
{"02-Mar": "Двухдверный", "04-May": "Четырехдверный", ">5": "Многодверный"}
)
df["Doors"].unique()
Out[978]:
In [979]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[979]:
In [980]:
print(f"Количество строк до удаления некорректных значений: {len(df)}")
df = df[df["Price"] >= 500]
print(f"Количество строк после удаления некорректных значений: {len(df)}")
In [981]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[981]:
In [982]:
sorted_df = df.sort_values(by="Prod. year")
sorted_df["Prod. year"].unique()
Out[982]:
Ручной синтез признаков.¶
In [983]:
df["Age"] = 2020 - df["Prod. year"]
df = df.drop("Prod. year", axis=1)
sorted_df = df.sort_values(by="Age")
sorted_df["Age"].unique()
Out[983]:
In [984]:
df
Out[984]:
Очистка дубликатов и пропущенных значений¶
In [985]:
df.duplicated().sum()
Out[985]:
In [986]:
df.drop_duplicates(inplace=True)
In [987]:
df.isna().sum()
Out[987]:
Очистка выбросов¶
In [988]:
df.dtypes
Out[988]:
In [989]:
numeric_features_with_outliers = [
"Price",
"Levy",
"Mileage",
"Age",
]
i = 1
for col in numeric_features_with_outliers:
plt.figure(figsize=(4, 30))
plt.subplot(6, 1, i)
df.boxplot(column=col)
i += 1
In [990]:
def remove_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
print(f"Количество строк до удаления выбросов: {len(df)}")
for column in numeric_features_with_outliers:
df = remove_outliers(df, column)
print(f"Количество строк после удаления выбросов: {len(df)}")
In [991]:
i = 1
for col in numeric_features_with_outliers:
plt.figure(figsize=(4, 30))
plt.subplot(6, 1, i)
df.boxplot(column=col)
i += 1
Разбиение на выборки¶
In [992]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print("Размеры выборок:")
print(f"Обучающая выборка: {train_df.shape[0]} записей")
print(f"Тестовая выборка: {test_df.shape[0]} записей")
Дискретизация числовых признаков¶
In [993]:
train_df.dtypes
Out[993]:
In [994]:
numeric_features_for_discritization = ["Age"]
def discretize_features(df, features, bins=4, labels=["Новый", "Средний", "Старый", "Очень старый"]):
for feature in features:
try:
df[f"{feature}_bin"] = pd.cut(df[feature], bins=bins, labels=labels) # type: ignore
except Exception as e:
print(f"Ошибка при дискретизации признака {feature}: {e}")
return df
train_df = discretize_features(train_df, numeric_features_for_discritization)
test_df = discretize_features(test_df, numeric_features_for_discritization)
train_df
Out[994]:
Унитарное кодирование категориальных признаков¶
In [995]:
train_df.dtypes
Out[995]:
In [996]:
categorical_features_for_encoding = [
"Leather interior",
"Category",
"Fuel type",
"Gear box type",
"Drive wheels",
"Doors",
"Wheel",
"Age_bin",
]
train_df = pd.get_dummies(train_df, columns=categorical_features_for_encoding)
test_df = pd.get_dummies(test_df, columns=categorical_features_for_encoding)
train_df
Out[996]:
Масштабирование признаков¶
In [997]:
train_df.dtypes
Out[997]:
In [998]:
scaler = StandardScaler()
numeric_features_for_stardartization = [
"Price",
"Levy",
"Engine volume",
"Mileage",
"Cylinders",
"Airbags",
"Age",
]
train_df[numeric_features_for_stardartization] = scaler.fit_transform(
train_df[numeric_features_for_stardartization]
)
test_df[numeric_features_for_stardartization] = scaler.transform(
test_df[numeric_features_for_stardartization]
)
train_df
Out[998]:
Конструирование признаков с помощью Featuretools¶
In [999]:
es = ft.EntitySet(id="car_data")
es = es.add_dataframe(dataframe_name="train", dataframe=train_df)
feature_matrix, feature_defs = ft.dfs(
entityset=es,
target_dataframe_name="train",
max_depth=1,
)
feature_defs