211 KiB
211 KiB
Загрузка набора данных¶
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
from pandas import DataFrame
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn import linear_model, tree, neighbors, ensemble
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.pipeline import make_pipeline
df = pd.read_csv("../data/car_price_prediction.csv")
df = df.drop(columns=["ID"])
df
Out[1]:
Анализ датасета и очистка данных¶
In [2]:
df.dtypes
Out[2]:
In [3]:
df["Engine volume"] = df["Engine volume"].str.replace("Turbo", "")
df["Engine volume"] = pd.to_numeric(df["Engine volume"])
df["Engine volume"].unique()
Out[3]:
In [4]:
df["Mileage"] = df["Mileage"].str.replace("km", "")
df["Mileage"] = df["Mileage"].astype("int64")
df["Mileage"].unique()
Out[4]:
In [5]:
df["Levy"] = df["Levy"].replace("-", "0")
df["Levy"] = df["Levy"].astype("int64")
df["Levy"].unique()
Out[5]:
In [6]:
df["Cylinders"] = df["Cylinders"].astype("int64")
df["Cylinders"].unique()
Out[6]:
In [7]:
df["Doors"].unique()
Out[7]:
In [8]:
df["Doors"] = df["Doors"].map(
{"02-Mar": "Двухдверный", "04-May": "Четырехдверный", ">5": "Многодверный"}
)
df["Doors"].unique()
Out[8]:
In [9]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[9]:
In [10]:
print(f"Количество строк до удаления некорректных значений: {len(df)}")
df = df[df["Price"] >= 500]
print(f"Количество строк после удаления некорректных значений: {len(df)}")
In [11]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[11]:
In [12]:
sorted_df = df.sort_values(by="Prod. year")
sorted_df["Prod. year"].unique()
Out[12]:
In [13]:
df
Out[13]:
Очистка дубликатов и пропущенных значений¶
In [14]:
df.duplicated().sum()
Out[14]:
In [15]:
df.drop_duplicates(inplace=True)
In [16]:
df.isna().sum()
Out[16]:
Очистка выбросов¶
In [17]:
df.dtypes
Out[17]:
In [18]:
numeric_features_with_outliers = [
"Price",
"Levy",
"Mileage",
"Prod. year",
]
i = 1
for col in numeric_features_with_outliers:
plt.figure(figsize=(4, 30))
plt.subplot(6, 1, i)
df.boxplot(column=col)
i += 1
In [19]:
def remove_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
print(f"Количество строк до удаления выбросов: {len(df)}")
for column in numeric_features_with_outliers:
df = remove_outliers(df, column)
print(f"Количество строк после удаления выбросов: {len(df)}")
In [20]:
i = 1
for col in numeric_features_with_outliers:
plt.figure(figsize=(4, 30))
plt.subplot(6, 1, i)
df.boxplot(column=col)
i += 1
Разбиение на выборки¶
In [21]:
X = df
y = df["Category"]
train_df, test_df, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
print("Размеры выборок:")
print(f"Обучающая выборка: {train_df.shape[0]} записей")
print(train_df.Category.value_counts())
print(f"Тестовая выборка: {test_df.shape[0]} записей")
print(test_df.Category.value_counts())
Oversampling¶
In [22]:
def oversample(df):
X = df.drop("Category", axis=1)
y = df["Category"]
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore
resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
return resampled_df
train_df_overs = oversample(train_df)
test_df_overs = oversample(test_df)
print("Размеры выборок:")
print(f"Обучающая выборка: {train_df_overs.shape[0]} записей")
print(train_df_overs.Category.value_counts())
print(f"Тестовая выборка: {test_df_overs.shape[0]} записей")
print(test_df_overs.Category.value_counts())
In [23]:
price_y_train = train_df["Price"]
price_y_test = test_df["Price"]
train_df = train_df.drop(columns=["Price"])
test_df = test_df.drop(columns=["Price"])
Ручной синтез признаков.¶
In [24]:
def age_create(df):
df["Age"] = 2020 - df["Prod. year"]
df = df.drop(columns=["Prod. year"])
return df
train_df = age_create(train_df)
test_df = age_create(test_df)
In [25]:
sorted_df = train_df.sort_values(by="Age")
sorted_df["Age"].unique()
Out[25]:
In [26]:
plt.figure(figsize=(4, 30))
plt.subplot(6, 1, i)
train_df.boxplot(column="Age")
Out[26]:
Унитарное кодирование категориальных признаков¶
In [27]:
train_df.dtypes
Out[27]:
In [28]:
categorical_features_for_encoding = [
"Leather interior",
"Category",
"Fuel type",
"Gear box type",
"Drive wheels",
"Doors",
"Wheel",
]
def onehot_encode(df):
encoder = OneHotEncoder(sparse_output=False, drop="first")
temp = encoder.fit_transform(df[categorical_features_for_encoding])
df = df.drop(columns=categorical_features_for_encoding)
return temp
temp = onehot_encode(train_df)
temp
# onehot_encode(test_df)
# train_df
Out[28]:
Масштабирование признаков¶
In [29]:
train_df.dtypes
Out[29]:
In [30]:
scaler = StandardScaler()
numeric_features_for_stardartization = [
"Levy",
"Engine volume",
"Mileage",
"Cylinders",
"Airbags",
"Age",
]
train_df[numeric_features_for_stardartization] = scaler.fit_transform(
train_df[numeric_features_for_stardartization]
)
test_df[numeric_features_for_stardartization] = scaler.transform(
test_df[numeric_features_for_stardartization]
)
train_df
Out[30]:
Числовое кодирование категориальных признаков¶
In [31]:
train_df.dtypes
Out[31]:
In [32]:
categorical_features_for_encoding = [
"Model",
"Manufacturer",
"Color",
"Leather interior",
"Category",
"Fuel type",
"Gear box type",
"Drive wheels",
"Doors",
"Wheel",
]
def label_encode(df):
encoder = LabelEncoder()
for column in df.columns:
if column in categorical_features_for_encoding:
df[column] = encoder.fit_transform(df[column])
label_encode(train_df)
label_encode(test_df)
train_df
Out[32]:
Формирование набора моделей¶
In [33]:
train_df.dtypes
Out[33]:
In [34]:
test_df.dtypes
Out[34]:
In [35]:
random_state = 9
set_config(transform_output="pandas")
models = {
"linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
"linear_poly": {
"model": make_pipeline(
PolynomialFeatures(degree=2),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"linear_interact": {
"model": make_pipeline(
PolynomialFeatures(interaction_only=True),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"ridge": {"model": linear_model.RidgeCV()},
"decision_tree": {
"model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
},
"knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
"random_forest": {
"model": ensemble.RandomForestRegressor(
max_depth=7, random_state=random_state, n_jobs=-1
)
},
}
Обучение и оценка моделей с помощью различных алгоритмов¶
In [36]:
for model_name in models.keys():
print(f"Model: {model_name}")
model = models[model_name]["model"]
fitted_model = model.fit(train_df.values, price_y_train.values.ravel())
y_train_pred = fitted_model.predict(train_df)
y_test_pred = fitted_model.predict(test_df)
models[model_name]["fitted"] = fitted_model
models[model_name]["train_preds"] = y_train_pred
models[model_name]["preds"] = y_test_pred
models[model_name]["RMSE_train"] = math.sqrt(
metrics.mean_squared_error(price_y_train, y_train_pred)
)
models[model_name]["RMSE_test"] = math.sqrt(
metrics.mean_squared_error(price_y_test, y_test_pred)
)
models[model_name]["RMAE_test"] = math.sqrt(
metrics.mean_absolute_error(price_y_test, y_test_pred)
)
models[model_name]["R2_test"] = metrics.r2_score(price_y_test, y_test_pred)
Вывод результатов оценки¶
In [37]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[37]: