219 KiB
219 KiB
Загрузка набора данных¶
In [533]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn import linear_model, tree, neighbors, ensemble
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.pipeline import make_pipeline
df = pd.read_csv("../data/car_price_prediction.csv")
df = df.drop(columns=["ID"])
random_state = 9
set_config(transform_output="pandas")
df
Out[533]:
Анализ датасета и очистка данных¶
In [534]:
df.dtypes
Out[534]:
In [535]:
df["Engine volume"] = df["Engine volume"].str.replace("Turbo", "")
df["Engine volume"] = pd.to_numeric(df["Engine volume"])
df["Engine volume"].unique()
Out[535]:
In [536]:
df["Mileage"] = df["Mileage"].str.replace("km", "")
df["Mileage"] = df["Mileage"].astype("int64")
df["Mileage"].unique()
Out[536]:
In [537]:
df["Levy"] = df["Levy"].replace("-", "0")
df["Levy"] = df["Levy"].astype("int64")
df["Levy"].unique()
Out[537]:
In [538]:
df["Cylinders"] = df["Cylinders"].astype("int64")
df["Cylinders"].unique()
Out[538]:
In [539]:
df["Doors"].unique()
Out[539]:
In [540]:
df["Doors"] = df["Doors"].map(
{"02-Mar": "Двухдверный", "04-May": "Четырехдверный", ">5": "Многодверный"}
)
df["Doors"].unique()
Out[540]:
In [541]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[541]:
In [542]:
print(f"Количество строк до удаления некорректных значений: {len(df)}")
df = df[df["Price"] >= 500]
print(f"Количество строк после удаления некорректных значений: {len(df)}")
In [543]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[543]:
In [544]:
sorted_df = df.sort_values(by="Prod. year")
sorted_df["Prod. year"].unique()
Out[544]:
In [545]:
df
Out[545]:
Очистка дубликатов и пропущенных значений¶
In [546]:
df.duplicated().sum()
Out[546]:
In [547]:
df.drop_duplicates(inplace=True)
# df.duplicated().sum()
In [548]:
df.isna().sum()
Out[548]:
Очистка выбросов¶
In [549]:
df.dtypes
Out[549]:
In [550]:
numeric_features_with_outliers = [
"Price",
"Levy",
"Mileage",
"Prod. year",
]
i = 1
for col in numeric_features_with_outliers:
plt.figure(figsize=(4, 30))
plt.subplot(6, 1, i)
df.boxplot(column=col)
i += 1
In [551]:
def remove_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
print(f"Количество строк до удаления выбросов: {len(df)}")
for column in numeric_features_with_outliers:
df = remove_outliers(df, column)
print(f"Количество строк после удаления выбросов: {len(df)}")
In [552]:
i = 1
for col in numeric_features_with_outliers:
plt.figure(figsize=(4, 30))
plt.subplot(6, 1, i)
df.boxplot(column=col)
i += 1
In [553]:
df.to_csv("../data/car-price-prediction.csv")
Формирование конвейера для классификации данных¶
In [554]:
df.dtypes
Out[554]:
In [555]:
columns_to_drop = [
"Model",
# "Manufacturer",
"Color",
"Doors",
"Cylinders",
"Mileage",
]
num_columns = [
column
for column in df.columns
if column not in columns_to_drop and df[column].dtype != "object"
]
cat_columns = [
column
for column in df.columns
if column not in columns_to_drop and df[column].dtype == "object"
]
cat_cols_for_one_hot_enc = [
"Leather interior",
"Category",
"Fuel type",
"Gear box type",
"Drive wheels",
# "Doors",
"Wheel",
"Manufacturer",
]
cat_cols_for_num_enc = [
# "Model",
# "Manufacturer",
# "Color",
]
num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
[
("imputer", num_imputer),
("scaler", num_scaler),
]
)
cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_one_hot_encoder = OneHotEncoder(
handle_unknown="ignore", sparse_output=False, drop="first"
)
preprocessing_one_hot = Pipeline(
[
("imputer", cat_imputer),
("encoder", cat_one_hot_encoder),
]
)
preprocessing_label_enc = Pipeline(
[
("imputer", cat_imputer),
("encoder", OrdinalEncoder()),
]
)
features_preprocessing = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("prepocessing_one_hot", preprocessing_one_hot, cat_cols_for_one_hot_enc),
("prepocessing_label_enc", preprocessing_label_enc, cat_cols_for_num_enc),
("prepocessing_num", preprocessing_num, num_columns),
],
remainder="passthrough",
)
drop_columns = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("drop_columns", "drop", columns_to_drop),
],
remainder="passthrough",
)
pipeline_end = Pipeline(
[
("features_preprocessing", features_preprocessing),
("drop_columns", drop_columns),
]
)
Демонстрация работы конвейера для предобработки данных¶
In [556]:
preprocessing_result = pipeline_end.fit_transform(df)
df = pd.DataFrame(
preprocessing_result,
columns=pipeline_end.get_feature_names_out(),
)
df
Out[556]:
Разбиение на выборки¶
In [557]:
train_df, test_df = train_test_split(
df, test_size=0.3, random_state=42
)
print("Размеры выборок:")
print(f"Обучающая выборка: {train_df.shape[0]} записей")
print(f"Тестовая выборка: {test_df.shape[0]} записей")
In [558]:
price_y_train = train_df["Price"]
price_y_test = test_df["Price"]
train_df.drop(columns=["Price"], inplace=True)
test_df.drop(columns=["Price"], inplace=True)
train_df
Out[558]:
Формирование набора моделей¶
In [559]:
models = {
"linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
# "linear_poly": {
# "model": make_pipeline(
# PolynomialFeatures(degree=2),
# linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
# )
# },
# "linear_interact": {
# "model": make_pipeline(
# PolynomialFeatures(interaction_only=True),
# linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
# )
# },
"ridge": {"model": linear_model.RidgeCV()},
"decision_tree": {
"model": tree.DecisionTreeRegressor(random_state=random_state)
},
"knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
"random_forest": {
"model": ensemble.RandomForestRegressor(
random_state=random_state, n_jobs=-1
)
},
}
Обучение и оценка моделей с помощью различных алгоритмов¶
In [560]:
for model_name in models.keys():
print(f"Model: {model_name}")
model = models[model_name]["model"]
fitted_model = model.fit(train_df.values, price_y_train.values.ravel())
y_train_pred = fitted_model.predict(train_df)
y_test_pred = fitted_model.predict(test_df)
models[model_name]["fitted"] = fitted_model
models[model_name]["train_preds"] = y_train_pred
models[model_name]["preds"] = y_test_pred
models[model_name]["RMSE_train"] = math.sqrt(
metrics.mean_squared_error(price_y_train, y_train_pred)
)
models[model_name]["RMSE_test"] = math.sqrt(
metrics.mean_squared_error(price_y_test, y_test_pred)
)
models[model_name]["RMAE_test"] = math.sqrt(
metrics.mean_absolute_error(price_y_test, y_test_pred)
)
models[model_name]["R2_test"] = metrics.r2_score(price_y_test, y_test_pred)
Вывод результатов оценки¶
In [561]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[561]:
In [562]:
best_model = str(reg_metrics.sort_values(by="RMSE_test").iloc[0].name)
display(best_model)
Вывод предсказаний для выборок¶
In [563]:
pd.concat(
[
train_df,
price_y_train,
pd.Series(
models[best_model]["train_preds"],
index=price_y_train.index,
name="PricePred",
),
],
axis=1,
).head(5)
Out[563]:
In [564]:
pd.concat(
[
test_df,
price_y_test,
pd.Series(
models[best_model]["preds"],
index=price_y_test.index,
name="PricePred",
),
],
axis=1,
).head(5)
Out[564]: