48 KiB
48 KiB
Загрузка набора данных¶
In [69]:
import pandas as pd
import numpy as np
import math
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn import linear_model, tree, neighbors, ensemble
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.pipeline import make_pipeline
from transformers import CarsFeatures
set_config(transform_output="pandas")
random_state = 9
df = pd.read_csv("../data/car-price-prediction.csv")
df = df.drop(columns=["Unnamed: 0"])
df
Out[69]:
In [70]:
df.dtypes
Out[70]:
Разбиение на выборки¶
In [71]:
X = df
y = df["Category"]
train_df, test_df, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
print("Размеры выборок:")
print(f"Обучающая выборка: {train_df.shape[0]} записей")
print(train_df.Category.value_counts())
print(f"Тестовая выборка: {test_df.shape[0]} записей")
print(test_df.Category.value_counts())
Oversampling¶
In [72]:
def oversample(df):
X = df.drop("Category", axis=1)
y = df["Category"]
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore
resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
return resampled_df
train_df_overs = oversample(train_df)
test_df_overs = oversample(test_df)
print("Размеры выборок:")
print(f"Обучающая выборка: {train_df_overs.shape[0]} записей")
print(train_df_overs.Category.value_counts())
print(f"Тестовая выборка: {test_df_overs.shape[0]} записей")
print(test_df_overs.Category.value_counts())
In [73]:
price_y_train = train_df["Price"]
price_y_test = test_df["Price"]
Формирование конвейера для классификации данных¶
In [74]:
columns_to_drop = ["Price", "Color", "Model", "Manufacturer"]
num_columns = [
column
for column in df.columns
if column not in columns_to_drop and df[column].dtype != "object"
]
cat_columns = [
column
for column in df.columns
if column not in columns_to_drop and df[column].dtype == "object"
]
num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
[
("imputer", num_imputer),
("scaler", num_scaler),
]
)
cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
[
("imputer", cat_imputer),
("encoder", cat_encoder),
]
)
features_preprocessing = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("prepocessing_num", preprocessing_num, num_columns),
("prepocessing_cat", preprocessing_cat, cat_columns),
# ("prepocessing_features", num_imputer, ["Prod. year"]),
],
remainder="passthrough",
)
features_engineering = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("add_features", CarsFeatures(), ["Prod. year"]),
],
remainder="passthrough",
)
drop_columns = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("drop_columns", "drop", columns_to_drop),
],
remainder="passthrough",
)
features_postprocessing = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("prepocessing_num", preprocessing_num, ["Age"]),
],
remainder="passthrough",
)
pipeline_end = Pipeline(
[
("features_preprocessing", features_preprocessing),
("drop_columns", drop_columns),
]
)
Демонстрация работы конвейера для предобработки данных¶
In [75]:
preprocessing_result = pipeline_end.fit_transform(train_df)
preprocessed_df = pd.DataFrame(
preprocessing_result,
columns=pipeline_end.get_feature_names_out(),
)
preprocessed_df
Out[75]:
Формирование набора моделей¶
In [76]:
models = {
"linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
"linear_poly": {
"model": make_pipeline(
PolynomialFeatures(degree=2),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"linear_interact": {
"model": make_pipeline(
PolynomialFeatures(interaction_only=True),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"ridge": {"model": linear_model.RidgeCV()},
"decision_tree": {
"model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
},
"knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
"random_forest": {
"model": ensemble.RandomForestRegressor(
max_depth=7, random_state=random_state, n_jobs=-1
)
},
}
Обучение и оценка моделей с помощью различных алгоритмов¶
In [77]:
for model_name in models.keys():
print(f"Model: {model_name}")
model = models[model_name]["model"]
model_pipeline = Pipeline([("pipeline", pipeline_end), ("model", model)])
model_pipeline = model_pipeline.fit(train_df, price_y_train.values.ravel())
y_train_pred = model_pipeline.predict(train_df)
y_test_pred = model_pipeline.predict(test_df)
models[model_name]["fitted"] = model_pipeline
models[model_name]["train_preds"] = y_train_pred
models[model_name]["preds"] = y_test_pred
models[model_name]["RMSE_train"] = math.sqrt(
metrics.mean_squared_error(price_y_train, y_train_pred)
)
models[model_name]["RMSE_test"] = math.sqrt(
metrics.mean_squared_error(price_y_test, y_test_pred)
)
models[model_name]["RMAE_test"] = math.sqrt(
metrics.mean_absolute_error(price_y_test, y_test_pred)
)
models[model_name]["R2_test"] = metrics.r2_score(price_y_test, y_test_pred)
Вывод результатов оценки¶
In [78]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[78]: