129 KiB
129 KiB
Загрузка данных¶
In [94]:
import pandas as pd
from sklearn import set_config
set_config(transform_output="pandas")
random_state = 9
df = pd.read_csv("data/car_price_prediction.csv", index_col="ID")
df["distance_nokm"] = pd.to_numeric(df["Mileage"].str.replace(" km", "", regex=False))
df["distance_norm"] = df["distance_nokm"].clip(0, 350000)
df.boxplot(column="distance_norm")
df["Levy"] = pd.to_numeric(df["Levy"].str.replace("-", "0", regex=False))
# df["Cylinders"] = pd.to_numeric(df["Cylinders"].str.replace(".", "", regex=False))
# average_mileage = df["distance_norm"].mean()
# print(f"Среднее значение поля 'пробег': {average_mileage}")
# average_mileage = df["distance_norm"].mean()
# df["above_average_mileage"] = (df["distance_norm"] > average_mileage).astype(int)
df
Out[94]:
In [95]:
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from transformers import TitanicFeatures
# columns_to_drop = ["Survived", "Name", "Cabin", "Ticket", "Embarked", "Parch", "Fare"]
columns_to_drop = [
# "Price",
"Doors",
# "Color",
# "Gear box type",
# "Prod_year",
"Mileage",
# "Airbags",
"Levy",
# "Leather_interior",
"Fuel type",
"Drive wheels",
"Engine volume",
"Wheel",
"distance_nokm",
"Model",
"Cylinders"
]
num_columns = [
"Airbags",
"distance_norm",
# "Cylinders"
]
cat_columns = [
"Color",
"Gear box type",
"Leather_interior",
"Manufacturer",
"Category"
]
num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
[
("imputer", num_imputer),
("scaler", num_scaler),
]
)
cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
[
("imputer", cat_imputer),
("encoder", cat_encoder),
]
)
features_preprocessing = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("prepocessing_num", preprocessing_num, num_columns),
("prepocessing_cat", preprocessing_cat, cat_columns),
#("prepocessing_features", cat_imputer, ["Name", "Cabin"]),
],
remainder="passthrough"
)
# features_engineering = ColumnTransformer(
# verbose_feature_names_out=False,
# transformers=[
# ("add_features", TitanicFeatures(), ["Name", "Cabin"]),
# ],
# remainder="passthrough",
# )
drop_columns = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("drop_columns", "drop", columns_to_drop),
],
remainder="passthrough",
)
# features_postprocessing = ColumnTransformer(
# verbose_feature_names_out=False,
# transformers=[
# ("prepocessing_cat", preprocessing_cat, ["Cabin_type"]),
# ],
# remainder="passthrough",
# )
pipeline_end = Pipeline(
[
("features_preprocessing", features_preprocessing),
# ("features_engineering", features_engineering),
("drop_columns", drop_columns),
# ("features_postprocessing", features_postprocessing),
]
)
In [96]:
preprocessing_result = pipeline_end.fit_transform(df)
df = pd.DataFrame(
preprocessing_result,
columns=pipeline_end.get_feature_names_out(),
)
Формирование выборок¶
In [97]:
from utils import split_stratified_into_train_val_test
X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
df,
target_colname="Price",
stratify_colname="Airbags",
frac_train=0.80,
frac_val=0,
frac_test=0.20,
random_state=random_state,
)
display("X_train", X_train)
display("y_train", y_train)
display("X_test", X_test)
display("y_test", y_test)
In [98]:
X_test
Out[98]:
In [99]:
X_train
Out[99]:
In [100]:
y_test
Out[100]:
In [101]:
y_train
Out[101]:
Определение перечня алгоритмов решения задачи аппроксимации (регрессии)¶
In [102]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble, neural_network
random_state = 9
models = {
# "mlp": {
# "model": neural_network.MLPRegressor(
# activation="tanh",
# hidden_layer_sizes=(3,),
# max_iter=500,
# early_stopping=True,
# random_state=random_state,
# )
# },
"linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
"linear_poly": {
"model": make_pipeline(
PolynomialFeatures(degree=2),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"linear_interact": {
"model": make_pipeline(
PolynomialFeatures(interaction_only=True),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"ridge": {"model": linear_model.RidgeCV()},
"decision_tree": {
"model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
},
"knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
"random_forest": {
"model": ensemble.RandomForestRegressor(
max_depth=7, random_state=random_state, n_jobs=-1
)
},
}
Определение функции для стандартизации значений в столбце "Температура" для MLP¶
In [103]:
from pandas import DataFrame
from sklearn import preprocessing
import numpy as np
stndart_scaler = preprocessing.StandardScaler()
def std_temp(df: DataFrame) -> DataFrame:
df["distance_norm"] = np.array(stndart_scaler.fit_transform(
df["distance_norm"].reshape(-1, 1))
).reshape(df["distance_norm"].shape)
return df
def std_temp2(df: DataFrame) -> DataFrame:
# Преобразуем столбец в массив NumPy и применяем reshape
df["distance_norm"] = stndart_scaler.fit_transform(
df["distance_norm"].values.reshape(-1, 1) # Изменяем на reshape(-1, 1)
)
return df
def std_temp3(df: DataFrame) -> DataFrame:
# Проверка на NaN и заполнение средним значением
df["distance_norm"].fillna(df["distance_norm"].mean(), inplace=True)
# Преобразуем столбец в массив NumPy и применяем reshape
df["distance_norm"] = stndart_scaler.fit_transform(
df["distance_norm"].values.reshape(-1, 1)
)
return df
In [104]:
print(X_train.isnull().sum())
print(X_test.isnull().sum())
Обучение и оценка моделей с помощью различных алгоритмов¶
In [105]:
import math
from pandas import DataFrame
from sklearn import metrics
for model_name in models.keys():
print(f"Model: {model_name}")
X_train: DataFrame = X_train.copy()
X_test: DataFrame = X_test.copy()
if model_name == "mlp":
X_train = std_temp(X_train)
X_test = std_temp(X_test)
fitted_model = models[model_name]["model"].fit(
X_train.values, y_train.values.ravel()
)
y_train_pred = fitted_model.predict(X_train.values)
y_test_pred = fitted_model.predict(X_test.values)
models[model_name]["fitted"] = fitted_model
models[model_name]["train_preds"] = y_train_pred
models[model_name]["preds"] = y_test_pred
models[model_name]["RMSE_train"] = math.sqrt(
metrics.mean_squared_error(y_train, y_train_pred)
)
models[model_name]["RMSE_test"] = math.sqrt(
metrics.mean_squared_error(y_test, y_test_pred)
)
models[model_name]["RMAE_test"] = math.sqrt(
metrics.mean_absolute_error(y_test, y_test_pred)
)
models[model_name]["R2_test"] = metrics.r2_score(y_test, y_test_pred)
Вывод результатов оценки¶
In [106]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[106]:
Вывод реального и "спрогнозированного" результата для обучающей и тестовой выборок¶
Получение лучшей модели
In [107]:
best_model = str(reg_metrics.sort_values(by="RMSE_test").iloc[0].name)
display(best_model)
Вывод для обучающей выборки
In [ ]:
pd.concat(
[
X_train,
y_train,
pd.Series(
models[best_model]["train_preds"],
index=y_train.index,
name="PricePred",
),
],
axis=1,
).head(5)
Out[ ]:
Вывод для тестовой выборки
In [ ]:
pd.concat(
[
X_test,
y_test,
pd.Series(
models[best_model]["preds"],
index=y_test.index,
name="PricePred",
),
],
axis=1,
).head(5)
Out[ ]: