295 KiB
Классификация¶
Загрузка набора данных¶
import pandas as pd
from sklearn import set_config
set_config(transform_output="pandas")
random_state=9
df = pd.read_csv("data/titanic.csv", index_col="PassengerId")
df
Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации¶
Целевой признак -- Survived
from src.utils import split_stratified_into_train_val_test
X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
df, stratify_colname="Survived", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=random_state
)
display("X_train", X_train)
display("y_train", y_train)
display("X_test", X_test)
display("y_test", y_test)
Формирование конвейера для классификации данных¶
preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация
preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование
features_preprocessing -- трансформер для предобработки признаков
features_engineering -- трансформер для конструирования признаков
drop_columns -- трансформер для удаления колонок
features_postprocessing -- трансформер для унитарного кодирования новых признаков
pipeline_end -- основной конвейер предобработки данных и конструирования признаков
Конвейер выполняется последовательно.
Трансформер выполняет параллельно для указанного набора колонок.
Документация:
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from src.transformers import TitanicFeatures
columns_to_drop = ["Survived", "Name", "Cabin", "Ticket", "Embarked", "Parch", "Fare"]
num_columns = [
column
for column in df.columns
if column not in columns_to_drop and df[column].dtype != "object"
]
cat_columns = [
column
for column in df.columns
if column not in columns_to_drop and df[column].dtype == "object"
]
num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
[
("imputer", num_imputer),
("scaler", num_scaler),
]
)
cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
[
("imputer", cat_imputer),
("encoder", cat_encoder),
]
)
features_preprocessing = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("prepocessing_num", preprocessing_num, num_columns),
("prepocessing_cat", preprocessing_cat, cat_columns),
("prepocessing_features", cat_imputer, ["Name", "Cabin"]),
],
remainder="passthrough"
)
features_engineering = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("add_features", TitanicFeatures(), ["Name", "Cabin"]),
],
remainder="passthrough",
)
drop_columns = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("drop_columns", "drop", columns_to_drop),
],
remainder="passthrough",
)
features_postprocessing = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("prepocessing_cat", preprocessing_cat, ["Cabin_type"]),
],
remainder="passthrough",
)
pipeline_end = Pipeline(
[
("features_preprocessing", features_preprocessing),
("features_engineering", features_engineering),
("drop_columns", drop_columns),
("features_postprocessing", features_postprocessing),
]
)
Демонстрация работы конвейера для предобработки данных при классификации¶
preprocessing_result = pipeline_end.fit_transform(X_train)
preprocessed_df = pd.DataFrame(
preprocessing_result,
columns=pipeline_end.get_feature_names_out(),
)
preprocessed_df
Формирование набора моделей для классификации¶
logistic -- логистическая регрессия
ridge -- гребневая регрессия
decision_tree -- дерево решений
knn -- k-ближайших соседей
naive_bayes -- наивный Байесовский классификатор
gradient_boosting -- метод градиентного бустинга (набор деревьев решений)
random_forest -- метод случайного леса (набор деревьев решений)
mlp -- многослойный персептрон (нейронная сеть)
Документация: https://scikit-learn.org/1.5/supervised_learning.html
from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree
class_models = {
"logistic": {"model": linear_model.LogisticRegression()},
# "ridge": {"model": linear_model.RidgeClassifierCV(cv=5, class_weight="balanced")},
"ridge": {"model": linear_model.LogisticRegression(penalty="l2", class_weight="balanced")},
"decision_tree": {
"model": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)
},
"knn": {"model": neighbors.KNeighborsClassifier(n_neighbors=7)},
"naive_bayes": {"model": naive_bayes.GaussianNB()},
"gradient_boosting": {
"model": ensemble.GradientBoostingClassifier(n_estimators=210)
},
"random_forest": {
"model": ensemble.RandomForestClassifier(
max_depth=11, class_weight="balanced", random_state=random_state
)
},
"mlp": {
"model": neural_network.MLPClassifier(
hidden_layer_sizes=(7,),
max_iter=500,
early_stopping=True,
random_state=random_state,
)
},
}
Обучение моделей на обучающем наборе данных и оценка на тестовом¶
from src.utils import run_classification
for model_name in class_models.keys():
print(f"Model: {model_name}")
model = class_models[model_name]["model"]
pipeline = Pipeline([("pipeline", pipeline_end), ("model", model)]).fit(
X_train, y_train.values.ravel()
)
class_models[model_name] = run_classification(
pipeline, X_train, X_test, y_train, y_test
)
Сводная таблица оценок качества для использованных моделей классификации¶
Документация: https://scikit-learn.org/1.5/modules/model_evaluation.html
Матрица неточностей
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)
for index, key in enumerate(class_models.keys()):
c_matrix = class_models[key]["Confusion_matrix"]
disp = ConfusionMatrixDisplay(
confusion_matrix=c_matrix, display_labels=["Died", "Sirvived"]
).plot(ax=ax.flat[index])
disp.ax_.set_title(key)
plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)
plt.show()
Точность, полнота, верность (аккуратность), F-мера
class_metrics = pd.DataFrame.from_dict(class_models, "index")[
[
"Precision_train",
"Precision_test",
"Recall_train",
"Recall_test",
"Accuracy_train",
"Accuracy_test",
"F1_train",
"F1_test",
]
]
class_metrics.sort_values(
by="Accuracy_test", ascending=False
).style.background_gradient(
cmap="plasma",
low=0.3,
high=1,
subset=["Accuracy_train", "Accuracy_test", "F1_train", "F1_test"],
).background_gradient(
cmap="viridis",
low=1,
high=0.3,
subset=[
"Precision_train",
"Precision_test",
"Recall_train",
"Recall_test",
],
)
ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса
class_metrics = pd.DataFrame.from_dict(class_models, "index")[
[
"Accuracy_test",
"F1_test",
"ROC_AUC_test",
"Cohen_kappa_test",
"MCC_test",
]
]
class_metrics.sort_values(by="ROC_AUC_test", ascending=False).style.background_gradient(
cmap="plasma",
low=0.3,
high=1,
subset=[
"ROC_AUC_test",
"MCC_test",
"Cohen_kappa_test",
],
).background_gradient(
cmap="viridis",
low=1,
high=0.3,
subset=[
"Accuracy_test",
"F1_test",
],
)
best_model = str(class_metrics.sort_values(by="MCC_test", ascending=False).iloc[0].name)
display(best_model)
Вывод данных с ошибкой предсказания для оценки¶
preprocessing_result = pipeline_end.transform(X_test)
preprocessed_df = pd.DataFrame(
preprocessing_result,
columns=pipeline_end.get_feature_names_out(),
)
y_pred = class_models[best_model]["preds"]
error_index = y_test[y_test["Survived"] != y_pred].index.tolist()
display(f"Error items count: {len(error_index)}")
error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]
error_df = X_test.loc[error_index].copy()
error_df.insert(loc=1, column="Predicted", value=error_predicted)
error_df.sort_index()
Пример использования обученной модели (конвейера) для предсказания¶
model = class_models[best_model]["pipeline"]
example_id = 450
test = pd.DataFrame(X_test.loc[example_id, :]).T
test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T
display(test)
display(test_preprocessed)
result_proba = model.predict_proba(test)[0]
result = model.predict(test)[0]
real = int(y_test.loc[example_id].values[0])
display(f"predicted: {result} (proba: {result_proba})")
display(f"real: {real}")
Подбор гиперпараметров методом поиска по сетке¶
https://www.kaggle.com/code/sociopath00/random-forest-using-gridsearchcv
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
from sklearn.model_selection import GridSearchCV
optimized_model_type = "random_forest"
random_forest_model = class_models[optimized_model_type]["pipeline"]
param_grid = {
"model__n_estimators": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],
"model__max_features": ["sqrt", "log2", 2],
"model__max_depth": [2, 3, 4, 5, 6, 7, 8, 9 ,10],
"model__criterion": ["gini", "entropy", "log_loss"],
}
gs_optomizer = GridSearchCV(
estimator=random_forest_model, param_grid=param_grid, n_jobs=-1
)
gs_optomizer.fit(X_train, y_train.values.ravel())
gs_optomizer.best_params_
Обучение модели с новыми гиперпараметрами
pipeline = gs_optomizer.best_estimator_.fit(X_train, y_train.values.ravel())
result = run_classification(pipeline, X_train, X_test, y_train, y_test)
Формирование данных для оценки старой и новой версии модели
optimized_metrics = pd.DataFrame(columns=list(result.keys()))
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
data=class_models[optimized_model_type]
)
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
data=result
)
optimized_metrics.insert(loc=0, column="Name", value=["Old", "New"])
optimized_metrics = optimized_metrics.set_index("Name")
Оценка параметров старой и новой модели
optimized_metrics[
[
"Precision_train",
"Precision_test",
"Recall_train",
"Recall_test",
"Accuracy_train",
"Accuracy_test",
"F1_train",
"F1_test",
]
].style.background_gradient(
cmap="plasma",
low=0.3,
high=1,
subset=["Accuracy_train", "Accuracy_test", "F1_train", "F1_test"],
).background_gradient(
cmap="viridis",
low=1,
high=0.3,
subset=[
"Precision_train",
"Precision_test",
"Recall_train",
"Recall_test",
],
)
optimized_metrics[
[
"Accuracy_test",
"F1_test",
"ROC_AUC_test",
"Cohen_kappa_test",
"MCC_test",
]
].style.background_gradient(
cmap="plasma",
low=0.3,
high=1,
subset=[
"ROC_AUC_test",
"MCC_test",
"Cohen_kappa_test",
],
).background_gradient(
cmap="viridis",
low=1,
high=0.3,
subset=[
"Accuracy_test",
"F1_test",
],
)
_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False
)
for index in range(0, len(optimized_metrics)):
c_matrix = optimized_metrics.iloc[index]["Confusion_matrix"]
disp = ConfusionMatrixDisplay(
confusion_matrix=c_matrix, display_labels=["Died", "Sirvived"]
).plot(ax=ax.flat[index])
plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)
plt.show()
Регрессия¶
Загрузка данных¶
import pandas as pd
density_train = pd.read_csv("data/density/density_train.csv", sep=";", decimal=",")
density_test = pd.read_csv("data/density/density_test.csv", sep=";", decimal=",")
display(density_train.head(3))
display(density_test.head(3))
Формирование выборок¶
density_y_train = pd.DataFrame(density_train["Density"], columns=["Density"])
density_train = density_train.drop(["Density"], axis=1)
display(density_train.head(3))
display(density_y_train.head(3))
density_y_test = pd.DataFrame(density_test["Density"], columns=["Density"])
density_test = density_test.drop(["Density"], axis=1)
display(density_test.head(3))
display(density_y_test.head(3))
Определение перечня алгоритмов решения задачи аппроксимации (регрессии)¶
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble, neural_network
random_state = 9
models = {
"linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
"linear_poly": {
"model": make_pipeline(
PolynomialFeatures(degree=2),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"linear_interact": {
"model": make_pipeline(
PolynomialFeatures(interaction_only=True),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"ridge": {"model": linear_model.RidgeCV()},
"decision_tree": {
"model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
},
"knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
"random_forest": {
"model": ensemble.RandomForestRegressor(
max_depth=7, random_state=random_state, n_jobs=-1
)
},
"mlp": {
"model": neural_network.MLPRegressor(
activation="tanh",
hidden_layer_sizes=(3,),
max_iter=500,
early_stopping=True,
random_state=random_state,
)
},
}
Обучение и оценка моделей с помощью различных алгоритмов¶
from src.utils import run_regression
for model_name in models.keys():
print(f"Model: {model_name}")
X_train = density_train
X_test = density_test
y_train = density_y_train
y_test = density_y_test
model = models[model_name]["model"]
fitted_model = model.fit(
X_train.values, density_y_train.values.ravel()
)
models[model_name] = run_regression(fitted_model, X_train, X_test, y_train, y_test)
Вывод результатов оценки¶
reg_metrics = pd.DataFrame.from_dict(models, "index")[
["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Вывод реального и "спрогнозированного" результата для обучающей и тестовой выборок¶
Получение лучшей модели
best_model = str(reg_metrics.sort_values(by="RMSE_test").iloc[0].name)
display(best_model)
Вывод для обучающей выборки
pd.concat(
[
density_train,
density_y_train,
pd.Series(
models[best_model]["train_preds"],
index=density_y_train.index,
name="DensityPred",
),
],
axis=1,
).head(5)
Вывод для тестовой выборки
pd.concat(
[
density_test,
density_y_test,
pd.Series(
models[best_model]["preds"],
index=density_y_test.index,
name="DensityPred",
),
],
axis=1,
).head(5)