Алексей Крюков 55d4ecc0de lab_done
2024-12-14 11:56:36 +04:00

299 KiB
Raw Permalink Blame History

Вариант: Список людей.

ссылка на датасет: https://www.kaggle.com/datasets/imoore/age-dataset

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,
    matthews_corrcoef, cohen_kappa_score, confusion_matrix
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import accuracy_score

# Функция для применения oversampling
def apply_oversampling(X, y):
    oversampler = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)
    return X_resampled, y_resampled

# Функция для применения undersampling
def apply_undersampling(X, y):
    undersampler = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = undersampler.fit_resample(X, y)
    return X_resampled, y_resampled

def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
):

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test


df = pd.read_csv("../static/csv/AgeDataset-V1.csv", nrows=1000)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Id                 1000 non-null   object 
 1   Name               1000 non-null   object 
 2   Short description  1000 non-null   object 
 3   Gender             995 non-null    object 
 4   Country            962 non-null    object 
 5   Occupation         998 non-null    object 
 6   Birth year         1000 non-null   int64  
 7   Death year         999 non-null    float64
 8   Manner of death    372 non-null    object 
 9   Age of death       999 non-null    float64
dtypes: float64(2), int64(1), object(7)
memory usage: 78.3+ KB

Как бизнес-цели выделим следующие 2 варианта: 1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. 2) Классификация людей по возрастным группам, что может быть полезно для рекламных целей

Выполним подготовку данных

In [6]:
df.fillna({"Gender": "NaN", "Country": "NaN", "Occupation" : "NaN", "Manner of death" : "NaN"}, inplace=True)
df = df.dropna()
df['Country'] = df['Country'].str.split('; ')
df = df.explode('Country')
data = df.copy()


value_counts = data["Country"].value_counts()
rare = value_counts[value_counts < 100].index
data = data[~data["Country"].isin(rare)]

data.drop(data[~data['Gender'].isin(['Male', 'Female'])].index, inplace=True)

data1 = pd.get_dummies(data, columns=['Gender', 'Country', 'Occupation'], drop_first=True)

Определить достижимый уровень качества модели для каждой задачи. На основе имеющихся данных уровень качества моделей не будет высоким, поскольку все таки длительность жизни лишь примерная и точно ее угадать невозможно.

Выберем ориентиры для наших 2х задач: 1)Регрессии - средний возраст человека 2)Классификации - аиболее часто встречающаяся возрастная группа

Построим конвейер.

In [7]:
print(data.columns)
Index(['Id', 'Name', 'Short description', 'Gender', 'Country', 'Occupation',
       'Birth year', 'Death year', 'Manner of death', 'Age of death'],
      dtype='object')
In [8]:
X_reg = data1.drop(['Id', 'Name', 'Age of death', 'Short description', 'Manner of death'], axis=1)
y_reg = data1['Age of death']

# Разделение данных
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Выбор моделей для регрессии
models_reg = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)
}

# Создание конвейера для регрессии
pipelines_reg = {}
for name, model in models_reg.items():
    pipelines_reg[name] = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

# Определение сетки гиперпараметров для регрессии
param_grids_reg = {
    'Linear Regression': {},
    'Random Forest Regressor': {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [None, 10, 20, 30]
    },
    'Gradient Boosting Regressor': {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7]
    }
}

# Настройка гиперпараметров для регрессии
best_models_reg = {}
for name, pipeline in pipelines_reg.items():
    grid_search = GridSearchCV(pipeline, param_grids_reg[name], cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_reg, y_train_reg)
    best_models_reg[name] = grid_search.best_estimator_
    print(f'Best parameters for {name}: {grid_search.best_params_}')
Best parameters for Linear Regression: {}
Best parameters for Random Forest Regressor: {'model__max_depth': None, 'model__n_estimators': 200}
Best parameters for Gradient Boosting Regressor: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 300}
In [9]:
# Обучение моделей и оценка качества
results_reg = {}

for model_name in best_models_reg.keys():
    print(f"Model: {model_name}")
    model_pipeline = best_models_reg[model_name]

    y_train_predict = model_pipeline.predict(X_train_reg)
    y_test_predict = model_pipeline.predict(X_test_reg)

    results_reg[model_name] = {
        "pipeline": model_pipeline,
        "preds_train": y_train_predict,
        "preds_test": y_test_predict,
        "MSE_train": mean_squared_error(y_train_reg, y_train_predict),
        "MSE_test": mean_squared_error(y_test_reg, y_test_predict),
        "R2_train": r2_score(y_train_reg, y_train_predict),
        "R2_test": r2_score(y_test_reg, y_test_predict),
        "MAE_train": mean_absolute_error(y_train_reg, y_train_predict),
        "MAE_test": mean_absolute_error(y_test_reg, y_test_predict),
    }

# Теперь результаты каждой модели находятся в results_reg
print(results_reg)
Model: Linear Regression
Model: Random Forest Regressor
Model: Gradient Boosting Regressor
{'Linear Regression': {'pipeline': Pipeline(steps=[('scaler', StandardScaler()), ('model', LinearRegression())]), 'preds_train': array([29., 53., 82., 55., 62., 42., 21., 75., 52., 32., 36., 73., 37.,
       46., 72., 27., 71., 83., 55., 22., 65., 47., 57., 78., 91., 60.,
       72., 67., 61., 88., 60., 88., 91., 52., 84., 93., 44., 58., 90.,
       94., 59., 92., 84., 64., 51., 93., 79., 81., 72., 78., 74., 27.,
       92., 46., 72., 78., 96., 95., 77., 49., 87., 90., 88., 88., 65.,
       42., 74., 87., 41., 77., 92., 69., 94., 49., 85., 79., 73., 43.,
       55., 73., 51., 79., 52., 63., 60., 68., 70., 39., 59., 79., 27.,
       57., 78., 80., 19., 69., 68., 39., 60., 67., 68., 87., 81., 62.,
       73., 23., 69., 67., 47., 45., 90., 82., 87., 70., 77., 67., 66.,
       84., 64., 54., 46., 75., 84., 36., 72., 42., 52., 48., 55., 89.,
       64., 80., 28., 52., 81., 63., 74., 68., 53., 66., 36., 80., 75.,
       78., 34., 68., 71., 78., 83., 93., 68., 63., 79., 51., 68., 65.,
       87., 66., 52., 63., 88., 83., 85., 39., 78., 80., 65., 71., 76.,
       84., 51., 86., 65., 66., 80., 56., 67., 23., 49., 89., 69., 92.,
       63., 83., 62., 82., 34., 73., 59., 79., 60., 44., 53., 75., 71.,
       64., 60., 62., 85., 95., 90., 56., 94., 99., 51., 59., 88., 21.,
       65., 97., 67., 42., 93., 54., 56., 56., 68., 95., 55., 69., 65.,
       59., 56., 72., 28., 75., 56., 65., 74., 75., 80., 82.]), 'preds_test': array([56.        , 22.        , 87.        , 88.        , 25.        ,
       54.        , 87.        , 85.        , 60.        , 88.        ,
       42.        , 72.        , 69.        , 82.        , 81.        ,
       48.        , 94.        , 56.        , 65.        , 86.        ,
       74.        , 62.        , 99.        , 66.        , 74.        ,
       59.        , 60.        , 64.        , 64.        , 62.        ,
       71.        , 72.        , 77.        , 85.        , 81.        ,
       81.        , 55.84443686, 40.        , 69.        , 66.        ,
       95.        , 40.        , 81.        , 75.        , 91.        ,
       82.        , 76.        , 66.        , 54.        , 59.        ,
       80.        , 45.        , 44.        , 92.        , 67.        ,
       86.        , 89.        , 89.        , 53.        ]), 'MSE_train': np.float64(7.202572085669638e-26), 'MSE_test': np.float64(0.0004101676300062632), 'R2_train': 1.0, 'R2_test': 0.9999986319859576, 'MAE_train': np.float64(2.193188159818137e-13), 'MAE_test': np.float64(0.0026366633706367913)}, 'Random Forest Regressor': {'pipeline': Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 RandomForestRegressor(n_estimators=200, random_state=42))]), 'preds_train': array([32.415, 54.05 , 80.21 , 53.42 , 61.825, 48.38 , 31.21 , 75.14 ,
       53.66 , 44.475, 37.86 , 69.925, 40.7  , 49.45 , 70.985, 31.605,
       71.845, 77.67 , 55.85 , 23.27 , 64.665, 51.965, 58.55 , 67.885,
       91.005, 60.08 , 72.905, 68.86 , 63.055, 87.615, 58.84 , 87.08 ,
       90.725, 52.17 , 80.965, 91.69 , 48.03 , 58.71 , 89.26 , 92.89 ,
       59.015, 91.75 , 82.57 , 63.895, 55.675, 92.27 , 79.905, 78.265,
       72.795, 76.885, 73.87 , 33.285, 91.75 , 47.07 , 72.47 , 76.92 ,
       94.385, 90.31 , 74.365, 50.7  , 85.73 , 83.985, 87.175, 86.815,
       62.32 , 46.985, 67.69 , 88.02 , 41.14 , 76.32 , 87.4  , 66.825,
       92.305, 51.86 , 83.51 , 80.005, 70.49 , 44.39 , 56.58 , 73.695,
       52.235, 79.01 , 50.495, 65.565, 62.43 , 66.77 , 69.69 , 40.495,
       63.64 , 79.755, 28.875, 59.86 , 79.155, 81.925, 33.975, 69.73 ,
       71.19 , 47.59 , 54.73 , 67.71 , 69.41 , 84.08 , 80.425, 60.615,
       73.29 , 24.925, 68.7  , 66.365, 46.64 , 50.88 , 89.61 , 78.34 ,
       85.715, 67.64 , 79.165, 64.275, 65.885, 81.285, 64.4  , 57.835,
       49.395, 75.095, 84.47 , 40.765, 62.5  , 44.53 , 56.76 , 47.27 ,
       53.965, 87.91 , 64.765, 71.42 , 28.32 , 55.25 , 80.665, 60.975,
       73.21 , 67.89 , 54.7  , 67.895, 44.685, 79.79 , 75.025, 76.505,
       38.565, 65.68 , 72.485, 77.19 , 84.71 , 91.46 , 67.54 , 61.705,
       77.92 , 53.51 , 69.705, 66.27 , 87.135, 65.785, 57.23 , 65.945,
       88.88 , 81.18 , 82.655, 40.31 , 77.985, 80.435, 62.81 , 73.05 ,
       75.88 , 82.215, 58.5  , 81.83 , 61.155, 65.455, 76.965, 57.17 ,
       66.44 , 34.205, 48.505, 87.855, 64.585, 91.78 , 63.915, 77.265,
       64.48 , 81.42 , 40.195, 71.515, 57.38 , 74.945, 57.52 , 47.795,
       51.94 , 74.025, 68.09 , 64.38 , 62.535, 59.58 , 80.365, 93.635,
       89.98 , 56.94 , 92.79 , 97.075, 53.34 , 58.425, 87.805, 24.685,
       64.205, 94.825, 65.02 , 43.075, 91.215, 56.39 , 56.38 , 57.195,
       68.27 , 90.75 , 56.04 , 68.77 , 65.135, 58.49 , 55.54 , 73.21 ,
       40.095, 75.28 , 55.98 , 63.535, 74.15 , 74.775, 76.405, 78.46 ]), 'preds_test': array([56.47 , 42.345, 87.025, 86.56 , 40.42 , 49.335, 86.02 , 81.75 ,
       62.32 , 90.2  , 63.72 , 74.3  , 67.43 , 58.9  , 83.06 , 46.655,
       92.365, 32.505, 71.02 , 89.43 , 63.06 , 63.645, 92.385, 53.625,
       71.25 , 68.73 , 66.38 , 70.14 , 62.755, 65.02 , 72.21 , 73.205,
       74.06 , 87.985, 83.44 , 78.265, 53.98 , 52.355, 61.145, 69.6  ,
       89.645, 55.83 , 77.695, 59.03 , 89.61 , 83.235, 70.58 , 71.92 ,
       69.175, 58.48 , 75.345, 59.55 , 52.395, 85.715, 65.425, 87.95 ,
       83.12 , 87.76 , 55.63 ]), 'MSE_train': np.float64(10.585386853448275), 'MSE_test': np.float64(73.41657415254235), 'R2_train': 0.9680762587778711, 'R2_test': 0.7551369317321679, 'MAE_train': np.float64(2.189698275862069), 'MAE_test': np.float64(6.021271186440678)}, 'Gradient Boosting Regressor': {'pipeline': Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 GradientBoostingRegressor(max_depth=5, n_estimators=300,
                                           random_state=42))]), 'preds_train': array([28.72956041, 53.12127389, 82.08536004, 55.09719521, 61.75388192,
       41.97235916, 21.14883789, 74.54323397, 52.25364062, 32.10924489,
       36.08384782, 72.70845527, 37.11384401, 46.04284373, 72.05464788,
       27.19660712, 71.03059415, 82.78900252, 54.82581543, 22.08471572,
       64.95177947, 46.91864608, 57.0614107 , 77.77389579, 90.91849183,
       60.00063443, 72.32250587, 67.19480682, 61.21037107, 87.9493728 ,
       59.76167757, 87.9242189 , 91.11672076, 51.981465  , 83.97286576,
       92.99512162, 44.2265744 , 57.97309623, 89.8580269 , 93.93278779,
       58.9790766 , 92.10213846, 83.92831871, 64.01048318, 50.85907853,
       93.03022066, 79.35509757, 80.97385883, 72.26475898, 78.0822317 ,
       73.74605417, 26.94997048, 91.93353737, 46.11073777, 71.86943063,
       78.22666513, 95.97811062, 94.90309836, 76.79483994, 49.04234743,
       87.10113854, 90.00164369, 88.15604432, 88.06107202, 64.86758165,
       42.0662194 , 73.86206285, 87.06076311, 40.77837315, 76.93677631,
       91.80841172, 68.86009114, 93.99977552, 49.01611104, 85.1215977 ,
       79.20236795, 72.81006079, 42.88133804, 55.07471142, 73.08367579,
       51.10101262, 79.26235085, 51.93996986, 63.1400842 , 60.16031868,
       67.74505892, 69.92474149, 39.10249238, 59.19318532, 79.00162184,
       27.13287068, 57.14727171, 78.13131855, 80.04141944, 19.21578015,
       69.08228133, 68.34019354, 39.23243336, 59.69048347, 66.68312364,
       67.90455008, 86.87414348, 80.96263263, 62.01895029, 72.84596009,
       22.9688195 , 69.07247695, 67.07765118, 46.8756752 , 45.26635382,
       89.94753457, 82.01807007, 86.98465799, 70.20029179, 77.11751998,
       66.85572827, 65.88490085, 83.94892282, 64.05271048, 54.31302531,
       46.21858535, 74.71421415, 84.18057805, 35.93101794, 71.74544023,
       41.87496037, 51.97305771, 47.95325267, 54.99051089, 88.93282021,
       63.9305233 , 79.93395234, 27.90266723, 51.99225237, 80.96691129,
       62.96957561, 74.2341931 , 68.04553945, 53.25614425, 66.06230994,
       36.07177583, 79.73194835, 74.81165201, 77.92313651, 34.03449993,
       67.979729  , 71.21728118, 77.76422978, 83.48678921, 92.88231457,
       68.07190528, 63.0157021 , 78.64975048, 51.06214065, 68.08719803,
       65.07261616, 87.04733437, 66.02343628, 52.0736837 , 63.22318294,
       88.04851864, 82.82539568, 84.97439652, 39.24799156, 78.25738675,
       79.93756933, 64.4750149 , 71.56468737, 76.18401232, 84.06330088,
       51.02264414, 85.98802018, 64.92800866, 66.16320124, 79.94939849,
       56.07374628, 66.98345294, 23.02540478, 49.17449175, 88.88588133,
       68.9329792 , 92.03345878, 63.07777892, 82.73557105, 61.78437332,
       81.8909867 , 34.21616731, 72.87348414, 58.98687689, 78.8140383 ,
       59.9574234 , 44.19210735, 52.71369582, 75.20218936, 70.59615384,
       63.54886587, 60.49279846, 61.78645898, 84.87971032, 94.81801802,
       89.90842136, 55.66192951, 93.90927911, 98.9415322 , 51.01506961,
       58.55722323, 87.77450912, 20.9321725 , 64.89912387, 97.0158939 ,
       67.06399678, 41.91876756, 92.91632536, 54.03711532, 56.10247109,
       55.84819722, 67.98653048, 95.00209989, 54.94376476, 69.02145146,
       65.17895584, 59.02771118, 55.92396986, 72.8440164 , 28.29625663,
       75.01157336, 56.0700562 , 64.97176071, 74.08213306, 74.93307889,
       79.9003625 , 81.82188841]), 'preds_test': array([66.503057  , 38.21021974, 89.04970956, 87.89189709, 36.73554665,
       45.73843308, 87.79291212, 82.13464953, 60.57314255, 91.22929864,
       62.23036581, 71.58664491, 66.17112665, 62.51658214, 81.60921548,
       38.83018398, 91.30064235, 24.90175483, 69.16155336, 87.4365223 ,
       71.56622022, 63.57230002, 93.97558163, 49.45397887, 68.85601209,
       68.60673528, 63.94743518, 68.42632232, 65.30704897, 66.74142159,
       68.75949485, 74.90532442, 73.25421167, 89.6482385 , 82.66649342,
       78.86658868, 56.09338908, 61.2786305 , 60.68340277, 71.36372731,
       90.85782508, 52.24020316, 83.95183498, 62.00353481, 89.95327108,
       86.00387125, 71.50207355, 76.51105405, 67.41310326, 58.59170399,
       76.96828297, 62.60133656, 46.93230456, 87.0082761 , 68.74473539,
       88.07943744, 83.14111532, 87.20969454, 53.47940315]), 'MSE_train': np.float64(0.03170555378857528), 'MSE_test': np.float64(78.66069437097792), 'R2_train': 0.999904381397821, 'R2_test': 0.7376464483927592, 'MAE_train': np.float64(0.13311180849830512), 'MAE_test': np.float64(6.141062895444746)}}
In [10]:
data2 = data.drop(['Short description', 'Manner of death', 'Gender', 'Country', 'Occupation'], axis=1)
In [11]:
# Создание возрастных групп
bins = [0, 18, 30, 50, 70, 100]
labels = ['0-18', '19-30', '31-50', '51-70', '71+']
data['Age Group'] = pd.cut(data['Age of death'], bins=bins, labels=labels)
data2['Age Group'] = pd.cut(data2['Age of death'], bins=bins, labels=labels)

# Выбор признаков и целевой переменной для классификации
X_class = data2.drop(['Id', 'Name', 'Age of death', 'Age Group'], axis=1)
y_class = data['Age Group']  
print(X_class.columns)
# Разделение данных
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

# Выбор моделей для классификации
models_class = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=5000, solver='liblinear'),
    'Random Forest Classifier': RandomForestClassifier(random_state=42),
    'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42)
}

# Создание конвейера для классификации
pipelines_class = {}
for name, model in models_class.items():
    pipelines_class[name] = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

# Определение сетки гиперпараметров для классификации

param_grids_class = {
    'Logistic Regression': {
        'model__C': [0.1, 1, 10],
        'model__solver': ['lbfgs', 'liblinear']
    },
    'Random Forest Classifier': {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [None, 10, 20, 30]
    },
    'Gradient Boosting Classifier': {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7]
    }
}
# Убрал определение параметров поскольку уже был предподсчет данных, но вылетела ошибка. Сохранил лучшие параметры

param_grids_class = {
    'Logistic Regression': {
        'model__C': [10],
        'model__solver': ['lbfgs']
    },
    'Random Forest Classifier': {
        'model__n_estimators': [200],
        'model__max_depth': [ 30]
    },
    'Gradient Boosting Classifier': {
        'model__n_estimators': [200],
        'model__learning_rate': [0.1],
        'model__max_depth': [7]
    }
}

# Настройка гиперпараметров для классификации
best_models_class = {}
for name, pipeline in pipelines_class.items():
    grid_search = GridSearchCV(pipeline, param_grids_class[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_class, y_train_class)
    best_models_class[name] = {"model": grid_search.best_estimator_}
    print(f'Best parameters for {name}: {grid_search.best_params_}')

# Обучение моделей и оценка качества
for model_name in best_models_class.keys():
    print(f"Model: {model_name}")
    model = best_models_class[model_name]["model"]

    model_pipeline = Pipeline([("scaler", StandardScaler()), ("model", model)])
    model_pipeline = model_pipeline.fit(X_train_class, y_train_class)

    y_train_predict = model_pipeline.predict(X_train_class)
    y_test_probs = model_pipeline.predict_proba(X_test_class)
    y_test_predict = model_pipeline.predict(X_test_class)

    best_models_class[model_name]["pipeline"] = model_pipeline
    best_models_class[model_name]["probs"] = y_test_probs
    best_models_class[model_name]["preds"] = y_test_predict

    best_models_class[model_name]["Precision_train"] = precision_score(y_train_class, y_train_predict, average='weighted')
    best_models_class[model_name]["Precision_test"] = precision_score(y_test_class, y_test_predict, average='weighted')
    best_models_class[model_name]["Recall_train"] = recall_score(y_train_class, y_train_predict, average='weighted')
    best_models_class[model_name]["Recall_test"] = recall_score(y_test_class, y_test_predict, average='weighted')
    best_models_class[model_name]["Accuracy_train"] = accuracy_score(y_train_class, y_train_predict)
    best_models_class[model_name]["Accuracy_test"] = accuracy_score(y_test_class, y_test_predict)
    best_models_class[model_name]["ROC_AUC_test"] = roc_auc_score(y_test_class, y_test_probs, multi_class='ovr')
    best_models_class[model_name]["F1_train"] = f1_score(y_train_class, y_train_predict, average='weighted')
    best_models_class[model_name]["F1_test"] = f1_score(y_test_class, y_test_predict, average='weighted')
    best_models_class[model_name]["MCC_test"] = matthews_corrcoef(y_test_class, y_test_predict)
    best_models_class[model_name]["Cohen_kappa_test"] = cohen_kappa_score(y_test_class, y_test_predict)
    best_models_class[model_name]["Confusion_matrix"] = confusion_matrix(y_test_class, y_test_predict)
Index(['Birth year', 'Death year'], dtype='object')
Best parameters for Logistic Regression: {'model__C': 10, 'model__solver': 'lbfgs'}
Best parameters for Random Forest Classifier: {'model__max_depth': 30, 'model__n_estimators': 200}
Best parameters for Gradient Boosting Classifier: {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 200}
Model: Logistic Regression
Model: Random Forest Classifier
c:\Users\alexk\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
c:\Users\alexk\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Model: Gradient Boosting Classifier
In [12]:
num_models = len(best_models_class)
fig, ax = plt.subplots(num_models, 1, figsize=(12, 10), sharex=False, sharey=False)

for index, key in enumerate(best_models_class.keys()):
    c_matrix = best_models_class[key]["Confusion_matrix"]
    
    # Получаем метки классов из матрицы ошибок
    num_classes = c_matrix.shape[0]
    actual_labels = ["0-18", "19-30", "31-50", "51-70", "71+"][:num_classes]
    
    disp = ConfusionMatrixDisplay(
        confusion_matrix=c_matrix, display_labels=actual_labels
    ).plot(ax=ax.flat[index])
    disp.ax_.set_title(key)

plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)
plt.show()
No description has been provided for this image
In [13]:
_, ax = plt.subplots(3, 2, figsize=(12, 10), sharex=False, sharey=False)
ax = ax.flatten()

for index, (name, model) in enumerate(best_models_reg.items()):
    y_pred_reg = model.predict(X_test_reg)

    # График фактических значений против предсказанных значений
    ax[index * 2].scatter(y_test_reg, y_pred_reg, alpha=0.5)
    ax[index * 2].plot([min(y_test_reg), max(y_test_reg)], [min(y_test_reg), max(y_test_reg)], color='red', linestyle='--')
    ax[index * 2].set_xlabel('Actual Values')
    ax[index * 2].set_ylabel('Predicted Values')
    ax[index * 2].set_title(f'{name}: Actual vs Predicted')

    # График остатков
    residuals = y_test_reg - y_pred_reg
    ax[index * 2 + 1].scatter(y_pred_reg, residuals, alpha=0.5)
    ax[index * 2 + 1].axhline(y=0, color='red', linestyle='--')
    ax[index * 2 + 1].set_xlabel('Predicted Values')
    ax[index * 2 + 1].set_ylabel('Residuals')
    ax[index * 2 + 1].set_title(f'{name}: Residuals vs Predicted')


plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)
plt.show()
No description has been provided for this image