490 KiB
Raw Blame History

In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("C://Users//annal//aim//static//csv//Forbes_Billionaires.csv")
print(df.columns)
Index(['Rank ', 'Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object')

Определим бизнес цели:

1- Прогнозирование возраста миллиардера(классификация)

2- Прогнозирование состояния миллиардера(регрессия)

Подготовим данные: категоризируем колонку age

In [2]:
print(df.isnull().sum())

print()

# Есть ли пустые значения признаков
print(df.isnull().any())

print()

# Процент пустых значений признаков
for i in df.columns:
    null_rate = df[i].isnull().sum() / len(df) * 100
    if null_rate > 0:
        print(f"{i} процент пустых значений: %{null_rate:.2f}")
Rank        0
Name        0
Networth    0
Age         0
Country     0
Source      0
Industry    0
dtype: int64

Rank        False
Name        False
Networth    False
Age         False
Country     False
Source      False
Industry    False
dtype: bool

In [2]:
bins = [0, 30, 40, 50, 60, 70, 80, 101]  # границы для возрастных категорий
labels = ['Under 30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+']  # метки для категорий

df["Age_category"] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
# Удаляем оригинальные колонки 'country', 'industry' и 'source' из исходного DataFrame
df.drop(columns=['Age'], inplace=True)

# Просмотр результата
print(df.head())
   Rank                        Name  Networth        Country  \
0      1                 Elon Musk      219.0  United States   
1      2                Jeff Bezos      171.0  United States   
2      3  Bernard Arnault & family      158.0         France   
3      4                Bill Gates      129.0  United States   
4      5            Warren Buffett      118.0  United States   

               Source                Industry Age_category  
0       Tesla, SpaceX             Automotive         50-60  
1              Amazon             Technology         50-60  
2                LVMH       Fashion & Retail         70-80  
3           Microsoft             Technology         60-70  
4  Berkshire Hathaway  Finance & Investments           80+  
In [27]:
from utils import split_stratified_into_train_val_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df, stratify_colname="Age_category", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=9
)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)
'X_train'
Rank Name Networth Country Source Industry Age_category
1909 1818 Tran Ba Duong & family 1.6 Vietnam automotive Automotive 60-70
2099 2076 Mark Dixon 1.4 United Kingdom office real estate Real Estate 60-70
1392 1341 Yingzhuo Xu 2.3 China agribusiness Food & Beverage 50-60
627 622 Bruce Flatt 4.6 Canada money management Finance & Investments 50-60
527 523 Li Liangbin 5.2 China lithium Manufacturing 50-60
... ... ... ... ... ... ... ...
84 85 Theo Albrecht, Jr. & family 18.7 Germany Aldi, Trader Joe's Fashion & Retail 70-80
633 622 Tony Tamer 4.6 United States private equity Finance & Investments 60-70
922 913 Bob Gaglardi 3.3 Canada hotels Real Estate 80+
2178 2076 Eugene Wu 1.4 Taiwan finance Finance & Investments 70-80
415 411 Leonard Stern 6.2 United States real estate Real Estate 80+

2080 rows × 7 columns

'y_train'
Age_category
1909 60-70
2099 60-70
1392 50-60
627 50-60
527 50-60
... ...
84 70-80
633 60-70
922 80+
2178 70-80
415 80+

2080 rows × 1 columns

'X_test'
Rank Name Networth Country Source Industry Age_category
2075 2076 Radhe Shyam Agarwal 1.4 India consumer goods Fashion & Retail 70-80
1529 1513 Robert Duggan 2.0 United States pharmaceuticals Healthcare 70-80
1803 1729 Yao Kuizhang 1.7 China beverages Food & Beverage 50-60
425 424 Alexei Kuzmichev 6.0 Russia oil, banking, telecom Energy 50-60
2597 2578 Ramesh Genomal 1.0 Philippines apparel Fashion & Retail 70-80
... ... ... ... ... ... ... ...
935 913 Alfred Oetker 3.3 Germany consumer goods Fashion & Retail 50-60
1541 1513 Thomas Lee 2.0 United States private equity Finance & Investments 70-80
1646 1645 Roberto Angelini Rossi 1.8 Chile forestry, mining diversified 70-80
376 375 Patrick Drahi 6.6 France telecom Telecom 50-60
1894 1818 Gerald Schwartz 1.6 Canada finance Finance & Investments 80+

520 rows × 7 columns

'y_test'
Age_category
2075 70-80
1529 70-80
1803 50-60
425 50-60
2597 70-80
... ...
935 50-60
1541 70-80
1646 70-80
376 50-60
1894 80+

520 rows × 1 columns

Формирование конвейера для классификации данных

preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация

preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование

features_preprocessing -- трансформер для предобработки признаков

features_engineering -- трансформер для конструирования признаков

drop_columns -- трансформер для удаления колонок

pipeline_end -- основной конвейер предобработки данных и конструирования признаков

In [37]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd

# Исправляем ColumnTransformer с сохранением имен колонок
columns_to_drop = ["Age_category", "Rank ", "Name"]

num_columns = [
    column
    for column in X_train.columns
    if column not in columns_to_drop and X_train[column].dtype != "object"
]
cat_columns = [
    column
    for column in X_train.columns
    if column not in columns_to_drop and X_train[column].dtype == "object"
]

# Предобработка числовых данных
num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

# Предобработка категориальных данных
cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ]
)

# Общая предобработка признаков
features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=True,  # Сохраняем имена колонок
    transformers=[
        ("prepocessing_num", preprocessing_num, num_columns),
        ("prepocessing_cat", preprocessing_cat, cat_columns),
    ],
    remainder="drop"  # Убираем неиспользуемые столбцы
)

# Итоговый конвейер
pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
    ]
)

# Преобразуем данные
preprocessing_result = pipeline_end.fit_transform(X_train)

# Создаем DataFrame с правильными именами колонок
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
    index=X_train.index,  # Сохраняем индексы
)

preprocessed_df
Out[37]:
prepocessing_num__Networth prepocessing_cat__Country_Argentina prepocessing_cat__Country_Australia prepocessing_cat__Country_Austria prepocessing_cat__Country_Barbados prepocessing_cat__Country_Belgium prepocessing_cat__Country_Belize prepocessing_cat__Country_Brazil prepocessing_cat__Country_Bulgaria prepocessing_cat__Country_Canada ... prepocessing_cat__Industry_Logistics prepocessing_cat__Industry_Manufacturing prepocessing_cat__Industry_Media & Entertainment prepocessing_cat__Industry_Metals & Mining prepocessing_cat__Industry_Real Estate prepocessing_cat__Industry_Service prepocessing_cat__Industry_Sports prepocessing_cat__Industry_Technology prepocessing_cat__Industry_Telecom prepocessing_cat__Industry_diversified
1909 -0.309917 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2099 -0.329245 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
1392 -0.242268 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
627 -0.019995 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
527 0.037990 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
84 1.342637 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
633 -0.019995 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
922 -0.145628 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
2178 -0.329245 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
415 0.134630 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0

2080 rows × 860 columns

Формирование набора моделей для классификации

logistic -- логистическая регрессия

ridge -- гребневая регрессия

decision_tree -- дерево решений

knn -- k-ближайших соседей

naive_bayes -- наивный Байесовский классификатор

gradient_boosting -- метод градиентного бустинга (набор деревьев решений)

random_forest -- метод случайного леса (набор деревьев решений)

mlp -- многослойный персептрон (нейронная сеть)

In [38]:
from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree

class_models = {
    "logistic": {"model": linear_model.LogisticRegression()},
    # "ridge": {"model": linear_model.RidgeClassifierCV(cv=5, class_weight="balanced")},
    "ridge": {"model": linear_model.LogisticRegression(penalty="l2", class_weight="balanced")},
    "decision_tree": {
        "model": tree.DecisionTreeClassifier(max_depth=7, random_state=9)
    },
    "knn": {"model": neighbors.KNeighborsClassifier(n_neighbors=7)},
    "naive_bayes": {"model": naive_bayes.GaussianNB()},
    "gradient_boosting": {
        "model": ensemble.GradientBoostingClassifier(n_estimators=210)
    },
    "random_forest": {
        "model": ensemble.RandomForestClassifier(
            max_depth=11, class_weight="balanced", random_state=9
        )
    },
    "mlp": {
        "model": neural_network.MLPClassifier(
            hidden_layer_sizes=(7,),
            max_iter=500,
            early_stopping=True,
            random_state=9,
        )
    },
}

Обучение моделей на обучающем наборе данных и оценка на тестовом

In [43]:
y_train['Age_category'] = y_train['Age_category'].cat.codes
y_test['Age_category'] = y_test['Age_category'].cat.codes
In [44]:
import numpy as np
from sklearn import metrics

for model_name in class_models.keys():
    print(f"Model: {model_name}")
    model = class_models[model_name]["model"]

    model_pipeline = Pipeline([("pipeline", pipeline_end), ("model", model)])
    model_pipeline = model_pipeline.fit(X_train, y_train)

    y_train_predict = model_pipeline.predict(X_train)
    y_test_probs = model_pipeline.predict_proba(X_test)
    y_test_predict = np.argmax(y_test_probs, axis=1)

    class_models[model_name]["pipeline"] = model_pipeline
    class_models[model_name]["probs"] = y_test_probs
    class_models[model_name]["preds"] = y_test_predict

    # Метрики
    class_models[model_name]["Precision_train"] = metrics.precision_score(
        y_train, y_train_predict, average="macro"
    )
    class_models[model_name]["Precision_test"] = metrics.precision_score(
        y_test, y_test_predict, average="macro"
    )
    class_models[model_name]["Recall_train"] = metrics.recall_score(
        y_train, y_train_predict, average="macro"
    )
    class_models[model_name]["Recall_test"] = metrics.recall_score(
        y_test, y_test_predict, average="macro"
    )
    class_models[model_name]["Accuracy_train"] = metrics.accuracy_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Accuracy_test"] = metrics.accuracy_score(
        y_test, y_test_predict
    )
    class_models[model_name]["ROC_AUC_test"] = metrics.roc_auc_score(
        y_test, y_test_probs, multi_class="ovr"
    )
    class_models[model_name]["F1_train"] = metrics.f1_score(
        y_train, y_train_predict, average="macro"
    )
    class_models[model_name]["F1_test"] = metrics.f1_score(
        y_test, y_test_predict, average="macro"
    )
    class_models[model_name]["MCC_test"] = metrics.matthews_corrcoef(
        y_test, y_test_predict
    )
    class_models[model_name]["Cohen_kappa_test"] = metrics.cohen_kappa_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Confusion_matrix"] = metrics.confusion_matrix(
        y_test, y_test_predict
    )
Model: logistic
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\utils\validation.py:1339: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\utils\validation.py:1339: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Model: ridge
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\neighbors\_classification.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  return self._fit(X, y)
Model: decision_tree
Model: knn
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\utils\validation.py:1339: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Model: naive_bayes
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\preprocessing\_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Model: gradient_boosting
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\base.py:1473: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  return fit_method(estimator, *args, **kwargs)
Model: random_forest
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:1105: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Model: mlp
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Сводная таблица оценок качества для использованных моделей классификации

In [49]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(17, 17), sharex=False, sharey=False)
for index, key in enumerate(class_models.keys()):
    c_matrix = class_models[key]["Confusion_matrix"]
    disp = ConfusionMatrixDisplay(
        confusion_matrix=c_matrix, display_labels=["Under 30", "30-40", "40-50", "50-60", "60-70", "70-80", "80+"]
    ).plot(ax=ax.flat[index])
    disp.ax_.set_title(key)

plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)
plt.show()
No description has been provided for this image

Точность, полнота, верность (аккуратность), F-мера

In [52]:
class_metrics = pd.DataFrame.from_dict(class_models, "index")[
    [
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
        "Accuracy_train",
        "Accuracy_test",
        "F1_train",
        "F1_test",
    ]
]
class_metrics.sort_values(
    by="Accuracy_test", ascending=False
).style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=["Accuracy_train", "Accuracy_test", "F1_train", "F1_test"],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
    ],
)
Out[52]:
  Precision_train Precision_test Recall_train Recall_test Accuracy_train Accuracy_test F1_train F1_test
logistic 0.567471 0.278074 0.444883 0.232769 0.618269 0.353846 0.465219 0.237759
gradient_boosting 0.836061 0.287405 0.725411 0.235795 0.689904 0.344231 0.760847 0.240251
knn 0.477783 0.221788 0.460090 0.214239 0.497115 0.328846 0.456182 0.211556
decision_tree 0.618281 0.163157 0.244223 0.184231 0.387981 0.325000 0.227570 0.146479
random_forest 0.581578 0.236539 0.735419 0.246556 0.627404 0.288462 0.599765 0.231541
ridge 0.518033 0.238462 0.695673 0.247678 0.556250 0.284615 0.553233 0.226955
mlp 0.035714 0.035714 0.142857 0.142857 0.250000 0.250000 0.057143 0.057143
naive_bayes 0.524162 0.239277 0.664585 0.202308 0.494231 0.176923 0.465319 0.151713

значения далеки от идела, датасет так себе...

In [51]:
pip install Jinja2
Collecting Jinja2
  Downloading jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting MarkupSafe>=2.0 (from Jinja2)
  Downloading MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Downloading jinja2-3.1.4-py3-none-any.whl (133 kB)
Downloading MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl (15 kB)
Installing collected packages: MarkupSafe, Jinja2
Successfully installed Jinja2-3.1.4 MarkupSafe-3.0.2
Note: you may need to restart the kernel to use updated packages.
[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip

ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса

In [53]:
class_metrics = pd.DataFrame.from_dict(class_models, "index")[
    [
        "Accuracy_test",
        "F1_test",
        "ROC_AUC_test",
        "Cohen_kappa_test",
        "MCC_test",
    ]
]
class_metrics.sort_values(by="ROC_AUC_test", ascending=False).style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=[
        "ROC_AUC_test",
        "MCC_test",
        "Cohen_kappa_test",
    ],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Accuracy_test",
        "F1_test",
    ],
)
Out[53]:
  Accuracy_test F1_test ROC_AUC_test Cohen_kappa_test MCC_test
gradient_boosting 0.344231 0.240251 0.649816 0.131708 0.138628
logistic 0.353846 0.237759 0.615478 0.160238 0.161282
ridge 0.284615 0.226955 0.612260 0.129672 0.133551
knn 0.328846 0.211556 0.602333 0.128794 0.130205
random_forest 0.288462 0.231541 0.599541 0.126828 0.129917
decision_tree 0.325000 0.146479 0.581718 0.078698 0.098279
naive_bayes 0.176923 0.151713 0.562024 0.071080 0.079232
mlp 0.250000 0.057143 0.554978 0.000000 0.000000
In [54]:
best_model = str(class_metrics.sort_values(by="MCC_test", ascending=False).iloc[0].name)

display(best_model)
'logistic'

Вывод данных с ошибкой предсказания для оценки

In [56]:
preprocessing_result = pipeline_end.transform(X_test)
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

y_pred = class_models[best_model]["preds"]

error_index = y_test[y_test["Age_category"] != y_pred].index.tolist()
display(f"Error items count: {len(error_index)}")

error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]
error_df = X_test.loc[error_index].copy()
error_df.insert(loc=1, column="Predicted", value=error_predicted)
error_df.sort_index()
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
'Error items count: 336'
Out[56]:
Rank Predicted Name Networth Country Source Industry Age_category
6 7 4 Sergey Brin 107.0 United States Google Technology 40-50
8 9 3 Steve Ballmer 91.4 United States Microsoft Technology 60-70
12 13 3 Carlos Slim Helu & family 81.2 Mexico telecom Telecom 80+
14 15 3 Mark Zuckerberg 67.3 United States Facebook Technology 30-40
22 23 5 Amancio Ortega 59.6 Spain Zara Fashion & Retail 80+
... ... ... ... ... ... ... ... ...
2586 2578 3 Roy Chi Ping Chung 1.0 Hong Kong manufacturing Manufacturing 60-70
2588 2578 3 Ronald Clarke 1.0 United States payments technology Technology 60-70
2591 2578 5 Sefik Yilmaz Dizdar 1.0 Turkey fashion retail Fashion & Retail 80+
2593 2578 6 Larry Fink 1.0 United States money management Finance & Investments 60-70
2596 2578 5 Nari Genomal 1.0 Philippines apparel Fashion & Retail 80+

336 rows × 8 columns

Многовато...

Пример использования обученной модели (конвейера) для предсказания

In [57]:
model = class_models[best_model]["pipeline"]

example_id = 450
test = pd.DataFrame(X_test.loc[example_id, :]).T
test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T
display(test)
display(test_preprocessed)
result_proba = model.predict_proba(test)[0]
result = model.predict(test)[0]
real = int(y_test.loc[example_id].values[0])
display(f"predicted: {result} (proba: {result_proba})")
display(f"real: {real}")
Rank Name Networth Country Source Industry Age_category
450 438 Ruan Liping 5.8 Hong Kong power strips Manufacturing 50-60
prepocessing_num__Networth prepocessing_cat__Country_Argentina prepocessing_cat__Country_Australia prepocessing_cat__Country_Austria prepocessing_cat__Country_Barbados prepocessing_cat__Country_Belgium prepocessing_cat__Country_Belize prepocessing_cat__Country_Brazil prepocessing_cat__Country_Bulgaria prepocessing_cat__Country_Canada ... prepocessing_cat__Industry_Logistics prepocessing_cat__Industry_Manufacturing prepocessing_cat__Industry_Media & Entertainment prepocessing_cat__Industry_Metals & Mining prepocessing_cat__Industry_Real Estate prepocessing_cat__Industry_Service prepocessing_cat__Industry_Sports prepocessing_cat__Industry_Technology prepocessing_cat__Industry_Telecom prepocessing_cat__Industry_diversified
450 0.289255 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1 rows × 860 columns

c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [1] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [1] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
'predicted: 3 (proba: [0.00172036 0.04303104 0.02714323 0.36848158 0.19524859 0.2037863\n 0.1605889 ])'
'real: 3'

Подбор гиперпараметров методом поиска по сетке

In [60]:
from sklearn.model_selection import GridSearchCV

optimized_model_type = "random_forest"

random_forest_model = class_models[optimized_model_type]["pipeline"]

param_grid = {
    "model__n_estimators": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],
    "model__max_features": ["sqrt", "log2", 2],
    "model__max_depth": [2, 3, 4, 5, 6, 7, 8, 9 ,10],
    "model__criterion": ["gini", "entropy", "log_loss"],
}

gs_optomizer = GridSearchCV(
    estimator=random_forest_model, param_grid=param_grid, n_jobs=-1
)
gs_optomizer.fit(X_train, y_train.values.ravel())
gs_optomizer.best_params_
c:\Users\annal\aim\.venv\Lib\site-packages\numpy\ma\core.py:2881: RuntimeWarning: invalid value encountered in cast
  _data = np.array(data, dtype=dtype, copy=copy,
Out[60]:
{'model__criterion': 'gini',
 'model__max_depth': 10,
 'model__max_features': 2,
 'model__n_estimators': 250}

Обучение модели с новыми гиперпараметрами

In [69]:
optimized_model = ensemble.RandomForestClassifier(
    random_state=9,
    criterion="gini",
    max_depth=10,
    max_features=2,
    n_estimators=250,
)

result = {}

result["pipeline"] = Pipeline([("pipeline", pipeline_end), ("model", optimized_model)]).fit(X_train, y_train.values.ravel())
result["train_preds"] = result["pipeline"].predict(X_train)
result["probs"] = result["pipeline"].predict_proba(X_test)
result["preds"] = np.argmax(y_test_probs, axis=1)

result["Precision_train"] = metrics.precision_score(y_train, result["train_preds"],average="macro")
result["Precision_test"] = metrics.precision_score(y_test, result["preds"], average="macro")
result["Recall_train"] = metrics.recall_score(y_train, result["train_preds"], average="macro")
result["Recall_test"] = metrics.recall_score(y_test, result["preds"], average="macro")
result["Accuracy_train"] = metrics.accuracy_score(y_train, result["train_preds"])
result["Accuracy_test"] = metrics.accuracy_score(y_test, result["preds"])
result["ROC_AUC_test"] = metrics.roc_auc_score(y_test, result["probs"], multi_class="ovr")
result["F1_train"] = metrics.f1_score(y_train, result["train_preds"], average="macro")
result["F1_test"] = metrics.f1_score(y_test, result["preds"], average="macro")
result["MCC_test"] = metrics.matthews_corrcoef(y_test, result["preds"])
result["Cohen_kappa_test"] = metrics.cohen_kappa_score(y_test, result["preds"])
result["Confusion_matrix"] = metrics.confusion_matrix(y_test, result["preds"])
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0, 1] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Формирование данных для оценки старой и новой версии модели

In [70]:
optimized_metrics = pd.DataFrame(columns=list(result.keys()))
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
    data=class_models[optimized_model_type]
)
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
    data=result
)
optimized_metrics.insert(loc=0, column="Name", value=["Old", "New"])
optimized_metrics = optimized_metrics.set_index("Name")

Оценка параметров старой и новой модели

In [71]:
optimized_metrics[
    [
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
        "Accuracy_train",
        "Accuracy_test",
        "F1_train",
        "F1_test",
    ]
].style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=["Accuracy_train", "Accuracy_test", "F1_train", "F1_test"],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
    ],
)
Out[71]:
  Precision_train Precision_test Recall_train Recall_test Accuracy_train Accuracy_test F1_train F1_test
Name                
Old 0.581578 0.236539 0.735419 0.246556 0.627404 0.288462 0.599765 0.231541
New 0.181388 0.035714 0.157692 0.142857 0.306250 0.250000 0.090702 0.057143
In [72]:
optimized_metrics[
    [
        "Accuracy_test",
        "F1_test",
        "ROC_AUC_test",
        "Cohen_kappa_test",
        "MCC_test",
    ]
].style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=[
        "ROC_AUC_test",
        "MCC_test",
        "Cohen_kappa_test",
    ],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Accuracy_test",
        "F1_test",
    ],
)
Out[72]:
  Accuracy_test F1_test ROC_AUC_test Cohen_kappa_test MCC_test
Name          
Old 0.288462 0.231541 0.599541 0.126828 0.129917
New 0.250000 0.057143 0.605446 0.000000 0.000000
In [74]:
_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False
)

for index in range(0, len(optimized_metrics)):
    c_matrix = optimized_metrics.iloc[index]["Confusion_matrix"]
    disp = ConfusionMatrixDisplay(
        confusion_matrix=c_matrix, display_labels=["Under 30", "30-40", "40-50", "50-60", "60-70", "70-80", "80+"]
    ).plot(ax=ax.flat[index])

plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)
plt.show()
No description has been provided for this image

Задача регрессии

In [3]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Networth','Rank ', 'Name'])  # Признаки
y = df['Networth']  # Целевая переменная для регрессии

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd

# Исправляем ColumnTransformer с сохранением имен колонок
columns_to_drop = []

num_columns = [
    column
    for column in X_train.columns
    if column not in columns_to_drop and X_train[column].dtype != "object"
]
cat_columns = [
    column
    for column in X_train.columns
    if column not in columns_to_drop and X_train[column].dtype == "object"
]

# Предобработка числовых данных
num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

# Предобработка категориальных данных
cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ]
)

# Общая предобработка признаков
features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=True,  # Сохраняем имена колонок
    transformers=[
        ("prepocessing_num", preprocessing_num, num_columns),
        ("prepocessing_cat", preprocessing_cat, cat_columns),
    ],
    remainder="drop"  # Убираем неиспользуемые столбцы
)

# Итоговый конвейер
pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
    ]
)

# Преобразуем данные
preprocessing_result = pipeline_end.fit_transform(X_train)

# Создаем DataFrame с правильными именами колонок
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
    index=X_train.index,  # Сохраняем индексы
)

preprocessed_df
Out[4]:
prepocessing_num__Age prepocessing_cat__Country_Argentina prepocessing_cat__Country_Australia prepocessing_cat__Country_Austria prepocessing_cat__Country_Barbados prepocessing_cat__Country_Belgium prepocessing_cat__Country_Belize prepocessing_cat__Country_Brazil prepocessing_cat__Country_Bulgaria prepocessing_cat__Country_Canada ... prepocessing_cat__Industry_Logistics prepocessing_cat__Industry_Manufacturing prepocessing_cat__Industry_Media & Entertainment prepocessing_cat__Industry_Metals & Mining prepocessing_cat__Industry_Real Estate prepocessing_cat__Industry_Service prepocessing_cat__Industry_Sports prepocessing_cat__Industry_Technology prepocessing_cat__Industry_Telecom prepocessing_cat__Industry_diversified
582 -0.109934 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
48 1.079079 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1772 1.004766 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
964 -0.407187 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2213 1.302019 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1638 1.227706 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1095 0.856139 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1130 0.781826 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
1294 0.335946 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
860 0.558886 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

2080 rows × 855 columns

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score


# Модели и параметры
models_classification = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForestClassifier": RandomForestClassifier(random_state=42),
    "KNN": KNeighborsClassifier()
}

param_grids_classification = {
    "LogisticRegression": {
        'model__C': [0.1, 1, 10]
    },
    "RandomForestClassifier": {
        "model__n_estimators": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],
        "model__max_features": ["sqrt", "log2", 2],
        "model__max_depth": [2, 3, 4, 5, 6, 7, 8, 9 ,10, 20],
        "model__criterion": ["gini", "entropy", "log_loss"],
    },
    "KNN": {
        'model__n_neighbors': [3, 5, 7, 9, 11],
        'model__weights': ['uniform', 'distance']
    }
}

# Результаты
results_classification = {}

# Перебор моделей
for name, model in models_classification.items():
    print(f"Training {name}...")
    pipeline = Pipeline(steps=[
        ('features_preprocessing', features_preprocessing),
        ('model', model)
    ])
    
    param_grid = param_grids_classification[name]
    grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Лучшая модель
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Метрики
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Вычисление матрицы ошибок
    c_matrix = confusion_matrix(y_test, y_pred)

    # Сохранение результатов
    results_classification[name] = {
        "Best Params": grid_search.best_params_,
        "Accuracy": acc,
        "F1 Score": f1,
        "Confusion_matrix": c_matrix
    }

# Печать результатов
for name, metrics in results_classification.items():
    print(f"\nModel: {name}")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
Training LogisticRegression...
c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\model_selection\_search.py:320: UserWarning: The total space of parameters 3 is smaller than n_iter=10. Running 3 iterations. For exhaustive searches, use GridSearchCV.
  warnings.warn(
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[7], line 44
     42 param_grid = param_grids_classification[name]
     43 grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
---> 44 grid_search.fit(X_train, y_train)
     46 # Лучшая модель
     47 best_model = grid_search.best_estimator_

File c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\model_selection\_search.py:1019, in BaseSearchCV.fit(self, X, y, **params)
   1013     results = self._format_results(
   1014         all_candidate_params, n_splits, all_out, all_more_results
   1015     )
   1017     return results
-> 1019 self._run_search(evaluate_candidates)
   1021 # multimetric is determined here because in the case of a callable
   1022 # self.scoring the return type is only known after calling
   1023 first_test_score = all_out[0]["test_scores"]

File c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\model_selection\_search.py:1960, in RandomizedSearchCV._run_search(self, evaluate_candidates)
   1958 def _run_search(self, evaluate_candidates):
   1959     """Search n_iter candidates from param_distributions"""
-> 1960     evaluate_candidates(
   1961         ParameterSampler(
   1962             self.param_distributions, self.n_iter, random_state=self.random_state
   1963         )
   1964     )

File c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\model_selection\_search.py:996, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
    989 elif len(out) != n_candidates * n_splits:
    990     raise ValueError(
    991         "cv.split and cv.get_n_splits returned "
    992         "inconsistent results. Expected {} "
    993         "splits, got {}".format(n_splits, len(out) // n_candidates)
    994     )
--> 996 _warn_or_raise_about_fit_failures(out, self.error_score)
    998 # For callable self.scoring, the return type is only know after
    999 # calling. If the return type is a dictionary, the error scores
   1000 # can now be inserted with the correct key. The type checking
   1001 # of out will be done in `_insert_error_scores`.
   1002 if callable(self.scoring):

File c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\model_selection\_validation.py:529, in _warn_or_raise_about_fit_failures(results, error_score)
    522 if num_failed_fits == num_fits:
    523     all_fits_failed_message = (
    524         f"\nAll the {num_fits} fits failed.\n"
    525         "It is very likely that your model is misconfigured.\n"
    526         "You can try to debug the error by setting error_score='raise'.\n\n"
    527         f"Below are more details about the failures:\n{fit_errors_summary}"
    528     )
--> 529     raise ValueError(all_fits_failed_message)
    531 else:
    532     some_fits_failed_message = (
    533         f"\n{num_failed_fits} fits failed out of a total of {num_fits}.\n"
    534         "The score on these train-test partitions for these parameters"
   (...)
    538         f"Below are more details about the failures:\n{fit_errors_summary}"
    539     )

ValueError: 
All the 15 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1231, in fit
    check_classification_targets(y)
  File "c:\Users\annal\aim\.venv\Lib\site-packages\sklearn\utils\multiclass.py", line 219, in check_classification_targets
    raise ValueError(
ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.