Mii_Kislitsa_Egor_Pibd_33/lec4.ipynb at 454ecc4e4c9cb8975fc5e6719fdae3125be53bfc

goblinrf 0804611fd3 Никому не советую!!!!

2024-11-16 00:11:46 +04:00

308 KiB

Raw Blame History

Загрузка набора данных¶

In [2]:

import pandas as pd

from sklearn import set_config

set_config(transform_output="pandas")

random_state=9

df = pd.read_csv("data/kc_house_data.csv", index_col="id")

df

Out[2]:

	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
id
7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	0	0	3	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	0	0	3	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	0	0	3	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	0	0	5	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	0	0	3	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
263000018	20140521T000000	360000.0	3	2.50	1530	1131	3.0	0	0	3	8	1530	0	2009	0	98103	47.6993	-122.346	1530	1509
6600060120	20150223T000000	400000.0	4	2.50	2310	5813	2.0	0	0	3	8	2310	0	2014	0	98146	47.5107	-122.362	1830	7200
1523300141	20140623T000000	402101.0	2	0.75	1020	1350	2.0	0	0	3	7	1020	0	2009	0	98144	47.5944	-122.299	1020	2007
291310100	20150116T000000	400000.0	3	2.50	1600	2388	2.0	0	0	3	8	1600	0	2004	0	98027	47.5345	-122.069	1410	1287
1523300157	20141015T000000	325000.0	2	0.75	1020	1076	2.0	0	0	3	7	1020	0	2008	0	98144	47.5941	-122.299	1020	1357

21613 rows × 20 columns

Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации¶

Целевой признак -- waterfront

In [3]:

from utils import split_stratified_into_train_val_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df,
    stratify_colname="waterfront",
    frac_train=0.80,
    frac_val=0,
    frac_test=0.20,
    random_state=random_state,
)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)

'X_train'

	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
id
3046200125	20150406T000000	202000.0	2	1.00	740	6550	1.0	0	0	4	5	740	0	1946	0	98168	47.4807	-122.332	1080	8515
1853000030	20150416T000000	775000.0	3	2.50	3550	32807	2.0	0	0	3	9	3550	0	1989	0	98077	47.7292	-122.082	3270	35001
1825079005	20140609T000000	739000.0	4	2.50	2800	246114	2.0	0	0	3	9	2800	0	1999	0	98014	47.6586	-121.962	2750	60351
2523039315	20141022T000000	481000.0	3	2.00	2580	15653	1.5	0	0	3	9	2580	0	1990	0	98166	47.4561	-122.361	1920	9840
6623400246	20140523T000000	200000.0	4	1.00	1350	11507	1.0	0	0	3	7	1350	0	1966	0	98055	47.4269	-122.197	1320	25675
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2523069134	20150406T000000	495000.0	4	2.50	2480	91911	1.0	0	2	4	7	1470	1010	1973	0	98027	47.4579	-121.981	2540	91911
1931300412	20150416T000000	475000.0	3	2.25	1190	1200	3.0	0	0	3	8	1190	0	2008	0	98103	47.6542	-122.346	1180	1224
4331000400	20150220T000000	252000.0	3	1.50	1150	13200	1.0	0	0	3	7	1150	0	1956	0	98166	47.4752	-122.345	1220	13066
9212900180	20140625T000000	760000.0	4	2.50	2760	6000	2.0	0	0	5	7	2230	530	1942	0	98115	47.6877	-122.295	1600	6000
7000100775	20140721T000000	625000.0	3	2.00	1730	12219	1.0	0	0	4	7	1730	0	1986	0	98004	47.5825	-122.189	2470	13594

17290 rows × 20 columns

'y_train'

	waterfront
id
3046200125	0
1853000030	0
1825079005	0
2523039315	0
6623400246	0
...	...
2523069134	0
1931300412	0
4331000400	0
9212900180	0
7000100775	0

17290 rows × 1 columns

'X_test'

	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
id
1775950100	20150113T000000	357823.0	3	1.50	1240	9196	1.0	0	0	3	8	1240	0	1968	0	98072	47.7562	-122.094	1690	10800
3550800040	20141114T000000	223000.0	3	1.00	940	7980	1.0	0	0	3	6	940	0	1961	0	98146	47.5107	-122.345	1050	7980
1454600256	20141013T000000	710000.0	5	2.50	2570	9600	1.0	0	2	3	8	1620	950	1956	0	98125	47.7216	-122.282	2680	9900
1467400095	20150224T000000	545000.0	4	1.75	2040	53578	1.0	0	0	5	7	1160	880	1959	0	98038	47.3844	-122.000	2040	53578
624069003	20150102T000000	829000.0	4	2.75	2970	59677	1.0	0	2	4	8	1610	1360	1973	0	98075	47.5953	-122.080	2930	42489
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3500100189	20140630T000000	300000.0	2	1.00	960	8153	1.0	0	0	3	6	960	0	1947	0	98155	47.7341	-122.300	1160	8199
952001495	20150306T000000	588000.0	4	1.75	2170	5750	1.0	0	2	3	7	1370	800	1975	0	98116	47.5668	-122.383	1450	5750
6072300800	20150505T000000	595000.0	4	1.75	2510	8989	1.0	0	0	4	8	1680	830	1964	0	98006	47.5569	-122.172	2510	8931
2944010240	20140908T000000	988000.0	4	3.00	4040	19700	2.0	0	0	3	11	4040	0	1987	0	98052	47.7205	-122.127	3930	21887
7893802670	20150424T000000	279900.0	3	3.25	2240	5000	2.0	0	0	3	9	1540	700	1989	0	98198	47.4114	-122.334	1800	7500

4323 rows × 20 columns

'y_test'

	waterfront
id
1775950100	0
3550800040	0
1454600256	0
1467400095	0
624069003	0
...	...
3500100189	0
952001495	0
6072300800	0
2944010240	0
7893802670	0

4323 rows × 1 columns

Формирование конвейера для классификации данных¶

preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация

preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование

features_preprocessing -- трансформер для предобработки признаков

features_engineering -- трансформер для конструирования признаков

drop_columns -- трансформер для удаления колонок

features_postprocessing -- трансформер для унитарного кодирования новых признаков

pipeline_end -- основной конвейер предобработки данных и конструирования признаков

Конвейер выполняется последовательно.

Трансформер выполняет параллельно для указанного набора колонок.

Документация:

https://scikit-learn.org/1.5/api/sklearn.pipeline.html

https://scikit-learn.org/1.5/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer

In [26]:

from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from custom_transformers import HouseFeatures


columns_to_drop = ["waterfront", "yr_built", "zipcode"]
num_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype != "object"
]
cat_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype == "object"
]

num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

cat_imputer = SimpleImputer(strategy="constant", fill_value=-1)
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ]
)

features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num, num_columns),
        ("prepocessing_cat", preprocessing_cat, cat_columns),
        ("prepocessing_features", cat_imputer, ["yr_built", "zipcode"]),
    ],
    remainder="passthrough",
)

features_engineering = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("add_features", HouseFeatures(), ["yr_built", "zipcode"]),
    ],
    remainder="passthrough",
)

drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)

features_postprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_cat", preprocessing_cat, ["Region"]),
    ],
    remainder="passthrough",
)

pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
        ("features_engineering", features_engineering),
        ("drop_columns", drop_columns),
        ("features_postprocessing", features_postprocessing),
    ]
)

Демонстрация работы конвейера для предобработки данных при классификации¶

In [27]:

preprocessing_result = pipeline_end.fit_transform(X_train)
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

preprocessed_df

Out[27]:

	Region_north	House_age	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	view	condition	...	date_20150506T000000	date_20150507T000000	date_20150508T000000	date_20150509T000000	date_20150510T000000	date_20150511T000000	date_20150512T000000	date_20150513T000000	date_20150514T000000	date_20150515T000000
id
3046200125	0.0	78	-0.945119	-1.468373	-1.448400	-1.462069	-0.205788	-0.918509	-0.305883	0.909775	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1853000030	1.0	35	0.667867	-0.393286	0.503345	1.605653	0.405288	0.935992	-0.305883	-0.628763	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1825079005	1.0	25	0.566528	0.681800	0.503345	0.786866	5.369556	0.935992	-0.305883	-0.628763	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2523039315	0.0	34	-0.159739	-0.393286	-0.147237	0.546688	0.006065	0.008742	-0.305883	-0.628763	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
6623400246	1.0	58	-0.950749	0.681800	-1.448400	-0.796122	-0.090424	-0.918509	-0.305883	-0.628763	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2523069134	1.0	51	-0.120329	0.681800	0.503345	0.437517	1.780808	-0.918509	2.308411	0.909775	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1931300412	1.0	16	-0.176628	-0.393286	0.178054	-0.970797	-0.330298	2.790494	-0.305883	-0.628763	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4331000400	0.0	68	-0.804370	-0.393286	-0.797819	-1.014465	-0.051023	-0.918509	-0.305883	-0.628763	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
9212900180	1.0	82	0.625642	0.681800	0.503345	0.743197	-0.218588	0.935992	-0.305883	2.448313	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
7000100775	1.0	38	0.245619	-0.393286	-0.147237	-0.381270	-0.073854	-0.918509	-0.305883	0.909775	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

17290 rows × 384 columns

Формирование набора моделей для классификации¶

logistic -- логистическая регрессия

ridge -- гребневая регрессия

decision_tree -- дерево решений

knn -- k-ближайших соседей

naive_bayes -- наивный Байесовский классификатор

gradient_boosting -- метод градиентного бустинга (набор деревьев решений)

random_forest -- метод случайного леса (набор деревьев решений)

mlp -- многослойный персептрон (нейронная сеть)

Документация: https://scikit-learn.org/1.5/supervised_learning.html

In [28]:

from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree

class_models = {
    "logistic": {"model": linear_model.LogisticRegression()},
    # "ridge": {"model": linear_model.RidgeClassifierCV(cv=5, class_weight="balanced")},
    "ridge": {"model": linear_model.LogisticRegression(penalty="l2", class_weight="balanced")},
    "decision_tree": {
        "model": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsClassifier(n_neighbors=7)},
    "naive_bayes": {"model": naive_bayes.GaussianNB()},
    "gradient_boosting": {
        "model": ensemble.GradientBoostingClassifier(n_estimators=210)
    },
    "random_forest": {
        "model": ensemble.RandomForestClassifier(
            max_depth=11, class_weight="balanced", random_state=random_state
        )
    },
    "mlp": {
        "model": neural_network.MLPClassifier(
            hidden_layer_sizes=(7,),
            max_iter=500,
            early_stopping=True,
            random_state=random_state,
        )
    },
}

Обучение моделей на обучающем наборе данных и оценка на тестовом¶

In [29]:

import numpy as np
from sklearn import metrics

for model_name in class_models.keys():
    print(f"Model: {model_name}")
    model = class_models[model_name]["model"]

    model_pipeline = Pipeline([("pipeline", pipeline_end), ("model", model)])
    model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())

    y_train_predict = model_pipeline.predict(X_train)
    y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]
    y_test_predict = np.where(y_test_probs > 0.5, 1, 0)

    class_models[model_name]["pipeline"] = model_pipeline
    class_models[model_name]["probs"] = y_test_probs
    class_models[model_name]["preds"] = y_test_predict

    class_models[model_name]["Precision_train"] = metrics.precision_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Precision_test"] = metrics.precision_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Recall_train"] = metrics.recall_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Recall_test"] = metrics.recall_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Accuracy_train"] = metrics.accuracy_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Accuracy_test"] = metrics.accuracy_score(
        y_test, y_test_predict
    )
    class_models[model_name]["ROC_AUC_test"] = metrics.roc_auc_score(
        y_test, y_test_probs
    )
    class_models[model_name]["F1_train"] = metrics.f1_score(y_train, y_train_predict)
    class_models[model_name]["F1_test"] = metrics.f1_score(y_test, y_test_predict)
    class_models[model_name]["MCC_test"] = metrics.matthews_corrcoef(
        y_test, y_test_predict
    )
    class_models[model_name]["Cohen_kappa_test"] = metrics.cohen_kappa_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Confusion_matrix"] = metrics.confusion_matrix(
        y_test, y_test_predict
    )

Model: logistic

c:\Users\ogoro\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\ogoro\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

Model: ridge

c:\Users\ogoro\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\ogoro\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

Model: decision_tree

c:\Users\ogoro\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

Model: knn

c:\Users\ogoro\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

Model: naive_bayes

c:\Users\ogoro\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

Model: gradient_boosting

c:\Users\ogoro\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

Model: random_forest

c:\Users\ogoro\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

Model: mlp

c:\Users\ogoro\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

Сводная таблица оценок качества для использованных моделей классификации¶

Документация: https://scikit-learn.org/1.5/modules/model_evaluation.html

Матрица неточностей

In [32]:

from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)
for index, key in enumerate(class_models.keys()):
    c_matrix = class_models[key]["Confusion_matrix"]
    disp = ConfusionMatrixDisplay(
        confusion_matrix=c_matrix, display_labels=["no water", "water"]
    ).plot(ax=ax.flat[index])
    disp.ax_.set_title(key)

plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)
plt.show()

No description has been provided for this image

Точность, полнота, верность (аккуратность), F-мера

In [34]:

class_metrics = pd.DataFrame.from_dict(class_models, "index")[
    [
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
        "Accuracy_train",
        "Accuracy_test",
        "F1_train",
        "F1_test",
    ]
]
class_metrics.sort_values(
    by="Accuracy_test", ascending=False
).style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=["Accuracy_train", "Accuracy_test", "F1_train", "F1_test"],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
    ],
)

Out[34]:

	Precision_train	Precision_test	Recall_train	Recall_test	Accuracy_train	Accuracy_test	F1_train	F1_test
logistic	0.813725	0.676471	0.638462	0.696970	0.996183	0.995142	0.715517	0.686567
decision_tree	0.934307	0.678571	0.984615	0.575758	0.999364	0.994680	0.958801	0.622951
gradient_boosting	1.000000	0.612903	1.000000	0.575758	1.000000	0.993986	1.000000	0.593750
mlp	0.789474	0.586207	0.576923	0.515152	0.995662	0.993523	0.666667	0.548387
knn	0.950000	1.000000	0.146154	0.060606	0.993522	0.992829	0.253333	0.114286
random_forest	0.372493	0.333333	1.000000	0.878788	0.987334	0.985658	0.542797	0.483333
ridge	0.343915	0.300971	1.000000	0.939394	0.985656	0.982882	0.511811	0.455882
naive_bayes	0.018619	0.006916	1.000000	0.363636	0.603702	0.596576	0.036558	0.013575

ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса

In [35]:

class_metrics = pd.DataFrame.from_dict(class_models, "index")[
    [
        "Accuracy_test",
        "F1_test",
        "ROC_AUC_test",
        "Cohen_kappa_test",
        "MCC_test",
    ]
]
class_metrics.sort_values(by="ROC_AUC_test", ascending=False).style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=[
        "ROC_AUC_test",
        "MCC_test",
        "Cohen_kappa_test",
    ],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Accuracy_test",
        "F1_test",
    ],
)

Out[35]:

	Accuracy_test	F1_test	ROC_AUC_test	Cohen_kappa_test	MCC_test
logistic	0.995142	0.686567	0.996073	0.684120	0.684197
ridge	0.982882	0.455882	0.995416	0.449517	0.526537
mlp	0.993523	0.548387	0.994420	0.545139	0.546293
gradient_boosting	0.993986	0.593750	0.994137	0.590723	0.591016
random_forest	0.985658	0.483333	0.992880	0.477550	0.536289
knn	0.992829	0.114286	0.844971	0.113512	0.245298
decision_tree	0.994680	0.622951	0.786180	0.620290	0.622414
naive_bayes	0.596576	0.013575	0.481002	-0.001429	-0.006747

In [36]:

best_model = str(class_metrics.sort_values(by="MCC_test", ascending=False).iloc[0].name)

display(best_model)

'logistic'

Вывод данных с ошибкой предсказания для оценки¶

In [38]:

preprocessing_result = pipeline_end.transform(X_test)
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

y_pred = class_models[best_model]["preds"]

error_index = y_test[y_test["waterfront"] != y_pred].index.tolist()
display(f"Error items count: {len(error_index)}")

error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]
error_df = X_test.loc[error_index].copy()
error_df.insert(loc=1, column="Predicted", value=error_predicted)
error_df.sort_index()

c:\Users\ogoro\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

'Error items count: 21'

Out[38]:

	date	Predicted	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
id
121039042	20150313T000000	0	425000.0	3	2.75	3610	107386	1.5	1	3	...	8	3130	480	1918	1962	98023	47.3351	-122.362	2630	42126
624069108	20140812T000000	0	3200000.0	4	3.25	7000	28206	1.0	1	4	...	12	3500	3500	1991	0	98075	47.5928	-122.086	4913	14663
1025039086	20140916T000000	0	1875000.0	3	2.50	3280	29111	2.0	1	3	...	11	3280	0	1925	0	98199	47.6699	-122.416	3530	21074
1732800780	20150212T000000	1	3065000.0	5	3.00	4150	7500	2.5	0	4	...	11	3510	640	1909	0	98119	47.6303	-122.362	2250	4050
2122039094	20141126T000000	0	705000.0	3	3.00	1970	20978	2.0	1	3	...	9	1770	200	1980	0	98070	47.3844	-122.438	2280	75396
2923039243	20141113T000000	0	340000.0	4	1.00	1200	11834	1.0	1	3	...	6	1200	0	1972	0	98070	47.4557	-122.443	1670	47462
3024059014	20150325T000000	0	1900000.0	4	2.25	3020	11489	1.5	1	3	...	10	2110	910	1916	1988	98040	47.5395	-122.210	3890	11489
3222049024	20140522T000000	1	361000.0	3	1.00	1100	4046	1.5	0	4	...	6	1100	0	1922	0	98198	47.3440	-122.331	2550	7847
3426049284	20140819T000000	0	2300000.0	4	3.25	4110	15929	2.0	1	4	...	12	2720	1390	2001	0	98115	47.6934	-122.271	2640	15929
3741600020	20140915T000000	1	540000.0	3	2.25	2100	20018	1.0	0	4	...	8	1470	630	1948	0	98166	47.4544	-122.366	2410	17196
3760500336	20141126T000000	1	2125000.0	4	2.75	3190	19513	2.0	0	4	...	10	3190	0	1982	0	98034	47.6991	-122.235	2750	13496
3867400175	20150224T000000	1	850000.0	2	1.50	1800	4144	1.0	0	4	...	7	900	900	1962	0	98116	47.5934	-122.390	2090	4173
6329000050	20150310T000000	0	641500.0	1	1.00	1000	9084	1.0	1	3	...	7	1000	0	1950	0	98146	47.5007	-122.382	1090	6536
6762700020	20141013T000000	1	7700000.0	6	8.00	12050	27600	2.5	0	3	...	13	8570	3480	1910	1987	98102	47.6298	-122.323	3940	8800
7278100515	20140821T000000	0	1295000.0	2	2.50	2910	19449	2.0	1	4	...	9	1940	970	1985	0	98177	47.7729	-122.393	2540	23598
7490000040	20140718T000000	1	2535000.0	5	3.25	3730	10626	1.0	0	4	...	10	3730	0	1963	0	98004	47.6240	-122.221	4180	19110
7631200292	20140626T000000	1	669000.0	2	1.75	1950	10766	1.0	0	3	...	6	1160	790	1952	0	98166	47.4504	-122.377	1780	11721
7636800041	20140625T000000	0	995000.0	3	4.50	4380	47044	2.0	1	3	...	9	3720	660	1968	1990	98166	47.4734	-122.365	2460	18512
8907500070	20150413T000000	1	5350000.0	5	5.00	8000	23985	2.0	0	4	...	12	6720	1280	2009	0	98004	47.6232	-122.220	4600	21750
8964800890	20150109T000000	1	3200000.0	3	3.25	4560	13363	1.0	0	4	...	11	2760	1800	1995	0	98004	47.6205	-122.214	4060	13362
9208900037	20140919T000000	1	6885000.0	6	7.75	9890	31374	2.0	0	4	...	13	8860	1030	2001	0	98039	47.6305	-122.240	4540	42730

21 rows × 21 columns

Пример использования обученной модели (конвейера) для предсказания¶

In [44]:

model = class_models[best_model]["pipeline"]

example_id = 624069108
test = pd.DataFrame(X_test.loc[example_id, :]).T
test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T
display(test)
display(test_preprocessed)
result_proba = model.predict_proba(test)[0]
result = model.predict(test)[0]
real = int(y_test.loc[example_id].values[0])
display(f"predicted: {result} (proba: {result_proba})")
display(f"real: {real}")

	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
624069108	20140812T000000	3200000.0	4	3.25	7000	28206	1.0	1	4	4	12	3500	3500	1991	0	98075	47.5928	-122.086	4913	14663

	Region_north	House_age	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	view	condition	...	date_20150506T000000	date_20150507T000000	date_20150508T000000	date_20150509T000000	date_20150510T000000	date_20150511T000000	date_20150512T000000	date_20150513T000000	date_20150514T000000	date_20150515T000000
624069108	1.0	33.0	7.494206	0.6818	1.479217	5.372072	0.29821	-0.918509	4.922704	0.909775	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

1 rows × 384 columns

'predicted: 0 (proba: [0.8437713 0.1562287])'

'real: 1'

Подбор гиперпараметров методом поиска по сетке¶

https://www.kaggle.com/code/sociopath00/random-forest-using-gridsearchcv

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [ ]:

from sklearn.model_selection import GridSearchCV

optimized_model_type = "random_forest"

random_forest_model = class_models[optimized_model_type]["pipeline"]

param_grid = {
    "model__n_estimators": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],
    "model__max_features": ["sqrt", "log2", 2],
    "model__max_depth": [2, 3, 4, 5, 6, 7, 8, 9 ,10],
    "model__criterion": ["gini", "entropy", "log_loss"],
}

gs_optomizer = GridSearchCV(
    estimator=random_forest_model, param_grid=param_grid, n_jobs=-1
)
gs_optomizer.fit(X_train, y_train.values.ravel())
gs_optomizer.best_params_

Обучение модели с новыми гиперпараметрами

In [90]:

optimized_model = ensemble.RandomForestClassifier(
    random_state=random_state,
    criterion="gini",
    max_depth=7,
    max_features="sqrt",
    n_estimators=30,
)

result = {}

result["pipeline"] = Pipeline([("pipeline", pipeline_end), ("model", optimized_model)]).fit(X_train, y_train.values.ravel())
result["train_preds"] = result["pipeline"].predict(X_train)
result["probs"] = result["pipeline"].predict_proba(X_test)[:, 1]
result["preds"] = np.where(result["probs"] > 0.5, 1, 0)

result["Precision_train"] = metrics.precision_score(y_train, result["train_preds"])
result["Precision_test"] = metrics.precision_score(y_test, result["preds"])
result["Recall_train"] = metrics.recall_score(y_train, result["train_preds"])
result["Recall_test"] = metrics.recall_score(y_test, result["preds"])
result["Accuracy_train"] = metrics.accuracy_score(y_train, result["train_preds"])
result["Accuracy_test"] = metrics.accuracy_score(y_test, result["preds"])
result["ROC_AUC_test"] = metrics.roc_auc_score(y_test, result["probs"])
result["F1_train"] = metrics.f1_score(y_train, result["train_preds"])
result["F1_test"] = metrics.f1_score(y_test, result["preds"])
result["MCC_test"] = metrics.matthews_corrcoef(y_test, result["preds"])
result["Cohen_kappa_test"] = metrics.cohen_kappa_score(y_test, result["preds"])
result["Confusion_matrix"] = metrics.confusion_matrix(y_test, result["preds"])

Формирование данных для оценки старой и новой версии модели

In [98]:

optimized_metrics = pd.DataFrame(columns=list(result.keys()))
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
    data=class_models[optimized_model_type]
)
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
    data=result
)
optimized_metrics.insert(loc=0, column="Name", value=["Old", "New"])
optimized_metrics = optimized_metrics.set_index("Name")

Оценка параметров старой и новой модели

In [99]:

optimized_metrics[
    [
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
        "Accuracy_train",
        "Accuracy_test",
        "F1_train",
        "F1_test",
    ]
].style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=["Accuracy_train", "Accuracy_test", "F1_train", "F1_test"],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
    ],
)

Out[99]:

	Precision_train	Precision_test	Recall_train	Recall_test	Accuracy_train	Accuracy_test	F1_train	F1_test
Name
Old	0.894340	0.794118	0.868132	0.782609	0.910112	0.837989	0.881041	0.788321
New	0.867220	0.822581	0.765568	0.739130	0.865169	0.837989	0.813230	0.778626

In [100]:

optimized_metrics[
    [
        "Accuracy_test",
        "F1_test",
        "ROC_AUC_test",
        "Cohen_kappa_test",
        "MCC_test",
    ]
].style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=[
        "ROC_AUC_test",
        "MCC_test",
        "Cohen_kappa_test",
    ],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Accuracy_test",
        "F1_test",
    ],
)

Out[100]:

	Accuracy_test	F1_test	ROC_AUC_test	Cohen_kappa_test	MCC_test
Name
Old	0.837989	0.788321	0.858893	0.657111	0.657157
New	0.837989	0.778626	0.859750	0.651447	0.653765