AIM-PIbd-31-Yakovlev-M-G/lab_4/lab_4.ipynb

132 KiB
Raw Permalink Blame History

In [337]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import set_config

df = pd.read_csv("data/house_data.csv", sep=",", nrows=10000)
df.dropna()
Out[337]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7129300520 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 ... 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 ... 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 ... 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 ... 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 ... 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9995 322059264 20140926T000000 279000.0 2 1.00 1020 47044 1.0 0 0 ... 7 1020 0 1904 1958 98042 47.4206 -122.155 1930 12139
9996 5557500270 20150209T000000 262000.0 3 1.50 1700 9579 1.0 0 0 ... 7 1100 600 1962 0 98023 47.3209 -122.338 1700 9628
9997 9164100125 20140807T000000 533000.0 4 1.00 1550 4750 1.5 0 0 ... 7 1550 0 1919 0 98117 47.6824 -122.389 1320 4750
9998 7370600045 20150402T000000 640000.0 3 1.75 1680 8100 1.0 0 2 ... 8 1680 0 1950 0 98177 47.7212 -122.364 1880 7750
9999 8594400060 20140609T000000 285000.0 3 2.25 1680 35127 2.0 0 0 ... 7 1680 0 1987 0 98092 47.3025 -122.067 1820 35166

10000 rows × 21 columns

Устраняем выбросы в колонке цены и добавляем колонку с категориями цены

In [338]:
q1 = df['price'].quantile(0.25)  # Находим 1-й квартиль (Q1)
q3 = df['price'].quantile(0.75)  # Находим 3-й квартиль (Q3)
iqr = q3 - q1  # Вычисляем межквартильный размах (IQR)

# Определяем границы для выбросов
lower_bound = q1 - 1.5 * iqr  # Нижняя граница
upper_bound = q3 + 1.5 * iqr  # Верхняя граница

# Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю
df['price'] = df['price'].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)

# Добавляем столбец с категорями цены
df['price_category'] = pd.cut(df['price'], bins=[75000,338750,602750,866750,1130750], labels=['low','middle','high','very_high'], include_lowest=True)
df.tail(20)
Out[338]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15 price_category
9980 6840700036 20140728T000000 497000.0 2 1.00 770 3325 1.0 0 0 ... 770 0 1918 0 98122 47.6102 -122.299 960 4800 middle
9981 1824069083 20150429T000000 835000.0 3 1.00 3060 30166 1.0 0 0 ... 3060 0 1959 0 98027 47.5656 -122.093 1880 19602 high
9982 1836980240 20141015T000000 730000.0 4 2.75 2920 4500 2.0 0 0 ... 2920 0 1999 0 98006 47.5646 -122.124 2920 4505 high
9983 3528900160 20141001T000000 655000.0 3 1.00 1370 5250 1.0 0 0 ... 1070 300 1939 0 98109 47.6421 -122.348 2410 4200 high
9984 1442800060 20141120T000000 205000.0 3 2.50 1870 3118 2.0 0 0 ... 1870 0 1993 0 98038 47.3739 -122.056 1580 3601 low
9985 8722100030 20150407T000000 632750.0 4 2.00 1800 4800 1.5 0 0 ... 1800 0 1918 0 98112 47.6388 -122.302 1950 4800 high
9986 1723049624 20140512T000000 330000.0 5 3.00 2100 7715 1.0 0 0 ... 1250 850 2013 0 98168 47.4866 -122.319 2100 7959 low
9987 4040400200 20141007T000000 527500.0 5 2.25 2530 8250 2.0 0 0 ... 2530 0 1961 0 98007 47.6117 -122.134 2020 8250 middle
9988 8691391090 20140508T000000 716500.0 4 2.50 3290 6465 2.0 0 0 ... 3290 0 2002 0 98075 47.5981 -121.976 3100 5929 high
9989 7853302190 20141217T000000 388500.0 4 2.50 1890 5395 2.0 0 0 ... 1890 0 2006 0 98065 47.5415 -121.883 2060 5395 middle
9990 3260000700 20140904T000000 530000.0 3 1.75 1680 7770 1.0 0 0 ... 1680 0 1967 0 98005 47.6028 -122.167 1880 7770 middle
9991 5126300510 20150108T000000 419000.0 3 2.50 2170 4517 2.0 0 0 ... 2170 0 2002 0 98059 47.4819 -122.140 2610 4770 middle
9992 7199330370 20150309T000000 385000.0 3 1.75 1200 7360 1.0 0 0 ... 1200 0 1978 0 98052 47.6979 -122.130 1200 7500 middle
9993 1854900240 20140528T000000 655000.0 4 2.50 2990 5669 2.0 0 0 ... 2990 0 2003 0 98074 47.6119 -122.011 3110 5058 high
9994 6738700335 20140701T000000 1127312.5 4 2.75 3770 10900 2.0 0 2 ... 3070 700 1924 0 98144 47.5849 -122.290 3000 5000 very_high
9995 322059264 20140926T000000 279000.0 2 1.00 1020 47044 1.0 0 0 ... 1020 0 1904 1958 98042 47.4206 -122.155 1930 12139 low
9996 5557500270 20150209T000000 262000.0 3 1.50 1700 9579 1.0 0 0 ... 1100 600 1962 0 98023 47.3209 -122.338 1700 9628 low
9997 9164100125 20140807T000000 533000.0 4 1.00 1550 4750 1.5 0 0 ... 1550 0 1919 0 98117 47.6824 -122.389 1320 4750 middle
9998 7370600045 20150402T000000 640000.0 3 1.75 1680 8100 1.0 0 2 ... 1680 0 1950 0 98177 47.7212 -122.364 1880 7750 high
9999 8594400060 20140609T000000 285000.0 3 2.25 1680 35127 2.0 0 0 ... 1680 0 1987 0 98092 47.3025 -122.067 1820 35166 low

20 rows × 22 columns

Бизнес-цели

  1. Прогноз класса цены недвижимости (Классификация)
  2. Оценка состояния недвижимости (Регрессия)

Определение достижимого уровня качества модели для первой задачи

Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации (Целевой признак - price)

In [339]:
from typing import Tuple
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split

def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
   
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )
    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.
    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )
    if frac_val <= 0:
        assert len(df_input) == len(df_train) + len(df_temp)
        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )
    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    return df_train, df_val, df_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df, stratify_colname="price_category", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42
)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)
'X_train'
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15 price_category
9843 3260000340 20140622T000000 732600.0 4 2.50 2130 7300 1.0 0 0 ... 1230 900 1963 0 98005 47.6050 -122.167 2130 7560 high
9623 9828702055 20140508T000000 358000.0 2 1.50 960 1808 2.0 0 0 ... 960 0 1993 0 98122 47.6183 -122.298 1290 1668 middle
3095 3438500625 20140519T000000 210000.0 3 1.00 1080 21043 1.0 0 0 ... 1080 0 1942 0 98106 47.5515 -122.357 1380 7620 low
411 2422029094 20140716T000000 517534.0 2 1.00 833 143947 1.0 0 0 ... 833 0 2006 0 98070 47.3889 -122.482 1380 143947 middle
3060 7462900015 20150108T000000 387000.0 3 2.25 1760 45133 2.0 0 0 ... 1760 0 1984 0 98065 47.5124 -121.866 1910 51773 middle
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1750 2787720140 20150407T000000 416000.0 3 2.50 1790 11542 1.0 0 0 ... 1190 600 1969 0 98059 47.5124 -122.160 1790 9131 middle
2354 6192400400 20140728T000000 775000.0 4 2.50 3090 7112 2.0 0 0 ... 3090 0 2001 0 98052 47.7050 -122.118 3050 6000 high
857 2296500036 20150310T000000 450000.0 4 2.75 2980 13260 1.0 0 0 ... 1800 1180 1979 0 98056 47.5152 -122.197 1920 10731 middle
6181 2787310130 20141212T000000 289950.0 4 1.75 2090 7416 1.0 0 0 ... 1050 1040 1970 0 98031 47.4107 -122.179 1710 7527 low
3141 8567300110 20140604T000000 485000.0 3 2.50 2340 59058 1.0 0 0 ... 2340 0 1985 0 98038 47.4052 -122.028 2700 37263 middle

8000 rows × 22 columns

'y_train'
price_category
9843 high
9623 middle
3095 low
411 middle
3060 middle
... ...
1750 middle
2354 high
857 middle
6181 low
3141 middle

8000 rows × 1 columns

'X_test'
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15 price_category
5341 6632900574 20150225T000000 595000.0 5 3.00 2980 10064 1.0 0 0 ... 1680 1300 1940 0 98155 47.7372 -122.316 1590 7800 middle
4384 2423029245 20140617T000000 550000.0 3 1.75 2240 78225 2.0 0 0 ... 2240 0 1976 0 98070 47.4638 -122.484 2030 202554 middle
5795 2473370050 20140604T000000 327500.0 4 1.75 1650 7800 1.0 0 0 ... 1650 0 1968 0 98058 47.4507 -122.139 1750 10400 low
4956 9528104985 20141104T000000 611000.0 2 1.00 1270 5100 1.0 0 0 ... 1100 170 1900 0 98115 47.6771 -122.328 1670 3900 high
7723 3972900025 20150313T000000 499000.0 6 1.75 2400 7500 1.5 0 0 ... 1400 1000 1975 0 98155 47.7661 -122.313 1980 7500 middle
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8517 3876600120 20150422T000000 265000.0 3 1.50 1780 10196 1.0 0 0 ... 1270 510 1967 0 98001 47.3375 -122.291 1320 7875 low
6914 6821600005 20150403T000000 710000.0 4 1.75 2120 5400 1.0 0 0 ... 1060 1060 1941 0 98199 47.6501 -122.395 2052 6000 high
4499 2767603931 20140818T000000 469000.0 3 3.25 1370 1194 3.0 0 0 ... 1370 0 2004 0 98107 47.6718 -122.388 1800 2678 middle
8651 8802400411 20140619T000000 249000.0 3 1.00 1050 8498 1.0 0 0 ... 1050 0 1959 0 98031 47.4043 -122.202 1050 8498 low
4234 5452800735 20140722T000000 780000.0 4 2.50 2270 13449 1.0 0 0 ... 1310 960 1975 0 98040 47.5416 -122.232 2810 13475 high

2000 rows × 22 columns

'y_test'
price_category
5341 middle
4384 middle
5795 low
4956 high
7723 middle
... ...
8517 low
6914 high
4499 middle
8651 low
4234 high

2000 rows × 1 columns

Формирование конвейера

preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация

preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование

features_preprocessing -- трансформер для предобработки признаков

drop_columns -- трансформер для удаления колонок

pipeline_end -- основной конвейер предобработки данных и конструирования признаков

In [340]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor  # Пример регрессионной модели
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

random_state = 42

# Указываем столбцы, которые нужно удалить и обрабатывать
columns_to_drop = ["date", "view", "waterfront"]
num_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype != "object" and df[column].dtype != "category"
]
cat_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype == "object" or df[column].dtype == "category"
]

# Определяем предобработку для численных данных
num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

# Определяем предобработку для категориальных данных
cat_imputer = SimpleImputer(strategy="constant")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
preprocessing_cat = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ]
)

features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num, num_columns),
        ("prepocessing_cat", preprocessing_cat, cat_columns),
        ("prepocessing_features", cat_imputer, ["price_category"]),
    ],
    remainder="passthrough"
)

drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)

features_postprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_cat", preprocessing_cat, ["price_category"]),
    ],
    remainder="passthrough",
)

pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
        ("drop_columns", drop_columns),
        ("features_postprocessing", features_postprocessing),
    ]

)
# preprocessing_result = pipeline_end.fit_transform(X_train.values)
cols = ['price_h', 'price_l', 'price_m', 'price_vh']
preprocessing_result = features_preprocessing.fit_transform(X_train)
preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns + cols + columns_to_drop)

preprocessing_result = drop_columns.fit_transform(preprocessing_result)
preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cols + cat_columns)

preprocessing_result = preprocessing_result.drop(columns=["price_category"])
preprocessing_result.head(20)
Out[340]:
id price bedrooms bathrooms sqft_living sqft_lot floors condition grade sqft_above ... yr_renovated zipcode lat long sqft_living15 sqft_lot15 price_h price_l price_m price_vh
0 -0.451103 0.916381 0.700559 0.573416 0.081706 -0.187493 -0.838739 0.839159 -0.512647 -0.638064 ... -0.2158 -1.349962 0.32254 0.340593 0.223199 -0.210584 1.0 0.0 0.0 0.0
1 1.845014 -0.589326 -1.49426 -0.72971 -1.191326 -0.302999 1.120073 -0.666734 -0.512647 -0.969739 ... -0.2158 0.820656 0.417588 -0.601419 -1.022503 -0.421966 0.0 0.0 1.0 0.0
2 -0.388708 -1.184213 -0.396851 -1.381273 -1.060759 0.101544 -0.838739 -0.666734 -1.369558 -0.822328 ... -0.2158 0.523819 -0.059795 -1.025683 -0.889035 -0.208431 0.0 1.0 0.0 0.0
3 -0.74402 0.051922 -1.49426 -1.381273 -1.32951 2.686416 -0.838739 -0.666734 -2.22647 -1.125749 ... -0.2158 -0.144063 -1.221808 -1.924549 -0.889035 4.682444 0.0 0.0 1.0 0.0
4 1.018038 -0.47276 -0.396851 0.247635 -0.320877 0.608196 1.120073 -0.666734 -0.512647 0.013003 ... -0.2158 -0.236825 -0.339221 2.505062 -0.103056 1.375604 0.0 0.0 1.0 0.0
5 -0.083826 -0.492858 -0.396851 1.550761 -0.701698 -0.314672 3.078884 -0.666734 0.344264 -0.416947 ... -0.2158 0.468162 0.987875 -0.903438 -0.844546 -0.436854 0.0 0.0 1.0 0.0
6 0.301277 -0.953091 -0.396851 0.573416 -0.712579 -0.180574 -0.838739 -0.666734 -0.512647 -0.773191 ... -0.2158 -0.886155 -1.293987 0.254302 -0.666588 -0.205992 0.0 1.0 0.0 0.0
7 -0.086798 -1.148038 -1.49426 -1.381273 -1.25661 -0.232501 -0.838739 -0.666734 -1.369558 -1.043445 ... -0.2158 0.523819 -0.249176 -1.018493 -1.600865 -0.296686 0.0 1.0 0.0 0.0
8 -0.824567 -1.148038 -1.49426 -1.381273 -1.0934 -0.15174 0.140667 0.839159 -0.512647 -0.859181 ... -0.2158 -1.387066 -1.937882 -0.60861 -0.636929 -0.137397 0.0 1.0 0.0 0.0
9 1.647935 -0.762165 2.895378 0.899198 0.963036 -0.186442 -0.838739 -0.666734 0.344264 0.037571 ... -0.2158 -1.016021 -1.783519 -0.896247 0.208369 -0.186332 0.0 1.0 0.0 0.0
10 -1.159614 -0.581287 -1.49426 -1.381273 -1.321893 -0.185096 -0.838739 0.839159 -1.369558 -1.11715 ... -0.2158 -0.830498 0.837799 0.304638 -0.355163 -0.130796 0.0 0.0 1.0 0.0
11 -1.329183 -0.681775 -1.49426 -1.381273 -1.071639 -0.200575 -0.838739 0.839159 -0.512647 -0.834612 ... -0.2158 1.024731 1.226566 -1.025683 -0.444141 -0.202404 0.0 1.0 0.0 0.0
12 0.377864 0.286926 0.700559 0.573416 0.419005 0.256379 1.120073 -0.666734 0.344264 0.848334 ... -0.2158 -0.923259 1.277306 -0.169963 0.742242 -0.071779 0.0 0.0 1.0 0.0
13 0.289882 -0.88677 -0.396851 0.573416 0.103467 -0.143853 -0.838739 -0.666734 0.344264 -0.244967 ... -0.2158 2.045107 -0.729417 -0.428836 -0.043737 -0.155335 0.0 1.0 0.0 0.0
14 1.613049 0.282907 -0.396851 -0.078147 0.103467 -0.259422 -0.838739 -0.666734 0.344264 -0.822328 ... -0.2158 0.727894 0.868529 -1.277366 0.223199 -0.338303 0.0 0.0 1.0 0.0
15 -0.962885 0.285118 0.700559 0.573416 0.005542 -0.183813 -0.838739 -0.666734 0.344264 -0.380094 ... -0.2158 -0.478004 1.195837 0.78643 0.445646 -0.180592 0.0 0.0 1.0 0.0
16 1.722145 -0.259726 -0.396851 -0.403928 -0.571131 -0.18865 -0.838739 0.839159 -0.512647 -0.269535 ... -0.2158 -0.811945 1.222993 0.168011 -0.666588 -0.213095 0.0 0.0 1.0 0.0
17 0.740562 1.589247 0.700559 1.550761 2.878025 0.466843 1.120073 -0.666734 2.058087 2.052192 ... -0.2158 -1.349962 0.604825 0.340593 2.462498 0.79434 0.0 0.0 0.0 1.0
18 -1.555659 -0.922945 -0.396851 -1.381273 -0.799624 -0.107784 -0.838739 -0.666734 -0.512647 -0.527505 ... -0.2158 1.432881 1.536008 -0.644564 -0.978014 -0.183354 0.0 1.0 0.0 0.0
19 -0.953738 0.142224 2.895378 1.224979 0.886872 4.00146 1.120073 -0.666734 -0.512647 0.713207 ... 4.605736 -0.663527 -1.135335 0.85834 0.593944 1.659169 0.0 0.0 1.0 0.0

20 rows × 22 columns

Формирование набора моделей для классификации¶

logistic -- логистическая регрессия

ridge -- гребневая регрессия

decision_tree -- дерево решений

knn -- k-ближайших соседей

naive_bayes -- наивный Байесовский классификатор

gradient_boosting -- метод градиентного бустинга (набор деревьев решений)

random_forest -- метод случайного леса (набор деревьев решений)

mlp -- многослойный персептрон (нейронная сеть)

In [341]:
from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree

class_models = {
    "logistic": {"model": linear_model.LogisticRegression()},
    # "ridge": {"model": linear_model.RidgeClassifierCV(cv=5, class_weight="balanced")},
    "ridge": {"model": linear_model.LogisticRegression(penalty="l2", class_weight="balanced")},
    "decision_tree": {
        "model": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsClassifier(n_neighbors=7)},
    "naive_bayes": {"model": naive_bayes.GaussianNB()},
    "gradient_boosting": {
        "model": ensemble.GradientBoostingClassifier(n_estimators=210)
    },
    "random_forest": {
        "model": ensemble.RandomForestClassifier(
            max_depth=11, class_weight="balanced", random_state=random_state
        )
    },
    "mlp": {
        "model": neural_network.MLPClassifier(
            hidden_layer_sizes=(7,),
            max_iter=500,
            early_stopping=True,
            random_state=random_state,
        )
    },
}

Обучение моделей на обучающем наборе данных и оценка на тестовом

In [343]:
import numpy as np
from sklearn import metrics

for model_name in class_models.keys():
    print(f"Model: {model_name}")
    model = class_models[model_name]["model"]

    model_pipeline = Pipeline([("pipeline", pipeline_end), ("model", model)])
    model_pipeline = model_pipeline.fit(X_train.values, y_train.values.ravel())

    y_train_predict = model_pipeline.predict(X_train)
    y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]
    y_test_predict = np.where(y_test_probs > 0.5, 1, 0)

    class_models[model_name]["pipeline"] = model_pipeline
    class_models[model_name]["probs"] = y_test_probs
    class_models[model_name]["preds"] = y_test_predict

    class_models[model_name]["Precision_train"] = metrics.precision_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Precision_test"] = metrics.precision_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Recall_train"] = metrics.recall_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Recall_test"] = metrics.recall_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Accuracy_train"] = metrics.accuracy_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Accuracy_test"] = metrics.accuracy_score(
        y_test, y_test_predict
    )
    class_models[model_name]["ROC_AUC_test"] = metrics.roc_auc_score(
        y_test, y_test_probs
    )
    class_models[model_name]["F1_train"] = metrics.f1_score(y_train, y_train_predict)
    class_models[model_name]["F1_test"] = metrics.f1_score(y_test, y_test_predict)
    class_models[model_name]["MCC_test"] = metrics.matthews_corrcoef(
        y_test, y_test_predict
    )
    class_models[model_name]["Cohen_kappa_test"] = metrics.cohen_kappa_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Confusion_matrix"] = metrics.confusion_matrix(
        y_test, y_test_predict
    )
Model: logistic
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\utils\_indexing.py:338, in _get_column_indices(X, key)
    337 try:
--> 338     all_columns = X.columns
    339 except AttributeError:

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
Cell In[343], line 9
      6 model = class_models[model_name]["model"]
      8 model_pipeline = Pipeline([("pipeline", pipeline_end), ("model", model)])
----> 9 model_pipeline = model_pipeline.fit(X_train.values, y_train.values.ravel())
     11 y_train_predict = model_pipeline.predict(X_train)
     12 y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\pipeline.py:469, in Pipeline.fit(self, X, y, **params)
    426 """Fit the model.
    427 
    428 Fit all the transformers one after the other and sequentially transform the
   (...)
    466     Pipeline with fitted steps.
    467 """
    468 routed_params = self._check_method_params(method="fit", props=params)
--> 469 Xt = self._fit(X, y, routed_params)
    470 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
    471     if self._final_estimator != "passthrough":

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\pipeline.py:406, in Pipeline._fit(self, X, y, routed_params)
    404     cloned_transformer = clone(transformer)
    405 # Fit or load from cache the current transformer
--> 406 X, fitted_transformer = fit_transform_one_cached(
    407     cloned_transformer,
    408     X,
    409     y,
    410     None,
    411     message_clsname="Pipeline",
    412     message=self._log_message(step_idx),
    413     params=routed_params[name],
    414 )
    415 # Replace the transformer of the step with the fitted
    416 # transformer. This is necessary when loading the transformer
    417 # from the cache.
    418 self.steps[step_idx] = (name, fitted_transformer)

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\joblib\memory.py:312, in NotMemorizedFunc.__call__(self, *args, **kwargs)
    311 def __call__(self, *args, **kwargs):
--> 312     return self.func(*args, **kwargs)

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\pipeline.py:1310, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, params)
   1308 with _print_elapsed_time(message_clsname, message):
   1309     if hasattr(transformer, "fit_transform"):
-> 1310         res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
   1311     else:
   1312         res = transformer.fit(X, y, **params.get("fit", {})).transform(
   1313             X, **params.get("transform", {})
   1314         )

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\pipeline.py:533, in Pipeline.fit_transform(self, X, y, **params)
    490 """Fit the model and transform with the final estimator.
    491 
    492 Fit all the transformers one after the other and sequentially transform
   (...)
    530     Transformed samples.
    531 """
    532 routed_params = self._check_method_params(method="fit_transform", props=params)
--> 533 Xt = self._fit(X, y, routed_params)
    535 last_step = self._final_estimator
    536 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\pipeline.py:406, in Pipeline._fit(self, X, y, routed_params)
    404     cloned_transformer = clone(transformer)
    405 # Fit or load from cache the current transformer
--> 406 X, fitted_transformer = fit_transform_one_cached(
    407     cloned_transformer,
    408     X,
    409     y,
    410     None,
    411     message_clsname="Pipeline",
    412     message=self._log_message(step_idx),
    413     params=routed_params[name],
    414 )
    415 # Replace the transformer of the step with the fitted
    416 # transformer. This is necessary when loading the transformer
    417 # from the cache.
    418 self.steps[step_idx] = (name, fitted_transformer)

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\joblib\memory.py:312, in NotMemorizedFunc.__call__(self, *args, **kwargs)
    311 def __call__(self, *args, **kwargs):
--> 312     return self.func(*args, **kwargs)

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\pipeline.py:1310, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, params)
   1308 with _print_elapsed_time(message_clsname, message):
   1309     if hasattr(transformer, "fit_transform"):
-> 1310         res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
   1311     else:
   1312         res = transformer.fit(X, y, **params.get("fit", {})).transform(
   1313             X, **params.get("transform", {})
   1314         )

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\utils\_set_output.py:316, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    314 @wraps(f)
    315 def wrapped(self, X, *args, **kwargs):
--> 316     data_to_wrap = f(self, X, *args, **kwargs)
    317     if isinstance(data_to_wrap, tuple):
    318         # only wrap the first output for cross decomposition
    319         return_tuple = (
    320             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    321             *data_to_wrap[1:],
    322         )

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\compose\_column_transformer.py:968, in ColumnTransformer.fit_transform(self, X, y, **params)
    965 self._validate_transformers()
    966 n_samples = _num_samples(X)
--> 968 self._validate_column_callables(X)
    969 self._validate_remainder(X)
    971 if _routing_enabled():

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\compose\_column_transformer.py:536, in ColumnTransformer._validate_column_callables(self, X)
    534         columns = columns(X)
    535     all_columns.append(columns)
--> 536     transformer_to_input_indices[name] = _get_column_indices(X, columns)
    538 self._columns = all_columns
    539 self._transformer_to_input_indices = transformer_to_input_indices

File d:\Study\3 курс 5 семестр\AIM\AIM-PIbd-31-Yakovlev-M-G\kernel\Lib\site-packages\sklearn\utils\_indexing.py:340, in _get_column_indices(X, key)
    338     all_columns = X.columns
    339 except AttributeError:
--> 340     raise ValueError(
    341         "Specifying the columns using strings is only supported for dataframes."
    342     )
    343 if isinstance(key, str):
    344     columns = [key]

ValueError: Specifying the columns using strings is only supported for dataframes.