132 KiB
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import set_config
df = pd.read_csv("data/house_data.csv", sep=",", nrows=10000)
df.dropna()
Устраняем выбросы в колонке цены и добавляем колонку с категориями цены¶
q1 = df['price'].quantile(0.25) # Находим 1-й квартиль (Q1)
q3 = df['price'].quantile(0.75) # Находим 3-й квартиль (Q3)
iqr = q3 - q1 # Вычисляем межквартильный размах (IQR)
# Определяем границы для выбросов
lower_bound = q1 - 1.5 * iqr # Нижняя граница
upper_bound = q3 + 1.5 * iqr # Верхняя граница
# Устраняем выбросы: заменяем значения ниже нижней границы на саму нижнюю границу, а выше верхней — на верхнюю
df['price'] = df['price'].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)
# Добавляем столбец с категорями цены
df['price_category'] = pd.cut(df['price'], bins=[75000,338750,602750,866750,1130750], labels=['low','middle','high','very_high'], include_lowest=True)
df.tail(20)
from typing import Tuple
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split
def split_stratified_into_train_val_test(
df_input,
stratify_colname="y",
frac_train=0.6,
frac_val=0.15,
frac_test=0.25,
random_state=None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
if frac_train + frac_val + frac_test != 1.0:
raise ValueError(
"fractions %f, %f, %f do not add up to 1.0"
% (frac_train, frac_val, frac_test)
)
if stratify_colname not in df_input.columns:
raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
X = df_input # Contains all columns.
y = df_input[
[stratify_colname]
] # Dataframe of just the column on which to stratify.
# Split original dataframe into train and temp dataframes.
df_train, df_temp, y_train, y_temp = train_test_split(
X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
)
if frac_val <= 0:
assert len(df_input) == len(df_train) + len(df_temp)
return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
# Split the temp dataframe into val and test dataframes.
relative_frac_test = frac_test / (frac_val + frac_test)
df_val, df_test, y_val, y_test = train_test_split(
df_temp,
y_temp,
stratify=y_temp,
test_size=relative_frac_test,
random_state=random_state,
)
assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
return df_train, df_val, df_test, y_train, y_val, y_test
X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
df, stratify_colname="price_category", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=42
)
display("X_train", X_train)
display("y_train", y_train)
display("X_test", X_test)
display("y_test", y_test)
Формирование конвейера¶
preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация
preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование
features_preprocessing -- трансформер для предобработки признаков
drop_columns -- трансформер для удаления колонок
pipeline_end -- основной конвейер предобработки данных и конструирования признаков
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor # Пример регрессионной модели
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
random_state = 42
# Указываем столбцы, которые нужно удалить и обрабатывать
columns_to_drop = ["date", "view", "waterfront"]
num_columns = [
column
for column in df.columns
if column not in columns_to_drop and df[column].dtype != "object" and df[column].dtype != "category"
]
cat_columns = [
column
for column in df.columns
if column not in columns_to_drop and df[column].dtype == "object" or df[column].dtype == "category"
]
# Определяем предобработку для численных данных
num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
[
("imputer", num_imputer),
("scaler", num_scaler),
]
)
# Определяем предобработку для категориальных данных
cat_imputer = SimpleImputer(strategy="constant")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
preprocessing_cat = Pipeline(
[
("imputer", cat_imputer),
("encoder", cat_encoder),
]
)
features_preprocessing = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("prepocessing_num", preprocessing_num, num_columns),
("prepocessing_cat", preprocessing_cat, cat_columns),
("prepocessing_features", cat_imputer, ["price_category"]),
],
remainder="passthrough"
)
drop_columns = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("drop_columns", "drop", columns_to_drop),
],
remainder="passthrough",
)
features_postprocessing = ColumnTransformer(
verbose_feature_names_out=False,
transformers=[
("prepocessing_cat", preprocessing_cat, ["price_category"]),
],
remainder="passthrough",
)
pipeline_end = Pipeline(
[
("features_preprocessing", features_preprocessing),
("drop_columns", drop_columns),
("features_postprocessing", features_postprocessing),
]
)
cols = ['price_h', 'price_l', 'price_m', 'price_vh']
preprocessing_result = features_preprocessing.fit_transform(X_train)
preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cat_columns + cols + columns_to_drop)
preprocessing_result = drop_columns.fit_transform(preprocessing_result)
preprocessing_result = pd.DataFrame(preprocessing_result, columns=num_columns + cols + cat_columns)
preprocessing_result = preprocessing_result.drop(columns=["price_category"])
preprocessing_result.head(20)
Формирование набора моделей для классификаци趶
logistic -- логистическая регрессия
ridge -- гребневая регрессия
decision_tree -- дерево решений
knn -- k-ближайших соседей
naive_bayes -- наивный Байесовский классификатор
gradient_boosting -- метод градиентного бустинга (набор деревьев решений)
random_forest -- метод случайного леса (набор деревьев решений)
mlp -- многослойный персептрон (нейронная сеть)
from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree
class_models = {
"logistic": {"model": linear_model.LogisticRegression()},
# "ridge": {"model": linear_model.RidgeClassifierCV(cv=5, class_weight="balanced")},
"ridge": {"model": linear_model.LogisticRegression(penalty="l2", class_weight="balanced")},
"decision_tree": {
"model": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)
},
"knn": {"model": neighbors.KNeighborsClassifier(n_neighbors=7)},
"naive_bayes": {"model": naive_bayes.GaussianNB()},
"gradient_boosting": {
"model": ensemble.GradientBoostingClassifier(n_estimators=210)
},
"random_forest": {
"model": ensemble.RandomForestClassifier(
max_depth=11, class_weight="balanced", random_state=random_state
)
},
"mlp": {
"model": neural_network.MLPClassifier(
hidden_layer_sizes=(7,),
max_iter=500,
early_stopping=True,
random_state=random_state,
)
},
}
Обучение моделей на обучающем наборе данных и оценка на тестовом¶
import numpy as np
from sklearn import metrics
for model_name in class_models.keys():
print(f"Model: {model_name}")
model = class_models[model_name]["model"]
model_pipeline = Pipeline([("pipeline", pipeline_end), ("model", model)])
model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())
y_train_predict = model_pipeline.predict(X_train)
y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]
y_test_predict = np.where(y_test_probs > 0.5, 1, 0)
class_models[model_name]["pipeline"] = model_pipeline
class_models[model_name]["probs"] = y_test_probs
class_models[model_name]["preds"] = y_test_predict
class_models[model_name]["Precision_train"] = metrics.precision_score(
y_train, y_train_predict
)
class_models[model_name]["Precision_test"] = metrics.precision_score(
y_test, y_test_predict
)
class_models[model_name]["Recall_train"] = metrics.recall_score(
y_train, y_train_predict
)
class_models[model_name]["Recall_test"] = metrics.recall_score(
y_test, y_test_predict
)
class_models[model_name]["Accuracy_train"] = metrics.accuracy_score(
y_train, y_train_predict
)
class_models[model_name]["Accuracy_test"] = metrics.accuracy_score(
y_test, y_test_predict
)
class_models[model_name]["ROC_AUC_test"] = metrics.roc_auc_score(
y_test, y_test_probs
)
class_models[model_name]["F1_train"] = metrics.f1_score(y_train, y_train_predict)
class_models[model_name]["F1_test"] = metrics.f1_score(y_test, y_test_predict)
class_models[model_name]["MCC_test"] = metrics.matthews_corrcoef(
y_test, y_test_predict
)
class_models[model_name]["Cohen_kappa_test"] = metrics.cohen_kappa_score(
y_test, y_test_predict
)
class_models[model_name]["Confusion_matrix"] = metrics.confusion_matrix(
y_test, y_test_predict
)