AIM-PIbd-32-Borovkov-M-V/Lab_4/lab4.ipynb
2024-12-24 22:38:45 +04:00

76 KiB
Raw Permalink Blame History

In [1]:
from sklearn.utils import resample
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,
    matthews_corrcoef, cohen_kappa_score, confusion_matrix
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import featuretools as ft
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("healthcare-dataset-stroke-data.csv")

# Обработка пропущенных значений
df["bmi"] = df["bmi"].fillna(df["bmi"].median())

# Удаление или замена неизвестных категорий
# Например, замена 'Other' на 'Unknown' в столбце 'gender'
df['gender'] = df['gender'].replace('Other', 'Unknown')

# Разделяем классы
stroke_0 = df[df['stroke'] == 0]
stroke_1 = df[df['stroke'] == 1]

# Увеличиваем выборку для 1
stroke_1 = resample(stroke_1, replace=True, n_samples=len(stroke_0), random_state=42)

# Объединяем классы
df = pd.concat([stroke_0, stroke_1])

df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 9722 entries, 249 to 134
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 9722 non-null   int64  
 1   gender             9722 non-null   object 
 2   age                9722 non-null   float64
 3   hypertension       9722 non-null   int64  
 4   heart_disease      9722 non-null   int64  
 5   ever_married       9722 non-null   object 
 6   work_type          9722 non-null   object 
 7   Residence_type     9722 non-null   object 
 8   avg_glucose_level  9722 non-null   float64
 9   bmi                9722 non-null   float64
 10  smoking_status     9722 non-null   object 
 11  stroke             9722 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 987.4+ KB

Классификация: Предсказать вероятность инсульта на основе данных пациента.

Регрессия: Предсказать уровень глюкозы в крови на основе данных пациента.

In [2]:
# Определение целевых переменных
X = df.drop('stroke', axis=1)
y_class = df['stroke']  # Задача классификации
y_reg = df['avg_glucose_level']  # Задача регрессии

# Преобразование категориальных переменных
categorical_features = ['gender', 'ever_married', 'smoking_status']
numerical_features = ['age', 'avg_glucose_level', 'bmi']

# Создание ColumnTransformer с обработкой неизвестных категорий
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])  # Используем handle_unknown='ignore'

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(X, y_class, y_reg, test_size=0.2, random_state=42) 

def estimate_bias_variance(model, X, y):
    predictions = np.array([model.fit(X, y).predict(X) for _ in range(1000)])
    bias = np.mean((y - np.mean(predictions, axis=0)) ** 2)
    variance = np.mean(np.var(predictions, axis=0))
    return bias, variance

Классификация

In [3]:
# Задача классификации
class_pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))])

class_pipeline_sgd = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log_loss', penalty='l2', random_state=42, max_iter=2000))]) 

# Настройка гиперпараметров
param_grid_class_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20]}

param_grid_class_sgd = {
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive'],
    'classifier__eta0': [0.01, 0.1]}

# Поиск гиперпараметров
grid_search_class_rf = GridSearchCV(class_pipeline_rf, param_grid_class_rf, cv=5, scoring='accuracy')
grid_search_class_rf.fit(X_train, y_class_train)

grid_search_class_sgd = GridSearchCV(class_pipeline_sgd, param_grid_class_sgd, cv=5, scoring='accuracy')
grid_search_class_sgd.fit(X_train, y_class_train)
Out[3]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['age',
                                                                          'avg_glucose_level',
                                                                          'bmi']),
                                                                        ('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['gender',
                                                                          'ever_married',
                                                                          'smoking_status'])])),
                                       ('classifier',
                                        SGDClassifier(loss='log_loss',
                                                      max_iter=2000,
                                                      random_state=42))]),
             param_grid={'classifier__alpha': [0.0001, 0.001, 0.01],
                         'classifier__eta0': [0.01, 0.1],
                         'classifier__learning_rate': ['constant', 'adaptive']},
             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['age',
                                                                          'avg_glucose_level',
                                                                          'bmi']),
                                                                        ('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['gender',
                                                                          'ever_married',
                                                                          'smoking_status'])])),
                                       ('classifier',
                                        SGDClassifier(loss='log_loss',
                                                      max_iter=2000,
                                                      random_state=42))]),
             param_grid={'classifier__alpha': [0.0001, 0.001, 0.01],
                         'classifier__eta0': [0.01, 0.1],
                         'classifier__learning_rate': ['constant', 'adaptive']},
             scoring='accuracy')
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['age', 'avg_glucose_level',
                                                   'bmi']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['gender', 'ever_married',
                                                   'smoking_status'])])),
                ('classifier',
                 SGDClassifier(alpha=0.001, eta0=0.01, learning_rate='constant',
                               loss='log_loss', max_iter=2000,
                               random_state=42))])
ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['age', 'avg_glucose_level', 'bmi']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['gender', 'ever_married', 'smoking_status'])])
['age', 'avg_glucose_level', 'bmi']
StandardScaler()
['gender', 'ever_married', 'smoking_status']
OneHotEncoder(handle_unknown='ignore')
SGDClassifier(alpha=0.001, eta0=0.01, learning_rate='constant', loss='log_loss',
              max_iter=2000, random_state=42)
In [4]:
# Оценка моделей
y_class_pred_rf = grid_search_class_rf.predict(X_test)
y_class_pred_sgd = grid_search_class_sgd.predict(X_test)

print("Classification Report for Random Forest:")
print(classification_report(y_class_test, y_class_pred_rf))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_class_test, y_class_pred_rf))

print("Classification Report for SGD:")
print(classification_report(y_class_test, y_class_pred_sgd))
print("Confusion Matrix for SGD:")
print(confusion_matrix(y_class_test, y_class_pred_sgd))
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       993
           1       0.97      1.00      0.98       952

    accuracy                           0.98      1945
   macro avg       0.98      0.98      0.98      1945
weighted avg       0.99      0.98      0.98      1945

Confusion Matrix for Random Forest:
[[963  30]
 [  0 952]]
Classification Report for SGD:
              precision    recall  f1-score   support

           0       0.84      0.73      0.78       993
           1       0.75      0.85      0.80       952

    accuracy                           0.79      1945
   macro avg       0.80      0.79      0.79      1945
weighted avg       0.80      0.79      0.79      1945

Confusion Matrix for SGD:
[[728 265]
 [141 811]]
In [5]:
# Оценка смещения и дисперсии
bias_class_rf, variance_class_rf = estimate_bias_variance(grid_search_class_rf.best_estimator_, X_train, y_class_train)
bias_class_sgd, variance_class_sgd = estimate_bias_variance(grid_search_class_sgd.best_estimator_, X_train, y_class_train)
Classification Bias (Random Forest): 0.0
Classification Variance (Random Forest): 0.0
Classification Bias (SGD): 0.229008615147229
Classification Variance (SGD): 0.0
In [5]:
from sklearn.utils import resample
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,
    matthews_corrcoef, cohen_kappa_score, confusion_matrix
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import featuretools as ft
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("healthcare-dataset-stroke-data.csv")

# Обработка пропущенных значений
df["bmi"] = df["bmi"].fillna(df["bmi"].median())

# Удаление или замена неизвестных категорий
# Например, замена 'Other' на 'Unknown' в столбце 'gender'
df['gender'] = df['gender'].replace('Other', 'Unknown')

# Разделяем классы
stroke_0 = df[df['stroke'] == 0]
stroke_1 = df[df['stroke'] == 1]

# Увеличиваем выборку для 1
stroke_1 = resample(stroke_1, replace=True, n_samples=len(stroke_0), random_state=42)

# Объединяем классы
df = pd.concat([stroke_0, stroke_1])

df = df.sample(frac=0.4)

df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 3889 entries, 4615 to 2797
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 3889 non-null   int64  
 1   gender             3889 non-null   object 
 2   age                3889 non-null   float64
 3   hypertension       3889 non-null   int64  
 4   heart_disease      3889 non-null   int64  
 5   ever_married       3889 non-null   object 
 6   work_type          3889 non-null   object 
 7   Residence_type     3889 non-null   object 
 8   avg_glucose_level  3889 non-null   float64
 9   bmi                3889 non-null   float64
 10  smoking_status     3889 non-null   object 
 11  stroke             3889 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 395.0+ KB
In [6]:
# Определение целевых переменных
X = df.drop('stroke', axis=1)
y_class = df['stroke']  # Задача классификации
y_reg = df['avg_glucose_level']  # Задача регрессии

# Преобразование категориальных переменных
categorical_features = ['gender', 'ever_married', 'smoking_status']
numerical_features = ['age', 'avg_glucose_level', 'bmi']

# Создание ColumnTransformer с обработкой неизвестных категорий
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])  # Используем handle_unknown='ignore'

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(X, y_class, y_reg, test_size=0.2, random_state=42) 

def estimate_bias_variance(model, X, y):
    predictions = np.array([model.fit(X, y).predict(X) for _ in range(1000)])
    bias = np.mean((y - np.mean(predictions, axis=0)) ** 2)
    variance = np.mean(np.var(predictions, axis=0))
    return bias, variance
In [7]:
# Задача регрессии
reg_pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))])

reg_pipeline_sgd = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SGDRegressor(loss='squared_error', penalty='l2', random_state=42, max_iter=2000))])

# Настройка гиперпараметров для регрессии
param_grid_reg_rf = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20]}

param_grid_reg_sgd = {
    'regressor__alpha': [0.0001, 0.001, 0.01],
    'regressor__learning_rate': ['constant', 'adaptive'],
    'regressor__eta0': [0.01, 0.1]}

# Поиск гиперпараметров
grid_search_reg_rf = GridSearchCV(reg_pipeline_rf, param_grid_reg_rf, cv=5, scoring='r2')
grid_search_reg_rf.fit(X_train, y_reg_train)

grid_search_reg_sgd = GridSearchCV(reg_pipeline_sgd, param_grid_reg_sgd, cv=5, scoring='r2')
grid_search_reg_sgd.fit(X_train, y_reg_train)
Out[7]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['age',
                                                                          'avg_glucose_level',
                                                                          'bmi']),
                                                                        ('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['gender',
                                                                          'ever_married',
                                                                          'smoking_status'])])),
                                       ('regressor',
                                        SGDRegressor(max_iter=2000,
                                                     random_state=42))]),
             param_grid={'regressor__alpha': [0.0001, 0.001, 0.01],
                         'regressor__eta0': [0.01, 0.1],
                         'regressor__learning_rate': ['constant', 'adaptive']},
             scoring='r2')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['age',
                                                                          'avg_glucose_level',
                                                                          'bmi']),
                                                                        ('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['gender',
                                                                          'ever_married',
                                                                          'smoking_status'])])),
                                       ('regressor',
                                        SGDRegressor(max_iter=2000,
                                                     random_state=42))]),
             param_grid={'regressor__alpha': [0.0001, 0.001, 0.01],
                         'regressor__eta0': [0.01, 0.1],
                         'regressor__learning_rate': ['constant', 'adaptive']},
             scoring='r2')
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['age', 'avg_glucose_level',
                                                   'bmi']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['gender', 'ever_married',
                                                   'smoking_status'])])),
                ('regressor',
                 SGDRegressor(eta0=0.1, learning_rate='adaptive', max_iter=2000,
                              random_state=42))])
ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['age', 'avg_glucose_level', 'bmi']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['gender', 'ever_married', 'smoking_status'])])
['age', 'avg_glucose_level', 'bmi']
StandardScaler()
['gender', 'ever_married', 'smoking_status']
OneHotEncoder(handle_unknown='ignore')
SGDRegressor(eta0=0.1, learning_rate='adaptive', max_iter=2000, random_state=42)
In [8]:
# Оценка моделей
y_reg_pred_rf = grid_search_reg_rf.predict(X_test)
y_reg_pred_sgd = grid_search_reg_sgd.predict(X_test)

print("Regression Metrics for Random Forest:")
print("Mean Squared Error:", mean_squared_error(y_reg_test, y_reg_pred_rf))
print("R2 Score:", r2_score(y_reg_test, y_reg_pred_rf))

print("Regression Metrics for SGD:")
print("Mean Squared Error:", mean_squared_error(y_reg_test, y_reg_pred_sgd))
print("R2 Score:", r2_score(y_reg_test, y_reg_pred_sgd))
Regression Metrics for Random Forest:
Mean Squared Error: 0.020541479842551013
R2 Score: 0.9999932939237861
Regression Metrics for SGD:
Mean Squared Error: 4.648382893338219e-05
R2 Score: 0.9999999848246522
In [9]:
# Оценка смещения и дисперсии
bias_reg_rf, variance_reg_rf = estimate_bias_variance(grid_search_reg_rf.best_estimator_, X_train, y_reg_train)
bias_reg_sgd, variance_reg_sgd = estimate_bias_variance(grid_search_reg_sgd.best_estimator_, X_train, y_reg_train)

print("Regression Bias (Random Forest):", bias_reg_rf)
print("Regression Variance (Random Forest):", variance_reg_rf)
print("Regression Bias (SGD):", bias_reg_sgd)
print("Regression Variance (SGD):", variance_reg_sgd)
Regression Bias (Random Forest): 0.0022602929210874885
Regression Variance (Random Forest): 3.608883047891326e-24
Regression Bias (SGD): 4.682701837803326e-05
Regression Variance (SGD): 3.2443460449162085e-24