76 KiB
76 KiB
In [1]:
from sklearn.utils import resample
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.metrics import (
precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,
matthews_corrcoef, cohen_kappa_score, confusion_matrix
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import featuretools as ft
from sklearn.metrics import accuracy_score, classification_report
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
# Обработка пропущенных значений
df["bmi"] = df["bmi"].fillna(df["bmi"].median())
# Удаление или замена неизвестных категорий
# Например, замена 'Other' на 'Unknown' в столбце 'gender'
df['gender'] = df['gender'].replace('Other', 'Unknown')
# Разделяем классы
stroke_0 = df[df['stroke'] == 0]
stroke_1 = df[df['stroke'] == 1]
# Увеличиваем выборку для 1
stroke_1 = resample(stroke_1, replace=True, n_samples=len(stroke_0), random_state=42)
# Объединяем классы
df = pd.concat([stroke_0, stroke_1])
df.info()
Классификация: Предсказать вероятность инсульта на основе данных пациента.
Регрессия: Предсказать уровень глюкозы в крови на основе данных пациента.
In [2]:
# Определение целевых переменных
X = df.drop('stroke', axis=1)
y_class = df['stroke'] # Задача классификации
y_reg = df['avg_glucose_level'] # Задача регрессии
# Преобразование категориальных переменных
categorical_features = ['gender', 'ever_married', 'smoking_status']
numerical_features = ['age', 'avg_glucose_level', 'bmi']
# Создание ColumnTransformer с обработкой неизвестных категорий
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)]) # Используем handle_unknown='ignore'
# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(X, y_class, y_reg, test_size=0.2, random_state=42)
def estimate_bias_variance(model, X, y):
predictions = np.array([model.fit(X, y).predict(X) for _ in range(1000)])
bias = np.mean((y - np.mean(predictions, axis=0)) ** 2)
variance = np.mean(np.var(predictions, axis=0))
return bias, variance
Классификация
In [3]:
# Задача классификации
class_pipeline_rf = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(random_state=42))])
class_pipeline_sgd = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', SGDClassifier(loss='log_loss', penalty='l2', random_state=42, max_iter=2000))])
# Настройка гиперпараметров
param_grid_class_rf = {
'classifier__n_estimators': [100, 200],
'classifier__max_depth': [None, 10, 20]}
param_grid_class_sgd = {
'classifier__alpha': [0.0001, 0.001, 0.01],
'classifier__learning_rate': ['constant', 'adaptive'],
'classifier__eta0': [0.01, 0.1]}
# Поиск гиперпараметров
grid_search_class_rf = GridSearchCV(class_pipeline_rf, param_grid_class_rf, cv=5, scoring='accuracy')
grid_search_class_rf.fit(X_train, y_class_train)
grid_search_class_sgd = GridSearchCV(class_pipeline_sgd, param_grid_class_sgd, cv=5, scoring='accuracy')
grid_search_class_sgd.fit(X_train, y_class_train)
Out[3]:
In [4]:
# Оценка моделей
y_class_pred_rf = grid_search_class_rf.predict(X_test)
y_class_pred_sgd = grid_search_class_sgd.predict(X_test)
print("Classification Report for Random Forest:")
print(classification_report(y_class_test, y_class_pred_rf))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_class_test, y_class_pred_rf))
print("Classification Report for SGD:")
print(classification_report(y_class_test, y_class_pred_sgd))
print("Confusion Matrix for SGD:")
print(confusion_matrix(y_class_test, y_class_pred_sgd))
In [5]:
# Оценка смещения и дисперсии
bias_class_rf, variance_class_rf = estimate_bias_variance(grid_search_class_rf.best_estimator_, X_train, y_class_train)
bias_class_sgd, variance_class_sgd = estimate_bias_variance(grid_search_class_sgd.best_estimator_, X_train, y_class_train)
In [5]:
from sklearn.utils import resample
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.metrics import (
precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,
matthews_corrcoef, cohen_kappa_score, confusion_matrix
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import featuretools as ft
from sklearn.metrics import accuracy_score, classification_report
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
# Обработка пропущенных значений
df["bmi"] = df["bmi"].fillna(df["bmi"].median())
# Удаление или замена неизвестных категорий
# Например, замена 'Other' на 'Unknown' в столбце 'gender'
df['gender'] = df['gender'].replace('Other', 'Unknown')
# Разделяем классы
stroke_0 = df[df['stroke'] == 0]
stroke_1 = df[df['stroke'] == 1]
# Увеличиваем выборку для 1
stroke_1 = resample(stroke_1, replace=True, n_samples=len(stroke_0), random_state=42)
# Объединяем классы
df = pd.concat([stroke_0, stroke_1])
df = df.sample(frac=0.4)
df.info()
In [6]:
# Определение целевых переменных
X = df.drop('stroke', axis=1)
y_class = df['stroke'] # Задача классификации
y_reg = df['avg_glucose_level'] # Задача регрессии
# Преобразование категориальных переменных
categorical_features = ['gender', 'ever_married', 'smoking_status']
numerical_features = ['age', 'avg_glucose_level', 'bmi']
# Создание ColumnTransformer с обработкой неизвестных категорий
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)]) # Используем handle_unknown='ignore'
# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(X, y_class, y_reg, test_size=0.2, random_state=42)
def estimate_bias_variance(model, X, y):
predictions = np.array([model.fit(X, y).predict(X) for _ in range(1000)])
bias = np.mean((y - np.mean(predictions, axis=0)) ** 2)
variance = np.mean(np.var(predictions, axis=0))
return bias, variance
In [7]:
# Задача регрессии
reg_pipeline_rf = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', RandomForestRegressor(random_state=42))])
reg_pipeline_sgd = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', SGDRegressor(loss='squared_error', penalty='l2', random_state=42, max_iter=2000))])
# Настройка гиперпараметров для регрессии
param_grid_reg_rf = {
'regressor__n_estimators': [100, 200],
'regressor__max_depth': [None, 10, 20]}
param_grid_reg_sgd = {
'regressor__alpha': [0.0001, 0.001, 0.01],
'regressor__learning_rate': ['constant', 'adaptive'],
'regressor__eta0': [0.01, 0.1]}
# Поиск гиперпараметров
grid_search_reg_rf = GridSearchCV(reg_pipeline_rf, param_grid_reg_rf, cv=5, scoring='r2')
grid_search_reg_rf.fit(X_train, y_reg_train)
grid_search_reg_sgd = GridSearchCV(reg_pipeline_sgd, param_grid_reg_sgd, cv=5, scoring='r2')
grid_search_reg_sgd.fit(X_train, y_reg_train)
Out[7]:
In [8]:
# Оценка моделей
y_reg_pred_rf = grid_search_reg_rf.predict(X_test)
y_reg_pred_sgd = grid_search_reg_sgd.predict(X_test)
print("Regression Metrics for Random Forest:")
print("Mean Squared Error:", mean_squared_error(y_reg_test, y_reg_pred_rf))
print("R2 Score:", r2_score(y_reg_test, y_reg_pred_rf))
print("Regression Metrics for SGD:")
print("Mean Squared Error:", mean_squared_error(y_reg_test, y_reg_pred_sgd))
print("R2 Score:", r2_score(y_reg_test, y_reg_pred_sgd))
In [9]:
# Оценка смещения и дисперсии
bias_reg_rf, variance_reg_rf = estimate_bias_variance(grid_search_reg_rf.best_estimator_, X_train, y_reg_train)
bias_reg_sgd, variance_reg_sgd = estimate_bias_variance(grid_search_reg_sgd.best_estimator_, X_train, y_reg_train)
print("Regression Bias (Random Forest):", bias_reg_rf)
print("Regression Variance (Random Forest):", variance_reg_rf)
print("Regression Bias (SGD):", bias_reg_sgd)
print("Regression Variance (SGD):", variance_reg_sgd)