102 KiB
102 KiB
Регрессия.
- Прогнозирование вероятности IT-направления: Цель: Используя такие параметры, как уровень образования, тип учебного заведения, финансовое положение, возраст и уровень гибкости, можно предсказать занятие IT-направления.
Классификация.
- Распределение студентов по типам учебных заведений Цель: распределить студентов по различным типам учреждений (например, государственные/частные университеты), используя данные об их образовании, возрасте, месте проживания и финансовых возможностях.
In [38]:
import pandas as pd
from sklearn import set_config
set_config(transform_output="pandas")
random_state=9
df = pd.read_csv("data/students_education.csv")
def Institution_Type(value):
if value == "Private":
return 1
elif value == "Public":
return 0
df['Institution Type'] = df['Institution Type'].map(Institution_Type)
df
Out[38]:
In [90]:
from sklearn.utils import resample
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.metrics import (
precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,
matthews_corrcoef, cohen_kappa_score, confusion_matrix
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import featuretools as ft
from sklearn.metrics import accuracy_score, classification_report
# Определение целевых переменных
X = df.drop('Institution Type', axis=1)
y_class = df['Institution Type'] # Задача классификации
y_reg = df['IT Student'] # Задача регрессии
# Преобразование категориальных переменных
categorical_features = ['Education Level', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition',
'Internet Type', 'Network Type', 'Flexibility Level']
numerical_features = ['Age']
# Создание ColumnTransformer с обработкой неизвестных категорий
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)]) # Используем handle_unknown='ignore'
# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(X, y_class, y_reg, test_size=0.2, random_state=42)
def estimate_bias_variance(model, X, y):
predictions = np.array([model.fit(X, y).predict(X) for _ in range(1000)])
bias = np.mean((y - np.mean(predictions, axis=0)) ** 2)
variance = np.mean(np.var(predictions, axis=0))
return bias, variance
display("X_train", X_train)
display("y_class_train", y_class_train)
display("X_test", X_test)
display("y_class_test", y_class_test)
In [92]:
# Задача классификации
categorical_preprocessor = OneHotEncoder(sparse_output=False)
# Создаем общий preprocessor
preprocessor = ColumnTransformer([
("cat", categorical_preprocessor, categorical_features),
("num", "passthrough", numerical_features),
], remainder="drop")
# Построение пайплайнов
class_pipeline_rf = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(random_state=42))])
class_pipeline_sgd = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', SGDClassifier(loss='log_loss', penalty='l2', random_state=42, max_iter=2000))])
# Настройки гиперпараметров
param_grid_class_rf = {
'classifier__n_estimators': [100, 200],
'classifier__max_depth': [None, 10, 20]}
param_grid_class_sgd = {
'classifier__alpha': [0.0001, 0.001, 0.01],
'classifier__learning_rate': ['constant', 'adaptive'],
'classifier__eta0': [0.01, 0.1]}
# Поиск гиперпараметров
grid_search_class_rf = GridSearchCV(class_pipeline_rf, param_grid_class_rf, cv=5, scoring='accuracy')
grid_search_class_rf.fit(X_train, y_class_train)
grid_search_class_sgd = GridSearchCV(class_pipeline_sgd, param_grid_class_sgd, cv=5, scoring='accuracy')
grid_search_class_sgd.fit(X_train, y_class_train)
Out[92]:
In [93]:
# Оценка моделей
y_class_pred_rf = grid_search_class_rf.predict(X_test)
y_class_pred_sgd = grid_search_class_sgd.predict(X_test)
print("Classification Report for Random Forest:")
print(classification_report(y_class_test, y_class_pred_rf))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_class_test, y_class_pred_rf))
print("Classification Report for SGD:")
print(classification_report(y_class_test, y_class_pred_sgd))
print("Confusion Matrix for SGD:")
print(confusion_matrix(y_class_test, y_class_pred_sgd))
In [99]:
# Оценка смещения и дисперсии
bias_class_rf, variance_class_rf = estimate_bias_variance(grid_search_class_rf.best_estimator_, X_train, y_class_train)
bias_class_sgd, variance_class_sgd = estimate_bias_variance(grid_search_class_sgd.best_estimator_, X_train, y_class_train)
print("Classification Bias (Random Forest):", bias_class_rf)
print("Classification Variance (Random Forest):", variance_class_rf)
print("Classification Bias (SGD):", bias_class_sgd)
print("Classification Variance (SGD):", variance_class_sgd)
In [96]:
# Задача регрессии
reg_pipeline_rf = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', RandomForestRegressor(random_state=42))])
reg_pipeline_sgd = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', SGDRegressor(loss='squared_error', penalty='l2', random_state=42, max_iter=2000))])
# Настройка гиперпараметров для регрессии
param_grid_reg_rf = {
'regressor__n_estimators': [100, 200],
'regressor__max_depth': [None, 10, 20]}
param_grid_reg_sgd = {
'regressor__alpha': [0.0001, 0.001, 0.01],
'regressor__learning_rate': ['constant', 'adaptive'],
'regressor__eta0': [0.01, 0.1]}
# Поиск гиперпараметров
grid_search_reg_rf = GridSearchCV(reg_pipeline_rf, param_grid_reg_rf, cv=5, scoring='r2')
grid_search_reg_rf.fit(X_train, y_reg_train)
grid_search_reg_sgd = GridSearchCV(reg_pipeline_sgd, param_grid_reg_sgd, cv=5, scoring='r2')
grid_search_reg_sgd.fit(X_train, y_reg_train)
Out[96]:
In [97]:
# Оценка моделей
y_reg_pred_rf = grid_search_reg_rf.predict(X_test)
y_reg_pred_sgd = grid_search_reg_sgd.predict(X_test)
print("Regression Metrics for Random Forest:")
print("Mean Squared Error:", mean_squared_error(y_reg_test, y_reg_pred_rf))
print("R2 Score:", r2_score(y_reg_test, y_reg_pred_rf))
print("Regression Metrics for SGD:")
print("Mean Squared Error:", mean_squared_error(y_reg_test, y_reg_pred_sgd))
print("R2 Score:", r2_score(y_reg_test, y_reg_pred_sgd))
In [98]:
# Оценка смещения и дисперсии
bias_reg_rf, variance_reg_rf = estimate_bias_variance(grid_search_reg_rf.best_estimator_, X_train, y_reg_train)
bias_reg_sgd, variance_reg_sgd = estimate_bias_variance(grid_search_reg_sgd.best_estimator_, X_train, y_reg_train)
print("Regression Bias (Random Forest):", bias_reg_rf)
print("Regression Variance (Random Forest):", variance_reg_rf)
print("Regression Bias (SGD):", bias_reg_sgd)
print("Regression Variance (SGD):", variance_reg_sgd)