PredictiveAnalytics/lab4.ipynb
2025-01-08 18:22:03 +04:00

102 KiB
Raw Blame History

Регрессия.

  • Прогнозирование вероятности IT-направления: Цель: Используя такие параметры, как уровень образования, тип учебного заведения, финансовое положение, возраст и уровень гибкости, можно предсказать занятие IT-направления.

Классификация.

  • Распределение студентов по типам учебных заведений Цель: распределить студентов по различным типам учреждений (например, государственные/частные университеты), используя данные об их образовании, возрасте, месте проживания и финансовых возможностях.
In [38]:
import pandas as pd

from sklearn import set_config

set_config(transform_output="pandas")

random_state=9

df = pd.read_csv("data/students_education.csv")
def Institution_Type(value):
    if value == "Private":
        return 1
    elif value == "Public":
        return 0

df['Institution Type'] = df['Institution Type'].map(Institution_Type)

df
Out[38]:
Education Level Institution Type Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level
0 University 1 Male 23 Tab No Town Mid Wifi 4G Moderate
1 University 1 Female 23 Mobile No Town Mid Mobile Data 4G Moderate
2 College 0 Female 18 Mobile No Town Mid Wifi 4G Moderate
3 School 1 Female 11 Mobile No Town Mid Mobile Data 4G Moderate
4 School 1 Female 18 Mobile No Town Poor Mobile Data 3G Low
... ... ... ... ... ... ... ... ... ... ... ...
1200 College 1 Female 18 Mobile No Town Mid Wifi 4G Low
1201 College 1 Female 18 Mobile No Rural Mid Wifi 4G Moderate
1202 School 1 Male 11 Mobile No Town Mid Mobile Data 3G Moderate
1203 College 1 Female 18 Mobile No Rural Mid Wifi 4G Low
1204 School 1 Female 11 Mobile No Town Poor Mobile Data 3G Moderate

1205 rows × 11 columns

In [90]:
from sklearn.utils import resample
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,
    matthews_corrcoef, cohen_kappa_score, confusion_matrix
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import featuretools as ft
from sklearn.metrics import accuracy_score, classification_report

# Определение целевых переменных
X = df.drop('Institution Type', axis=1)
y_class = df['Institution Type']  # Задача классификации
y_reg = df['IT Student']  # Задача регрессии

# Преобразование категориальных переменных
categorical_features = ['Education Level', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition',
                        'Internet Type', 'Network Type', 'Flexibility Level']
numerical_features = ['Age']

# Создание ColumnTransformer с обработкой неизвестных категорий
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])  # Используем handle_unknown='ignore'

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(X, y_class, y_reg, test_size=0.2, random_state=42) 

def estimate_bias_variance(model, X, y):
    predictions = np.array([model.fit(X, y).predict(X) for _ in range(1000)])
    bias = np.mean((y - np.mean(predictions, axis=0)) ** 2)
    variance = np.mean(np.var(predictions, axis=0))
    return bias, variance

display("X_train", X_train)
display("y_class_train", y_class_train)

display("X_test", X_test)
display("y_class_test", y_class_test)
'X_train'
Education Level Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level
294 1 0 9 1 0 1 2 0 2 1
876 1 1 11 1 0 1 0 0 1 2
382 1 1 11 1 0 1 0 0 1 1
634 2 0 23 1 0 1 0 1 1 1
906 1 0 11 1 0 1 0 1 1 1
... ... ... ... ... ... ... ... ... ... ...
1044 0 0 18 1 0 1 0 1 2 2
1095 2 0 23 0 1 1 2 1 2 0
1130 1 1 11 1 0 1 1 1 2 1
860 2 1 23 1 0 1 0 0 2 1
1126 2 1 23 0 1 0 0 0 1 1

964 rows × 10 columns

'y_class_train'
294     0
876     1
382     1
634     0
906     0
       ..
1044    1
1095    1
1130    1
860     1
1126    1
Name: Institution Type, Length: 964, dtype: int64
'X_test'
Education Level Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level
101 1 0 11 0 0 1 0 1 2 2
946 0 1 18 1 0 1 0 1 2 2
306 0 1 18 2 1 1 0 1 2 2
109 2 0 23 1 0 1 0 1 1 0
1061 2 1 23 0 1 0 0 0 1 2
... ... ... ... ... ... ... ... ... ... ...
908 1 1 10 1 0 1 2 1 2 2
1135 2 0 18 0 1 1 0 1 2 2
894 1 0 10 1 0 1 1 0 1 1
866 1 1 11 1 0 1 0 0 1 1
1006 2 0 23 0 0 1 2 1 2 2

241 rows × 10 columns

'y_class_test'
101     1
946     1
306     0
109     1
1061    1
       ..
908     1
1135    1
894     1
866     1
1006    1
Name: Institution Type, Length: 241, dtype: int64
In [92]:
# Задача классификации
categorical_preprocessor = OneHotEncoder(sparse_output=False)

# Создаем общий preprocessor
preprocessor = ColumnTransformer([
    ("cat", categorical_preprocessor, categorical_features),
    ("num", "passthrough", numerical_features),  
], remainder="drop")  

# Построение пайплайнов
class_pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))])

class_pipeline_sgd = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log_loss', penalty='l2', random_state=42, max_iter=2000))])

# Настройки гиперпараметров
param_grid_class_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20]}

param_grid_class_sgd = {
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive'],
    'classifier__eta0': [0.01, 0.1]}

# Поиск гиперпараметров
grid_search_class_rf = GridSearchCV(class_pipeline_rf, param_grid_class_rf, cv=5, scoring='accuracy')
grid_search_class_rf.fit(X_train, y_class_train)

grid_search_class_sgd = GridSearchCV(class_pipeline_sgd, param_grid_class_sgd, cv=5, scoring='accuracy')
grid_search_class_sgd.fit(X_train, y_class_train)
Out[92]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['Education '
                                                                          'Level',
                                                                          'Gender',
                                                                          'Device',
                                                                          'IT '
                                                                          'Student',
                                                                          'Location',
                                                                          'Financial '
                                                                          'Condition',
                                                                          'Internet '
                                                                          'Type',
                                                                          'Network '
                                                                          'Type',
                                                                          'Flexibility '
                                                                          'Level']),
                                                                        ('num',
                                                                         'passthrough',
                                                                         ['Age'])])),
                                       ('classifier',
                                        SGDClassifier(loss='log_loss',
                                                      max_iter=2000,
                                                      random_state=42))]),
             param_grid={'classifier__alpha': [0.0001, 0.001, 0.01],
                         'classifier__eta0': [0.01, 0.1],
                         'classifier__learning_rate': ['constant', 'adaptive']},
             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['Education '
                                                                          'Level',
                                                                          'Gender',
                                                                          'Device',
                                                                          'IT '
                                                                          'Student',
                                                                          'Location',
                                                                          'Financial '
                                                                          'Condition',
                                                                          'Internet '
                                                                          'Type',
                                                                          'Network '
                                                                          'Type',
                                                                          'Flexibility '
                                                                          'Level']),
                                                                        ('num',
                                                                         'passthrough',
                                                                         ['Age'])])),
                                       ('classifier',
                                        SGDClassifier(loss='log_loss',
                                                      max_iter=2000,
                                                      random_state=42))]),
             param_grid={'classifier__alpha': [0.0001, 0.001, 0.01],
                         'classifier__eta0': [0.01, 0.1],
                         'classifier__learning_rate': ['constant', 'adaptive']},
             scoring='accuracy')
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['Education Level', 'Gender',
                                                   'Device', 'IT Student',
                                                   'Location',
                                                   'Financial Condition',
                                                   'Internet Type',
                                                   'Network Type',
                                                   'Flexibility Level']),
                                                 ('num', 'passthrough',
                                                  ['Age'])])),
                ('classifier',
                 SGDClassifier(eta0=0.1, learning_rate='adaptive',
                               loss='log_loss', max_iter=2000,
                               random_state=42))])
ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse_output=False),
                                 ['Education Level', 'Gender', 'Device',
                                  'IT Student', 'Location',
                                  'Financial Condition', 'Internet Type',
                                  'Network Type', 'Flexibility Level']),
                                ('num', 'passthrough', ['Age'])])
['Education Level', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition', 'Internet Type', 'Network Type', 'Flexibility Level']
OneHotEncoder(sparse_output=False)
['Age']
passthrough
SGDClassifier(eta0=0.1, learning_rate='adaptive', loss='log_loss',
              max_iter=2000, random_state=42)
In [93]:
# Оценка моделей
y_class_pred_rf = grid_search_class_rf.predict(X_test)
y_class_pred_sgd = grid_search_class_sgd.predict(X_test)

print("Classification Report for Random Forest:")
print(classification_report(y_class_test, y_class_pred_rf))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_class_test, y_class_pred_rf))

print("Classification Report for SGD:")
print(classification_report(y_class_test, y_class_pred_sgd))
print("Confusion Matrix for SGD:")
print(confusion_matrix(y_class_test, y_class_pred_sgd))
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.87      0.86      0.86        76
           1       0.93      0.94      0.94       165

    accuracy                           0.91       241
   macro avg       0.90      0.90      0.90       241
weighted avg       0.91      0.91      0.91       241

Confusion Matrix for Random Forest:
[[ 65  11]
 [ 10 155]]
Classification Report for SGD:
              precision    recall  f1-score   support

           0       0.72      0.58      0.64        76
           1       0.82      0.90      0.86       165

    accuracy                           0.80       241
   macro avg       0.77      0.74      0.75       241
weighted avg       0.79      0.80      0.79       241

Confusion Matrix for SGD:
[[ 44  32]
 [ 17 148]]
In [99]:
# Оценка смещения и дисперсии
bias_class_rf, variance_class_rf = estimate_bias_variance(grid_search_class_rf.best_estimator_, X_train, y_class_train)
bias_class_sgd, variance_class_sgd = estimate_bias_variance(grid_search_class_sgd.best_estimator_, X_train, y_class_train)

print("Classification Bias (Random Forest):", bias_class_rf)
print("Classification Variance (Random Forest):", variance_class_rf)
print("Classification Bias (SGD):", bias_class_sgd)
print("Classification Variance (SGD):", variance_class_sgd)
Classification Bias (Random Forest): 0.05186721991701245
Classification Variance (Random Forest): 0.0
Classification Bias (SGD): 0.19398340248962656
Classification Variance (SGD): 0.0
In [96]:
# Задача регрессии
reg_pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))])

reg_pipeline_sgd = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SGDRegressor(loss='squared_error', penalty='l2', random_state=42, max_iter=2000))])

# Настройка гиперпараметров для регрессии
param_grid_reg_rf = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20]}

param_grid_reg_sgd = {
    'regressor__alpha': [0.0001, 0.001, 0.01],
    'regressor__learning_rate': ['constant', 'adaptive'],
    'regressor__eta0': [0.01, 0.1]}

# Поиск гиперпараметров
grid_search_reg_rf = GridSearchCV(reg_pipeline_rf, param_grid_reg_rf, cv=5, scoring='r2')
grid_search_reg_rf.fit(X_train, y_reg_train)

grid_search_reg_sgd = GridSearchCV(reg_pipeline_sgd, param_grid_reg_sgd, cv=5, scoring='r2')
grid_search_reg_sgd.fit(X_train, y_reg_train)
c:\Users\Альфия\PycharmProjects\Curse\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
c:\Users\Альфия\PycharmProjects\Curse\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
c:\Users\Альфия\PycharmProjects\Curse\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
c:\Users\Альфия\PycharmProjects\Curse\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
c:\Users\Альфия\PycharmProjects\Curse\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
c:\Users\Альфия\PycharmProjects\Curse\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
Out[96]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['Education '
                                                                          'Level',
                                                                          'Gender',
                                                                          'Device',
                                                                          'IT '
                                                                          'Student',
                                                                          'Location',
                                                                          'Financial '
                                                                          'Condition',
                                                                          'Internet '
                                                                          'Type',
                                                                          'Network '
                                                                          'Type',
                                                                          'Flexibility '
                                                                          'Level']),
                                                                        ('num',
                                                                         'passthrough',
                                                                         ['Age'])])),
                                       ('regressor',
                                        SGDRegressor(max_iter=2000,
                                                     random_state=42))]),
             param_grid={'regressor__alpha': [0.0001, 0.001, 0.01],
                         'regressor__eta0': [0.01, 0.1],
                         'regressor__learning_rate': ['constant', 'adaptive']},
             scoring='r2')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['Education '
                                                                          'Level',
                                                                          'Gender',
                                                                          'Device',
                                                                          'IT '
                                                                          'Student',
                                                                          'Location',
                                                                          'Financial '
                                                                          'Condition',
                                                                          'Internet '
                                                                          'Type',
                                                                          'Network '
                                                                          'Type',
                                                                          'Flexibility '
                                                                          'Level']),
                                                                        ('num',
                                                                         'passthrough',
                                                                         ['Age'])])),
                                       ('regressor',
                                        SGDRegressor(max_iter=2000,
                                                     random_state=42))]),
             param_grid={'regressor__alpha': [0.0001, 0.001, 0.01],
                         'regressor__eta0': [0.01, 0.1],
                         'regressor__learning_rate': ['constant', 'adaptive']},
             scoring='r2')
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['Education Level', 'Gender',
                                                   'Device', 'IT Student',
                                                   'Location',
                                                   'Financial Condition',
                                                   'Internet Type',
                                                   'Network Type',
                                                   'Flexibility Level']),
                                                 ('num', 'passthrough',
                                                  ['Age'])])),
                ('regressor',
                 SGDRegressor(alpha=0.001, learning_rate='adaptive',
                              max_iter=2000, random_state=42))])
ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse_output=False),
                                 ['Education Level', 'Gender', 'Device',
                                  'IT Student', 'Location',
                                  'Financial Condition', 'Internet Type',
                                  'Network Type', 'Flexibility Level']),
                                ('num', 'passthrough', ['Age'])])
['Education Level', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition', 'Internet Type', 'Network Type', 'Flexibility Level']
OneHotEncoder(sparse_output=False)
['Age']
passthrough
SGDRegressor(alpha=0.001, learning_rate='adaptive', max_iter=2000,
             random_state=42)
In [97]:
# Оценка моделей
y_reg_pred_rf = grid_search_reg_rf.predict(X_test)
y_reg_pred_sgd = grid_search_reg_sgd.predict(X_test)

print("Regression Metrics for Random Forest:")
print("Mean Squared Error:", mean_squared_error(y_reg_test, y_reg_pred_rf))
print("R2 Score:", r2_score(y_reg_test, y_reg_pred_rf))

print("Regression Metrics for SGD:")
print("Mean Squared Error:", mean_squared_error(y_reg_test, y_reg_pred_sgd))
print("R2 Score:", r2_score(y_reg_test, y_reg_pred_sgd))
Regression Metrics for Random Forest:
Mean Squared Error: 0.0
R2 Score: 1.0
Regression Metrics for SGD:
Mean Squared Error: 4011897878459.718
R2 Score: -20174462396433.535
In [98]:
# Оценка смещения и дисперсии
bias_reg_rf, variance_reg_rf = estimate_bias_variance(grid_search_reg_rf.best_estimator_, X_train, y_reg_train)
bias_reg_sgd, variance_reg_sgd = estimate_bias_variance(grid_search_reg_sgd.best_estimator_, X_train, y_reg_train)

print("Regression Bias (Random Forest):", bias_reg_rf)
print("Regression Variance (Random Forest):", variance_reg_rf)
print("Regression Bias (SGD):", bias_reg_sgd)
print("Regression Variance (SGD):", variance_reg_sgd)
Regression Bias (Random Forest): 0.0
Regression Variance (Random Forest): 0.0
Regression Bias (SGD): 4382665100501.0005
Regression Variance (SGD): 1.1089741676829076e-15