pred_analytics/Lab4.ipynb
2025-01-14 12:31:07 +04:00

111 KiB
Raw Permalink Blame History

Регрессия.

  • Прогнозирование вероятности IT-направления: Цель: Используя такие параметры, как уровень образования, тип учебного заведения, финансовое положение, возраст и уровень гибкости, можно предсказать занятие IT-направления.

Классификация.

  • Распределение студентов по типам учебных заведений Цель: распределить студентов по различным типам учреждений (например, государственные/частные университеты), используя данные об их образовании, возрасте, месте проживания и финансовых возможностях.
In [31]:
import pandas as pd

from sklearn import set_config

set_config(transform_output="pandas")

random_state=9

df = pd.read_csv("students_education.csv")
def Institution_Type(value):
    if value == "Private":
        return 1
    elif value == "Public":
        return 0

df['Institution Type'] = df['Institution Type'].map(Institution_Type)

df
Out[31]:
Education Level Institution Type Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level
0 University 1 Male 23 Tab 0 Town Mid Wifi 4G Moderate
1 University 1 Female 23 Mobile 0 Town Mid Mobile Data 4G Moderate
2 College 0 Female 18 Mobile 0 Town Mid Wifi 4G Moderate
3 School 1 Female 11 Mobile 0 Town Mid Mobile Data 4G Moderate
4 School 1 Female 18 Mobile 0 Town Poor Mobile Data 3G Low
... ... ... ... ... ... ... ... ... ... ... ...
1200 College 1 Female 18 Mobile 0 Town Mid Wifi 4G Low
1201 College 1 Female 18 Mobile 0 Rural Mid Wifi 4G Moderate
1202 School 1 Male 11 Mobile 0 Town Mid Mobile Data 3G Moderate
1203 College 1 Female 18 Mobile 0 Rural Mid Wifi 4G Low
1204 School 1 Female 11 Mobile 0 Town Poor Mobile Data 3G Moderate

1205 rows × 11 columns

In [27]:
pd.read_csv("students_education.csv")
Out[27]:
Education Level Institution Type Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level
0 University Private Male 23 Tab No Town Mid Wifi 4G Moderate
1 University Private Female 23 Mobile No Town Mid Mobile Data 4G Moderate
2 College Public Female 18 Mobile No Town Mid Wifi 4G Moderate
3 School Private Female 11 Mobile No Town Mid Mobile Data 4G Moderate
4 School Private Female 18 Mobile No Town Poor Mobile Data 3G Low
... ... ... ... ... ... ... ... ... ... ... ...
1200 College Private Female 18 Mobile No Town Mid Wifi 4G Low
1201 College Private Female 18 Mobile No Rural Mid Wifi 4G Moderate
1202 School Private Male 11 Mobile No Town Mid Mobile Data 3G Moderate
1203 College Private Female 18 Mobile No Rural Mid Wifi 4G Low
1204 School Private Female 11 Mobile No Town Poor Mobile Data 3G Moderate

1205 rows × 11 columns

In [32]:
from sklearn.utils import resample
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,
    matthews_corrcoef, cohen_kappa_score, confusion_matrix
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import featuretools as ft
from sklearn.metrics import accuracy_score, classification_report

# Определение целевых переменных
X = df.drop('Institution Type', axis=1)
y_class = df['Institution Type']  # Задача классификации
y_reg = df['IT Student']  # Задача регрессии

# Преобразование категориальных переменных
categorical_features = ['Education Level', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition',
                        'Internet Type', 'Network Type', 'Flexibility Level']
numerical_features = ['Age']

# Создание ColumnTransformer с обработкой неизвестных категорий
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])  # Используем handle_unknown='ignore'

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(X, y_class, y_reg, test_size=0.2, random_state=42) 

def estimate_bias_variance(model, X, y):
    predictions = np.array([model.fit(X, y).predict(X) for _ in range(1000)])
    bias = np.mean((y - np.mean(predictions, axis=0)) ** 2)
    variance = np.mean(np.var(predictions, axis=0))
    return bias, variance

display("X_train", X_train)
display("y_class_train", y_class_train)

display("X_test", X_test)
display("y_class_test", y_class_test)
'X_train'
Education Level Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level
294 School Female 9 Mobile 0 Town Rich Mobile Data 4G Low
876 School Male 11 Mobile 0 Town Mid Mobile Data 3G Moderate
382 School Male 11 Mobile 0 Town Mid Mobile Data 3G Low
634 University Female 23 Mobile 0 Town Mid Wifi 3G Low
906 School Female 11 Mobile 0 Town Mid Wifi 3G Low
... ... ... ... ... ... ... ... ... ... ...
1044 College Female 18 Mobile 0 Town Mid Wifi 4G Moderate
1095 University Female 23 Computer 1 Town Rich Wifi 4G High
1130 School Male 11 Mobile 0 Town Poor Wifi 4G Low
860 University Male 23 Mobile 0 Town Mid Mobile Data 4G Low
1126 University Male 23 Computer 1 Rural Mid Mobile Data 3G Low

964 rows × 10 columns

'y_class_train'
294     0
876     1
382     1
634     0
906     0
       ..
1044    1
1095    1
1130    1
860     1
1126    1
Name: Institution Type, Length: 964, dtype: int64
'X_test'
Education Level Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level
101 School Female 11 Computer 0 Town Mid Wifi 4G Moderate
946 College Male 18 Mobile 0 Town Mid Wifi 4G Moderate
306 College Male 18 Tab 1 Town Mid Wifi 4G Moderate
109 University Female 23 Mobile 0 Town Mid Wifi 3G High
1061 University Male 23 Computer 1 Rural Mid Mobile Data 3G Moderate
... ... ... ... ... ... ... ... ... ... ...
908 School Male 10 Mobile 0 Town Rich Wifi 4G Moderate
1135 University Female 18 Computer 1 Town Mid Wifi 4G Moderate
894 School Female 10 Mobile 0 Town Poor Mobile Data 3G Low
866 School Male 11 Mobile 0 Town Mid Mobile Data 3G Low
1006 University Female 23 Computer 0 Town Rich Wifi 4G Moderate

241 rows × 10 columns

'y_class_test'
101     1
946     1
306     0
109     1
1061    1
       ..
908     1
1135    1
894     1
866     1
1006    1
Name: Institution Type, Length: 241, dtype: int64
In [33]:
# Задача классификации
categorical_preprocessor = OneHotEncoder(sparse_output=False)

# Создаем общий preprocessor
preprocessor = ColumnTransformer([
    ("cat", categorical_preprocessor, categorical_features),
    ("num", "passthrough", numerical_features),  
], remainder="drop")  

# Построение пайплайнов
class_pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))])

class_pipeline_sgd = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log_loss', penalty='l2', random_state=42, max_iter=2000))])

# Настройки гиперпараметров
param_grid_class_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20]}

param_grid_class_sgd = {
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive'],
    'classifier__eta0': [0.01, 0.1]}

# Поиск гиперпараметров
grid_search_class_rf = GridSearchCV(class_pipeline_rf, param_grid_class_rf, cv=5, scoring='accuracy')
grid_search_class_rf.fit(X_train, y_class_train)

grid_search_class_sgd = GridSearchCV(class_pipeline_sgd, param_grid_class_sgd, cv=5, scoring='accuracy')
grid_search_class_sgd.fit(X_train, y_class_train)
Out[33]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['Education '
                                                                          'Level',
                                                                          'Gender',
                                                                          'Device',
                                                                          'IT '
                                                                          'Student',
                                                                          'Location',
                                                                          'Financial '
                                                                          'Condition',
                                                                          'Internet '
                                                                          'Type',
                                                                          'Network '
                                                                          'Type',
                                                                          'Flexibility '
                                                                          'Level']),
                                                                        ('num',
                                                                         'passthrough',
                                                                         ['Age'])])),
                                       ('classifier',
                                        SGDClassifier(loss='log_loss',
                                                      max_iter=2000,
                                                      random_state=42))]),
             param_grid={'classifier__alpha': [0.0001, 0.001, 0.01],
                         'classifier__eta0': [0.01, 0.1],
                         'classifier__learning_rate': ['constant', 'adaptive']},
             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['Education '
                                                                          'Level',
                                                                          'Gender',
                                                                          'Device',
                                                                          'IT '
                                                                          'Student',
                                                                          'Location',
                                                                          'Financial '
                                                                          'Condition',
                                                                          'Internet '
                                                                          'Type',
                                                                          'Network '
                                                                          'Type',
                                                                          'Flexibility '
                                                                          'Level']),
                                                                        ('num',
                                                                         'passthrough',
                                                                         ['Age'])])),
                                       ('classifier',
                                        SGDClassifier(loss='log_loss',
                                                      max_iter=2000,
                                                      random_state=42))]),
             param_grid={'classifier__alpha': [0.0001, 0.001, 0.01],
                         'classifier__eta0': [0.01, 0.1],
                         'classifier__learning_rate': ['constant', 'adaptive']},
             scoring='accuracy')
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['Education Level', 'Gender',
                                                   'Device', 'IT Student',
                                                   'Location',
                                                   'Financial Condition',
                                                   'Internet Type',
                                                   'Network Type',
                                                   'Flexibility Level']),
                                                 ('num', 'passthrough',
                                                  ['Age'])])),
                ('classifier',
                 SGDClassifier(eta0=0.1, learning_rate='adaptive',
                               loss='log_loss', max_iter=2000,
                               random_state=42))])
ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse_output=False),
                                 ['Education Level', 'Gender', 'Device',
                                  'IT Student', 'Location',
                                  'Financial Condition', 'Internet Type',
                                  'Network Type', 'Flexibility Level']),
                                ('num', 'passthrough', ['Age'])])
['Education Level', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition', 'Internet Type', 'Network Type', 'Flexibility Level']
OneHotEncoder(sparse_output=False)
['Age']
passthrough
SGDClassifier(eta0=0.1, learning_rate='adaptive', loss='log_loss',
              max_iter=2000, random_state=42)
In [34]:
# Оценка моделей
y_class_pred_rf = grid_search_class_rf.predict(X_test)
y_class_pred_sgd = grid_search_class_sgd.predict(X_test)

print("Classification Report for Random Forest:")
print(classification_report(y_class_test, y_class_pred_rf))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_class_test, y_class_pred_rf))

print("Classification Report for SGD:")
print(classification_report(y_class_test, y_class_pred_sgd))
print("Confusion Matrix for SGD:")
print(confusion_matrix(y_class_test, y_class_pred_sgd))
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.87      0.86      0.86        76
           1       0.93      0.94      0.94       165

    accuracy                           0.91       241
   macro avg       0.90      0.90      0.90       241
weighted avg       0.91      0.91      0.91       241

Confusion Matrix for Random Forest:
[[ 65  11]
 [ 10 155]]
Classification Report for SGD:
              precision    recall  f1-score   support

           0       0.72      0.58      0.64        76
           1       0.82      0.90      0.86       165

    accuracy                           0.80       241
   macro avg       0.77      0.74      0.75       241
weighted avg       0.79      0.80      0.79       241

Confusion Matrix for SGD:
[[ 44  32]
 [ 17 148]]
In [35]:
# Оценка смещения и дисперсии
bias_class_rf, variance_class_rf = estimate_bias_variance(grid_search_class_rf.best_estimator_, X_train, y_class_train)
bias_class_sgd, variance_class_sgd = estimate_bias_variance(grid_search_class_sgd.best_estimator_, X_train, y_class_train)

print("Classification Bias (Random Forest):", bias_class_rf)
print("Classification Variance (Random Forest):", variance_class_rf)
print("Classification Bias (SGD):", bias_class_sgd)
print("Classification Variance (SGD):", variance_class_sgd)
Classification Bias (Random Forest): 0.05186721991701245
Classification Variance (Random Forest): 0.0
Classification Bias (SGD): 0.19398340248962656
Classification Variance (SGD): 0.0
In [36]:
# Задача регрессии
reg_pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))])

reg_pipeline_sgd = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SGDRegressor(loss='squared_error', penalty='l2', random_state=42, max_iter=2000))])

# Настройка гиперпараметров для регрессии
param_grid_reg_rf = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20]}

param_grid_reg_sgd = {
    'regressor__alpha': [0.0001, 0.001, 0.01],
    'regressor__learning_rate': ['constant', 'adaptive'],
    'regressor__eta0': [0.01, 0.1]}

# Поиск гиперпараметров
grid_search_reg_rf = GridSearchCV(reg_pipeline_rf, param_grid_reg_rf, cv=5, scoring='r2')
grid_search_reg_rf.fit(X_train, y_reg_train)

grid_search_reg_sgd = GridSearchCV(reg_pipeline_sgd, param_grid_reg_sgd, cv=5, scoring='r2')
grid_search_reg_sgd.fit(X_train, y_reg_train)
c:\Users\novop\Downloads\ckmai-main\ckmai\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
c:\Users\novop\Downloads\ckmai-main\ckmai\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
c:\Users\novop\Downloads\ckmai-main\ckmai\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
c:\Users\novop\Downloads\ckmai-main\ckmai\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
c:\Users\novop\Downloads\ckmai-main\ckmai\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
c:\Users\novop\Downloads\ckmai-main\ckmai\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
Out[36]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['Education '
                                                                          'Level',
                                                                          'Gender',
                                                                          'Device',
                                                                          'IT '
                                                                          'Student',
                                                                          'Location',
                                                                          'Financial '
                                                                          'Condition',
                                                                          'Internet '
                                                                          'Type',
                                                                          'Network '
                                                                          'Type',
                                                                          'Flexibility '
                                                                          'Level']),
                                                                        ('num',
                                                                         'passthrough',
                                                                         ['Age'])])),
                                       ('regressor',
                                        SGDRegressor(max_iter=2000,
                                                     random_state=42))]),
             param_grid={'regressor__alpha': [0.0001, 0.001, 0.01],
                         'regressor__eta0': [0.01, 0.1],
                         'regressor__learning_rate': ['constant', 'adaptive']},
             scoring='r2')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['Education '
                                                                          'Level',
                                                                          'Gender',
                                                                          'Device',
                                                                          'IT '
                                                                          'Student',
                                                                          'Location',
                                                                          'Financial '
                                                                          'Condition',
                                                                          'Internet '
                                                                          'Type',
                                                                          'Network '
                                                                          'Type',
                                                                          'Flexibility '
                                                                          'Level']),
                                                                        ('num',
                                                                         'passthrough',
                                                                         ['Age'])])),
                                       ('regressor',
                                        SGDRegressor(max_iter=2000,
                                                     random_state=42))]),
             param_grid={'regressor__alpha': [0.0001, 0.001, 0.01],
                         'regressor__eta0': [0.01, 0.1],
                         'regressor__learning_rate': ['constant', 'adaptive']},
             scoring='r2')
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['Education Level', 'Gender',
                                                   'Device', 'IT Student',
                                                   'Location',
                                                   'Financial Condition',
                                                   'Internet Type',
                                                   'Network Type',
                                                   'Flexibility Level']),
                                                 ('num', 'passthrough',
                                                  ['Age'])])),
                ('regressor',
                 SGDRegressor(alpha=0.001, learning_rate='adaptive',
                              max_iter=2000, random_state=42))])
ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse_output=False),
                                 ['Education Level', 'Gender', 'Device',
                                  'IT Student', 'Location',
                                  'Financial Condition', 'Internet Type',
                                  'Network Type', 'Flexibility Level']),
                                ('num', 'passthrough', ['Age'])])
['Education Level', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition', 'Internet Type', 'Network Type', 'Flexibility Level']
OneHotEncoder(sparse_output=False)
['Age']
passthrough
SGDRegressor(alpha=0.001, learning_rate='adaptive', max_iter=2000,
             random_state=42)
In [37]:
# Оценка моделей
y_reg_pred_rf = grid_search_reg_rf.predict(X_test)
y_reg_pred_sgd = grid_search_reg_sgd.predict(X_test)

print("Regression Metrics for Random Forest:")
print("Mean Squared Error:", mean_squared_error(y_reg_test, y_reg_pred_rf))
print("R2 Score:", r2_score(y_reg_test, y_reg_pred_rf))

print("Regression Metrics for SGD:")
print("Mean Squared Error:", mean_squared_error(y_reg_test, y_reg_pred_sgd))
print("R2 Score:", r2_score(y_reg_test, y_reg_pred_sgd))
Regression Metrics for Random Forest:
Mean Squared Error: 0.0
R2 Score: 1.0
Regression Metrics for SGD:
Mean Squared Error: 4011897878459.718
R2 Score: -20174462396433.535
In [38]:
# Оценка смещения и дисперсии
bias_reg_rf, variance_reg_rf = estimate_bias_variance(grid_search_reg_rf.best_estimator_, X_train, y_reg_train)
bias_reg_sgd, variance_reg_sgd = estimate_bias_variance(grid_search_reg_sgd.best_estimator_, X_train, y_reg_train)

print("Regression Bias (Random Forest):", bias_reg_rf)
print("Regression Variance (Random Forest):", variance_reg_rf)
print("Regression Bias (SGD):", bias_reg_sgd)
print("Regression Variance (SGD):", variance_reg_sgd)
Regression Bias (Random Forest): 0.0
Regression Variance (Random Forest): 0.0
Regression Bias (SGD): 4382665100501.0005
Regression Variance (SGD): 1.1089741676829076e-15