PredictiveAnalytics/lab4.ipynb at 7330d543e59d971a2f2874e9e7411535376f8c64

Alfiya/PredictiveAnalytics

Fork 0

Тукаева Альфия 8c645b301b lab4

2025-01-08 18:22:03 +04:00

102 KiB

Raw Blame History

Регрессия.

Прогнозирование вероятности IT-направления: Цель: Используя такие параметры, как уровень образования, тип учебного заведения, финансовое положение, возраст и уровень гибкости, можно предсказать занятие IT-направления.

Классификация.

Распределение студентов по типам учебных заведений Цель: распределить студентов по различным типам учреждений (например, государственные/частные университеты), используя данные об их образовании, возрасте, месте проживания и финансовых возможностях.

In [38]:

import pandas as pd

from sklearn import set_config

set_config(transform_output="pandas")

random_state=9

df = pd.read_csv("data/students_education.csv")
def Institution_Type(value):
    if value == "Private":
        return 1
    elif value == "Public":
        return 0

df['Institution Type'] = df['Institution Type'].map(Institution_Type)

df

Out[38]:

	Education Level	Institution Type	Gender	Age	Device	IT Student	Location	Financial Condition	Internet Type	Network Type	Flexibility Level
0	University	1	Male	23	Tab	No	Town	Mid	Wifi	4G	Moderate
1	University	1	Female	23	Mobile	No	Town	Mid	Mobile Data	4G	Moderate
2	College	0	Female	18	Mobile	No	Town	Mid	Wifi	4G	Moderate
3	School	1	Female	11	Mobile	No	Town	Mid	Mobile Data	4G	Moderate
4	School	1	Female	18	Mobile	No	Town	Poor	Mobile Data	3G	Low
...	...	...	...	...	...	...	...	...	...	...	...
1200	College	1	Female	18	Mobile	No	Town	Mid	Wifi	4G	Low
1201	College	1	Female	18	Mobile	No	Rural	Mid	Wifi	4G	Moderate
1202	School	1	Male	11	Mobile	No	Town	Mid	Mobile Data	3G	Moderate
1203	College	1	Female	18	Mobile	No	Rural	Mid	Wifi	4G	Low
1204	School	1	Female	11	Mobile	No	Town	Poor	Mobile Data	3G	Moderate

1205 rows × 11 columns

In [90]:

from sklearn.utils import resample
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,
    matthews_corrcoef, cohen_kappa_score, confusion_matrix
)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import featuretools as ft
from sklearn.metrics import accuracy_score, classification_report

# Определение целевых переменных
X = df.drop('Institution Type', axis=1)
y_class = df['Institution Type']  # Задача классификации
y_reg = df['IT Student']  # Задача регрессии

# Преобразование категориальных переменных
categorical_features = ['Education Level', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition',
                        'Internet Type', 'Network Type', 'Flexibility Level']
numerical_features = ['Age']

# Создание ColumnTransformer с обработкой неизвестных категорий
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])  # Используем handle_unknown='ignore'

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(X, y_class, y_reg, test_size=0.2, random_state=42) 

def estimate_bias_variance(model, X, y):
    predictions = np.array([model.fit(X, y).predict(X) for _ in range(1000)])
    bias = np.mean((y - np.mean(predictions, axis=0)) ** 2)
    variance = np.mean(np.var(predictions, axis=0))
    return bias, variance

display("X_train", X_train)
display("y_class_train", y_class_train)

display("X_test", X_test)
display("y_class_test", y_class_test)

'X_train'

	Education Level	Gender	Age	Device	IT Student	Location	Financial Condition	Internet Type	Network Type	Flexibility Level
294	1	0	9	1	0	1	2	0	2	1
876	1	1	11	1	0	1	0	0	1	2
382	1	1	11	1	0	1	0	0	1	1
634	2	0	23	1	0	1	0	1	1	1
906	1	0	11	1	0	1	0	1	1	1
...	...	...	...	...	...	...	...	...	...	...
1044	0	0	18	1	0	1	0	1	2	2
1095	2	0	23	0	1	1	2	1	2	0
1130	1	1	11	1	0	1	1	1	2	1
860	2	1	23	1	0	1	0	0	2	1
1126	2	1	23	0	1	0	0	0	1	1

964 rows × 10 columns

'y_class_train'

294     0
876     1
382     1
634     0
906     0
       ..
1044    1
1095    1
1130    1
860     1
1126    1
Name: Institution Type, Length: 964, dtype: int64

'X_test'

	Education Level	Gender	Age	Device	IT Student	Location	Financial Condition	Internet Type	Network Type	Flexibility Level
101	1	0	11	0	0	1	0	1	2	2
946	0	1	18	1	0	1	0	1	2	2
306	0	1	18	2	1	1	0	1	2	2
109	2	0	23	1	0	1	0	1	1	0
1061	2	1	23	0	1	0	0	0	1	2
...	...	...	...	...	...	...	...	...	...	...
908	1	1	10	1	0	1	2	1	2	2
1135	2	0	18	0	1	1	0	1	2	2
894	1	0	10	1	0	1	1	0	1	1
866	1	1	11	1	0	1	0	0	1	1
1006	2	0	23	0	0	1	2	1	2	2

241 rows × 10 columns

'y_class_test'

101     1
946     1
306     0
109     1
1061    1
       ..
908     1
1135    1
894     1
866     1
1006    1
Name: Institution Type, Length: 241, dtype: int64

In [92]:

# Задача классификации
categorical_preprocessor = OneHotEncoder(sparse_output=False)

# Создаем общий preprocessor
preprocessor = ColumnTransformer([
    ("cat", categorical_preprocessor, categorical_features),
    ("num", "passthrough", numerical_features),  
], remainder="drop")  

# Построение пайплайнов
class_pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))])

class_pipeline_sgd = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log_loss', penalty='l2', random_state=42, max_iter=2000))])

# Настройки гиперпараметров
param_grid_class_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20]}

param_grid_class_sgd = {
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive'],
    'classifier__eta0': [0.01, 0.1]}

# Поиск гиперпараметров
grid_search_class_rf = GridSearchCV(class_pipeline_rf, param_grid_class_rf, cv=5, scoring='accuracy')
grid_search_class_rf.fit(X_train, y_class_train)

grid_search_class_sgd = GridSearchCV(class_pipeline_sgd, param_grid_class_sgd, cv=5, scoring='accuracy')
grid_search_class_sgd.fit(X_train, y_class_train)

Out[92]:

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['Education '
                                                                          'Level',
                                                                          'Gender',
                                                                          'Device',
                                                                          'IT '
                                                                          'Student',
                                                                          'Location',
                                                                          'Financial '
                                                                          'Condition',
                                                                          'Internet '
                                                                          'Type',
                                                                          'Network '
                                                                          'Type',
                                                                          'Flexibility '
                                                                          'Level']),
                                                                        ('num',
                                                                         'passthrough',
                                                                         ['Age'])])),
                                       ('classifier',
                                        SGDClassifier(loss='log_loss',
                                                      max_iter=2000,
                                                      random_state=42))]),
             param_grid={'classifier__alpha': [0.0001, 0.001, 0.01],
                         'classifier__eta0': [0.01, 0.1],
                         'classifier__learning_rate': ['constant', 'adaptive']},
             scoring='accuracy')

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

GridSearchCV?Documentation for GridSearchCViFitted

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['Education '
                                                                          'Level',
                                                                          'Gender',
                                                                          'Device',
                                                                          'IT '
                                                                          'Student',
                                                                          'Location',
                                                                          'Financial '
                                                                          'Condition',
                                                                          'Internet '
                                                                          'Type',
                                                                          'Network '
                                                                          'Type',
                                                                          'Flexibility '
                                                                          'Level']),
                                                                        ('num',
                                                                         'passthrough',
                                                                         ['Age'])])),
                                       ('classifier',
                                        SGDClassifier(loss='log_loss',
                                                      max_iter=2000,
                                                      random_state=42))]),
             param_grid={'classifier__alpha': [0.0001, 0.001, 0.01],
                         'classifier__eta0': [0.01, 0.1],
                         'classifier__learning_rate': ['constant', 'adaptive']},
             scoring='accuracy')

best_estimator_: Pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['Education Level', 'Gender',
                                                   'Device', 'IT Student',
                                                   'Location',
                                                   'Financial Condition',
                                                   'Internet Type',
                                                   'Network Type',
                                                   'Flexibility Level']),
                                                 ('num', 'passthrough',
                                                  ['Age'])])),
                ('classifier',
                 SGDClassifier(eta0=0.1, learning_rate='adaptive',
                               loss='log_loss', max_iter=2000,
                               random_state=42))])

preprocessor: ColumnTransformer?Documentation for preprocessor: ColumnTransformer

ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse_output=False),
                                 ['Education Level', 'Gender', 'Device',
                                  'IT Student', 'Location',
                                  'Financial Condition', 'Internet Type',
                                  'Network Type', 'Flexibility Level']),
                                ('num', 'passthrough', ['Age'])])

cat

['Education Level', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition', 'Internet Type', 'Network Type', 'Flexibility Level']

OneHotEncoder?Documentation for OneHotEncoder

OneHotEncoder(sparse_output=False)

num

['Age']

passthrough

passthrough

SGDClassifier?Documentation for SGDClassifier

SGDClassifier(eta0=0.1, learning_rate='adaptive', loss='log_loss',
              max_iter=2000, random_state=42)

In [93]:

# Оценка моделей
y_class_pred_rf = grid_search_class_rf.predict(X_test)
y_class_pred_sgd = grid_search_class_sgd.predict(X_test)

print("Classification Report for Random Forest:")
print(classification_report(y_class_test, y_class_pred_rf))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_class_test, y_class_pred_rf))

print("Classification Report for SGD:")
print(classification_report(y_class_test, y_class_pred_sgd))
print("Confusion Matrix for SGD:")
print(confusion_matrix(y_class_test, y_class_pred_sgd))

Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.87      0.86      0.86        76
           1       0.93      0.94      0.94       165

    accuracy                           0.91       241
   macro avg       0.90      0.90      0.90       241
weighted avg       0.91      0.91      0.91       241

Confusion Matrix for Random Forest:
[[ 65  11]
 [ 10 155]]
Classification Report for SGD:
              precision    recall  f1-score   support

           0       0.72      0.58      0.64        76
           1       0.82      0.90      0.86       165

    accuracy                           0.80       241
   macro avg       0.77      0.74      0.75       241
weighted avg       0.79      0.80      0.79       241

Confusion Matrix for SGD:
[[ 44  32]
 [ 17 148]]

In [99]:

# Оценка смещения и дисперсии
bias_class_rf, variance_class_rf = estimate_bias_variance(grid_search_class_rf.best_estimator_, X_train, y_class_train)
bias_class_sgd, variance_class_sgd = estimate_bias_variance(grid_search_class_sgd.best_estimator_, X_train, y_class_train)

print("Classification Bias (Random Forest):", bias_class_rf)
print("Classification Variance (Random Forest):", variance_class_rf)
print("Classification Bias (SGD):", bias_class_sgd)
print("Classification Variance (SGD):", variance_class_sgd)

Classification Bias (Random Forest): 0.05186721991701245
Classification Variance (Random Forest): 0.0
Classification Bias (SGD): 0.19398340248962656
Classification Variance (SGD): 0.0

In [96]:

# Задача регрессии
reg_pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))])

reg_pipeline_sgd = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SGDRegressor(loss='squared_error', penalty='l2', random_state=42, max_iter=2000))])

# Настройка гиперпараметров для регрессии
param_grid_reg_rf = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20]}

param_grid_reg_sgd = {
    'regressor__alpha': [0.0001, 0.001, 0.01],
    'regressor__learning_rate': ['constant', 'adaptive'],
    'regressor__eta0': [0.01, 0.1]}

# Поиск гиперпараметров
grid_search_reg_rf = GridSearchCV(reg_pipeline_rf, param_grid_reg_rf, cv=5, scoring='r2')
grid_search_reg_rf.fit(X_train, y_reg_train)

grid_search_reg_sgd = GridSearchCV(reg_pipeline_sgd, param_grid_reg_sgd, cv=5, scoring='r2')
grid_search_reg_sgd.fit(X_train, y_reg_train)

c:\Users\Альфия\PycharmProjects\Curse\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
warnings.warn(
c:\Users\Альфия\PycharmProjects\Curse\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
warnings.warn(
c:\Users\Альфия\PycharmProjects\Curse\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
warnings.warn(
c:\Users\Альфия\PycharmProjects\Curse\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
warnings.warn(
c:\Users\Альфия\PycharmProjects\Curse\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
warnings.warn(
c:\Users\Альфия\PycharmProjects\Curse\.venv\Lib\site-packages\sklearn\linear_model\_stochastic_gradient.py:1616: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
warnings.warn(

Out[96]:

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['Education '
                                                                          'Level',
                                                                          'Gender',
                                                                          'Device',
                                                                          'IT '
                                                                          'Student',
                                                                          'Location',
                                                                          'Financial '
                                                                          'Condition',
                                                                          'Internet '
                                                                          'Type',
                                                                          'Network '
                                                                          'Type',
                                                                          'Flexibility '
                                                                          'Level']),
                                                                        ('num',
                                                                         'passthrough',
                                                                         ['Age'])])),
                                       ('regressor',
                                        SGDRegressor(max_iter=2000,
                                                     random_state=42))]),
             param_grid={'regressor__alpha': [0.0001, 0.001, 0.01],
                         'regressor__eta0': [0.01, 0.1],
                         'regressor__learning_rate': ['constant', 'adaptive']},
             scoring='r2')

GridSearchCV?Documentation for GridSearchCViFitted

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['Education '
                                                                          'Level',
                                                                          'Gender',
                                                                          'Device',
                                                                          'IT '
                                                                          'Student',
                                                                          'Location',
                                                                          'Financial '
                                                                          'Condition',
                                                                          'Internet '
                                                                          'Type',
                                                                          'Network '
                                                                          'Type',
                                                                          'Flexibility '
                                                                          'Level']),
                                                                        ('num',
                                                                         'passthrough',
                                                                         ['Age'])])),
                                       ('regressor',
                                        SGDRegressor(max_iter=2000,
                                                     random_state=42))]),
             param_grid={'regressor__alpha': [0.0001, 0.001, 0.01],
                         'regressor__eta0': [0.01, 0.1],
                         'regressor__learning_rate': ['constant', 'adaptive']},
             scoring='r2')

best_estimator_: Pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['Education Level', 'Gender',
                                                   'Device', 'IT Student',
                                                   'Location',
                                                   'Financial Condition',
                                                   'Internet Type',
                                                   'Network Type',
                                                   'Flexibility Level']),
                                                 ('num', 'passthrough',
                                                  ['Age'])])),
                ('regressor',
                 SGDRegressor(alpha=0.001, learning_rate='adaptive',
                              max_iter=2000, random_state=42))])

preprocessor: ColumnTransformer?Documentation for preprocessor: ColumnTransformer

ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse_output=False),
                                 ['Education Level', 'Gender', 'Device',
                                  'IT Student', 'Location',
                                  'Financial Condition', 'Internet Type',
                                  'Network Type', 'Flexibility Level']),
                                ('num', 'passthrough', ['Age'])])

cat

['Education Level', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition', 'Internet Type', 'Network Type', 'Flexibility Level']

OneHotEncoder?Documentation for OneHotEncoder

OneHotEncoder(sparse_output=False)

num

['Age']

passthrough

passthrough

SGDRegressor?Documentation for SGDRegressor

SGDRegressor(alpha=0.001, learning_rate='adaptive', max_iter=2000,
             random_state=42)

In [97]:

# Оценка моделей
y_reg_pred_rf = grid_search_reg_rf.predict(X_test)
y_reg_pred_sgd = grid_search_reg_sgd.predict(X_test)

print("Regression Metrics for Random Forest:")
print("Mean Squared Error:", mean_squared_error(y_reg_test, y_reg_pred_rf))
print("R2 Score:", r2_score(y_reg_test, y_reg_pred_rf))

print("Regression Metrics for SGD:")
print("Mean Squared Error:", mean_squared_error(y_reg_test, y_reg_pred_sgd))
print("R2 Score:", r2_score(y_reg_test, y_reg_pred_sgd))

Regression Metrics for Random Forest:
Mean Squared Error: 0.0
R2 Score: 1.0
Regression Metrics for SGD:
Mean Squared Error: 4011897878459.718
R2 Score: -20174462396433.535

In [98]:

# Оценка смещения и дисперсии
bias_reg_rf, variance_reg_rf = estimate_bias_variance(grid_search_reg_rf.best_estimator_, X_train, y_reg_train)
bias_reg_sgd, variance_reg_sgd = estimate_bias_variance(grid_search_reg_sgd.best_estimator_, X_train, y_reg_train)

print("Regression Bias (Random Forest):", bias_reg_rf)
print("Regression Variance (Random Forest):", variance_reg_rf)
print("Regression Bias (SGD):", bias_reg_sgd)
print("Regression Variance (SGD):", variance_reg_sgd)

Regression Bias (Random Forest): 0.0
Regression Variance (Random Forest): 0.0
Regression Bias (SGD): 4382665100501.0005
Regression Variance (SGD): 1.1089741676829076e-15

102 KiB Raw Blame History Unescape Escape

102 KiB

Raw Blame History