AIM-PIbd-31-Makarov-DV/lab_4/lab4.ipynb
2024-11-15 00:44:23 +04:00

207 KiB
Raw Blame History

Лабораторная 4

Датасет: Информация об онлайн обучении учеников

Бизнес-цель 1: Улучшение доступа к онлайн-образованию для учеников с низким уровнем финансового обеспечения.

In [104]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple
from pandas import DataFrame
from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree, metrics, set_config
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

set_config(transform_output="pandas")
df = pd.read_csv("..\\static\\csv\\students_adaptability_level_online_education.csv")
print(df.columns)

map_flexibility_to_int = {'Low': 0, 'Moderate': 1, 'High': 2}

df['Flexibility Level'] = df['Flexibility Level'].map(map_flexibility_to_int).astype('int32')
Index(['Education Level', 'Institution Type', 'Gender', 'Age', 'Device',
       'IT Student', 'Location', 'Financial Condition', 'Internet Type',
       'Network Type', 'Flexibility Level'],
      dtype='object')

Предварительно создадим колонку для работы с ней (ключевой фактор)

In [105]:
fincond_mapping = {'Poor': 2, 'Mid': 1, 'Rich': 0}
internet_type_mapping = {'Mobile Data': 1, 'Wifi': 0}
device_mapping = {'Mobile': 1, 'Computer': 0}
network_type = {'2G': 2, '3G': 1, '4G': 0}

df['Financial Score'] = df['Financial Condition'].map(fincond_mapping)
df['Internet Score'] = df['Internet Type'].map(internet_type_mapping)
df['Device Score'] = df['Device'].map(device_mapping)
df['Network Score'] = df['Network Type'].map(network_type)

df['Access Difficulty Score'] = df['Financial Score'] + df['Internet Score'] + df['Device Score'] + df['Network Score']

df['Access Difficulty'] = (df['Access Difficulty Score'] >= 3).astype(int)
df.drop(columns=['Financial Score', 'Device Score', 'Internet Score', 'Network Score', 'Access Difficulty Score'], inplace=True)

Формируем выборки

In [106]:
def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
   
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )
    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
    X = df_input
    y = df_input[
        [stratify_colname]
    ]
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )
    if frac_val <= 0:
        assert len(df_input) == len(df_train) + len(df_temp)
        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
    
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )
    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    return df_train, df_val, df_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df, stratify_colname="Access Difficulty", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=9
)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)
'X_train'
Education Level Institution Type Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level Access Difficulty
649 School Public Male 18 Mobile No Town Mid Wifi 4G 1 0
637 School Private Female 9 Mobile No Town Mid Mobile Data 4G 1 1
68 School Public Female 11 Mobile No Town Mid Wifi 4G 0 0
276 University Private Female 18 Mobile Yes Town Mid Mobile Data 3G 0 1
547 School Public Male 11 Mobile No Town Mid Wifi 4G 1 0
... ... ... ... ... ... ... ... ... ... ... ... ...
1097 University Private Male 23 Mobile Yes Town Rich Wifi 4G 0 0
854 School Public Female 18 Mobile No Town Mid Mobile Data 4G 0 1
756 University Public Male 18 Computer No Town Mid Wifi 3G 1 0
133 College Public Male 18 Mobile No Town Poor Mobile Data 4G 0 1
53 University Public Male 27 Mobile Yes Rural Poor Mobile Data 4G 1 1

964 rows × 12 columns

'y_train'
Access Difficulty
649 0
637 1
68 0
276 1
547 0
... ...
1097 0
854 1
756 0
133 1
53 1

964 rows × 1 columns

'X_test'
Education Level Institution Type Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level Access Difficulty
265 School Private Female 9 Mobile No Town Poor Wifi 4G 1 1
358 School Private Female 10 Mobile No Town Mid Mobile Data 3G 1 1
316 University Private Male 23 Tab No Town Mid Wifi 4G 1 0
907 School Private Female 9 Mobile No Town Poor Mobile Data 4G 1 1
1042 University Private Male 23 Mobile No Town Mid Mobile Data 3G 1 1
... ... ... ... ... ... ... ... ... ... ... ... ...
421 School Private Female 10 Mobile No Town Mid Mobile Data 3G 1 1
936 University Private Male 23 Tab No Town Rich Wifi 4G 2 0
722 University Private Male 23 Mobile Yes Rural Poor Mobile Data 3G 1 1
1075 University Private Male 23 Computer Yes Town Mid Wifi 4G 0 0
577 University Private Male 23 Mobile Yes Town Mid Wifi 4G 0 0

241 rows × 12 columns

'y_test'
Access Difficulty
265 1
358 1
316 0
907 1
1042 1
... ...
421 1
936 0
722 1
1075 0
577 0

241 rows × 1 columns

In [107]:
null_values = df.isnull().sum()
print("Пропущенные значения по столбцам:")
print(null_values)

stat_summary = df.describe()
print("\nСтатистический обзор данных:")
print(stat_summary)
Пропущенные значения по столбцам:
Education Level        0
Institution Type       0
Gender                 0
Age                    0
Device                 0
IT Student             0
Location               0
Financial Condition    0
Internet Type          0
Network Type           0
Flexibility Level      0
Access Difficulty      0
dtype: int64

Статистический обзор данных:
               Age  Flexibility Level  Access Difficulty
count  1205.000000        1205.000000        1205.000000
mean     17.065560           0.684647           0.624896
std       5.830369           0.618221           0.484351
min       9.000000           0.000000           0.000000
25%      11.000000           0.000000           0.000000
50%      18.000000           1.000000           1.000000
75%      23.000000           1.000000           1.000000
max      27.000000           2.000000           1.000000

Формируем конвеер для классификации данных и проверка конвеера

In [108]:
columns_to_drop = ['Age', 'Education Level', 'Gender', 'IT Student', 'Flexibility Level']
num_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype != "object"
]
cat_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype == "object"
]

num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ]
)

features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num, num_columns),
        ("prepocessing_cat", preprocessing_cat, cat_columns),
    ],
    remainder="passthrough"
)

drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)


pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
        ("drop_columns", drop_columns),
    ]
)

preprocessing_result = pipeline_end.fit_transform(X_train)
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

preprocessed_df
Out[108]:
Access Difficulty Institution Type_Public Device_Mobile Device_Tab Location_Town Financial Condition_Poor Financial Condition_Rich Internet Type_Wifi Network Type_3G Network Type_4G
649 -1.289567 1.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0
637 0.775454 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
68 -1.289567 1.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0
276 0.775454 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0
547 -1.289567 1.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0
... ... ... ... ... ... ... ... ... ... ...
1097 -1.289567 0.0 1.0 0.0 1.0 0.0 1.0 1.0 0.0 1.0
854 0.775454 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
756 -1.289567 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0
133 0.775454 1.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0
53 0.775454 1.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0

964 rows × 10 columns

Формируем набор моделей

In [109]:
class_models = {
    "logistic": {"model": linear_model.LogisticRegression()},
    "ridge": {"model": linear_model.LogisticRegression(penalty="l2", class_weight="balanced")},
    "decision_tree": {
        "model": tree.DecisionTreeClassifier(max_depth=7, random_state=9)
    },
    "knn": {"model": neighbors.KNeighborsClassifier(n_neighbors=7)},
    "naive_bayes": {"model": naive_bayes.GaussianNB()},
    "gradient_boosting": {
        "model": ensemble.GradientBoostingClassifier(n_estimators=210)
    },
    "random_forest": {
        "model": ensemble.RandomForestClassifier(
            max_depth=11, class_weight="balanced", random_state=9
        )
    },
    "mlp": {
        "model": neural_network.MLPClassifier(
            hidden_layer_sizes=(7,),
            max_iter=500,
            early_stopping=True,
            random_state=9,
        )
    },
}

Обучаем модели и тестируем их

In [110]:
for model_name in class_models.keys():
    print(f"Model: {model_name}")
    model = class_models[model_name]["model"]

    model_pipeline = Pipeline([("pipeline", pipeline_end), ("model", model)])
    model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())

    y_train_predict = model_pipeline.predict(X_train)
    y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]
    y_test_predict = np.where(y_test_probs > 0.5, 1, 0)

    class_models[model_name]["pipeline"] = model_pipeline
    class_models[model_name]["probs"] = y_test_probs
    class_models[model_name]["preds"] = y_test_predict

    class_models[model_name]["Precision_train"] = metrics.precision_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Precision_test"] = metrics.precision_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Recall_train"] = metrics.recall_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Recall_test"] = metrics.recall_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Accuracy_train"] = metrics.accuracy_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Accuracy_test"] = metrics.accuracy_score(
        y_test, y_test_predict
    )
    class_models[model_name]["ROC_AUC_test"] = metrics.roc_auc_score(
        y_test, y_test_probs
    )
    class_models[model_name]["F1_train"] = metrics.f1_score(y_train, y_train_predict, average=None)
    class_models[model_name]["F1_test"] = metrics.f1_score(y_test, y_test_predict, average=None)
    class_models[model_name]["MCC_test"] = metrics.matthews_corrcoef(
        y_test, y_test_predict
    )
    class_models[model_name]["Cohen_kappa_test"] = metrics.cohen_kappa_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Confusion_matrix"] = metrics.confusion_matrix(
        y_test, y_test_predict
    )
Model: logistic
Model: ridge
Model: decision_tree
Model: knn
Model: naive_bayes
Model: gradient_boosting
Model: random_forest
Model: mlp
d:\ulstu\cr3\sem1\MAI\AIM-PIbd-31-Makarov-DV\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
d:\ulstu\cr3\sem1\MAI\AIM-PIbd-31-Makarov-DV\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
In [111]:
_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)
for index, key in enumerate(class_models.keys()):
    c_matrix = class_models[key]["Confusion_matrix"]
    disp = ConfusionMatrixDisplay(
        confusion_matrix=c_matrix, display_labels=["Low dif-ty", "High dif-ty"]
    ).plot(ax=ax.flat[index])
    disp.ax_.set_title(key)

plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)
plt.show()
No description has been provided for this image

Точность, полнота, верность (аккуратность), F-мера

In [112]:
class_metrics = pd.DataFrame.from_dict(class_models, "index")[
    [
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
        "Accuracy_train",
        "Accuracy_test",
        "F1_train",
        "F1_test",
    ]
]
class_metrics.sort_values(
    by="Accuracy_test", ascending=False
)
Out[112]:
Precision_train Precision_test Recall_train Recall_test Accuracy_train Accuracy_test F1_train F1_test
logistic 1.0 1.0 1.0 1.0 1.000000 1.000000 [1.0, 1.0] [1.0, 1.0]
ridge 1.0 1.0 1.0 1.0 1.000000 1.000000 [1.0, 1.0] [1.0, 1.0]
decision_tree 1.0 1.0 1.0 1.0 1.000000 1.000000 [1.0, 1.0] [1.0, 1.0]
knn 1.0 1.0 1.0 1.0 1.000000 1.000000 [1.0, 1.0] [1.0, 1.0]
naive_bayes 1.0 1.0 1.0 1.0 1.000000 1.000000 [1.0, 1.0] [1.0, 1.0]
gradient_boosting 1.0 1.0 1.0 1.0 1.000000 1.000000 [1.0, 1.0] [1.0, 1.0]
random_forest 1.0 1.0 1.0 1.0 1.000000 1.000000 [1.0, 1.0] [1.0, 1.0]
mlp 0.0 0.0 0.0 0.0 0.375519 0.373444 [0.5460030165912518, 0.0] [0.5438066465256798, 0.0]

ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса

In [113]:
class_metrics = pd.DataFrame.from_dict(class_models, "index")[
    [
        "Accuracy_test",
        "F1_test",
        "ROC_AUC_test",
        "Cohen_kappa_test",
        "MCC_test",
    ]
]
class_metrics.sort_values(by="ROC_AUC_test", ascending=False)
Out[113]:
Accuracy_test F1_test ROC_AUC_test Cohen_kappa_test MCC_test
logistic 1.000000 [1.0, 1.0] 1.000000 1.0 1.0
ridge 1.000000 [1.0, 1.0] 1.000000 1.0 1.0
decision_tree 1.000000 [1.0, 1.0] 1.000000 1.0 1.0
knn 1.000000 [1.0, 1.0] 1.000000 1.0 1.0
naive_bayes 1.000000 [1.0, 1.0] 1.000000 1.0 1.0
gradient_boosting 1.000000 [1.0, 1.0] 1.000000 1.0 1.0
random_forest 1.000000 [1.0, 1.0] 1.000000 1.0 1.0
mlp 0.373444 [0.5438066465256798, 0.0] 0.068065 0.0 0.0

Лучшая модель

In [114]:
best_model = str(class_metrics.sort_values(by="MCC_test", ascending=False).iloc[0].name)

display(best_model)
'logistic'

Находим ошибки

In [115]:
preprocessing_result = pipeline_end.transform(X_test)
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

y_pred = class_models[best_model]["preds"]

error_index = y_test[y_test["Access Difficulty"] != y_pred].index.tolist()
display(f"Error items count: {len(error_index)}")

error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]
error_df = X_test.loc[error_index].copy()
error_df.insert(loc=1, column="Predicted", value=error_predicted)
error_df.sort_index()
'Error items count: 0'
Out[115]:
Education Level Predicted Institution Type Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level Access Difficulty

Пример использования модели (конвейера) для предсказания

In [116]:
model = class_models[best_model]["pipeline"]

example_id = 450
test = pd.DataFrame(X_test.loc[example_id, :]).T
test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T
display(test)
display(test_preprocessed)
result_proba = model.predict_proba(test)[0]
result = model.predict(test)[0]
real = int(y_test.loc[example_id].values[0])
display(f"predicted: {result} (proba: {result_proba})")
display(f"real: {real}")
Education Level Institution Type Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level Access Difficulty
450 School Private Female 11 Mobile No Town Poor Mobile Data 4G 1 1
Access Difficulty Institution Type_Public Device_Mobile Device_Tab Location_Town Financial Condition_Poor Financial Condition_Rich Internet Type_Wifi Network Type_3G Network Type_4G
450 0.775454 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0
'predicted: 1 (proba: [0.00310819 0.99689181])'
'real: 1'

Создаем гиперпараметры методом поиска по сетке.

In [121]:
optimized_model_type = 'random_forest'
random_state = 9

random_forest_model = class_models[optimized_model_type]["pipeline"]

param_grid = {
    "model__n_estimators": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],
    "model__max_features": ["sqrt", "log2", 2],
    "model__max_depth": [2, 3, 4, 5, 6, 7, 8, 9 ,10],
    "model__criterion": ["gini", "entropy", "log_loss"],
}

gs_optomizer = GridSearchCV(
    estimator=random_forest_model, param_grid=param_grid, n_jobs=-1
)
gs_optomizer.fit(X_train, y_train.values.ravel())
gs_optomizer.best_params_
Out[121]:
{'model__criterion': 'gini',
 'model__max_depth': 2,
 'model__max_features': 'sqrt',
 'model__n_estimators': 10}

Обучение модели с новыми гиперпараметрами

In [122]:
optimized_model = ensemble.RandomForestClassifier(
    random_state=random_state,
    criterion="gini",
    max_depth=2,
    max_features="sqrt",
    n_estimators=10,
)

result = {}

result["pipeline"] = Pipeline([("pipeline", pipeline_end), ("model", optimized_model)]).fit(X_train, y_train.values.ravel())
result["train_preds"] = result["pipeline"].predict(X_train)
result["probs"] = result["pipeline"].predict_proba(X_test)[:, 1]
result["preds"] = np.where(result["probs"] > 0.5, 1, 0)

result["Precision_train"] = metrics.precision_score(y_train, result["train_preds"])
result["Precision_test"] = metrics.precision_score(y_test, result["preds"])
result["Recall_train"] = metrics.recall_score(y_train, result["train_preds"])
result["Recall_test"] = metrics.recall_score(y_test, result["preds"])
result["Accuracy_train"] = metrics.accuracy_score(y_train, result["train_preds"])
result["Accuracy_test"] = metrics.accuracy_score(y_test, result["preds"])
result["ROC_AUC_test"] = metrics.roc_auc_score(y_test, result["probs"])
result["F1_train"] = metrics.f1_score(y_train, result["train_preds"])
result["F1_test"] = metrics.f1_score(y_test, result["preds"])
result["MCC_test"] = metrics.matthews_corrcoef(y_test, result["preds"])
result["Cohen_kappa_test"] = metrics.cohen_kappa_score(y_test, result["preds"])
result["Confusion_matrix"] = metrics.confusion_matrix(y_test, result["preds"])

Формирование данных для оценки старой и новой версии модели и сама оценка данных

In [124]:
optimized_metrics = pd.DataFrame(columns=list(result.keys()))
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
    data=class_models[optimized_model_type]
)
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
    data=result
)
optimized_metrics.insert(loc=0, column="Name", value=["Old", "New"])
optimized_metrics = optimized_metrics.set_index("Name")

optimized_metrics[
    [
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
        "Accuracy_train",
        "Accuracy_test",
        "F1_train",
        "F1_test",
    ]
]
Out[124]:
Precision_train Precision_test Recall_train Recall_test Accuracy_train Accuracy_test F1_train F1_test
Name
Old 1.0 1.0 1.0 1.0 1.0 1.0 [1.0, 1.0] [1.0, 1.0]
New 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
In [125]:
optimized_metrics[
    [
        "Accuracy_test",
        "F1_test",
        "ROC_AUC_test",
        "Cohen_kappa_test",
        "MCC_test",
    ]
]
Out[125]:
Accuracy_test F1_test ROC_AUC_test Cohen_kappa_test MCC_test
Name
Old 1.0 [1.0, 1.0] 1.0 1.0 1.0
New 1.0 1.0 1.0 1.0 1.0
In [127]:
_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False
)

for index in range(0, len(optimized_metrics)):
    c_matrix = optimized_metrics.iloc[index]["Confusion_matrix"]
    disp = ConfusionMatrixDisplay(
        confusion_matrix=c_matrix, display_labels=["Low dif-ty", "High dif-ty"]
    ).plot(ax=ax.flat[index])

plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)
plt.show()
No description has been provided for this image

Модель идеально классифицировала объекты, которые относятся к "High difficulty" и "Low difficulty".