AIM-PIbd-31-Makarov-DV/lab_3/lab3.ipynb
2024-11-08 21:42:42 +04:00

1547 lines
129 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Лабораторная 3\n",
"\n",
"Датасет: Информация об онлайн обучении учеников"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Education Level', 'Institution Type', 'Gender', 'Age', 'Device',\n",
" 'IT Student', 'Location', 'Financial Condition', 'Internet Type',\n",
" 'Network Type', 'Flexibility Level'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import featuretools as ft\n",
"import time\n",
"import math\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error\n",
"\n",
"df = pd.read_csv(\"..\\\\static\\\\csv\\\\students_adaptability_level_online_education.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Столбцы:\n",
"\n",
"Education Level - уровень образования\\\n",
"Institution Type - тип учреждения\\\n",
"Gender - пол\\\n",
"Age - возраст\\\n",
"Device - устройство\\\n",
"IT Student - ученик IT направления или нет\\\n",
"Location - локация\\\n",
"Financial Condition - финансовое состояние\\\n",
"Internet Type - тип доступа к сети\\\n",
"Network Type - уровень сети\\\n",
"Flexibility Level - уровень приспособления"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1205 entries, 0 to 1204\n",
"Data columns (total 11 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Education Level 1205 non-null object\n",
" 1 Institution Type 1205 non-null object\n",
" 2 Gender 1205 non-null object\n",
" 3 Age 1205 non-null int64 \n",
" 4 Device 1205 non-null object\n",
" 5 IT Student 1205 non-null object\n",
" 6 Location 1205 non-null object\n",
" 7 Financial Condition 1205 non-null object\n",
" 8 Internet Type 1205 non-null object\n",
" 9 Network Type 1205 non-null object\n",
" 10 Flexibility Level 1205 non-null object\n",
"dtypes: int64(1), object(10)\n",
"memory usage: 103.7+ KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Education Level</th>\n",
" <th>Institution Type</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Device</th>\n",
" <th>IT Student</th>\n",
" <th>Location</th>\n",
" <th>Financial Condition</th>\n",
" <th>Internet Type</th>\n",
" <th>Network Type</th>\n",
" <th>Flexibility Level</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>University</td>\n",
" <td>Private</td>\n",
" <td>Male</td>\n",
" <td>23</td>\n",
" <td>Tab</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>University</td>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>23</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>College</td>\n",
" <td>Public</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>School</td>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>School</td>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Poor</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Education Level Institution Type Gender Age Device IT Student Location \\\n",
"0 University Private Male 23 Tab No Town \n",
"1 University Private Female 23 Mobile No Town \n",
"2 College Public Female 18 Mobile No Town \n",
"3 School Private Female 11 Mobile No Town \n",
"4 School Private Female 18 Mobile No Town \n",
"\n",
" Financial Condition Internet Type Network Type Flexibility Level \n",
"0 Mid Wifi 4G Moderate \n",
"1 Mid Mobile Data 4G Moderate \n",
"2 Mid Wifi 4G Moderate \n",
"3 Mid Mobile Data 4G Moderate \n",
"4 Poor Mobile Data 3G Low "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.info()\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Примеры бизнес-целей для датасета:\n",
"1. Улучшение доступа к онлайн-образованию для учеников с низким уровнем финансового обеспечения.\n",
"2. Повышение удовлетворенности учеников онлайн-обучением на основе их устройств, типу соединения, местоположения."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Цели технического проекта:\n",
"\n",
"1. Провести анализ зависимости учеников от уровня интернет-соединения и устройств\n",
"2. Провести анализ влияния различных факторов (тип устройства, интернет-соединение, финансовое положение) на уровень приспособленности."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверяем на выбросы."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пустые значения по столбцам:\n",
"Education Level 0\n",
"Institution Type 0\n",
"Gender 0\n",
"Age 0\n",
"Device 0\n",
"IT Student 0\n",
"Location 0\n",
"Financial Condition 0\n",
"Internet Type 0\n",
"Network Type 0\n",
"Flexibility Level 0\n",
"dtype: int64\n",
"\n",
"Количество дубликатов: 980\n",
"\n",
"Статистический обзор данных:\n",
"\n",
"Коэффициент асимметрии для столбца 'Age': 0.024342017300169792\n"
]
}
],
"source": [
"null_values = df.isnull().sum()\n",
"print(\"Пустые значения по столбцам:\")\n",
"print(null_values)\n",
"\n",
"duplicates = df.duplicated().sum()\n",
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n",
"\n",
"print(\"\\nСтатистический обзор данных:\")\n",
"df.describe()\n",
"\n",
"for column in df.select_dtypes(include=[np.number]).columns:\n",
" skewness = df[column].skew()\n",
" print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выбросы незначительны, дубликаты есть. Удаляем дубликаты и очищаем от шумов."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Шумы в датасете:\n",
"Empty DataFrame\n",
"Columns: [Education Level, Institution Type, Gender, Age, Device, IT Student, Location, Financial Condition, Internet Type, Network Type, Flexibility Level]\n",
"Index: []\n"
]
}
],
"source": [
"cleaned_df = df.drop_duplicates()\n",
"\n",
"Q1 = df[\"Age\"].quantile(0.25)\n",
"Q3 = df[\"Age\"].quantile(0.75)\n",
"\n",
"IQR = Q3 - Q1\n",
"\n",
"threshold = 1.5 * IQR\n",
"lower_bound = Q1 - threshold\n",
"upper_bound = Q3 + threshold\n",
"\n",
"outliers = (df[\"Age\"] < lower_bound) | (df[\"Age\"] > upper_bound)\n",
"\n",
"print(\"Шумы в датасете:\")\n",
"print(df[outliers])\n",
"\n",
"median_score = df[\"Age\"].median()\n",
"df.loc[outliers, \"Age\"] = median_score"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Преобразуем строковые значение в столбце \"Уровень приспособления\" в числовые значения. Это понадобится для расчёта качества набора признаков."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"map_flexibility_to_int = {'Low': 0, 'Moderate': 1, 'High': 2}\n",
"\n",
"df['Flexibility Level'] = df['Flexibility Level'].map(map_flexibility_to_int).astype('int32')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Шумов в датасете нет. Разбиваем датасет на три выборки: обучающую, контрольную и тестовую."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: (723, 10)\n",
"Размер контрольной выборки: (241, 10)\n",
"Размер тестовой выборки: (241, 10)\n"
]
}
],
"source": [
"X = df.drop(columns=['Flexibility Level'])\n",
"Y = df['Flexibility Level']\n",
"\n",
"X_train_df, X_test_df, Y_train_df, Y_test_df = train_test_split(X, Y, test_size=0.2, random_state=42)\n",
"\n",
"X_train_df, X_val_df, Y_train_df, Y_val_df = train_test_split(X_train_df, Y_train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", X_train_df.shape)\n",
"print(\"Размер контрольной выборки:\",X_val_df.shape)\n",
"print(\"Размер тестовой выборки:\", X_test_df.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверка сбалансированности данных."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в обучающей выборке:\n",
"Flexibility Level\n",
"1 0.531120\n",
"0 0.385892\n",
"2 0.082988\n",
"Name: proportion, dtype: float64\n",
"\n",
"Распределение классов в контрольной выборке:\n",
"Flexibility Level\n",
"1 0.522822\n",
"0 0.406639\n",
"2 0.070539\n",
"Name: proportion, dtype: float64\n",
"\n",
"Распределение классов в тестовой выборке:\n",
"Flexibility Level\n",
"1 0.477178\n",
"0 0.427386\n",
"2 0.095436\n",
"Name: proportion, dtype: float64\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1800x500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"def analyze_balance(y_train, y_val, y_test, y_name):\n",
" print(\"Распределение классов в обучающей выборке:\")\n",
" print(y_train.value_counts(normalize=True))\n",
" \n",
" print(\"\\nРаспределение классов в контрольной выборке:\")\n",
" print(y_val.value_counts(normalize=True))\n",
" \n",
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
" print(y_test.value_counts(normalize=True))\n",
"\n",
" fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n",
" fig.suptitle('Распределение в различных выборках')\n",
"\n",
" sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts(normalize=True), ax=axes[0])\n",
" axes[0].set_title('Обучающая выборка')\n",
" axes[0].set_xlabel(y_name)\n",
" axes[0].set_ylabel('Доля')\n",
"\n",
" sns.barplot(x=y_val.value_counts().index, y=y_val.value_counts(normalize=True), ax=axes[1])\n",
" axes[1].set_title('Контрольная выборка')\n",
" axes[1].set_xlabel(y_name)\n",
"\n",
" sns.barplot(x=y_test.value_counts().index, y=y_test.value_counts(normalize=True), ax=axes[2])\n",
" axes[2].set_title('Тестовая выборка')\n",
" axes[2].set_xlabel(y_name)\n",
"\n",
" plt.show()\n",
"\n",
"analyze_balance(Y_train_df, Y_val_df, Y_test_df, 'Flexibility Level')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выполним оверсемплинг для балансировки."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в обучающей выборке:\n",
"Flexibility Level\n",
"2 0.333333\n",
"0 0.333333\n",
"1 0.333333\n",
"Name: proportion, dtype: float64\n",
"\n",
"Распределение классов в контрольной выборке:\n",
"Flexibility Level\n",
"1 0.333333\n",
"0 0.333333\n",
"2 0.333333\n",
"Name: proportion, dtype: float64\n",
"\n",
"Распределение классов в тестовой выборке:\n",
"Flexibility Level\n",
"1 0.477178\n",
"0 0.427386\n",
"2 0.095436\n",
"Name: proportion, dtype: float64\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1800x500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"ros = RandomOverSampler(random_state=42)\n",
"\n",
"X_train_resampled, Y_train_resampled = ros.fit_resample(X_train_df, Y_train_df)\n",
"X_val_resampled, Y_val_resampled = ros.fit_resample(X_val_df, Y_val_df)\n",
"\n",
"analyze_balance(Y_train_resampled, Y_val_resampled, Y_test_df, 'Flexibility Level')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Конструирование признаков. Для начала применим унитарное кодирование категориальных признаков (one-hot encoding), переведя их в бинарные вектора."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Education Level_School</th>\n",
" <th>Education Level_University</th>\n",
" <th>Institution Type_Public</th>\n",
" <th>Gender_Male</th>\n",
" <th>Device_Mobile</th>\n",
" <th>Device_Tab</th>\n",
" <th>IT Student_Yes</th>\n",
" <th>Location_Town</th>\n",
" <th>Financial Condition_Poor</th>\n",
" <th>Financial Condition_Rich</th>\n",
" <th>Internet Type_Wifi</th>\n",
" <th>Network Type_3G</th>\n",
" <th>Network Type_4G</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>18</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>23</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>18</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>23</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Age Education Level_School Education Level_University \\\n",
"0 10 True False \n",
"1 18 False False \n",
"2 23 False True \n",
"3 18 True False \n",
"4 23 False True \n",
"\n",
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
"0 False True True False \n",
"1 False False True False \n",
"2 False True True False \n",
"3 True True True False \n",
"4 False False True False \n",
"\n",
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
"0 False True False \n",
"1 False True False \n",
"2 False True False \n",
"3 False True False \n",
"4 False False False \n",
"\n",
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
"0 True True False \n",
"1 False True False \n",
"2 False True False \n",
"3 True False False \n",
"4 False True False \n",
"\n",
" Network Type_4G \n",
"0 True \n",
"1 True \n",
"2 True \n",
"3 True \n",
"4 True "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_features = ['Education Level', 'Institution Type', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition', 'Internet Type', 'Network Type']\n",
"\n",
"train_encoded = pd.get_dummies(X_train_resampled, columns=cat_features, drop_first=True)\n",
"val_encoded = pd.get_dummies(X_val_resampled, columns=cat_features, drop_first=True)\n",
"test_encoded = pd.get_dummies(X_test_df, columns=cat_features, drop_first=True)\n",
"\n",
"train_encoded.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Применим дискретизацию к числовым признакам."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Education Level_School</th>\n",
" <th>Education Level_University</th>\n",
" <th>Institution Type_Public</th>\n",
" <th>Gender_Male</th>\n",
" <th>Device_Mobile</th>\n",
" <th>Device_Tab</th>\n",
" <th>IT Student_Yes</th>\n",
" <th>Location_Town</th>\n",
" <th>Financial Condition_Poor</th>\n",
" <th>Financial Condition_Rich</th>\n",
" <th>Internet Type_Wifi</th>\n",
" <th>Network Type_3G</th>\n",
" <th>Network Type_4G</th>\n",
" <th>Age_Bin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Education Level_School Education Level_University \\\n",
"0 True False \n",
"1 False False \n",
"2 False True \n",
"3 True False \n",
"4 False True \n",
"\n",
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
"0 False True True False \n",
"1 False False True False \n",
"2 False True True False \n",
"3 True True True False \n",
"4 False False True False \n",
"\n",
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
"0 False True False \n",
"1 False True False \n",
"2 False True False \n",
"3 False True False \n",
"4 False False False \n",
"\n",
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
"0 True True False \n",
"1 False True False \n",
"2 False True False \n",
"3 True False False \n",
"4 False True False \n",
"\n",
" Network Type_4G Age_Bin \n",
"0 True young \n",
"1 True young \n",
"2 True young \n",
"3 True young \n",
"4 True young "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"num_features = ['Age']\n",
"\n",
"def discretize_features(df, features, bins, labels):\n",
" for feature in features:\n",
" df[f'{feature}_Bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
" df.drop(columns=[feature], inplace=True)\n",
" return df\n",
"\n",
"age_bins = [0, 25, 55, 100]\n",
"age_labels = [\"young\", \"middle-aged\", \"old\"]\n",
"\n",
"train_encoded = discretize_features(train_encoded, num_features, bins=age_bins, labels=age_labels)\n",
"val_encoded = discretize_features(val_encoded, num_features, bins=age_bins, labels=age_labels)\n",
"test_encoded = discretize_features(test_encoded, num_features, bins=age_bins, labels=age_labels)\n",
"\n",
"train_encoded.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Применим ручной синтез признаков. К примеру, для этого датасета, сделаем признак \"соотвествие устройства для обучения\". Мобильные устройства часто менее удобны для учебы по сравнению с планшетами."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Education Level_School</th>\n",
" <th>Education Level_University</th>\n",
" <th>Institution Type_Public</th>\n",
" <th>Gender_Male</th>\n",
" <th>Device_Mobile</th>\n",
" <th>Device_Tab</th>\n",
" <th>IT Student_Yes</th>\n",
" <th>Location_Town</th>\n",
" <th>Financial Condition_Poor</th>\n",
" <th>Financial Condition_Rich</th>\n",
" <th>Internet Type_Wifi</th>\n",
" <th>Network Type_3G</th>\n",
" <th>Network Type_4G</th>\n",
" <th>Age_Bin</th>\n",
" <th>Device Suitability</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Education Level_School Education Level_University \\\n",
"0 True False \n",
"1 False False \n",
"2 False True \n",
"3 True False \n",
"4 False True \n",
"\n",
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
"0 False True True False \n",
"1 False False True False \n",
"2 False True True False \n",
"3 True True True False \n",
"4 False False True False \n",
"\n",
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
"0 False True False \n",
"1 False True False \n",
"2 False True False \n",
"3 False True False \n",
"4 False False False \n",
"\n",
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
"0 True True False \n",
"1 False True False \n",
"2 False True False \n",
"3 True False False \n",
"4 False True False \n",
"\n",
" Network Type_4G Age_Bin Device Suitability \n",
"0 True young Low \n",
"1 True young Low \n",
"2 True young Low \n",
"3 True young Low \n",
"4 True young Low "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_encoded['Device Suitability'] = train_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n",
"val_encoded['Device Suitability'] = val_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n",
"test_encoded['Device Suitability'] = test_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n",
"\n",
"train_encoded.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Конструирование признаков с помощью фреймворка Featuretools."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Education Level_School</th>\n",
" <th>Education Level_University</th>\n",
" <th>Institution Type_Public</th>\n",
" <th>Gender_Male</th>\n",
" <th>Device_Mobile</th>\n",
" <th>Device_Tab</th>\n",
" <th>IT Student_Yes</th>\n",
" <th>Location_Town</th>\n",
" <th>Financial Condition_Poor</th>\n",
" <th>Financial Condition_Rich</th>\n",
" <th>Internet Type_Wifi</th>\n",
" <th>Network Type_3G</th>\n",
" <th>Network Type_4G</th>\n",
" <th>Age_Bin</th>\n",
" <th>Device Suitability</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Education Level_School Education Level_University \\\n",
"id \n",
"0 True False \n",
"1 False False \n",
"2 False True \n",
"3 True False \n",
"4 False True \n",
"\n",
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
"id \n",
"0 False True True False \n",
"1 False False True False \n",
"2 False True True False \n",
"3 True True True False \n",
"4 False False True False \n",
"\n",
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
"id \n",
"0 False True False \n",
"1 False True False \n",
"2 False True False \n",
"3 False True False \n",
"4 False False False \n",
"\n",
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
"id \n",
"0 True True False \n",
"1 False True False \n",
"2 False True False \n",
"3 True False False \n",
"4 False True False \n",
"\n",
" Network Type_4G Age_Bin Device Suitability \n",
"id \n",
"0 True young Low \n",
"1 True young Low \n",
"2 True young Low \n",
"3 True young Low \n",
"4 True young Low "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ft_data = train_encoded.copy()\n",
"\n",
"es = ft.EntitySet(id=\"students\")\n",
"es = es.add_dataframe(dataframe_name=\"students_data\", dataframe=ft_data, index=\"id\", make_index=True)\n",
"\n",
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=es, \n",
" target_dataframe_name=\"students_data\",\n",
" max_depth=1\n",
")\n",
"\n",
"feature_matrix.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Featuretools не смог сделать новые признаки.\n",
"\n",
"Оценка качества набора признаков."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Время обучения модели: 0.11 секунд\n"
]
}
],
"source": [
"train_encoded = pd.get_dummies(train_encoded, drop_first=True)\n",
"val_encoded = pd.get_dummies(val_encoded, drop_first=True)\n",
"test_encoded = pd.get_dummies(test_encoded, drop_first=True)\n",
"\n",
"cols = train_encoded.columns\n",
"\n",
"train_encoded = train_encoded.reindex(columns=cols, fill_value=0)\n",
"val_encoded = val_encoded.reindex(columns=cols, fill_value=0)\n",
"test_encoded = test_encoded.reindex(columns=cols, fill_value=0)\n",
"\n",
"model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"\n",
"start = time.time()\n",
"model.fit(train_encoded, Y_train_resampled)\n",
"train_time = time.time() - start\n",
"\n",
"print(f'Время обучения модели: {train_time:.2f} секунд')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature Importance:\n",
" feature importance\n",
"9 Financial Condition_Rich 0.184028\n",
"3 Gender_Male 0.108992\n",
"8 Financial Condition_Poor 0.107030\n",
"2 Institution Type_Public 0.095663\n",
"10 Internet Type_Wifi 0.089925\n",
"7 Location_Town 0.078658\n",
"0 Education Level_School 0.061961\n",
"6 IT Student_Yes 0.055048\n",
"1 Education Level_University 0.049695\n",
"12 Network Type_4G 0.044837\n",
"4 Device_Mobile 0.042086\n",
"11 Network Type_3G 0.038541\n",
"13 Age_Bin_middle-aged 0.034876\n",
"15 Device Suitability_Low 0.004611\n",
"5 Device_Tab 0.004049\n",
"14 Age_Bin_old 0.000000\n"
]
}
],
"source": [
"# Получение важности признаков\n",
"importances = model.feature_importances_\n",
"feature_names = train_encoded.columns\n",
"\n",
"# Сортировка признаков по важности\n",
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
"\n",
"print(\"Feature Importance:\")\n",
"print(feature_importance)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RMSE: 0.5652451456569942\n",
"R²: 0.22569473420679287\n",
"MAE: 0.2697095435684647 \n",
"\n",
"Кросс-валидация RMSE: 0.5705060311373475 \n",
"\n",
"Train RMSE: 0.5237418787490223\n",
"Train R²: 0.5885416666666667\n",
"Train MAE: 0.19791666666666666\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
}
],
"source": [
"y_pred = model.predict(test_encoded)\n",
"\n",
"# Анализ важности признаков\n",
"feature_importances = model.feature_importances_\n",
"feature_names = train_encoded.columns\n",
"\n",
"importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n",
"importance_df = importance_df.sort_values(by='Importance', ascending=False)\n",
"\n",
"rmse = mean_squared_error(Y_test_df, y_pred, squared=False)\n",
"r2 = r2_score(Y_test_df, y_pred)\n",
"mae = mean_absolute_error(Y_test_df, y_pred)\n",
"\n",
"print()\n",
"print(f\"RMSE: {rmse}\")\n",
"print(f\"R²: {r2}\")\n",
"print(f\"MAE: {mae} \\n\")\n",
"\n",
"# Кросс-валидация\n",
"scores = cross_val_score(model, train_encoded, Y_train_resampled, cv=5, scoring='neg_mean_squared_error')\n",
"rmse_cv = math.sqrt((-scores.mean()))\n",
"print(f\"Кросс-валидация RMSE: {rmse_cv} \\n\")\n",
"\n",
"# Проверка на переобучение\n",
"y_train_pred = model.predict(train_encoded)\n",
"\n",
"rmse_train = mean_squared_error(Y_train_resampled, y_train_pred, squared=False)\n",
"r2_train = r2_score(Y_train_resampled, y_train_pred)\n",
"mae_train = mean_absolute_error(Y_train_resampled, y_train_pred)\n",
"\n",
"print(f\"Train RMSE: {rmse_train}\")\n",
"print(f\"Train R²: {r2_train}\")\n",
"print(f\"Train MAE: {mae_train}\")\n",
"print()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}