diff --git a/lab_3/lab3.ipynb b/lab_3/lab3.ipynb new file mode 100644 index 0000000..d98dc6a --- /dev/null +++ b/lab_3/lab3.ipynb @@ -0,0 +1,1546 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Лабораторная 3\n", + "\n", + "Датасет: Информация об онлайн обучении учеников" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['Education Level', 'Institution Type', 'Gender', 'Age', 'Device',\n", + " 'IT Student', 'Location', 'Financial Condition', 'Internet Type',\n", + " 'Network Type', 'Flexibility Level'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import featuretools as ft\n", + "import time\n", + "import math\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error\n", + "\n", + "df = pd.read_csv(\"..\\\\static\\\\csv\\\\students_adaptability_level_online_education.csv\")\n", + "print(df.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Столбцы:\n", + "\n", + "Education Level - уровень образования\\\n", + "Institution Type - тип учреждения\\\n", + "Gender - пол\\\n", + "Age - возраст\\\n", + "Device - устройство\\\n", + "IT Student - ученик IT направления или нет\\\n", + "Location - локация\\\n", + "Financial Condition - финансовое состояние\\\n", + "Internet Type - тип доступа к сети\\\n", + "Network Type - уровень сети\\\n", + "Flexibility Level - уровень приспособления" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 1205 entries, 0 to 1204\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Education Level 1205 non-null object\n", + " 1 Institution Type 1205 non-null object\n", + " 2 Gender 1205 non-null object\n", + " 3 Age 1205 non-null int64 \n", + " 4 Device 1205 non-null object\n", + " 5 IT Student 1205 non-null object\n", + " 6 Location 1205 non-null object\n", + " 7 Financial Condition 1205 non-null object\n", + " 8 Internet Type 1205 non-null object\n", + " 9 Network Type 1205 non-null object\n", + " 10 Flexibility Level 1205 non-null object\n", + "dtypes: int64(1), object(10)\n", + "memory usage: 103.7+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Education LevelInstitution TypeGenderAgeDeviceIT StudentLocationFinancial ConditionInternet TypeNetwork TypeFlexibility Level
0UniversityPrivateMale23TabNoTownMidWifi4GModerate
1UniversityPrivateFemale23MobileNoTownMidMobile Data4GModerate
2CollegePublicFemale18MobileNoTownMidWifi4GModerate
3SchoolPrivateFemale11MobileNoTownMidMobile Data4GModerate
4SchoolPrivateFemale18MobileNoTownPoorMobile Data3GLow
\n", + "
" + ], + "text/plain": [ + " Education Level Institution Type Gender Age Device IT Student Location \\\n", + "0 University Private Male 23 Tab No Town \n", + "1 University Private Female 23 Mobile No Town \n", + "2 College Public Female 18 Mobile No Town \n", + "3 School Private Female 11 Mobile No Town \n", + "4 School Private Female 18 Mobile No Town \n", + "\n", + " Financial Condition Internet Type Network Type Flexibility Level \n", + "0 Mid Wifi 4G Moderate \n", + "1 Mid Mobile Data 4G Moderate \n", + "2 Mid Wifi 4G Moderate \n", + "3 Mid Mobile Data 4G Moderate \n", + "4 Poor Mobile Data 3G Low " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.info()\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Примеры бизнес-целей для датасета:\n", + "1. Улучшение доступа к онлайн-образованию для учеников с низким уровнем финансового обеспечения.\n", + "2. Повышение удовлетворенности учеников онлайн-обучением на основе их устройств, типу соединения, местоположения." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Цели технического проекта:\n", + "\n", + "1. Провести анализ зависимости учеников от уровня интернет-соединения и устройств\n", + "2. Провести анализ влияния различных факторов (тип устройства, интернет-соединение, финансовое положение) на уровень приспособленности." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Проверяем на выбросы." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Пустые значения по столбцам:\n", + "Education Level 0\n", + "Institution Type 0\n", + "Gender 0\n", + "Age 0\n", + "Device 0\n", + "IT Student 0\n", + "Location 0\n", + "Financial Condition 0\n", + "Internet Type 0\n", + "Network Type 0\n", + "Flexibility Level 0\n", + "dtype: int64\n", + "\n", + "Количество дубликатов: 980\n", + "\n", + "Статистический обзор данных:\n", + "\n", + "Коэффициент асимметрии для столбца 'Age': 0.024342017300169792\n" + ] + } + ], + "source": [ + "null_values = df.isnull().sum()\n", + "print(\"Пустые значения по столбцам:\")\n", + "print(null_values)\n", + "\n", + "duplicates = df.duplicated().sum()\n", + "print(f\"\\nКоличество дубликатов: {duplicates}\")\n", + "\n", + "print(\"\\nСтатистический обзор данных:\")\n", + "df.describe()\n", + "\n", + "for column in df.select_dtypes(include=[np.number]).columns:\n", + " skewness = df[column].skew()\n", + " print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выбросы незначительны, дубликаты есть. Удаляем дубликаты и очищаем от шумов." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Шумы в датасете:\n", + "Empty DataFrame\n", + "Columns: [Education Level, Institution Type, Gender, Age, Device, IT Student, Location, Financial Condition, Internet Type, Network Type, Flexibility Level]\n", + "Index: []\n" + ] + } + ], + "source": [ + "cleaned_df = df.drop_duplicates()\n", + "\n", + "Q1 = df[\"Age\"].quantile(0.25)\n", + "Q3 = df[\"Age\"].quantile(0.75)\n", + "\n", + "IQR = Q3 - Q1\n", + "\n", + "threshold = 1.5 * IQR\n", + "lower_bound = Q1 - threshold\n", + "upper_bound = Q3 + threshold\n", + "\n", + "outliers = (df[\"Age\"] < lower_bound) | (df[\"Age\"] > upper_bound)\n", + "\n", + "print(\"Шумы в датасете:\")\n", + "print(df[outliers])\n", + "\n", + "median_score = df[\"Age\"].median()\n", + "df.loc[outliers, \"Age\"] = median_score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Преобразуем строковые значение в столбце \"Уровень приспособления\" в числовые значения. Это понадобится для расчёта качества набора признаков." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "map_flexibility_to_int = {'Low': 0, 'Moderate': 1, 'High': 2}\n", + "\n", + "df['Flexibility Level'] = df['Flexibility Level'].map(map_flexibility_to_int).astype('int32')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Шумов в датасете нет. Разбиваем датасет на три выборки: обучающую, контрольную и тестовую." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: (723, 10)\n", + "Размер контрольной выборки: (241, 10)\n", + "Размер тестовой выборки: (241, 10)\n" + ] + } + ], + "source": [ + "X = df.drop(columns=['Flexibility Level'])\n", + "Y = df['Flexibility Level']\n", + "\n", + "X_train_df, X_test_df, Y_train_df, Y_test_df = train_test_split(X, Y, test_size=0.2, random_state=42)\n", + "\n", + "X_train_df, X_val_df, Y_train_df, Y_val_df = train_test_split(X_train_df, Y_train_df, test_size=0.25, random_state=42)\n", + "\n", + "print(\"Размер обучающей выборки:\", X_train_df.shape)\n", + "print(\"Размер контрольной выборки:\",X_val_df.shape)\n", + "print(\"Размер тестовой выборки:\", X_test_df.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Проверка сбалансированности данных." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение классов в обучающей выборке:\n", + "Flexibility Level\n", + "1 0.531120\n", + "0 0.385892\n", + "2 0.082988\n", + "Name: proportion, dtype: float64\n", + "\n", + "Распределение классов в контрольной выборке:\n", + "Flexibility Level\n", + "1 0.522822\n", + "0 0.406639\n", + "2 0.070539\n", + "Name: proportion, dtype: float64\n", + "\n", + "Распределение классов в тестовой выборке:\n", + "Flexibility Level\n", + "1 0.477178\n", + "0 0.427386\n", + "2 0.095436\n", + "Name: proportion, dtype: float64\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "def analyze_balance(y_train, y_val, y_test, y_name):\n", + " print(\"Распределение классов в обучающей выборке:\")\n", + " print(y_train.value_counts(normalize=True))\n", + " \n", + " print(\"\\nРаспределение классов в контрольной выборке:\")\n", + " print(y_val.value_counts(normalize=True))\n", + " \n", + " print(\"\\nРаспределение классов в тестовой выборке:\")\n", + " print(y_test.value_counts(normalize=True))\n", + "\n", + " fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n", + " fig.suptitle('Распределение в различных выборках')\n", + "\n", + " sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts(normalize=True), ax=axes[0])\n", + " axes[0].set_title('Обучающая выборка')\n", + " axes[0].set_xlabel(y_name)\n", + " axes[0].set_ylabel('Доля')\n", + "\n", + " sns.barplot(x=y_val.value_counts().index, y=y_val.value_counts(normalize=True), ax=axes[1])\n", + " axes[1].set_title('Контрольная выборка')\n", + " axes[1].set_xlabel(y_name)\n", + "\n", + " sns.barplot(x=y_test.value_counts().index, y=y_test.value_counts(normalize=True), ax=axes[2])\n", + " axes[2].set_title('Тестовая выборка')\n", + " axes[2].set_xlabel(y_name)\n", + "\n", + " plt.show()\n", + "\n", + "analyze_balance(Y_train_df, Y_val_df, Y_test_df, 'Flexibility Level')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выполним оверсемплинг для балансировки." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение классов в обучающей выборке:\n", + "Flexibility Level\n", + "2 0.333333\n", + "0 0.333333\n", + "1 0.333333\n", + "Name: proportion, dtype: float64\n", + "\n", + "Распределение классов в контрольной выборке:\n", + "Flexibility Level\n", + "1 0.333333\n", + "0 0.333333\n", + "2 0.333333\n", + "Name: proportion, dtype: float64\n", + "\n", + "Распределение классов в тестовой выборке:\n", + "Flexibility Level\n", + "1 0.477178\n", + "0 0.427386\n", + "2 0.095436\n", + "Name: proportion, dtype: float64\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ros = RandomOverSampler(random_state=42)\n", + "\n", + "X_train_resampled, Y_train_resampled = ros.fit_resample(X_train_df, Y_train_df)\n", + "X_val_resampled, Y_val_resampled = ros.fit_resample(X_val_df, Y_val_df)\n", + "\n", + "analyze_balance(Y_train_resampled, Y_val_resampled, Y_test_df, 'Flexibility Level')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Конструирование признаков. Для начала применим унитарное кодирование категориальных признаков (one-hot encoding), переведя их в бинарные вектора." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeEducation Level_SchoolEducation Level_UniversityInstitution Type_PublicGender_MaleDevice_MobileDevice_TabIT Student_YesLocation_TownFinancial Condition_PoorFinancial Condition_RichInternet Type_WifiNetwork Type_3GNetwork Type_4G
010TrueFalseFalseTrueTrueFalseFalseTrueFalseTrueTrueFalseTrue
118FalseFalseFalseFalseTrueFalseFalseTrueFalseFalseTrueFalseTrue
223FalseTrueFalseTrueTrueFalseFalseTrueFalseFalseTrueFalseTrue
318TrueFalseTrueTrueTrueFalseFalseTrueFalseTrueFalseFalseTrue
423FalseTrueFalseFalseTrueFalseFalseFalseFalseFalseTrueFalseTrue
\n", + "
" + ], + "text/plain": [ + " Age Education Level_School Education Level_University \\\n", + "0 10 True False \n", + "1 18 False False \n", + "2 23 False True \n", + "3 18 True False \n", + "4 23 False True \n", + "\n", + " Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n", + "0 False True True False \n", + "1 False False True False \n", + "2 False True True False \n", + "3 True True True False \n", + "4 False False True False \n", + "\n", + " IT Student_Yes Location_Town Financial Condition_Poor \\\n", + "0 False True False \n", + "1 False True False \n", + "2 False True False \n", + "3 False True False \n", + "4 False False False \n", + "\n", + " Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n", + "0 True True False \n", + "1 False True False \n", + "2 False True False \n", + "3 True False False \n", + "4 False True False \n", + "\n", + " Network Type_4G \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 True \n", + "4 True " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat_features = ['Education Level', 'Institution Type', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition', 'Internet Type', 'Network Type']\n", + "\n", + "train_encoded = pd.get_dummies(X_train_resampled, columns=cat_features, drop_first=True)\n", + "val_encoded = pd.get_dummies(X_val_resampled, columns=cat_features, drop_first=True)\n", + "test_encoded = pd.get_dummies(X_test_df, columns=cat_features, drop_first=True)\n", + "\n", + "train_encoded.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Применим дискретизацию к числовым признакам." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Education Level_SchoolEducation Level_UniversityInstitution Type_PublicGender_MaleDevice_MobileDevice_TabIT Student_YesLocation_TownFinancial Condition_PoorFinancial Condition_RichInternet Type_WifiNetwork Type_3GNetwork Type_4GAge_Bin
0TrueFalseFalseTrueTrueFalseFalseTrueFalseTrueTrueFalseTrueyoung
1FalseFalseFalseFalseTrueFalseFalseTrueFalseFalseTrueFalseTrueyoung
2FalseTrueFalseTrueTrueFalseFalseTrueFalseFalseTrueFalseTrueyoung
3TrueFalseTrueTrueTrueFalseFalseTrueFalseTrueFalseFalseTrueyoung
4FalseTrueFalseFalseTrueFalseFalseFalseFalseFalseTrueFalseTrueyoung
\n", + "
" + ], + "text/plain": [ + " Education Level_School Education Level_University \\\n", + "0 True False \n", + "1 False False \n", + "2 False True \n", + "3 True False \n", + "4 False True \n", + "\n", + " Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n", + "0 False True True False \n", + "1 False False True False \n", + "2 False True True False \n", + "3 True True True False \n", + "4 False False True False \n", + "\n", + " IT Student_Yes Location_Town Financial Condition_Poor \\\n", + "0 False True False \n", + "1 False True False \n", + "2 False True False \n", + "3 False True False \n", + "4 False False False \n", + "\n", + " Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n", + "0 True True False \n", + "1 False True False \n", + "2 False True False \n", + "3 True False False \n", + "4 False True False \n", + "\n", + " Network Type_4G Age_Bin \n", + "0 True young \n", + "1 True young \n", + "2 True young \n", + "3 True young \n", + "4 True young " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_features = ['Age']\n", + "\n", + "def discretize_features(df, features, bins, labels):\n", + " for feature in features:\n", + " df[f'{feature}_Bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n", + " df.drop(columns=[feature], inplace=True)\n", + " return df\n", + "\n", + "age_bins = [0, 25, 55, 100]\n", + "age_labels = [\"young\", \"middle-aged\", \"old\"]\n", + "\n", + "train_encoded = discretize_features(train_encoded, num_features, bins=age_bins, labels=age_labels)\n", + "val_encoded = discretize_features(val_encoded, num_features, bins=age_bins, labels=age_labels)\n", + "test_encoded = discretize_features(test_encoded, num_features, bins=age_bins, labels=age_labels)\n", + "\n", + "train_encoded.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Применим ручной синтез признаков. К примеру, для этого датасета, сделаем признак \"соотвествие устройства для обучения\". Мобильные устройства часто менее удобны для учебы по сравнению с планшетами." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Education Level_SchoolEducation Level_UniversityInstitution Type_PublicGender_MaleDevice_MobileDevice_TabIT Student_YesLocation_TownFinancial Condition_PoorFinancial Condition_RichInternet Type_WifiNetwork Type_3GNetwork Type_4GAge_BinDevice Suitability
0TrueFalseFalseTrueTrueFalseFalseTrueFalseTrueTrueFalseTrueyoungLow
1FalseFalseFalseFalseTrueFalseFalseTrueFalseFalseTrueFalseTrueyoungLow
2FalseTrueFalseTrueTrueFalseFalseTrueFalseFalseTrueFalseTrueyoungLow
3TrueFalseTrueTrueTrueFalseFalseTrueFalseTrueFalseFalseTrueyoungLow
4FalseTrueFalseFalseTrueFalseFalseFalseFalseFalseTrueFalseTrueyoungLow
\n", + "
" + ], + "text/plain": [ + " Education Level_School Education Level_University \\\n", + "0 True False \n", + "1 False False \n", + "2 False True \n", + "3 True False \n", + "4 False True \n", + "\n", + " Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n", + "0 False True True False \n", + "1 False False True False \n", + "2 False True True False \n", + "3 True True True False \n", + "4 False False True False \n", + "\n", + " IT Student_Yes Location_Town Financial Condition_Poor \\\n", + "0 False True False \n", + "1 False True False \n", + "2 False True False \n", + "3 False True False \n", + "4 False False False \n", + "\n", + " Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n", + "0 True True False \n", + "1 False True False \n", + "2 False True False \n", + "3 True False False \n", + "4 False True False \n", + "\n", + " Network Type_4G Age_Bin Device Suitability \n", + "0 True young Low \n", + "1 True young Low \n", + "2 True young Low \n", + "3 True young Low \n", + "4 True young Low " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_encoded['Device Suitability'] = train_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n", + "val_encoded['Device Suitability'] = val_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n", + "test_encoded['Device Suitability'] = test_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n", + "\n", + "train_encoded.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Конструирование признаков с помощью фреймворка Featuretools." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n", + "d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " pd.to_datetime(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Education Level_SchoolEducation Level_UniversityInstitution Type_PublicGender_MaleDevice_MobileDevice_TabIT Student_YesLocation_TownFinancial Condition_PoorFinancial Condition_RichInternet Type_WifiNetwork Type_3GNetwork Type_4GAge_BinDevice Suitability
id
0TrueFalseFalseTrueTrueFalseFalseTrueFalseTrueTrueFalseTrueyoungLow
1FalseFalseFalseFalseTrueFalseFalseTrueFalseFalseTrueFalseTrueyoungLow
2FalseTrueFalseTrueTrueFalseFalseTrueFalseFalseTrueFalseTrueyoungLow
3TrueFalseTrueTrueTrueFalseFalseTrueFalseTrueFalseFalseTrueyoungLow
4FalseTrueFalseFalseTrueFalseFalseFalseFalseFalseTrueFalseTrueyoungLow
\n", + "
" + ], + "text/plain": [ + " Education Level_School Education Level_University \\\n", + "id \n", + "0 True False \n", + "1 False False \n", + "2 False True \n", + "3 True False \n", + "4 False True \n", + "\n", + " Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n", + "id \n", + "0 False True True False \n", + "1 False False True False \n", + "2 False True True False \n", + "3 True True True False \n", + "4 False False True False \n", + "\n", + " IT Student_Yes Location_Town Financial Condition_Poor \\\n", + "id \n", + "0 False True False \n", + "1 False True False \n", + "2 False True False \n", + "3 False True False \n", + "4 False False False \n", + "\n", + " Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n", + "id \n", + "0 True True False \n", + "1 False True False \n", + "2 False True False \n", + "3 True False False \n", + "4 False True False \n", + "\n", + " Network Type_4G Age_Bin Device Suitability \n", + "id \n", + "0 True young Low \n", + "1 True young Low \n", + "2 True young Low \n", + "3 True young Low \n", + "4 True young Low " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ft_data = train_encoded.copy()\n", + "\n", + "es = ft.EntitySet(id=\"students\")\n", + "es = es.add_dataframe(dataframe_name=\"students_data\", dataframe=ft_data, index=\"id\", make_index=True)\n", + "\n", + "feature_matrix, feature_defs = ft.dfs(\n", + " entityset=es, \n", + " target_dataframe_name=\"students_data\",\n", + " max_depth=1\n", + ")\n", + "\n", + "feature_matrix.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Featuretools не смог сделать новые признаки.\n", + "\n", + "Оценка качества набора признаков." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Время обучения модели: 0.11 секунд\n" + ] + } + ], + "source": [ + "train_encoded = pd.get_dummies(train_encoded, drop_first=True)\n", + "val_encoded = pd.get_dummies(val_encoded, drop_first=True)\n", + "test_encoded = pd.get_dummies(test_encoded, drop_first=True)\n", + "\n", + "cols = train_encoded.columns\n", + "\n", + "train_encoded = train_encoded.reindex(columns=cols, fill_value=0)\n", + "val_encoded = val_encoded.reindex(columns=cols, fill_value=0)\n", + "test_encoded = test_encoded.reindex(columns=cols, fill_value=0)\n", + "\n", + "model = RandomForestClassifier(n_estimators=100, random_state=42)\n", + "\n", + "start = time.time()\n", + "model.fit(train_encoded, Y_train_resampled)\n", + "train_time = time.time() - start\n", + "\n", + "print(f'Время обучения модели: {train_time:.2f} секунд')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature Importance:\n", + " feature importance\n", + "9 Financial Condition_Rich 0.184028\n", + "3 Gender_Male 0.108992\n", + "8 Financial Condition_Poor 0.107030\n", + "2 Institution Type_Public 0.095663\n", + "10 Internet Type_Wifi 0.089925\n", + "7 Location_Town 0.078658\n", + "0 Education Level_School 0.061961\n", + "6 IT Student_Yes 0.055048\n", + "1 Education Level_University 0.049695\n", + "12 Network Type_4G 0.044837\n", + "4 Device_Mobile 0.042086\n", + "11 Network Type_3G 0.038541\n", + "13 Age_Bin_middle-aged 0.034876\n", + "15 Device Suitability_Low 0.004611\n", + "5 Device_Tab 0.004049\n", + "14 Age_Bin_old 0.000000\n" + ] + } + ], + "source": [ + "# Получение важности признаков\n", + "importances = model.feature_importances_\n", + "feature_names = train_encoded.columns\n", + "\n", + "# Сортировка признаков по важности\n", + "feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n", + "feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n", + "\n", + "print(\"Feature Importance:\")\n", + "print(feature_importance)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RMSE: 0.5652451456569942\n", + "R²: 0.22569473420679287\n", + "MAE: 0.2697095435684647 \n", + "\n", + "Кросс-валидация RMSE: 0.5705060311373475 \n", + "\n", + "Train RMSE: 0.5237418787490223\n", + "Train R²: 0.5885416666666667\n", + "Train MAE: 0.19791666666666666\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "y_pred = model.predict(test_encoded)\n", + "\n", + "# Анализ важности признаков\n", + "feature_importances = model.feature_importances_\n", + "feature_names = train_encoded.columns\n", + "\n", + "importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n", + "importance_df = importance_df.sort_values(by='Importance', ascending=False)\n", + "\n", + "rmse = mean_squared_error(Y_test_df, y_pred, squared=False)\n", + "r2 = r2_score(Y_test_df, y_pred)\n", + "mae = mean_absolute_error(Y_test_df, y_pred)\n", + "\n", + "print()\n", + "print(f\"RMSE: {rmse}\")\n", + "print(f\"R²: {r2}\")\n", + "print(f\"MAE: {mae} \\n\")\n", + "\n", + "# Кросс-валидация\n", + "scores = cross_val_score(model, train_encoded, Y_train_resampled, cv=5, scoring='neg_mean_squared_error')\n", + "rmse_cv = math.sqrt((-scores.mean()))\n", + "print(f\"Кросс-валидация RMSE: {rmse_cv} \\n\")\n", + "\n", + "# Проверка на переобучение\n", + "y_train_pred = model.predict(train_encoded)\n", + "\n", + "rmse_train = mean_squared_error(Y_train_resampled, y_train_pred, squared=False)\n", + "r2_train = r2_score(Y_train_resampled, y_train_pred)\n", + "mae_train = mean_absolute_error(Y_train_resampled, y_train_pred)\n", + "\n", + "print(f\"Train RMSE: {rmse_train}\")\n", + "print(f\"Train R²: {r2_train}\")\n", + "print(f\"Train MAE: {mae_train}\")\n", + "print()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lab_3/requirements.txt b/lab_3/requirements.txt new file mode 100644 index 0000000..f9e243b Binary files /dev/null and b/lab_3/requirements.txt differ