AIM-PIbd-31-Makarov-DV/lab_3/lab3.ipynb

1567 lines
142 KiB
Plaintext
Raw Normal View History

2024-11-08 19:32:49 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Лабораторная 3\n",
"\n",
"Датасет: Информация об онлайн обучении учеников"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Education Level', 'Institution Type', 'Gender', 'Age', 'Device',\n",
" 'IT Student', 'Location', 'Financial Condition', 'Internet Type',\n",
" 'Network Type', 'Flexibility Level'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import featuretools as ft\n",
2024-11-08 19:33:42 +04:00
"import time\n",
"import math\n",
2024-11-08 19:32:49 +04:00
"from imblearn.over_sampling import RandomOverSampler\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import cross_val_score\n",
2024-11-08 19:33:42 +04:00
"from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error\n",
2024-11-08 19:32:49 +04:00
"\n",
"df = pd.read_csv(\"..\\\\static\\\\csv\\\\students_adaptability_level_online_education.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Столбцы:\n",
"\n",
"Education Level - уровень образования\\\n",
"Institution Type - тип учреждения\\\n",
"Gender - пол\\\n",
"Age - возраст\\\n",
"Device - устройство\\\n",
"IT Student - ученик IT направления или нет\\\n",
"Location - локация\\\n",
"Financial Condition - финансовое состояние\\\n",
"Internet Type - тип доступа к сети\\\n",
"Network Type - уровень сети\\\n",
"Flexibility Level - уровень приспособления"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1205 entries, 0 to 1204\n",
"Data columns (total 11 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Education Level 1205 non-null object\n",
" 1 Institution Type 1205 non-null object\n",
" 2 Gender 1205 non-null object\n",
" 3 Age 1205 non-null int64 \n",
" 4 Device 1205 non-null object\n",
" 5 IT Student 1205 non-null object\n",
" 6 Location 1205 non-null object\n",
" 7 Financial Condition 1205 non-null object\n",
" 8 Internet Type 1205 non-null object\n",
" 9 Network Type 1205 non-null object\n",
" 10 Flexibility Level 1205 non-null object\n",
"dtypes: int64(1), object(10)\n",
"memory usage: 103.7+ KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Education Level</th>\n",
" <th>Institution Type</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Device</th>\n",
" <th>IT Student</th>\n",
" <th>Location</th>\n",
" <th>Financial Condition</th>\n",
" <th>Internet Type</th>\n",
" <th>Network Type</th>\n",
" <th>Flexibility Level</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>University</td>\n",
" <td>Private</td>\n",
" <td>Male</td>\n",
" <td>23</td>\n",
" <td>Tab</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>University</td>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>23</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>College</td>\n",
" <td>Public</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>School</td>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>School</td>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Poor</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Education Level Institution Type Gender Age Device IT Student Location \\\n",
"0 University Private Male 23 Tab No Town \n",
"1 University Private Female 23 Mobile No Town \n",
"2 College Public Female 18 Mobile No Town \n",
"3 School Private Female 11 Mobile No Town \n",
"4 School Private Female 18 Mobile No Town \n",
"\n",
" Financial Condition Internet Type Network Type Flexibility Level \n",
"0 Mid Wifi 4G Moderate \n",
"1 Mid Mobile Data 4G Moderate \n",
"2 Mid Wifi 4G Moderate \n",
"3 Mid Mobile Data 4G Moderate \n",
"4 Poor Mobile Data 3G Low "
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.info()\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Примеры бизнес-целей для датасета:\n",
"1. Улучшение доступа к онлайн-образованию для учеников с низким уровнем финансового обеспечения.\n",
"2. Повышение удовлетворенности учеников онлайн-обучением на основе их устройств, типу соединения, местоположения."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Цели технического проекта:\n",
"\n",
"1. Провести анализ зависимости учеников от уровня интернет-соединения и устройств\n",
"2. Провести анализ влияния различных факторов (тип устройства, интернет-соединение, финансовое положение) на уровень приспособленности."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверяем на выбросы."
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пустые значения по столбцам:\n",
"Education Level 0\n",
"Institution Type 0\n",
"Gender 0\n",
"Age 0\n",
"Device 0\n",
"IT Student 0\n",
"Location 0\n",
"Financial Condition 0\n",
"Internet Type 0\n",
"Network Type 0\n",
"Flexibility Level 0\n",
"dtype: int64\n",
"\n",
"Количество дубликатов: 980\n",
"\n",
"Статистический обзор данных:\n",
"\n",
"Коэффициент асимметрии для столбца 'Age': 0.024342017300169792\n"
]
}
],
"source": [
"null_values = df.isnull().sum()\n",
"print(\"Пустые значения по столбцам:\")\n",
"print(null_values)\n",
"\n",
"duplicates = df.duplicated().sum()\n",
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n",
"\n",
"print(\"\\nСтатистический обзор данных:\")\n",
"df.describe()\n",
"\n",
"for column in df.select_dtypes(include=[np.number]).columns:\n",
" skewness = df[column].skew()\n",
" print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выбросы незначительны, дубликаты есть. Удаляем дубликаты и очищаем от шумов."
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Шумы в датасете:\n",
"Empty DataFrame\n",
"Columns: [Education Level, Institution Type, Gender, Age, Device, IT Student, Location, Financial Condition, Internet Type, Network Type, Flexibility Level]\n",
"Index: []\n"
]
}
],
"source": [
"cleaned_df = df.drop_duplicates()\n",
"\n",
"Q1 = df[\"Age\"].quantile(0.25)\n",
"Q3 = df[\"Age\"].quantile(0.75)\n",
"\n",
"IQR = Q3 - Q1\n",
"\n",
"threshold = 1.5 * IQR\n",
"lower_bound = Q1 - threshold\n",
"upper_bound = Q3 + threshold\n",
"\n",
"outliers = (df[\"Age\"] < lower_bound) | (df[\"Age\"] > upper_bound)\n",
"\n",
"print(\"Шумы в датасете:\")\n",
"print(df[outliers])\n",
"\n",
"median_score = df[\"Age\"].median()\n",
"df.loc[outliers, \"Age\"] = median_score"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Преобразуем строковые значение в столбце \"Уровень приспособления\" в числовые значения. Это понадобится для расчёта качества набора признаков."
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [
{
"ename": "IntCastingNaNError",
"evalue": "Cannot convert non-finite values (NA or inf) to integer",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mIntCastingNaNError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[149], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m map_flexibility_to_int \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mLow\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mModerate\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mHigh\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m2\u001b[39m}\n\u001b[1;32m----> 3\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFlexibility Level\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mFlexibility Level\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmap_flexibility_to_int\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mint32\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:6643\u001b[0m, in \u001b[0;36mNDFrame.astype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 6637\u001b[0m results \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 6638\u001b[0m ser\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy, errors\u001b[38;5;241m=\u001b[39merrors) \u001b[38;5;28;01mfor\u001b[39;00m _, ser \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems()\n\u001b[0;32m 6639\u001b[0m ]\n\u001b[0;32m 6641\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 6642\u001b[0m \u001b[38;5;66;03m# else, only a single dtype is given\u001b[39;00m\n\u001b[1;32m-> 6643\u001b[0m new_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mgr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 6644\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_constructor_from_mgr(new_data, axes\u001b[38;5;241m=\u001b[39mnew_data\u001b[38;5;241m.\u001b[39maxes)\n\u001b[0;32m 6645\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mastype\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:430\u001b[0m, in \u001b[0;36mBaseBlockManager.astype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 427\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m using_copy_on_write():\n\u001b[0;32m 428\u001b[0m copy \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m--> 430\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mastype\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 435\u001b[0m \u001b[43m \u001b[49m\u001b[43musing_cow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43musing_copy_on_write\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 436\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:363\u001b[0m, in \u001b[0;36mBaseBlockManager.apply\u001b[1;34m(self, f, align_keys, **kwargs)\u001b[0m\n\u001b[0;32m 361\u001b[0m applied \u001b[38;5;241m=\u001b[39m b\u001b[38;5;241m.\u001b[39mapply(f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 362\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 363\u001b[0m applied \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 364\u001b[0m result_blocks \u001b[38;5;241m=\u001b[39m extend_blocks(applied, result_blocks)\n\u001b[0;32m 366\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mfrom_blocks(result_blocks, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maxes)\n",
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\blocks.py:758\u001b[0m, in \u001b[0;36mBlock.astype\u001b[1;34m(self, dtype, copy, errors, using_cow, squeeze)\u001b[0m\n\u001b[0;32m 755\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCan not squeeze with more than one column.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 756\u001b[0m values \u001b[38;5;241m=\u001b[39m values[\u001b[38;5;241m0\u001b[39m, :] \u001b[38;5;66;03m# type: ignore[call-overload]\u001b[39;00m\n\u001b[1;32m--> 758\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array_safe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 760\u001b[0m new_values \u001b[38;5;241m=\u001b[39m maybe_coerce_values(new_values)\n\u001b[0;32m 762\u001b[0m refs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:237\u001b[0m, in \u001b[0;36mastype_array_safe\u001b[1;34m(values, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 234\u001b[0m dtype \u001b[38;5;241m=\u001b[39m dtype\u001b[38;5;241m.\u001b[39mnumpy_dtype\n\u001b[0;32m 236\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 237\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 238\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m):\n\u001b[0;32m 239\u001b[0m \u001b[38;5;66;03m# e.g. _astype_nansafe can fail on object-dtype of strings\u001b[39;00m\n\u001b[0;32m 240\u001b[0m \u001b[38;5;66;03m# trying to convert to float\u001b[39;00m\n\u001b[0;32m 241\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:182\u001b[0m, in \u001b[0;36mastype_array\u001b[1;34m(values, dtype, copy)\u001b[0m\n\u001b[0;32m 179\u001b[0m values \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n\u001b[0;32m 181\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 182\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43m_astype_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 184\u001b[0m \u001b[38;5;66;03m# in pandas we don't store numpy str dtypes, so convert to object\u001b[39;00m\n\u001b[0;32m 185\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, np\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, \u001b[38;5;28mstr\u001b[39m):\n",
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:101\u001b[0m, in \u001b[0;36m_astype_nansafe\u001b[1;34m(arr, dtype, copy, skipna)\u001b[0m\n\u001b[0;32m 96\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mensure_string_array(\n\u001b[0;32m 97\u001b[0m arr, skipna\u001b[38;5;241m=\u001b[39mskipna, convert_na_value\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m 98\u001b[0m )\u001b[38;5;241m.\u001b[39mreshape(shape)\n\u001b[0;32m 100\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m np\u001b[38;5;241m.\u001b[39missubdtype(arr\u001b[38;5;241m.\u001b[39mdtype, np\u001b[38;5;241m.\u001b[39mfloating) \u001b[38;5;129;01mand\u001b[39;00m dtype\u001b[38;5;241m.\u001b[39mkind \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miu\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m--> 101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_astype_float_to_int_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 103\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mobject\u001b[39m:\n\u001b[0;32m 104\u001b[0m \u001b[38;5;66;03m# if we have a datetime/timedelta array of objects\u001b[39;00m\n\u001b[0;32m 105\u001b[0m \u001b[38;5;66;03m# then coerce to datetime64[ns] and use DatetimeArray.astype\u001b[39;00m\n\u001b[0;32m 107\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mis_np_dtype(dtype, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mM\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n",
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:145\u001b[0m, in \u001b[0;36m_astype_float_to_int_nansafe\u001b[1;34m(values, dtype, copy)\u001b[0m\n\u001b[0;32m 141\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 142\u001b[0m \u001b[38;5;124;03mastype with a check preventing converting NaN to an meaningless integer value.\u001b[39;00m\n\u001b[0;32m 143\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 144\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m np\u001b[38;5;241m.\u001b[39misfinite(values)\u001b[38;5;241m.\u001b[39mall():\n\u001b[1;32m--> 145\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m IntCastingNaNError(\n\u001b[0;32m 146\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot convert non-finite values (NA or inf) to integer\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 147\u001b[0m )\n\u001b[0;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype\u001b[38;5;241m.\u001b[39mkind \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mu\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 149\u001b[0m \u001b[38;5;66;03m# GH#45151\u001b[39;00m\n\u001b[0;32m 150\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (values \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m.\u001b[39mall():\n",
"\u001b[1;31mIntCastingNaNError\u001b[0m: Cannot convert non-finite values (NA or inf) to integer"
]
}
],
"source": [
"map_flexibility_to_int = {'Low': 0, 'Moderate': 1, 'High': 2}\n",
"\n",
"df['Flexibility Level'] = df['Flexibility Level'].map(map_flexibility_to_int).astype('int32')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Шумов в датасете нет. Разбиваем датасет на три выборки: обучающую, контрольную и тестовую."
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: (723, 10)\n",
"Размер контрольной выборки: (241, 10)\n",
"Размер тестовой выборки: (241, 10)\n"
]
}
],
"source": [
"X = df.drop(columns=['Flexibility Level'])\n",
"Y = df['Flexibility Level']\n",
"\n",
"X_train_df, X_test_df, Y_train_df, Y_test_df = train_test_split(X, Y, test_size=0.2, random_state=42)\n",
"\n",
"X_train_df, X_val_df, Y_train_df, Y_val_df = train_test_split(X_train_df, Y_train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", X_train_df.shape)\n",
"print(\"Размер контрольной выборки:\",X_val_df.shape)\n",
"print(\"Размер тестовой выборки:\", X_test_df.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверка сбалансированности данных."
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в обучающей выборке:\n",
"Flexibility Level\n",
"1 0.531120\n",
"0 0.385892\n",
"2 0.082988\n",
"Name: proportion, dtype: float64\n",
"\n",
"Распределение классов в контрольной выборке:\n",
"Flexibility Level\n",
"1 0.522822\n",
"0 0.406639\n",
"2 0.070539\n",
"Name: proportion, dtype: float64\n",
"\n",
"Распределение классов в тестовой выборке:\n",
"Flexibility Level\n",
"1 0.477178\n",
"0 0.427386\n",
"2 0.095436\n",
"Name: proportion, dtype: float64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABboAAAHyCAYAAAAtJXgGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABuX0lEQVR4nO3de3zO9f/H8ec2dnaIzeYwm5wPOTSH0JwaU4QUkW9mCoWEUl8qQ4cl5RDKoaxEEZV+neQQheacnCUmp2yGHIaNXe/fH267vi7XNjOHa5887rfbdbu5Ptf78/m8Ptd12eu6ntfnel9uxhgjAAAAAAAAAAAsyt3VBQAAAAAAAAAAcD0IugEAAAAAAAAAlkbQDQAAAAAAAACwNIJuAAAAAAAAAIClEXQDAAAAAAAAACyNoBsAAAAAAAAAYGkE3QAAAAAAAAAASyPoBgAAAAAAAABYGkE3AAAAAOSBzWZTSkqK9u7d6+pSAAAAbnsE3QAAAACQS0eOHNHAgQMVGhoqT09PBQYGqlq1ajp16pSrSwMAALitFXB1AQAAADfaRx99pJiYGPt1Ly8vlS1bVq1atdIrr7yioKAgF1YHwKr+/PNPNW/eXBcuXNCAAQN09913q0CBAvLx8ZGfn5+rywMAALitEXQDAIB/rVGjRqlcuXI6f/68Vq5cqffff1/ff/+9tm7dKl9fX1eXB8Bi+vTpI09PT61evVqlS5d2dTkAAAC4DEE3AAD417r//vtVt25dSdKTTz6p4sWLa+zYsfr666/VtWtXF1cHwEo2bNign376SYsWLSLkBgAAyIeYoxsAANw2WrRoIUlKTEyUJB0/flzPP/+87rrrLvn7+6tw4cK6//779fvvvzute/78eY0YMUKVKlWSt7e3SpYsqY4dO2rPnj2SpH379snNzS3bS7NmzezbWr58udzc3DR37lwNGzZMwcHB8vPzU7t27XTgwAGnfa9Zs0atW7dWkSJF5Ovrq6ZNm2rVqlVZHmOzZs2y3P+IESOcxs6aNUvh4eHy8fFRsWLF1KVLlyz3n9OxXc5ms2n8+PGqXr26vL29FRQUpD59+ujEiRMO48LCwtS2bVun/fTv399pm1nVPmbMGKf7VJLS0tIUGxurChUqyMvLSyEhIXrhhReUlpaW5X11uSvvt4CAALVp00Zbt27N1bo1atTQhg0b1KhRI/n4+KhcuXKaMmWKw7j09HQNHz5c4eHhKlKkiPz8/BQREaFly5Y5jNu1a5datGih4OBg+3E89dRTOn78uNO+e/TocdXHu0ePHgoLC3NY78CBA/Lx8ZGbm5v27dsn6X+P80cffeQwdsSIEVk+Lv3793eqp23btg77ytzm22+/nc2957z9+Ph4ubm5acaMGQ7j3njjDbm5uen777/PdlvSpedX5v3g7u6u4OBgPfroo9q/f/911bV69Wp5e3trz549ql69ury8vBQcHKw+ffpk+djMmzfP/v8rICBA//nPf3To0CGHMT169JC/v7/27t2rqKgo+fn5qVSpUho1apSMMU71Xv7YnD59WuHh4SpXrpz+/vtv+/K3335bjRo1UvHixeXj46Pw8HDNnz/fYb/Xex8DAADkR5zRDQAAbhuZoXTx4sUlSXv37tWCBQvUqVMnlStXTklJSZo6daqaNm2q7du3q1SpUpKkjIwMtW3bVkuXLlWXLl307LPP6vTp01q8eLG2bt2q8uXL2/fRtWtXPfDAAw77HTp0aJb1vP7663Jzc9OLL76o5ORkjR8/XpGRkdq0aZN8fHwkST/99JPuv/9+hYeHKzY2Vu7u7oqPj1eLFi20YsUK1a9f32m7ZcqUUVxcnCTpzJkzevrpp7Pc9yuvvKLOnTvrySef1NGjRzVx4kQ1adJEv/32m4oWLeq0Tu/evRURESFJ+vLLL/XVV1853N6nTx/7/OgDBgxQYmKiJk2apN9++02rVq1SwYIFs7wfrsU///xjP7bL2Ww2tWvXTitXrlTv3r1VtWpVbdmyRePGjdMff/yhBQsWXHXbVapU0UsvvSRjjPbs2aOxY8fqgQcecAhIs3PixAk98MAD6ty5s7p27arPP/9cTz/9tDw9PdWzZ09J0qlTp/TBBx+oa9eu6tWrl06fPq0PP/xQUVFRWrt2rWrXri1JSk1NVZkyZfTggw+qcOHC2rp1qyZPnqxDhw7pm2++cdp3QECAxo0bZ7/++OOPX7Xe4cOH6/z581cd5woxMTH68ssvNXjwYLVs2VIhISHasmWLRo4cqSeeeMLp/1dWIiIi1Lt3b9lsNm3dulXjx4/X4cOHtWLFijzXdezYMZ0/f15PP/20WrRooaeeekp79uzR5MmTtWbNGq1Zs0ZeXl6S/vc7AfXq1VNcXJySkpI0YcIErVq1yun/V0ZGhlq3bq177rlHb731lhYuXKjY2FhdvHhRo0aNyrKWCxcu6OGHH9b+/fu1atUqlSxZ0n7bhAkT1K5dO3Xr1k3p6emaM2eOOnXqpG+//VZt2rS5YfcxAABAvmMAAAD+ZeLj440ks2TJEnP06FFz4MABM2fOHFO8eHHj4+NjDh48aIwx5vz58yYjI8Nh3cTEROPl5WVGjRplXzZjxgwjyYwdO9ZpXzabzb6eJDNmzBinMdWrVzdNmza1X1+2bJmRZEqXLm1OnTplX/75558bSWbChAn2bVesWNFERUXZ92OMMWfPnjXlypUzLVu2dNpXo0aNTI0aNezXjx49aiSZ2NhY+7J9+/YZDw8P8/rrrzusu2XLFlOgQAGn5bt37zaSzMcff2xfFhsbay5/KblixQojycyePdth3YULFzotDw0NNW3atHGqvV+/fubKl6dX1v7CCy+YEiVKmPDwcIf79JNPPjHu7u5mxYoVDutPmTLFSDKrVq1y2t/lmjZt6rA9Y4wZNmyYkWSSk5Ovuq4k884779iXpaWlmdq1a5sSJUqY9PR0Y4wxFy9eNGlpaQ7rnjhxwgQFBZmePXvmuI++ffsaf39/p+XdunUz5cqVc1h25X0WHR1tQkND7de3bt1q3N3dzf33328kmcTERGOMMX/99ZeRZGbMmOGwvSsf68x99OvXz6meNm3aOOwrp/8XOW3/77//NsWKFTMtW7Y0aWlppk6dOqZs2bLm5MmT2W4nU2hoqImOjnZY9thjjxlfX9/rqivz+n333WcuXrxoX57592bixInGGGPS09NNiRIlTI0aNcy5c+fs47799lsjyQwfPty+LDo62kgyzzzzjH2ZzWYzbdq0MZ6enubo0aMO9cbHxxubzWa6detmfH19zZo1a5zqPnv2rMP19PR0U6NGDdOiRQuH5ddzHwMAAORHTF0CAAD+tSIjIxUYGKiQkBB16dJF/v7++uqrr+zz63p5ecnd/dLLoYyMDB07dkz+/v6qXLmyNm7caN/OF198oYCAAD3zzDNO+7hySodr0b17dxUqVMh+/ZFHHlHJkiXt0wZs2rRJu3fv1mOPPaZjx44pJSVFKSkpSk1N1X333adffvlFNpvNYZvnz5+Xt7d3jvv98ssvZbPZ1LlzZ/s2U1JSFBwcrIoVKzpNpZGeni5J9rNVszJv3jwVKVJELVu2dNhmeHi4/P39nbZ54cIFh3EpKSlXPcP40KFDmjhxol555RX5+/s77b9q1aqqUqWKwzYzp6u5cv9Zyazp6NGjSkhI0FdffaWaNWsqICDgqusWKFBAffr0sV/39PRUnz59lJycrA0bNkiSPDw85OnpKenSGejHjx/XxYsXVbduXYfnW6aTJ08qKSlJS5cu1XfffacmTZo4jUlPT8/xccnK0KFDdffdd6tTp04OywMDAyVJBw8ezNV2zp8/7/QYXrhwIcuxZ8+eVUpKik6cOOEwJUd2goODNXnyZC1evFgRERHatGmTZsyYocKFC+eqtrS0NKWkpCg5OVmLFy/WTz/9pPvuu++665KkwYMHy8PDw3798ccfV1BQkL777jtJ0vr165WcnKy+ffs6/F9s06aNqlSpYh93ucungcmcFiY9PV1LlixxGjtkyBDNnj1bn3/+eZbf6Mj8Noh06ZsGJ0+eVEREhNNz7HrvYwAAgPy
"text/plain": [
"<Figure size 1800x500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"def analyze_balance(y_train, y_val, y_test, y_name):\n",
" print(\"Распределение классов в обучающей выборке:\")\n",
" print(y_train.value_counts(normalize=True))\n",
" \n",
" print(\"\\nРаспределение классов в контрольной выборке:\")\n",
" print(y_val.value_counts(normalize=True))\n",
" \n",
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
" print(y_test.value_counts(normalize=True))\n",
"\n",
" fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n",
" fig.suptitle('Распределение в различных выборках')\n",
"\n",
" sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts(normalize=True), ax=axes[0])\n",
" axes[0].set_title('Обучающая выборка')\n",
" axes[0].set_xlabel(y_name)\n",
" axes[0].set_ylabel('Доля')\n",
"\n",
" sns.barplot(x=y_val.value_counts().index, y=y_val.value_counts(normalize=True), ax=axes[1])\n",
" axes[1].set_title('Контрольная выборка')\n",
" axes[1].set_xlabel(y_name)\n",
"\n",
" sns.barplot(x=y_test.value_counts().index, y=y_test.value_counts(normalize=True), ax=axes[2])\n",
" axes[2].set_title('Тестовая выборка')\n",
" axes[2].set_xlabel(y_name)\n",
"\n",
" plt.show()\n",
"\n",
"analyze_balance(Y_train_df, Y_val_df, Y_test_df, 'Flexibility Level')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выполним оверсемплинг для балансировки."
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в обучающей выборке:\n",
"Flexibility Level\n",
"2 0.333333\n",
"0 0.333333\n",
"1 0.333333\n",
"Name: proportion, dtype: float64\n",
"\n",
"Распределение классов в контрольной выборке:\n",
"Flexibility Level\n",
"1 0.333333\n",
"0 0.333333\n",
"2 0.333333\n",
"Name: proportion, dtype: float64\n",
"\n",
"Распределение классов в тестовой выборке:\n",
"Flexibility Level\n",
"1 0.477178\n",
"0 0.427386\n",
"2 0.095436\n",
"Name: proportion, dtype: float64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABboAAAHyCAYAAAAtJXgGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABt6klEQVR4nO3deXhMZ//H8U8SsltKSCyRqH2ppbEUjTVEG0W1lHoqokWLKlrtQ1tBl1S1StFaWulCS2mrv25qKS0ae9WuaqeViF2QkLl/f7gyjzFJRITJqffruua6zJn7nPM9MyPfmc+cucfNGGMEAAAAAAAAAIBFubu6AAAAAAAAAAAAbgRBNwAAAAAAAADA0gi6AQAAAAAAAACWRtANAAAAAAAAALA0gm4AAAAAAAAAgKURdAMAAAAAAAAALI2gGwAAAAAAAABgaQTdAAAAAAAAAABLI+gGAAAAgFyw2WxKTk7Wnj17XF0KAADAbY+gGwAAAABy6MiRIxo0aJBCQkLk6empEiVKqHr16jp9+rSrSwMAALitFXB1AQAAAHnto48+UkxMjP26l5eXypUrpzZt2ujll19WYGCgC6sDYFV//fWXWrRooYsXL2rgwIG6++67VaBAAfn4+MjPz8/V5QEAANzWCLoBAMC/1ujRo1W+fHlduHBBK1as0Pvvv68ffvhBW7Zska+vr6vLA2Axffv2laenp1atWqUyZcq4uhwAAABcgaAbAAD8a913332qV6+eJOmJJ55Q8eLFNW7cOH3zzTfq1q2bi6sDYCXr16/Xzz//rIULFxJyAwAA5EPM0Q0AAG4bLVu2lCTt3btXknT8+HE999xzuuuuu+Tv76/ChQvrvvvu0x9//OG07oULFzRy5EhVrlxZ3t7eKlWqlDp16qTdu3dLkvbt2yc3N7csL82bN7dva9myZXJzc9OcOXM0fPhwBQUFyc/PT+3bt9fBgwed9r169Wq1bdtWRYoUka+vr5o1a6aVK1dmeozNmzfPdP8jR450Gjtz5kyFhYXJx8dHxYoVU9euXTPdf3bHdiWbzabx48erRo0a8vb2VmBgoPr27asTJ044jAsNDVW7du2c9jNgwACnbWZW+9ixY53uU0lKTU1VbGysKlasKC8vLwUHB+v5559XampqpvfVla6+3wICAhQVFaUtW7bkaN2aNWtq/fr1aty4sXx8fFS+fHlNmTLFYVxaWppGjBihsLAwFSlSRH5+fgoPD9fSpUsdxu3cuVMtW7ZUUFCQ/TiefPJJHT9+3GnfPXv2vObj3bNnT4WGhjqsd/DgQfn4+MjNzU379u2T9L/H+aOPPnIYO3LkyEwflwEDBjjV065dO4d9ZWzzrbfeyuLec95+fHy83NzcNGPGDIdxr7/+utzc3PTDDz9kuS3p8vMr435wd3dXUFCQHnnkER04cOCG6lq1apW8vb21e/du1ahRQ15eXgoKClLfvn0zfWzmzp1r//8VEBCg//znPzp8+LDDmJ49e8rf31979uxRZGSk/Pz8VLp0aY0ePVrGGKd6r3xszpw5o7CwMJUvX17//POPfflbb72lxo0bq3jx4vLx8VFYWJjmzZvnsN8bvY8BAADyI87oBgAAt42MULp48eKSpD179mj+/Pnq3Lmzypcvr8TERE2dOlXNmjXTtm3bVLp0aUlSenq62rVrpyVLlqhr16565plndObMGS1atEhbtmxRhQoV7Pvo1q2b7r//fof9Dhs2LNN6XnvtNbm5uemFF15QUlKSxo8fr4iICG3cuFE+Pj6SpJ9//ln33XefwsLCFBsbK3d3d8XHx6tly5Zavny5GjRo4LTdsmXLKi4uTpJ09uxZPfXUU5nu++WXX1aXLl30xBNP6OjRo5o4caKaNm2q33//XUWLFnVap0+fPgoPD5ckffXVV/r6668dbu/bt699fvSBAwdq7969mjRpkn7//XetXLlSBQsWzPR+uB4nT560H9uVbDab2rdvrxUrVqhPnz6qVq2aNm/erHfeeUd//vmn5s+ff81tV61aVS+++KKMMdq9e7fGjRun+++/3yEgzcqJEyd0//33q0uXLurWrZu++OILPfXUU/L09FSvXr0kSadPn9YHH3ygbt26qXfv3jpz5ow+/PBDRUZGas2aNapTp44kKSUlRWXLltUDDzygwoULa8uWLZo8ebIOHz6sb7/91mnfAQEBeuedd+zXH3vssWvWO2LECF24cOGa41whJiZGX331lYYMGaLWrVsrODhYmzdv1qhRo/T44487/f/KTHh4uPr06SObzaYtW7Zo/Pjx+vvvv7V8+fJc13Xs2DFduHBBTz31lFq2bKknn3xSu3fv1uTJk7V69WqtXr1aXl5ekv73OwH169dXXFycEhMTNWHCBK1cudLp/1d6erratm2re+65R2+++aYWLFig2NhYXbp0SaNHj860losXL+qhhx7SgQMHtHLlSpUqVcp+24QJE9S+fXt1795daWlpmj17tjp37qzvvvtOUVFReXYfAwAA5DsGAADgXyY+Pt5IMosXLzZHjx41Bw8eNLNnzzbFixc3Pj4+5tChQ8YYYy5cuGDS09Md1t27d6/x8vIyo0ePti+bMWOGkWTGjRvntC+bzWZfT5IZO3as05gaNWqYZs2a2a8vXbrUSDJlypQxp0+fti//4osvjCQzYcIE+7YrVapkIiMj7fsxxphz586Z8uXLm9atWzvtq3HjxqZmzZr260ePHjWSTGxsrH3Zvn37jIeHh3nttdcc1t28ebMpUKCA0/Jdu3YZSebjjz+2L4uNjTVXvpRcvny5kWRmzZrlsO6CBQucloeEhJioqCin2vv372+ufnl6de3PP/+8KVmypAkLC3O4Tz/99FPj7u5uli9f7rD+lClTjCSzcuVKp/1dqVmzZg7bM8aY4cOHG0kmKSnpmutKMm+//bZ9WWpqqqlTp44pWbKkSUtLM8YYc+nSJZOamuqw7okTJ0xgYKDp1atXtvvo16+f8ff3d1revXt3U758eYdlV99n0dHRJiQkxH59y5Ytxt3d3dx3331Gktm7d68xxpj9+/cbSWbGjBkO27v6sc7YR//+/Z3qiYqKcthXdv8vstv+P//8Y4oVK2Zat25tUlNTTd26dU25cuXMqVOnstxOhpCQEBMdHe2w7NFHHzW+vr43VFfG9VatWplLly7Zl2f8vZk4caIxxpi0tDRTsmRJU7NmTXP+/Hn7uO+++85IMiNGjLAvi46ONpLM008/bV9ms9lMVFSU8fT0NEePHnWoNz4+3thsNtO9e3fj6+trVq9e7VT3uXPnHK6npaWZmjVrmpYtWzosv5H7GAAAID9i6hIAAPCvFRERoRIlSig4OFhdu3aVv7+/vv76a/v8ul5eXnJ3v/xyKD09XceOHZO/v7+qVKmiDRs22Lfz5ZdfKiAgQE8//bTTPq6e0uF69OjRQ4UKFbJff/jhh1WqVCn7tAEbN27Url279Oijj+rYsWNKTk5WcnKyUlJS1KpVK/3666+y2WwO27xw4YK8vb2z3e9XX30lm82mLl262LeZnJysoKAgVapUyWkqjbS0NEmyn62amblz56pIkSJq3bq1wzbDwsLk7+/vtM2LFy86jEtOTr7mGcaHDx/WxIkT9fLLL8vf399p/9WqVVPVqlUdtpkxXc3V+89MRk1Hjx5VQkKCvv76a9WqVUsBAQHXXLdAgQLq27ev/bqnp6f69u2rpKQkrV+/XpLk4eEhT09PSZfPQD9+/LguXbqkevXqOTzfMpw6dUqJiYlasmSJvv/+ezVt2tRpTFpaWraPS2aGDRumu+++W507d3ZYXqJECUnSoUOHcrSdCxcuOD2GFy9ezHTsuXPnlJycrBMnTjhMyZGVoKAgTZ48WYsWLVJ4eLg2btyoGTNmqHDhwjmqLTU1VcnJyUpKStKiRYv0888/q1WrVjdclyQNGTJEHh4e9uuPPfaYAgMD9f3330uS1q1bp6SkJPXr18/h/2JUVJSqVq1qH3elK6eByZgWJi0tTYsXL3YaO3ToUM2aNUtffPFFpt/oyPg2iHT5mwanTp1SeHi403PsRu9jAACA/IapSwA
"text/plain": [
"<Figure size 1800x500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"ros = RandomOverSampler(random_state=42)\n",
"\n",
"X_train_resampled, Y_train_resampled = ros.fit_resample(X_train_df, Y_train_df)\n",
"X_val_resampled, Y_val_resampled = ros.fit_resample(X_val_df, Y_val_df)\n",
"\n",
"analyze_balance(Y_train_resampled, Y_val_resampled, Y_test_df, 'Flexibility Level')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Конструирование признаков. Для начала применим унитарное кодирование категориальных признаков (one-hot encoding), переведя их в бинарные вектора."
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Education Level_School</th>\n",
" <th>Education Level_University</th>\n",
" <th>Institution Type_Public</th>\n",
" <th>Gender_Male</th>\n",
" <th>Device_Mobile</th>\n",
" <th>Device_Tab</th>\n",
" <th>IT Student_Yes</th>\n",
" <th>Location_Town</th>\n",
" <th>Financial Condition_Poor</th>\n",
" <th>Financial Condition_Rich</th>\n",
" <th>Internet Type_Wifi</th>\n",
" <th>Network Type_3G</th>\n",
" <th>Network Type_4G</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>18</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>23</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>18</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>23</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Age Education Level_School Education Level_University \\\n",
"0 10 True False \n",
"1 18 False False \n",
"2 23 False True \n",
"3 18 True False \n",
"4 23 False True \n",
"\n",
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
"0 False True True False \n",
"1 False False True False \n",
"2 False True True False \n",
"3 True True True False \n",
"4 False False True False \n",
"\n",
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
"0 False True False \n",
"1 False True False \n",
"2 False True False \n",
"3 False True False \n",
"4 False False False \n",
"\n",
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
"0 True True False \n",
"1 False True False \n",
"2 False True False \n",
"3 True False False \n",
"4 False True False \n",
"\n",
" Network Type_4G \n",
"0 True \n",
"1 True \n",
"2 True \n",
"3 True \n",
"4 True "
]
},
"execution_count": 140,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_features = ['Education Level', 'Institution Type', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition', 'Internet Type', 'Network Type']\n",
"\n",
"train_encoded = pd.get_dummies(X_train_resampled, columns=cat_features, drop_first=True)\n",
"val_encoded = pd.get_dummies(X_val_resampled, columns=cat_features, drop_first=True)\n",
"test_encoded = pd.get_dummies(X_test_df, columns=cat_features, drop_first=True)\n",
"\n",
"train_encoded.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Применим дискретизацию к числовым признакам."
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Education Level_School</th>\n",
" <th>Education Level_University</th>\n",
" <th>Institution Type_Public</th>\n",
" <th>Gender_Male</th>\n",
" <th>Device_Mobile</th>\n",
" <th>Device_Tab</th>\n",
" <th>IT Student_Yes</th>\n",
" <th>Location_Town</th>\n",
" <th>Financial Condition_Poor</th>\n",
" <th>Financial Condition_Rich</th>\n",
" <th>Internet Type_Wifi</th>\n",
" <th>Network Type_3G</th>\n",
" <th>Network Type_4G</th>\n",
" <th>Age_Bin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Education Level_School Education Level_University \\\n",
"0 True False \n",
"1 False False \n",
"2 False True \n",
"3 True False \n",
"4 False True \n",
"\n",
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
"0 False True True False \n",
"1 False False True False \n",
"2 False True True False \n",
"3 True True True False \n",
"4 False False True False \n",
"\n",
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
"0 False True False \n",
"1 False True False \n",
"2 False True False \n",
"3 False True False \n",
"4 False False False \n",
"\n",
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
"0 True True False \n",
"1 False True False \n",
"2 False True False \n",
"3 True False False \n",
"4 False True False \n",
"\n",
" Network Type_4G Age_Bin \n",
"0 True young \n",
"1 True young \n",
"2 True young \n",
"3 True young \n",
"4 True young "
]
},
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"num_features = ['Age']\n",
"\n",
"def discretize_features(df, features, bins, labels):\n",
" for feature in features:\n",
" df[f'{feature}_Bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
" df.drop(columns=[feature], inplace=True)\n",
" return df\n",
"\n",
"age_bins = [0, 25, 55, 100]\n",
"age_labels = [\"young\", \"middle-aged\", \"old\"]\n",
"\n",
"train_encoded = discretize_features(train_encoded, num_features, bins=age_bins, labels=age_labels)\n",
"val_encoded = discretize_features(val_encoded, num_features, bins=age_bins, labels=age_labels)\n",
"test_encoded = discretize_features(test_encoded, num_features, bins=age_bins, labels=age_labels)\n",
"\n",
"train_encoded.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Применим ручной синтез признаков. К примеру, для этого датасета, сделаем признак \"соотвествие устройства для обучения\". Мобильные устройства часто менее удобны для учебы по сравнению с планшетами."
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Education Level_School</th>\n",
" <th>Education Level_University</th>\n",
" <th>Institution Type_Public</th>\n",
" <th>Gender_Male</th>\n",
" <th>Device_Mobile</th>\n",
" <th>Device_Tab</th>\n",
" <th>IT Student_Yes</th>\n",
" <th>Location_Town</th>\n",
" <th>Financial Condition_Poor</th>\n",
" <th>Financial Condition_Rich</th>\n",
" <th>Internet Type_Wifi</th>\n",
" <th>Network Type_3G</th>\n",
" <th>Network Type_4G</th>\n",
" <th>Age_Bin</th>\n",
" <th>Device Suitability</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Education Level_School Education Level_University \\\n",
"0 True False \n",
"1 False False \n",
"2 False True \n",
"3 True False \n",
"4 False True \n",
"\n",
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
"0 False True True False \n",
"1 False False True False \n",
"2 False True True False \n",
"3 True True True False \n",
"4 False False True False \n",
"\n",
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
"0 False True False \n",
"1 False True False \n",
"2 False True False \n",
"3 False True False \n",
"4 False False False \n",
"\n",
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
"0 True True False \n",
"1 False True False \n",
"2 False True False \n",
"3 True False False \n",
"4 False True False \n",
"\n",
" Network Type_4G Age_Bin Device Suitability \n",
"0 True young Low \n",
"1 True young Low \n",
"2 True young Low \n",
"3 True young Low \n",
"4 True young Low "
]
},
"execution_count": 142,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_encoded['Device Suitability'] = train_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n",
"val_encoded['Device Suitability'] = val_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n",
"test_encoded['Device Suitability'] = test_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n",
"\n",
"train_encoded.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Конструирование признаков с помощью фреймворка Featuretools."
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Education Level_School</th>\n",
" <th>Education Level_University</th>\n",
" <th>Institution Type_Public</th>\n",
" <th>Gender_Male</th>\n",
" <th>Device_Mobile</th>\n",
" <th>Device_Tab</th>\n",
" <th>IT Student_Yes</th>\n",
" <th>Location_Town</th>\n",
" <th>Financial Condition_Poor</th>\n",
" <th>Financial Condition_Rich</th>\n",
" <th>Internet Type_Wifi</th>\n",
" <th>Network Type_3G</th>\n",
" <th>Network Type_4G</th>\n",
" <th>Age_Bin</th>\n",
" <th>Device Suitability</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>young</td>\n",
" <td>Low</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Education Level_School Education Level_University \\\n",
"id \n",
"0 True False \n",
"1 False False \n",
"2 False True \n",
"3 True False \n",
"4 False True \n",
"\n",
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
"id \n",
"0 False True True False \n",
"1 False False True False \n",
"2 False True True False \n",
"3 True True True False \n",
"4 False False True False \n",
"\n",
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
"id \n",
"0 False True False \n",
"1 False True False \n",
"2 False True False \n",
"3 False True False \n",
"4 False False False \n",
"\n",
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
"id \n",
"0 True True False \n",
"1 False True False \n",
"2 False True False \n",
"3 True False False \n",
"4 False True False \n",
"\n",
" Network Type_4G Age_Bin Device Suitability \n",
"id \n",
"0 True young Low \n",
"1 True young Low \n",
"2 True young Low \n",
"3 True young Low \n",
"4 True young Low "
]
},
"execution_count": 143,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ft_data = train_encoded.copy()\n",
"\n",
"es = ft.EntitySet(id=\"students\")\n",
"es = es.add_dataframe(dataframe_name=\"students_data\", dataframe=ft_data, index=\"id\", make_index=True)\n",
"\n",
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=es, \n",
" target_dataframe_name=\"students_data\",\n",
" max_depth=1\n",
")\n",
"\n",
"feature_matrix.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Featuretools не смог сделать новые признаки.\n",
"\n",
"Оценка качества набора признаков."
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Время обучения модели: 0.22 секунд\n"
]
}
],
"source": [
"train_encoded = pd.get_dummies(train_encoded, drop_first=True)\n",
"val_encoded = pd.get_dummies(val_encoded, drop_first=True)\n",
"test_encoded = pd.get_dummies(test_encoded, drop_first=True)\n",
"\n",
"cols = train_encoded.columns\n",
"\n",
"train_encoded = train_encoded.reindex(columns=cols, fill_value=0)\n",
"val_encoded = val_encoded.reindex(columns=cols, fill_value=0)\n",
"test_encoded = test_encoded.reindex(columns=cols, fill_value=0)\n",
"\n",
"model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"\n",
"start = time.time()\n",
"model.fit(train_encoded, Y_train_resampled)\n",
"train_time = time.time() - start\n",
"\n",
"print(f'Время обучения модели: {train_time:.2f} секунд')"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature Importance:\n",
" feature importance\n",
"9 Financial Condition_Rich 0.184028\n",
"3 Gender_Male 0.108992\n",
"8 Financial Condition_Poor 0.107030\n",
"2 Institution Type_Public 0.095663\n",
"10 Internet Type_Wifi 0.089925\n",
"7 Location_Town 0.078658\n",
"0 Education Level_School 0.061961\n",
"6 IT Student_Yes 0.055048\n",
"1 Education Level_University 0.049695\n",
"12 Network Type_4G 0.044837\n",
"4 Device_Mobile 0.042086\n",
"11 Network Type_3G 0.038541\n",
"13 Age_Bin_middle-aged 0.034876\n",
"15 Device Suitability_Low 0.004611\n",
"5 Device_Tab 0.004049\n",
"14 Age_Bin_old 0.000000\n"
]
}
],
"source": [
"# Получение важности признаков\n",
"importances = model.feature_importances_\n",
"feature_names = train_encoded.columns\n",
"\n",
"# Сортировка признаков по важности\n",
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
"\n",
"print(\"Feature Importance:\")\n",
"print(feature_importance)"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RMSE: 0.5652451456569942\n",
"R²: 0.22569473420679287\n",
"MAE: 0.2697095435684647 \n",
"\n",
"Кросс-валидация RMSE: 0.5705060311373475 \n",
"\n",
"Train RMSE: 0.5237418787490223\n",
"Train R²: 0.5885416666666667\n",
"Train MAE: 0.19791666666666666\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
}
],
"source": [
"y_pred = model.predict(test_encoded)\n",
"\n",
"# Анализ важности признаков\n",
"feature_importances = model.feature_importances_\n",
"feature_names = train_encoded.columns\n",
"\n",
"importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n",
"importance_df = importance_df.sort_values(by='Importance', ascending=False)\n",
"\n",
"rmse = mean_squared_error(Y_test_df, y_pred, squared=False)\n",
"r2 = r2_score(Y_test_df, y_pred)\n",
"mae = mean_absolute_error(Y_test_df, y_pred)\n",
"\n",
"print()\n",
"print(f\"RMSE: {rmse}\")\n",
"print(f\"R²: {r2}\")\n",
"print(f\"MAE: {mae} \\n\")\n",
"\n",
"# Кросс-валидация\n",
"scores = cross_val_score(model, train_encoded, Y_train_resampled, cv=5, scoring='neg_mean_squared_error')\n",
"rmse_cv = math.sqrt((-scores.mean()))\n",
"print(f\"Кросс-валидация RMSE: {rmse_cv} \\n\")\n",
"\n",
"# Проверка на переобучение\n",
"y_train_pred = model.predict(train_encoded)\n",
"\n",
"rmse_train = mean_squared_error(Y_train_resampled, y_train_pred, squared=False)\n",
"r2_train = r2_score(Y_train_resampled, y_train_pred)\n",
"mae_train = mean_absolute_error(Y_train_resampled, y_train_pred)\n",
"\n",
"print(f\"Train RMSE: {rmse_train}\")\n",
"print(f\"Train R²: {r2_train}\")\n",
"print(f\"Train MAE: {mae_train}\")\n",
"print()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}