1569 lines
142 KiB
Plaintext
1569 lines
142 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Лабораторная 3\n",
|
|||
|
"\n",
|
|||
|
"Датасет: Информация об онлайн обучении учеников"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 121,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Education Level', 'Institution Type', 'Gender', 'Age', 'Device',\n",
|
|||
|
" 'IT Student', 'Location', 'Financial Condition', 'Internet Type',\n",
|
|||
|
" 'Network Type', 'Flexibility Level'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import featuretools as ft\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"import time\n",
|
|||
|
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
|
|||
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|||
|
"from sklearn.model_selection import cross_val_score\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"..\\\\static\\\\csv\\\\students_adaptability_level_online_education.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Столбцы:\n",
|
|||
|
"\n",
|
|||
|
"Education Level - уровень образования\\\n",
|
|||
|
"Institution Type - тип учреждения\\\n",
|
|||
|
"Gender - пол\\\n",
|
|||
|
"Age - возраст\\\n",
|
|||
|
"Device - устройство\\\n",
|
|||
|
"IT Student - ученик IT направления или нет\\\n",
|
|||
|
"Location - локация\\\n",
|
|||
|
"Financial Condition - финансовое состояние\\\n",
|
|||
|
"Internet Type - тип доступа к сети\\\n",
|
|||
|
"Network Type - уровень сети\\\n",
|
|||
|
"Flexibility Level - уровень приспособления"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 122,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 1205 entries, 0 to 1204\n",
|
|||
|
"Data columns (total 11 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 Education Level 1205 non-null object\n",
|
|||
|
" 1 Institution Type 1205 non-null object\n",
|
|||
|
" 2 Gender 1205 non-null object\n",
|
|||
|
" 3 Age 1205 non-null int64 \n",
|
|||
|
" 4 Device 1205 non-null object\n",
|
|||
|
" 5 IT Student 1205 non-null object\n",
|
|||
|
" 6 Location 1205 non-null object\n",
|
|||
|
" 7 Financial Condition 1205 non-null object\n",
|
|||
|
" 8 Internet Type 1205 non-null object\n",
|
|||
|
" 9 Network Type 1205 non-null object\n",
|
|||
|
" 10 Flexibility Level 1205 non-null object\n",
|
|||
|
"dtypes: int64(1), object(10)\n",
|
|||
|
"memory usage: 103.7+ KB\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Education Level</th>\n",
|
|||
|
" <th>Institution Type</th>\n",
|
|||
|
" <th>Gender</th>\n",
|
|||
|
" <th>Age</th>\n",
|
|||
|
" <th>Device</th>\n",
|
|||
|
" <th>IT Student</th>\n",
|
|||
|
" <th>Location</th>\n",
|
|||
|
" <th>Financial Condition</th>\n",
|
|||
|
" <th>Internet Type</th>\n",
|
|||
|
" <th>Network Type</th>\n",
|
|||
|
" <th>Flexibility Level</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>University</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>23</td>\n",
|
|||
|
" <td>Tab</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Town</td>\n",
|
|||
|
" <td>Mid</td>\n",
|
|||
|
" <td>Wifi</td>\n",
|
|||
|
" <td>4G</td>\n",
|
|||
|
" <td>Moderate</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>University</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>23</td>\n",
|
|||
|
" <td>Mobile</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Town</td>\n",
|
|||
|
" <td>Mid</td>\n",
|
|||
|
" <td>Mobile Data</td>\n",
|
|||
|
" <td>4G</td>\n",
|
|||
|
" <td>Moderate</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>College</td>\n",
|
|||
|
" <td>Public</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>18</td>\n",
|
|||
|
" <td>Mobile</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Town</td>\n",
|
|||
|
" <td>Mid</td>\n",
|
|||
|
" <td>Wifi</td>\n",
|
|||
|
" <td>4G</td>\n",
|
|||
|
" <td>Moderate</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>School</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>11</td>\n",
|
|||
|
" <td>Mobile</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Town</td>\n",
|
|||
|
" <td>Mid</td>\n",
|
|||
|
" <td>Mobile Data</td>\n",
|
|||
|
" <td>4G</td>\n",
|
|||
|
" <td>Moderate</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>School</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>18</td>\n",
|
|||
|
" <td>Mobile</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Town</td>\n",
|
|||
|
" <td>Poor</td>\n",
|
|||
|
" <td>Mobile Data</td>\n",
|
|||
|
" <td>3G</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Education Level Institution Type Gender Age Device IT Student Location \\\n",
|
|||
|
"0 University Private Male 23 Tab No Town \n",
|
|||
|
"1 University Private Female 23 Mobile No Town \n",
|
|||
|
"2 College Public Female 18 Mobile No Town \n",
|
|||
|
"3 School Private Female 11 Mobile No Town \n",
|
|||
|
"4 School Private Female 18 Mobile No Town \n",
|
|||
|
"\n",
|
|||
|
" Financial Condition Internet Type Network Type Flexibility Level \n",
|
|||
|
"0 Mid Wifi 4G Moderate \n",
|
|||
|
"1 Mid Mobile Data 4G Moderate \n",
|
|||
|
"2 Mid Wifi 4G Moderate \n",
|
|||
|
"3 Mid Mobile Data 4G Moderate \n",
|
|||
|
"4 Poor Mobile Data 3G Low "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 122,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df.info()\n",
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Примеры бизнес-целей для датасета:\n",
|
|||
|
"1. Улучшение доступа к онлайн-образованию для учеников с низким уровнем финансового обеспечения.\n",
|
|||
|
"2. Повышение удовлетворенности учеников онлайн-обучением на основе их устройств, типу соединения, местоположения."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Цели технического проекта:\n",
|
|||
|
"\n",
|
|||
|
"1. Провести анализ зависимости учеников от уровня интернет-соединения и устройств\n",
|
|||
|
"2. Провести анализ влияния различных факторов (тип устройства, интернет-соединение, финансовое положение) на уровень приспособленности."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверяем на выбросы."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 123,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Пустые значения по столбцам:\n",
|
|||
|
"Education Level 0\n",
|
|||
|
"Institution Type 0\n",
|
|||
|
"Gender 0\n",
|
|||
|
"Age 0\n",
|
|||
|
"Device 0\n",
|
|||
|
"IT Student 0\n",
|
|||
|
"Location 0\n",
|
|||
|
"Financial Condition 0\n",
|
|||
|
"Internet Type 0\n",
|
|||
|
"Network Type 0\n",
|
|||
|
"Flexibility Level 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Количество дубликатов: 980\n",
|
|||
|
"\n",
|
|||
|
"Статистический обзор данных:\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'Age': 0.024342017300169792\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"null_values = df.isnull().sum()\n",
|
|||
|
"print(\"Пустые значения по столбцам:\")\n",
|
|||
|
"print(null_values)\n",
|
|||
|
"\n",
|
|||
|
"duplicates = df.duplicated().sum()\n",
|
|||
|
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n",
|
|||
|
"\n",
|
|||
|
"print(\"\\nСтатистический обзор данных:\")\n",
|
|||
|
"df.describe()\n",
|
|||
|
"\n",
|
|||
|
"for column in df.select_dtypes(include=[np.number]).columns:\n",
|
|||
|
" skewness = df[column].skew()\n",
|
|||
|
" print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выбросы незначительны, дубликаты есть. Удаляем дубликаты и очищаем от шумов."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 148,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Шумы в датасете:\n",
|
|||
|
"Empty DataFrame\n",
|
|||
|
"Columns: [Education Level, Institution Type, Gender, Age, Device, IT Student, Location, Financial Condition, Internet Type, Network Type, Flexibility Level]\n",
|
|||
|
"Index: []\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"cleaned_df = df.drop_duplicates()\n",
|
|||
|
"\n",
|
|||
|
"Q1 = df[\"Age\"].quantile(0.25)\n",
|
|||
|
"Q3 = df[\"Age\"].quantile(0.75)\n",
|
|||
|
"\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"lower_bound = Q1 - threshold\n",
|
|||
|
"upper_bound = Q3 + threshold\n",
|
|||
|
"\n",
|
|||
|
"outliers = (df[\"Age\"] < lower_bound) | (df[\"Age\"] > upper_bound)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Шумы в датасете:\")\n",
|
|||
|
"print(df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"median_score = df[\"Age\"].median()\n",
|
|||
|
"df.loc[outliers, \"Age\"] = median_score"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Преобразуем строковые значение в столбце \"Уровень приспособления\" в числовые значения. Это понадобится для расчёта качества набора признаков."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 149,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"ename": "IntCastingNaNError",
|
|||
|
"evalue": "Cannot convert non-finite values (NA or inf) to integer",
|
|||
|
"output_type": "error",
|
|||
|
"traceback": [
|
|||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|||
|
"\u001b[1;31mIntCastingNaNError\u001b[0m Traceback (most recent call last)",
|
|||
|
"Cell \u001b[1;32mIn[149], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m map_flexibility_to_int \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mLow\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mModerate\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mHigh\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m2\u001b[39m}\n\u001b[1;32m----> 3\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFlexibility Level\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mFlexibility Level\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmap_flexibility_to_int\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mint32\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
|||
|
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:6643\u001b[0m, in \u001b[0;36mNDFrame.astype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 6637\u001b[0m results \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 6638\u001b[0m ser\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy, errors\u001b[38;5;241m=\u001b[39merrors) \u001b[38;5;28;01mfor\u001b[39;00m _, ser \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems()\n\u001b[0;32m 6639\u001b[0m ]\n\u001b[0;32m 6641\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 6642\u001b[0m \u001b[38;5;66;03m# else, only a single dtype is given\u001b[39;00m\n\u001b[1;32m-> 6643\u001b[0m new_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mgr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 6644\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_constructor_from_mgr(new_data, axes\u001b[38;5;241m=\u001b[39mnew_data\u001b[38;5;241m.\u001b[39maxes)\n\u001b[0;32m 6645\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mastype\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
|||
|
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:430\u001b[0m, in \u001b[0;36mBaseBlockManager.astype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 427\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m using_copy_on_write():\n\u001b[0;32m 428\u001b[0m copy \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m--> 430\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mastype\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 435\u001b[0m \u001b[43m \u001b[49m\u001b[43musing_cow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43musing_copy_on_write\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 436\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:363\u001b[0m, in \u001b[0;36mBaseBlockManager.apply\u001b[1;34m(self, f, align_keys, **kwargs)\u001b[0m\n\u001b[0;32m 361\u001b[0m applied \u001b[38;5;241m=\u001b[39m b\u001b[38;5;241m.\u001b[39mapply(f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 362\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 363\u001b[0m applied \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 364\u001b[0m result_blocks \u001b[38;5;241m=\u001b[39m extend_blocks(applied, result_blocks)\n\u001b[0;32m 366\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mfrom_blocks(result_blocks, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maxes)\n",
|
|||
|
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\blocks.py:758\u001b[0m, in \u001b[0;36mBlock.astype\u001b[1;34m(self, dtype, copy, errors, using_cow, squeeze)\u001b[0m\n\u001b[0;32m 755\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCan not squeeze with more than one column.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 756\u001b[0m values \u001b[38;5;241m=\u001b[39m values[\u001b[38;5;241m0\u001b[39m, :] \u001b[38;5;66;03m# type: ignore[call-overload]\u001b[39;00m\n\u001b[1;32m--> 758\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array_safe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 760\u001b[0m new_values \u001b[38;5;241m=\u001b[39m maybe_coerce_values(new_values)\n\u001b[0;32m 762\u001b[0m refs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
|||
|
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:237\u001b[0m, in \u001b[0;36mastype_array_safe\u001b[1;34m(values, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 234\u001b[0m dtype \u001b[38;5;241m=\u001b[39m dtype\u001b[38;5;241m.\u001b[39mnumpy_dtype\n\u001b[0;32m 236\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 237\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 238\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m):\n\u001b[0;32m 239\u001b[0m \u001b[38;5;66;03m# e.g. _astype_nansafe can fail on object-dtype of strings\u001b[39;00m\n\u001b[0;32m 240\u001b[0m \u001b[38;5;66;03m# trying to convert to float\u001b[39;00m\n\u001b[0;32m 241\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
|
|||
|
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:182\u001b[0m, in \u001b[0;36mastype_array\u001b[1;34m(values, dtype, copy)\u001b[0m\n\u001b[0;32m 179\u001b[0m values \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n\u001b[0;32m 181\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 182\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43m_astype_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 184\u001b[0m \u001b[38;5;66;03m# in pandas we don't store numpy str dtypes, so convert to object\u001b[39;00m\n\u001b[0;32m 185\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, np\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, \u001b[38;5;28mstr\u001b[39m):\n",
|
|||
|
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:101\u001b[0m, in \u001b[0;36m_astype_nansafe\u001b[1;34m(arr, dtype, copy, skipna)\u001b[0m\n\u001b[0;32m 96\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mensure_string_array(\n\u001b[0;32m 97\u001b[0m arr, skipna\u001b[38;5;241m=\u001b[39mskipna, convert_na_value\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m 98\u001b[0m )\u001b[38;5;241m.\u001b[39mreshape(shape)\n\u001b[0;32m 100\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m np\u001b[38;5;241m.\u001b[39missubdtype(arr\u001b[38;5;241m.\u001b[39mdtype, np\u001b[38;5;241m.\u001b[39mfloating) \u001b[38;5;129;01mand\u001b[39;00m dtype\u001b[38;5;241m.\u001b[39mkind \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miu\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m--> 101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_astype_float_to_int_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 103\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mobject\u001b[39m:\n\u001b[0;32m 104\u001b[0m \u001b[38;5;66;03m# if we have a datetime/timedelta array of objects\u001b[39;00m\n\u001b[0;32m 105\u001b[0m \u001b[38;5;66;03m# then coerce to datetime64[ns] and use DatetimeArray.astype\u001b[39;00m\n\u001b[0;32m 107\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mis_np_dtype(dtype, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mM\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n",
|
|||
|
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:145\u001b[0m, in \u001b[0;36m_astype_float_to_int_nansafe\u001b[1;34m(values, dtype, copy)\u001b[0m\n\u001b[0;32m 141\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 142\u001b[0m \u001b[38;5;124;03mastype with a check preventing converting NaN to an meaningless integer value.\u001b[39;00m\n\u001b[0;32m 143\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 144\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m np\u001b[38;5;241m.\u001b[39misfinite(values)\u001b[38;5;241m.\u001b[39mall():\n\u001b[1;32m--> 145\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m IntCastingNaNError(\n\u001b[0;32m 146\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot convert non-finite values (NA or inf) to integer\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 147\u001b[0m )\n\u001b[0;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype\u001b[38;5;241m.\u001b[39mkind \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mu\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 149\u001b[0m \u001b[38;5;66;03m# GH#45151\u001b[39;00m\n\u001b[0;32m 150\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (values \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m.\u001b[39mall():\n",
|
|||
|
"\u001b[1;31mIntCastingNaNError\u001b[0m: Cannot convert non-finite values (NA or inf) to integer"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"map_flexibility_to_int = {'Low': 0, 'Moderate': 1, 'High': 2}\n",
|
|||
|
"\n",
|
|||
|
"df['Flexibility Level'] = df['Flexibility Level'].map(map_flexibility_to_int).astype('int32')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Шумов в датасете нет. Разбиваем датасет на три выборки: обучающую, контрольную и тестовую."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 137,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: (723, 10)\n",
|
|||
|
"Размер контрольной выборки: (241, 10)\n",
|
|||
|
"Размер тестовой выборки: (241, 10)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"X = df.drop(columns=['Flexibility Level'])\n",
|
|||
|
"Y = df['Flexibility Level']\n",
|
|||
|
"\n",
|
|||
|
"X_train_df, X_test_df, Y_train_df, Y_test_df = train_test_split(X, Y, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"X_train_df, X_val_df, Y_train_df, Y_val_df = train_test_split(X_train_df, Y_train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", X_train_df.shape)\n",
|
|||
|
"print(\"Размер контрольной выборки:\",X_val_df.shape)\n",
|
|||
|
"print(\"Размер тестовой выборки:\", X_test_df.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверка сбалансированности данных."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 138,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в обучающей выборке:\n",
|
|||
|
"Flexibility Level\n",
|
|||
|
"1 0.531120\n",
|
|||
|
"0 0.385892\n",
|
|||
|
"2 0.082988\n",
|
|||
|
"Name: proportion, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в контрольной выборке:\n",
|
|||
|
"Flexibility Level\n",
|
|||
|
"1 0.522822\n",
|
|||
|
"0 0.406639\n",
|
|||
|
"2 0.070539\n",
|
|||
|
"Name: proportion, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в тестовой выборке:\n",
|
|||
|
"Flexibility Level\n",
|
|||
|
"1 0.477178\n",
|
|||
|
"0 0.427386\n",
|
|||
|
"2 0.095436\n",
|
|||
|
"Name: proportion, dtype: float64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABboAAAHyCAYAAAAtJXgGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABuX0lEQVR4nO3de3zO9f/H8ec2dnaIzeYwm5wPOTSH0JwaU4QUkW9mCoWEUl8qQ4cl5RDKoaxEEZV+neQQheacnCUmp2yGHIaNXe/fH267vi7XNjOHa5887rfbdbu5Ptf78/m8Ptd12eu6ntfnel9uxhgjAAAAAAAAAAAsyt3VBQAAAAAAAAAAcD0IugEAAAAAAAAAlkbQDQAAAAAAAACwNIJuAAAAAAAAAIClEXQDAAAAAAAAACyNoBsAAAAAAAAAYGkE3QAAAAAAAAAASyPoBgAAAAAAAABYGkE3AAAAAOSBzWZTSkqK9u7d6+pSAAAAbnsE3QAAAACQS0eOHNHAgQMVGhoqT09PBQYGqlq1ajp16pSrSwMAALitFXB1AQAAADfaRx99pJiYGPt1Ly8vlS1bVq1atdIrr7yioKAgF1YHwKr+/PNPNW/eXBcuXNCAAQN09913q0CBAvLx8ZGfn5+rywMAALitEXQDAIB/rVGjRqlcuXI6f/68Vq5cqffff1/ff/+9tm7dKl9fX1eXB8Bi+vTpI09PT61evVqlS5d2dTkAAAC4DEE3AAD417r//vtVt25dSdKTTz6p4sWLa+zYsfr666/VtWtXF1cHwEo2bNign376SYsWLSLkBgAAyIeYoxsAANw2WrRoIUlKTEyUJB0/flzPP/+87rrrLvn7+6tw4cK6//779fvvvzute/78eY0YMUKVKlWSt7e3SpYsqY4dO2rPnj2SpH379snNzS3bS7NmzezbWr58udzc3DR37lwNGzZMwcHB8vPzU7t27XTgwAGnfa9Zs0atW7dWkSJF5Ovrq6ZNm2rVqlVZHmOzZs2y3P+IESOcxs6aNUvh4eHy8fFRsWLF1KVLlyz3n9OxXc5ms2n8+PGqXr26vL29FRQUpD59+ujEiRMO48LCwtS2bVun/fTv399pm1nVPmbMGKf7VJLS0tIUGxurChUqyMvLSyEhIXrhhReUlpaW5X11uSvvt4CAALVp00Zbt27N1bo1atTQhg0b1KhRI/n4+KhcuXKaMmWKw7j09HQNHz5c4eHhKlKkiPz8/BQREaFly5Y5jNu1a5datGih4OBg+3E89dRTOn78uNO+e/TocdXHu0ePHgoLC3NY78CBA/Lx8ZGbm5v27dsn6X+P80cffeQwdsSIEVk+Lv3793eqp23btg77ytzm22+/nc2957z9+Ph4ubm5acaMGQ7j3njjDbm5uen777/PdlvSpedX5v3g7u6u4OBgPfroo9q/f/911bV69Wp5e3trz549ql69ury8vBQcHKw+ffpk+djMmzfP/v8rICBA//nPf3To0CGHMT169JC/v7/27t2rqKgo+fn5qVSpUho1apSMMU71Xv7YnD59WuHh4SpXrpz+/vtv+/K3335bjRo1UvHixeXj46Pw8HDNnz/fYb/Xex8DAADkR5zRDQAAbhuZoXTx4sUlSXv37tWCBQvUqVMnlStXTklJSZo6daqaNm2q7du3q1SpUpKkjIwMtW3bVkuXLlWXLl307LPP6vTp01q8eLG2bt2q8uXL2/fRtWtXPfDAAw77HTp0aJb1vP7663Jzc9OLL76o5ORkjR8/XpGRkdq0aZN8fHwkST/99JPuv/9+hYeHKzY2Vu7u7oqPj1eLFi20YsUK1a9f32m7ZcqUUVxcnCTpzJkzevrpp7Pc9yuvvKLOnTvrySef1NGjRzVx4kQ1adJEv/32m4oWLeq0Tu/evRURESFJ+vLLL/XVV1853N6nTx/7/OgDBgxQYmKiJk2apN9++02rVq1SwYIFs7wfrsU///xjP7bL2Ww2tWvXTitXrlTv3r1VtWpVbdmyRePGjdMff/yhBQsWXHXbVapU0UsvvSRjjPbs2aOxY8fqgQcecAhIs3PixAk98MAD6ty5s7p27arPP/9cTz/9tDw9PdWzZ09J0qlTp/TBBx+oa9eu6tWrl06fPq0PP/xQUVFRWrt2rWrXri1JSk1NVZkyZfTggw+qcOHC2rp1qyZPnqxDhw7pm2++cdp3QECAxo0bZ7/++OOPX7Xe4cOH6/z581cd5woxMTH68ssvNXjwYLVs2VIhISHasmWLRo4cqSeeeMLp/1dWIiIi1Lt3b9lsNm3dulXjx4/X4cOHtWLFijzXdezYMZ0/f15PP/20WrRooaeeekp79uzR5MmTtWbNGq1Zs0ZeXl6S/vc7AfXq1VNcXJySkpI0YcIErVq1yun/V0ZGhlq3bq177rlHb731lhYuXKjY2FhdvHhRo0aNyrKWCxcu6OGHH9b+/fu1atUqlSxZ0n7bhAkT1K5dO3Xr1k3p6emaM2eOOnXqpG+//VZt2rS5YfcxAABAvmMAAAD+ZeLj440ks2TJEnP06FFz4MABM2fOHFO8eHHj4+NjDh48aIwx5vz58yYjI8Nh3cTEROPl5WVGjRplXzZjxgwjyYwdO9ZpXzabzb6eJDNmzBinMdWrVzdNmza1X1+2bJmRZEqXLm1OnTplX/75558bSWbChAn2bVesWNFERUXZ92OMMWfPnjXlypUzLVu2dNpXo0aNTI0aNezXjx49aiSZ2NhY+7J9+/YZDw8P8/rrrzusu2XLFlOgQAGn5bt37zaSzMcff2xfFhsbay5/KblixQojycyePdth3YULFzotDw0NNW3atHGqvV+/fubKl6dX1v7CCy+YEiVKmPDwcIf79JNPPjHu7u5mxYoVDutPmTLFSDKrVq1y2t/lmjZt6rA9Y4wZNmyYkWSSk5Ovuq4k884779iXpaWlmdq1a5sSJUqY9PR0Y4wxFy9eNGlpaQ7rnjhxwgQFBZmePXvmuI++ffsaf39/p+XdunUz5cqVc1h25X0WHR1tQkND7de3bt1q3N3dzf33328kmcTERGOMMX/99ZeRZGbMmOGwvSsf68x99OvXz6meNm3aOOwrp/8XOW3/77//NsWKFTMtW7Y0aWlppk6dOqZs2bLm5MmT2W4nU2hoqImOjnZY9thjjxlfX9/rqivz+n333WcuXrxoX57592bixInGGGPS09NNiRIlTI0aNcy5c+fs47799lsjyQwfPty+LDo62kgyzzzzjH2ZzWYzbdq0MZ6enubo0aMO9cbHxxubzWa6detmfH19zZo1a5zqPnv2rMP19PR0U6NGDdOiRQuH5ddzHwMAAORHTF0CAAD+tSIjIxUYGKiQkBB16dJF/v7++uqrr+zz63p5ecnd/dLLoYyMDB07dkz+/v6qXLmyNm7caN/OF198oYCAAD3zzDNO+7hySodr0b17dxUqVMh+/ZFHHlHJkiXt0wZs2rRJu3fv1mOPPaZjx44pJSVFKSkpSk1N1X333adffvlFNpvNYZvnz5+Xt7d3jvv98ssvZbPZ1LlzZ/s2U1JSFBwcrIoVKzpNpZGeni5J9rNVszJv3jwVKVJELVu2dNhmeHi4/P39nbZ54cIFh3EpKSlXPcP40KFDmjhxol555RX5+/s77b9q1aqqUqWKwzYzp6u5cv9Zyazp6NGjSkhI0FdffaWaNWsqICDgqusWKFBAffr0sV/39PRUnz59lJycrA0bNkiSPDw85OnpKenSGejHjx/XxYsXVbduXYfnW6aTJ08qKSlJS5cu1XfffacmTZo4jUlPT8/xccnK0KFDdffdd6tTp04OywMDAyVJBw8ezNV2zp8/7/QYXrhwIcuxZ8+eVUpKik6cOOEwJUd2goODNXnyZC1evFgRERHatGmTZsyYocKFC+eqtrS0NKWkpCg5OVmLFy/WTz/9pPvuu++665KkwYMHy8PDw3798ccfV1BQkL777jtJ0vr165WcnKy+ffs6/F9s06aNqlSpYh93ucungcmcFiY9PV1LlixxGjtkyBDNnj1bn3/+eZbf6Mj8Noh06ZsGJ0+eVEREhNNz7HrvYwAAgPy
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1800x500 with 3 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"\n",
|
|||
|
"def analyze_balance(y_train, y_val, y_test, y_name):\n",
|
|||
|
" print(\"Распределение классов в обучающей выборке:\")\n",
|
|||
|
" print(y_train.value_counts(normalize=True))\n",
|
|||
|
" \n",
|
|||
|
" print(\"\\nРаспределение классов в контрольной выборке:\")\n",
|
|||
|
" print(y_val.value_counts(normalize=True))\n",
|
|||
|
" \n",
|
|||
|
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
|
|||
|
" print(y_test.value_counts(normalize=True))\n",
|
|||
|
"\n",
|
|||
|
" fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n",
|
|||
|
" fig.suptitle('Распределение в различных выборках')\n",
|
|||
|
"\n",
|
|||
|
" sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts(normalize=True), ax=axes[0])\n",
|
|||
|
" axes[0].set_title('Обучающая выборка')\n",
|
|||
|
" axes[0].set_xlabel(y_name)\n",
|
|||
|
" axes[0].set_ylabel('Доля')\n",
|
|||
|
"\n",
|
|||
|
" sns.barplot(x=y_val.value_counts().index, y=y_val.value_counts(normalize=True), ax=axes[1])\n",
|
|||
|
" axes[1].set_title('Контрольная выборка')\n",
|
|||
|
" axes[1].set_xlabel(y_name)\n",
|
|||
|
"\n",
|
|||
|
" sns.barplot(x=y_test.value_counts().index, y=y_test.value_counts(normalize=True), ax=axes[2])\n",
|
|||
|
" axes[2].set_title('Тестовая выборка')\n",
|
|||
|
" axes[2].set_xlabel(y_name)\n",
|
|||
|
"\n",
|
|||
|
" plt.show()\n",
|
|||
|
"\n",
|
|||
|
"analyze_balance(Y_train_df, Y_val_df, Y_test_df, 'Flexibility Level')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выполним оверсемплинг для балансировки."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 139,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в обучающей выборке:\n",
|
|||
|
"Flexibility Level\n",
|
|||
|
"2 0.333333\n",
|
|||
|
"0 0.333333\n",
|
|||
|
"1 0.333333\n",
|
|||
|
"Name: proportion, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в контрольной выборке:\n",
|
|||
|
"Flexibility Level\n",
|
|||
|
"1 0.333333\n",
|
|||
|
"0 0.333333\n",
|
|||
|
"2 0.333333\n",
|
|||
|
"Name: proportion, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в тестовой выборке:\n",
|
|||
|
"Flexibility Level\n",
|
|||
|
"1 0.477178\n",
|
|||
|
"0 0.427386\n",
|
|||
|
"2 0.095436\n",
|
|||
|
"Name: proportion, dtype: float64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABboAAAHyCAYAAAAtJXgGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABt6klEQVR4nO3deXhMZ//H8U8SsltKSCyRqH2ppbEUjTVEG0W1lHoqokWLKlrtQ1tBl1S1StFaWulCS2mrv25qKS0ae9WuaqeViF2QkLl/f7gyjzFJRITJqffruua6zJn7nPM9MyPfmc+cucfNGGMEAAAAAAAAAIBFubu6AAAAAAAAAAAAbgRBNwAAAAAAAADA0gi6AQAAAAAAAACWRtANAAAAAAAAALA0gm4AAAAAAAAAgKURdAMAAAAAAAAALI2gGwAAAAAAAABgaQTdAAAAAAAAAABLI+gGAAAAgFyw2WxKTk7Wnj17XF0KAADAbY+gGwAAAABy6MiRIxo0aJBCQkLk6empEiVKqHr16jp9+rSrSwMAALitFXB1AQAAAHnto48+UkxMjP26l5eXypUrpzZt2ujll19WYGCgC6sDYFV//fWXWrRooYsXL2rgwIG6++67VaBAAfn4+MjPz8/V5QEAANzWCLoBAMC/1ujRo1W+fHlduHBBK1as0Pvvv68ffvhBW7Zska+vr6vLA2Axffv2laenp1atWqUyZcq4uhwAAABcgaAbAAD8a913332qV6+eJOmJJ55Q8eLFNW7cOH3zzTfq1q2bi6sDYCXr16/Xzz//rIULFxJyAwAA5EPM0Q0AAG4bLVu2lCTt3btXknT8+HE999xzuuuuu+Tv76/ChQvrvvvu0x9//OG07oULFzRy5EhVrlxZ3t7eKlWqlDp16qTdu3dLkvbt2yc3N7csL82bN7dva9myZXJzc9OcOXM0fPhwBQUFyc/PT+3bt9fBgwed9r169Wq1bdtWRYoUka+vr5o1a6aVK1dmeozNmzfPdP8jR450Gjtz5kyFhYXJx8dHxYoVU9euXTPdf3bHdiWbzabx48erRo0a8vb2VmBgoPr27asTJ044jAsNDVW7du2c9jNgwACnbWZW+9ixY53uU0lKTU1VbGysKlasKC8vLwUHB+v5559XampqpvfVla6+3wICAhQVFaUtW7bkaN2aNWtq/fr1aty4sXx8fFS+fHlNmTLFYVxaWppGjBihsLAwFSlSRH5+fgoPD9fSpUsdxu3cuVMtW7ZUUFCQ/TiefPJJHT9+3GnfPXv2vObj3bNnT4WGhjqsd/DgQfn4+MjNzU379u2T9L/H+aOPPnIYO3LkyEwflwEDBjjV065dO4d9ZWzzrbfeyuLec95+fHy83NzcNGPGDIdxr7/+utzc3PTDDz9kuS3p8vMr435wd3dXUFCQHnnkER04cOCG6lq1apW8vb21e/du1ahRQ15eXgoKClLfvn0zfWzmzp1r//8VEBCg//znPzp8+LDDmJ49e8rf31979uxRZGSk/Pz8VLp0aY0ePVrGGKd6r3xszpw5o7CwMJUvX17//POPfflbb72lxo0bq3jx4vLx8VFYWJjmzZvnsN8bvY8BAADyI87oBgAAt42MULp48eKSpD179mj+/Pnq3Lmzypcvr8TERE2dOlXNmjXTtm3bVLp0aUlSenq62rVrpyVLlqhr16565plndObMGS1atEhbtmxRhQoV7Pvo1q2b7r//fof9Dhs2LNN6XnvtNbm5uemFF15QUlKSxo8fr4iICG3cuFE+Pj6SpJ9//ln33XefwsLCFBsbK3d3d8XHx6tly5Zavny5GjRo4LTdsmXLKi4uTpJ09uxZPfXUU5nu++WXX1aXLl30xBNP6OjRo5o4caKaNm2q33//XUWLFnVap0+fPgoPD5ckffXVV/r6668dbu/bt699fvSBAwdq7969mjRpkn7//XetXLlSBQsWzPR+uB4nT560H9uVbDab2rdvrxUrVqhPnz6qVq2aNm/erHfeeUd//vmn5s+ff81tV61aVS+++KKMMdq9e7fGjRun+++/3yEgzcqJEyd0//33q0uXLurWrZu++OILPfXUU/L09FSvXr0kSadPn9YHH3ygbt26qXfv3jpz5ow+/PBDRUZGas2aNapTp44kKSUlRWXLltUDDzygwoULa8uWLZo8ebIOHz6sb7/91mnfAQEBeuedd+zXH3vssWvWO2LECF24cOGa41whJiZGX331lYYMGaLWrVsrODhYmzdv1qhRo/T44487/f/KTHh4uPr06SObzaYtW7Zo/Pjx+vvvv7V8+fJc13Xs2DFduHBBTz31lFq2bKknn3xSu3fv1uTJk7V69WqtXr1aXl5ekv73OwH169dXXFycEhMTNWHCBK1cudLp/1d6erratm2re+65R2+++aYWLFig2NhYXbp0SaNHj860losXL+qhhx7SgQMHtHLlSpUqVcp+24QJE9S+fXt1795daWlpmj17tjp37qzvvvtOUVFReXYfAwAA5DsGAADgXyY+Pt5IMosXLzZHjx41Bw8eNLNnzzbFixc3Pj4+5tChQ8YYYy5cuGDS09Md1t27d6/x8vIyo0ePti+bMWOGkWTGjRvntC+bzWZfT5IZO3as05gaNWqYZs2a2a8vXbrUSDJlypQxp0+fti//4osvjCQzYcIE+7YrVapkIiMj7fsxxphz586Z8uXLm9atWzvtq3HjxqZmzZr260ePHjWSTGxsrH3Zvn37jIeHh3nttdcc1t28ebMpUKCA0/Jdu3YZSebjjz+2L4uNjTVXvpRcvny5kWRmzZrlsO6CBQucloeEhJioqCin2vv372+ufnl6de3PP/+8KVmypAkLC3O4Tz/99FPj7u5uli9f7rD+lClTjCSzcuVKp/1dqVmzZg7bM8aY4cOHG0kmKSnpmutKMm+//bZ9WWpqqqlTp44pWbKkSUtLM8YYc+nSJZOamuqw7okTJ0xgYKDp1atXtvvo16+f8ff3d1revXt3U758eYdlV99n0dHRJiQkxH59y5Ytxt3d3dx3331Gktm7d68xxpj9+/cbSWbGjBkO27v6sc7YR//+/Z3qiYqKcthXdv8vstv+P//8Y4oVK2Zat25tUlNTTd26dU25cuXMqVOnstxOhpCQEBMdHe2w7NFHHzW+vr43VFfG9VatWplLly7Zl2f8vZk4caIxxpi0tDRTsmRJU7NmTXP+/Hn7uO+++85IMiNGjLAvi46ONpLM008/bV9ms9lMVFSU8fT0NEePHnWoNz4+3thsNtO9e3fj6+trVq9e7VT3uXPnHK6npaWZmjVrmpYtWzosv5H7GAAAID9i6hIAAPCvFRERoRIlSig4OFhdu3aVv7+/vv76a/v8ul5eXnJ3v/xyKD09XceOHZO/v7+qVKmiDRs22Lfz5ZdfKiAgQE8//bTTPq6e0uF69OjRQ4UKFbJff/jhh1WqVCn7tAEbN27Url279Oijj+rYsWNKTk5WcnKyUlJS1KpVK/3666+y2WwO27xw4YK8vb2z3e9XX30lm82mLl262LeZnJysoKAgVapUyWkqjbS0NEmyn62amblz56pIkSJq3bq1wzbDwsLk7+/vtM2LFy86jEtOTr7mGcaHDx/WxIkT9fLLL8vf399p/9WqVVPVqlUdtpkxXc3V+89MRk1Hjx5VQkKCvv76a9WqVUsBAQHXXLdAgQLq27ev/bqnp6f69u2rpKQkrV+/XpLk4eEhT09PSZfPQD9+/LguXbqkevXqOTzfMpw6dUqJiYlasmSJvv/+ezVt2tRpTFpaWraPS2aGDRumu+++W507d3ZYXqJECUnSoUOHcrSdCxcuOD2GFy9ezHTsuXPnlJycrBMnTjhMyZGVoKAgTZ48WYsWLVJ4eLg2btyoGTNmqHDhwjmqLTU1VcnJyUpKStKiRYv0888/q1WrVjdclyQNGTJEHh4e9uuPPfaYAgMD9f3330uS1q1bp6SkJPXr18/h/2JUVJSqVq1qH3elK6eByZgWJi0tTYsXL3YaO3ToUM2aNUtffPFFpt/oyPg2iHT5mwanTp1SeHi403PsRu9jAACA/IapSwA
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1800x500 with 3 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"X_train_resampled, Y_train_resampled = ros.fit_resample(X_train_df, Y_train_df)\n",
|
|||
|
"X_val_resampled, Y_val_resampled = ros.fit_resample(X_val_df, Y_val_df)\n",
|
|||
|
"\n",
|
|||
|
"analyze_balance(Y_train_resampled, Y_val_resampled, Y_test_df, 'Flexibility Level')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Конструирование признаков. Для начала применим унитарное кодирование категориальных признаков (one-hot encoding), переведя их в бинарные вектора."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 140,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Age</th>\n",
|
|||
|
" <th>Education Level_School</th>\n",
|
|||
|
" <th>Education Level_University</th>\n",
|
|||
|
" <th>Institution Type_Public</th>\n",
|
|||
|
" <th>Gender_Male</th>\n",
|
|||
|
" <th>Device_Mobile</th>\n",
|
|||
|
" <th>Device_Tab</th>\n",
|
|||
|
" <th>IT Student_Yes</th>\n",
|
|||
|
" <th>Location_Town</th>\n",
|
|||
|
" <th>Financial Condition_Poor</th>\n",
|
|||
|
" <th>Financial Condition_Rich</th>\n",
|
|||
|
" <th>Internet Type_Wifi</th>\n",
|
|||
|
" <th>Network Type_3G</th>\n",
|
|||
|
" <th>Network Type_4G</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>18</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>23</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>18</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>23</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Age Education Level_School Education Level_University \\\n",
|
|||
|
"0 10 True False \n",
|
|||
|
"1 18 False False \n",
|
|||
|
"2 23 False True \n",
|
|||
|
"3 18 True False \n",
|
|||
|
"4 23 False True \n",
|
|||
|
"\n",
|
|||
|
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
|
|||
|
"0 False True True False \n",
|
|||
|
"1 False False True False \n",
|
|||
|
"2 False True True False \n",
|
|||
|
"3 True True True False \n",
|
|||
|
"4 False False True False \n",
|
|||
|
"\n",
|
|||
|
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
|
|||
|
"0 False True False \n",
|
|||
|
"1 False True False \n",
|
|||
|
"2 False True False \n",
|
|||
|
"3 False True False \n",
|
|||
|
"4 False False False \n",
|
|||
|
"\n",
|
|||
|
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
|
|||
|
"0 True True False \n",
|
|||
|
"1 False True False \n",
|
|||
|
"2 False True False \n",
|
|||
|
"3 True False False \n",
|
|||
|
"4 False True False \n",
|
|||
|
"\n",
|
|||
|
" Network Type_4G \n",
|
|||
|
"0 True \n",
|
|||
|
"1 True \n",
|
|||
|
"2 True \n",
|
|||
|
"3 True \n",
|
|||
|
"4 True "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 140,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"cat_features = ['Education Level', 'Institution Type', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition', 'Internet Type', 'Network Type']\n",
|
|||
|
"\n",
|
|||
|
"train_encoded = pd.get_dummies(X_train_resampled, columns=cat_features, drop_first=True)\n",
|
|||
|
"val_encoded = pd.get_dummies(X_val_resampled, columns=cat_features, drop_first=True)\n",
|
|||
|
"test_encoded = pd.get_dummies(X_test_df, columns=cat_features, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"train_encoded.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Применим дискретизацию к числовым признакам."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 141,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Education Level_School</th>\n",
|
|||
|
" <th>Education Level_University</th>\n",
|
|||
|
" <th>Institution Type_Public</th>\n",
|
|||
|
" <th>Gender_Male</th>\n",
|
|||
|
" <th>Device_Mobile</th>\n",
|
|||
|
" <th>Device_Tab</th>\n",
|
|||
|
" <th>IT Student_Yes</th>\n",
|
|||
|
" <th>Location_Town</th>\n",
|
|||
|
" <th>Financial Condition_Poor</th>\n",
|
|||
|
" <th>Financial Condition_Rich</th>\n",
|
|||
|
" <th>Internet Type_Wifi</th>\n",
|
|||
|
" <th>Network Type_3G</th>\n",
|
|||
|
" <th>Network Type_4G</th>\n",
|
|||
|
" <th>Age_Bin</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Education Level_School Education Level_University \\\n",
|
|||
|
"0 True False \n",
|
|||
|
"1 False False \n",
|
|||
|
"2 False True \n",
|
|||
|
"3 True False \n",
|
|||
|
"4 False True \n",
|
|||
|
"\n",
|
|||
|
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
|
|||
|
"0 False True True False \n",
|
|||
|
"1 False False True False \n",
|
|||
|
"2 False True True False \n",
|
|||
|
"3 True True True False \n",
|
|||
|
"4 False False True False \n",
|
|||
|
"\n",
|
|||
|
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
|
|||
|
"0 False True False \n",
|
|||
|
"1 False True False \n",
|
|||
|
"2 False True False \n",
|
|||
|
"3 False True False \n",
|
|||
|
"4 False False False \n",
|
|||
|
"\n",
|
|||
|
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
|
|||
|
"0 True True False \n",
|
|||
|
"1 False True False \n",
|
|||
|
"2 False True False \n",
|
|||
|
"3 True False False \n",
|
|||
|
"4 False True False \n",
|
|||
|
"\n",
|
|||
|
" Network Type_4G Age_Bin \n",
|
|||
|
"0 True young \n",
|
|||
|
"1 True young \n",
|
|||
|
"2 True young \n",
|
|||
|
"3 True young \n",
|
|||
|
"4 True young "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 141,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"num_features = ['Age']\n",
|
|||
|
"\n",
|
|||
|
"def discretize_features(df, features, bins, labels):\n",
|
|||
|
" for feature in features:\n",
|
|||
|
" df[f'{feature}_Bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
|
|||
|
" df.drop(columns=[feature], inplace=True)\n",
|
|||
|
" return df\n",
|
|||
|
"\n",
|
|||
|
"age_bins = [0, 25, 55, 100]\n",
|
|||
|
"age_labels = [\"young\", \"middle-aged\", \"old\"]\n",
|
|||
|
"\n",
|
|||
|
"train_encoded = discretize_features(train_encoded, num_features, bins=age_bins, labels=age_labels)\n",
|
|||
|
"val_encoded = discretize_features(val_encoded, num_features, bins=age_bins, labels=age_labels)\n",
|
|||
|
"test_encoded = discretize_features(test_encoded, num_features, bins=age_bins, labels=age_labels)\n",
|
|||
|
"\n",
|
|||
|
"train_encoded.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Применим ручной синтез признаков. К примеру, для этого датасета, сделаем признак \"соотвествие устройства для обучения\". Мобильные устройства часто менее удобны для учебы по сравнению с планшетами."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 142,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Education Level_School</th>\n",
|
|||
|
" <th>Education Level_University</th>\n",
|
|||
|
" <th>Institution Type_Public</th>\n",
|
|||
|
" <th>Gender_Male</th>\n",
|
|||
|
" <th>Device_Mobile</th>\n",
|
|||
|
" <th>Device_Tab</th>\n",
|
|||
|
" <th>IT Student_Yes</th>\n",
|
|||
|
" <th>Location_Town</th>\n",
|
|||
|
" <th>Financial Condition_Poor</th>\n",
|
|||
|
" <th>Financial Condition_Rich</th>\n",
|
|||
|
" <th>Internet Type_Wifi</th>\n",
|
|||
|
" <th>Network Type_3G</th>\n",
|
|||
|
" <th>Network Type_4G</th>\n",
|
|||
|
" <th>Age_Bin</th>\n",
|
|||
|
" <th>Device Suitability</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Education Level_School Education Level_University \\\n",
|
|||
|
"0 True False \n",
|
|||
|
"1 False False \n",
|
|||
|
"2 False True \n",
|
|||
|
"3 True False \n",
|
|||
|
"4 False True \n",
|
|||
|
"\n",
|
|||
|
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
|
|||
|
"0 False True True False \n",
|
|||
|
"1 False False True False \n",
|
|||
|
"2 False True True False \n",
|
|||
|
"3 True True True False \n",
|
|||
|
"4 False False True False \n",
|
|||
|
"\n",
|
|||
|
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
|
|||
|
"0 False True False \n",
|
|||
|
"1 False True False \n",
|
|||
|
"2 False True False \n",
|
|||
|
"3 False True False \n",
|
|||
|
"4 False False False \n",
|
|||
|
"\n",
|
|||
|
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
|
|||
|
"0 True True False \n",
|
|||
|
"1 False True False \n",
|
|||
|
"2 False True False \n",
|
|||
|
"3 True False False \n",
|
|||
|
"4 False True False \n",
|
|||
|
"\n",
|
|||
|
" Network Type_4G Age_Bin Device Suitability \n",
|
|||
|
"0 True young Low \n",
|
|||
|
"1 True young Low \n",
|
|||
|
"2 True young Low \n",
|
|||
|
"3 True young Low \n",
|
|||
|
"4 True young Low "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 142,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"train_encoded['Device Suitability'] = train_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n",
|
|||
|
"val_encoded['Device Suitability'] = val_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n",
|
|||
|
"test_encoded['Device Suitability'] = test_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n",
|
|||
|
"\n",
|
|||
|
"train_encoded.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Конструирование признаков с помощью фреймворка Featuretools."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 143,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Education Level_School</th>\n",
|
|||
|
" <th>Education Level_University</th>\n",
|
|||
|
" <th>Institution Type_Public</th>\n",
|
|||
|
" <th>Gender_Male</th>\n",
|
|||
|
" <th>Device_Mobile</th>\n",
|
|||
|
" <th>Device_Tab</th>\n",
|
|||
|
" <th>IT Student_Yes</th>\n",
|
|||
|
" <th>Location_Town</th>\n",
|
|||
|
" <th>Financial Condition_Poor</th>\n",
|
|||
|
" <th>Financial Condition_Rich</th>\n",
|
|||
|
" <th>Internet Type_Wifi</th>\n",
|
|||
|
" <th>Network Type_3G</th>\n",
|
|||
|
" <th>Network Type_4G</th>\n",
|
|||
|
" <th>Age_Bin</th>\n",
|
|||
|
" <th>Device Suitability</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>young</td>\n",
|
|||
|
" <td>Low</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Education Level_School Education Level_University \\\n",
|
|||
|
"id \n",
|
|||
|
"0 True False \n",
|
|||
|
"1 False False \n",
|
|||
|
"2 False True \n",
|
|||
|
"3 True False \n",
|
|||
|
"4 False True \n",
|
|||
|
"\n",
|
|||
|
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
|
|||
|
"id \n",
|
|||
|
"0 False True True False \n",
|
|||
|
"1 False False True False \n",
|
|||
|
"2 False True True False \n",
|
|||
|
"3 True True True False \n",
|
|||
|
"4 False False True False \n",
|
|||
|
"\n",
|
|||
|
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
|
|||
|
"id \n",
|
|||
|
"0 False True False \n",
|
|||
|
"1 False True False \n",
|
|||
|
"2 False True False \n",
|
|||
|
"3 False True False \n",
|
|||
|
"4 False False False \n",
|
|||
|
"\n",
|
|||
|
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
|
|||
|
"id \n",
|
|||
|
"0 True True False \n",
|
|||
|
"1 False True False \n",
|
|||
|
"2 False True False \n",
|
|||
|
"3 True False False \n",
|
|||
|
"4 False True False \n",
|
|||
|
"\n",
|
|||
|
" Network Type_4G Age_Bin Device Suitability \n",
|
|||
|
"id \n",
|
|||
|
"0 True young Low \n",
|
|||
|
"1 True young Low \n",
|
|||
|
"2 True young Low \n",
|
|||
|
"3 True young Low \n",
|
|||
|
"4 True young Low "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 143,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"ft_data = train_encoded.copy()\n",
|
|||
|
"\n",
|
|||
|
"es = ft.EntitySet(id=\"students\")\n",
|
|||
|
"es = es.add_dataframe(dataframe_name=\"students_data\", dataframe=ft_data, index=\"id\", make_index=True)\n",
|
|||
|
"\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(\n",
|
|||
|
" entityset=es, \n",
|
|||
|
" target_dataframe_name=\"students_data\",\n",
|
|||
|
" max_depth=1\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"feature_matrix.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Featuretools не смог сделать новые признаки.\n",
|
|||
|
"\n",
|
|||
|
"Оценка качества набора признаков."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 144,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Время обучения модели: 0.22 секунд\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"train_encoded = pd.get_dummies(train_encoded, drop_first=True)\n",
|
|||
|
"val_encoded = pd.get_dummies(val_encoded, drop_first=True)\n",
|
|||
|
"test_encoded = pd.get_dummies(test_encoded, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"cols = train_encoded.columns\n",
|
|||
|
"\n",
|
|||
|
"train_encoded = train_encoded.reindex(columns=cols, fill_value=0)\n",
|
|||
|
"val_encoded = val_encoded.reindex(columns=cols, fill_value=0)\n",
|
|||
|
"test_encoded = test_encoded.reindex(columns=cols, fill_value=0)\n",
|
|||
|
"\n",
|
|||
|
"model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"start = time.time()\n",
|
|||
|
"model.fit(train_encoded, Y_train_resampled)\n",
|
|||
|
"train_time = time.time() - start\n",
|
|||
|
"\n",
|
|||
|
"print(f'Время обучения модели: {train_time:.2f} секунд')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 145,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Feature Importance:\n",
|
|||
|
" feature importance\n",
|
|||
|
"9 Financial Condition_Rich 0.184028\n",
|
|||
|
"3 Gender_Male 0.108992\n",
|
|||
|
"8 Financial Condition_Poor 0.107030\n",
|
|||
|
"2 Institution Type_Public 0.095663\n",
|
|||
|
"10 Internet Type_Wifi 0.089925\n",
|
|||
|
"7 Location_Town 0.078658\n",
|
|||
|
"0 Education Level_School 0.061961\n",
|
|||
|
"6 IT Student_Yes 0.055048\n",
|
|||
|
"1 Education Level_University 0.049695\n",
|
|||
|
"12 Network Type_4G 0.044837\n",
|
|||
|
"4 Device_Mobile 0.042086\n",
|
|||
|
"11 Network Type_3G 0.038541\n",
|
|||
|
"13 Age_Bin_middle-aged 0.034876\n",
|
|||
|
"15 Device Suitability_Low 0.004611\n",
|
|||
|
"5 Device_Tab 0.004049\n",
|
|||
|
"14 Age_Bin_old 0.000000\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Получение важности признаков\n",
|
|||
|
"importances = model.feature_importances_\n",
|
|||
|
"feature_names = train_encoded.columns\n",
|
|||
|
"\n",
|
|||
|
"# Сортировка признаков по важности\n",
|
|||
|
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
|
|||
|
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Feature Importance:\")\n",
|
|||
|
"print(feature_importance)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 154,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"RMSE: 0.5652451456569942\n",
|
|||
|
"R²: 0.22569473420679287\n",
|
|||
|
"MAE: 0.2697095435684647 \n",
|
|||
|
"\n",
|
|||
|
"Кросс-валидация RMSE: 0.5705060311373475 \n",
|
|||
|
"\n",
|
|||
|
"Train RMSE: 0.5237418787490223\n",
|
|||
|
"Train R²: 0.5885416666666667\n",
|
|||
|
"Train MAE: 0.19791666666666666\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error\n",
|
|||
|
"import math\n",
|
|||
|
"y_pred = model.predict(test_encoded)\n",
|
|||
|
"\n",
|
|||
|
"# Анализ важности признаков\n",
|
|||
|
"feature_importances = model.feature_importances_\n",
|
|||
|
"feature_names = train_encoded.columns\n",
|
|||
|
"\n",
|
|||
|
"importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n",
|
|||
|
"importance_df = importance_df.sort_values(by='Importance', ascending=False)\n",
|
|||
|
"\n",
|
|||
|
"rmse = mean_squared_error(Y_test_df, y_pred, squared=False)\n",
|
|||
|
"r2 = r2_score(Y_test_df, y_pred)\n",
|
|||
|
"mae = mean_absolute_error(Y_test_df, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"print(f\"RMSE: {rmse}\")\n",
|
|||
|
"print(f\"R²: {r2}\")\n",
|
|||
|
"print(f\"MAE: {mae} \\n\")\n",
|
|||
|
"\n",
|
|||
|
"# Кросс-валидация\n",
|
|||
|
"scores = cross_val_score(model, train_encoded, Y_train_resampled, cv=5, scoring='neg_mean_squared_error')\n",
|
|||
|
"rmse_cv = math.sqrt((-scores.mean()))\n",
|
|||
|
"print(f\"Кросс-валидация RMSE: {rmse_cv} \\n\")\n",
|
|||
|
"\n",
|
|||
|
"# Проверка на переобучение\n",
|
|||
|
"y_train_pred = model.predict(train_encoded)\n",
|
|||
|
"\n",
|
|||
|
"rmse_train = mean_squared_error(Y_train_resampled, y_train_pred, squared=False)\n",
|
|||
|
"r2_train = r2_score(Y_train_resampled, y_train_pred)\n",
|
|||
|
"mae_train = mean_absolute_error(Y_train_resampled, y_train_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Train RMSE: {rmse_train}\")\n",
|
|||
|
"print(f\"Train R²: {r2_train}\")\n",
|
|||
|
"print(f\"Train MAE: {mae_train}\")\n",
|
|||
|
"print()"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.13.0"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|