1569 lines
142 KiB
Plaintext
1569 lines
142 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Лабораторная 3\n",
|
||
"\n",
|
||
"Датасет: Информация об онлайн обучении учеников"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 121,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['Education Level', 'Institution Type', 'Gender', 'Age', 'Device',\n",
|
||
" 'IT Student', 'Location', 'Financial Condition', 'Internet Type',\n",
|
||
" 'Network Type', 'Flexibility Level'],\n",
|
||
" dtype='object')\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import numpy as np\n",
|
||
"import seaborn as sns\n",
|
||
"import featuretools as ft\n",
|
||
"from imblearn.over_sampling import RandomOverSampler\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"import time\n",
|
||
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
|
||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||
"from sklearn.model_selection import cross_val_score\n",
|
||
"\n",
|
||
"\n",
|
||
"\n",
|
||
"df = pd.read_csv(\"..\\\\static\\\\csv\\\\students_adaptability_level_online_education.csv\")\n",
|
||
"print(df.columns)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Столбцы:\n",
|
||
"\n",
|
||
"Education Level - уровень образования\\\n",
|
||
"Institution Type - тип учреждения\\\n",
|
||
"Gender - пол\\\n",
|
||
"Age - возраст\\\n",
|
||
"Device - устройство\\\n",
|
||
"IT Student - ученик IT направления или нет\\\n",
|
||
"Location - локация\\\n",
|
||
"Financial Condition - финансовое состояние\\\n",
|
||
"Internet Type - тип доступа к сети\\\n",
|
||
"Network Type - уровень сети\\\n",
|
||
"Flexibility Level - уровень приспособления"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 122,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 1205 entries, 0 to 1204\n",
|
||
"Data columns (total 11 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 Education Level 1205 non-null object\n",
|
||
" 1 Institution Type 1205 non-null object\n",
|
||
" 2 Gender 1205 non-null object\n",
|
||
" 3 Age 1205 non-null int64 \n",
|
||
" 4 Device 1205 non-null object\n",
|
||
" 5 IT Student 1205 non-null object\n",
|
||
" 6 Location 1205 non-null object\n",
|
||
" 7 Financial Condition 1205 non-null object\n",
|
||
" 8 Internet Type 1205 non-null object\n",
|
||
" 9 Network Type 1205 non-null object\n",
|
||
" 10 Flexibility Level 1205 non-null object\n",
|
||
"dtypes: int64(1), object(10)\n",
|
||
"memory usage: 103.7+ KB\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Education Level</th>\n",
|
||
" <th>Institution Type</th>\n",
|
||
" <th>Gender</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Device</th>\n",
|
||
" <th>IT Student</th>\n",
|
||
" <th>Location</th>\n",
|
||
" <th>Financial Condition</th>\n",
|
||
" <th>Internet Type</th>\n",
|
||
" <th>Network Type</th>\n",
|
||
" <th>Flexibility Level</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>University</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>23</td>\n",
|
||
" <td>Tab</td>\n",
|
||
" <td>No</td>\n",
|
||
" <td>Town</td>\n",
|
||
" <td>Mid</td>\n",
|
||
" <td>Wifi</td>\n",
|
||
" <td>4G</td>\n",
|
||
" <td>Moderate</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>University</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>23</td>\n",
|
||
" <td>Mobile</td>\n",
|
||
" <td>No</td>\n",
|
||
" <td>Town</td>\n",
|
||
" <td>Mid</td>\n",
|
||
" <td>Mobile Data</td>\n",
|
||
" <td>4G</td>\n",
|
||
" <td>Moderate</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>College</td>\n",
|
||
" <td>Public</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>18</td>\n",
|
||
" <td>Mobile</td>\n",
|
||
" <td>No</td>\n",
|
||
" <td>Town</td>\n",
|
||
" <td>Mid</td>\n",
|
||
" <td>Wifi</td>\n",
|
||
" <td>4G</td>\n",
|
||
" <td>Moderate</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>School</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>Mobile</td>\n",
|
||
" <td>No</td>\n",
|
||
" <td>Town</td>\n",
|
||
" <td>Mid</td>\n",
|
||
" <td>Mobile Data</td>\n",
|
||
" <td>4G</td>\n",
|
||
" <td>Moderate</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>School</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>18</td>\n",
|
||
" <td>Mobile</td>\n",
|
||
" <td>No</td>\n",
|
||
" <td>Town</td>\n",
|
||
" <td>Poor</td>\n",
|
||
" <td>Mobile Data</td>\n",
|
||
" <td>3G</td>\n",
|
||
" <td>Low</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Education Level Institution Type Gender Age Device IT Student Location \\\n",
|
||
"0 University Private Male 23 Tab No Town \n",
|
||
"1 University Private Female 23 Mobile No Town \n",
|
||
"2 College Public Female 18 Mobile No Town \n",
|
||
"3 School Private Female 11 Mobile No Town \n",
|
||
"4 School Private Female 18 Mobile No Town \n",
|
||
"\n",
|
||
" Financial Condition Internet Type Network Type Flexibility Level \n",
|
||
"0 Mid Wifi 4G Moderate \n",
|
||
"1 Mid Mobile Data 4G Moderate \n",
|
||
"2 Mid Wifi 4G Moderate \n",
|
||
"3 Mid Mobile Data 4G Moderate \n",
|
||
"4 Poor Mobile Data 3G Low "
|
||
]
|
||
},
|
||
"execution_count": 122,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.info()\n",
|
||
"df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Примеры бизнес-целей для датасета:\n",
|
||
"1. Улучшение доступа к онлайн-образованию для учеников с низким уровнем финансового обеспечения.\n",
|
||
"2. Повышение удовлетворенности учеников онлайн-обучением на основе их устройств, типу соединения, местоположения."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Цели технического проекта:\n",
|
||
"\n",
|
||
"1. Провести анализ зависимости учеников от уровня интернет-соединения и устройств\n",
|
||
"2. Провести анализ влияния различных факторов (тип устройства, интернет-соединение, финансовое положение) на уровень приспособленности."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Проверяем на выбросы."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 123,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Пустые значения по столбцам:\n",
|
||
"Education Level 0\n",
|
||
"Institution Type 0\n",
|
||
"Gender 0\n",
|
||
"Age 0\n",
|
||
"Device 0\n",
|
||
"IT Student 0\n",
|
||
"Location 0\n",
|
||
"Financial Condition 0\n",
|
||
"Internet Type 0\n",
|
||
"Network Type 0\n",
|
||
"Flexibility Level 0\n",
|
||
"dtype: int64\n",
|
||
"\n",
|
||
"Количество дубликатов: 980\n",
|
||
"\n",
|
||
"Статистический обзор данных:\n",
|
||
"\n",
|
||
"Коэффициент асимметрии для столбца 'Age': 0.024342017300169792\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"null_values = df.isnull().sum()\n",
|
||
"print(\"Пустые значения по столбцам:\")\n",
|
||
"print(null_values)\n",
|
||
"\n",
|
||
"duplicates = df.duplicated().sum()\n",
|
||
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n",
|
||
"\n",
|
||
"print(\"\\nСтатистический обзор данных:\")\n",
|
||
"df.describe()\n",
|
||
"\n",
|
||
"for column in df.select_dtypes(include=[np.number]).columns:\n",
|
||
" skewness = df[column].skew()\n",
|
||
" print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Выбросы незначительны, дубликаты есть. Удаляем дубликаты и очищаем от шумов."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 148,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Шумы в датасете:\n",
|
||
"Empty DataFrame\n",
|
||
"Columns: [Education Level, Institution Type, Gender, Age, Device, IT Student, Location, Financial Condition, Internet Type, Network Type, Flexibility Level]\n",
|
||
"Index: []\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"cleaned_df = df.drop_duplicates()\n",
|
||
"\n",
|
||
"Q1 = df[\"Age\"].quantile(0.25)\n",
|
||
"Q3 = df[\"Age\"].quantile(0.75)\n",
|
||
"\n",
|
||
"IQR = Q3 - Q1\n",
|
||
"\n",
|
||
"threshold = 1.5 * IQR\n",
|
||
"lower_bound = Q1 - threshold\n",
|
||
"upper_bound = Q3 + threshold\n",
|
||
"\n",
|
||
"outliers = (df[\"Age\"] < lower_bound) | (df[\"Age\"] > upper_bound)\n",
|
||
"\n",
|
||
"print(\"Шумы в датасете:\")\n",
|
||
"print(df[outliers])\n",
|
||
"\n",
|
||
"median_score = df[\"Age\"].median()\n",
|
||
"df.loc[outliers, \"Age\"] = median_score"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Преобразуем строковые значение в столбце \"Уровень приспособления\" в числовые значения. Это понадобится для расчёта качества набора признаков."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 149,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "IntCastingNaNError",
|
||
"evalue": "Cannot convert non-finite values (NA or inf) to integer",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mIntCastingNaNError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[1;32mIn[149], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m map_flexibility_to_int \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mLow\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mModerate\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mHigh\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m2\u001b[39m}\n\u001b[1;32m----> 3\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFlexibility Level\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mFlexibility Level\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmap_flexibility_to_int\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mint32\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
||
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\generic.py:6643\u001b[0m, in \u001b[0;36mNDFrame.astype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 6637\u001b[0m results \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 6638\u001b[0m ser\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy, errors\u001b[38;5;241m=\u001b[39merrors) \u001b[38;5;28;01mfor\u001b[39;00m _, ser \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems()\n\u001b[0;32m 6639\u001b[0m ]\n\u001b[0;32m 6641\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 6642\u001b[0m \u001b[38;5;66;03m# else, only a single dtype is given\u001b[39;00m\n\u001b[1;32m-> 6643\u001b[0m new_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mgr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 6644\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_constructor_from_mgr(new_data, axes\u001b[38;5;241m=\u001b[39mnew_data\u001b[38;5;241m.\u001b[39maxes)\n\u001b[0;32m 6645\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mastype\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
||
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:430\u001b[0m, in \u001b[0;36mBaseBlockManager.astype\u001b[1;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 427\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m using_copy_on_write():\n\u001b[0;32m 428\u001b[0m copy \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m--> 430\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mastype\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 435\u001b[0m \u001b[43m \u001b[49m\u001b[43musing_cow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43musing_copy_on_write\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 436\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\managers.py:363\u001b[0m, in \u001b[0;36mBaseBlockManager.apply\u001b[1;34m(self, f, align_keys, **kwargs)\u001b[0m\n\u001b[0;32m 361\u001b[0m applied \u001b[38;5;241m=\u001b[39m b\u001b[38;5;241m.\u001b[39mapply(f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 362\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 363\u001b[0m applied \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 364\u001b[0m result_blocks \u001b[38;5;241m=\u001b[39m extend_blocks(applied, result_blocks)\n\u001b[0;32m 366\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mfrom_blocks(result_blocks, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maxes)\n",
|
||
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\internals\\blocks.py:758\u001b[0m, in \u001b[0;36mBlock.astype\u001b[1;34m(self, dtype, copy, errors, using_cow, squeeze)\u001b[0m\n\u001b[0;32m 755\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCan not squeeze with more than one column.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 756\u001b[0m values \u001b[38;5;241m=\u001b[39m values[\u001b[38;5;241m0\u001b[39m, :] \u001b[38;5;66;03m# type: ignore[call-overload]\u001b[39;00m\n\u001b[1;32m--> 758\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array_safe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 760\u001b[0m new_values \u001b[38;5;241m=\u001b[39m maybe_coerce_values(new_values)\n\u001b[0;32m 762\u001b[0m refs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
||
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:237\u001b[0m, in \u001b[0;36mastype_array_safe\u001b[1;34m(values, dtype, copy, errors)\u001b[0m\n\u001b[0;32m 234\u001b[0m dtype \u001b[38;5;241m=\u001b[39m dtype\u001b[38;5;241m.\u001b[39mnumpy_dtype\n\u001b[0;32m 236\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 237\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 238\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m):\n\u001b[0;32m 239\u001b[0m \u001b[38;5;66;03m# e.g. _astype_nansafe can fail on object-dtype of strings\u001b[39;00m\n\u001b[0;32m 240\u001b[0m \u001b[38;5;66;03m# trying to convert to float\u001b[39;00m\n\u001b[0;32m 241\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
|
||
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:182\u001b[0m, in \u001b[0;36mastype_array\u001b[1;34m(values, dtype, copy)\u001b[0m\n\u001b[0;32m 179\u001b[0m values \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n\u001b[0;32m 181\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 182\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43m_astype_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 184\u001b[0m \u001b[38;5;66;03m# in pandas we don't store numpy str dtypes, so convert to object\u001b[39;00m\n\u001b[0;32m 185\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, np\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, \u001b[38;5;28mstr\u001b[39m):\n",
|
||
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:101\u001b[0m, in \u001b[0;36m_astype_nansafe\u001b[1;34m(arr, dtype, copy, skipna)\u001b[0m\n\u001b[0;32m 96\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mensure_string_array(\n\u001b[0;32m 97\u001b[0m arr, skipna\u001b[38;5;241m=\u001b[39mskipna, convert_na_value\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m 98\u001b[0m )\u001b[38;5;241m.\u001b[39mreshape(shape)\n\u001b[0;32m 100\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m np\u001b[38;5;241m.\u001b[39missubdtype(arr\u001b[38;5;241m.\u001b[39mdtype, np\u001b[38;5;241m.\u001b[39mfloating) \u001b[38;5;129;01mand\u001b[39;00m dtype\u001b[38;5;241m.\u001b[39mkind \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miu\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m--> 101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_astype_float_to_int_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 103\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mobject\u001b[39m:\n\u001b[0;32m 104\u001b[0m \u001b[38;5;66;03m# if we have a datetime/timedelta array of objects\u001b[39;00m\n\u001b[0;32m 105\u001b[0m \u001b[38;5;66;03m# then coerce to datetime64[ns] and use DatetimeArray.astype\u001b[39;00m\n\u001b[0;32m 107\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mis_np_dtype(dtype, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mM\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n",
|
||
"File \u001b[1;32md:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\pandas\\core\\dtypes\\astype.py:145\u001b[0m, in \u001b[0;36m_astype_float_to_int_nansafe\u001b[1;34m(values, dtype, copy)\u001b[0m\n\u001b[0;32m 141\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 142\u001b[0m \u001b[38;5;124;03mastype with a check preventing converting NaN to an meaningless integer value.\u001b[39;00m\n\u001b[0;32m 143\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 144\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m np\u001b[38;5;241m.\u001b[39misfinite(values)\u001b[38;5;241m.\u001b[39mall():\n\u001b[1;32m--> 145\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m IntCastingNaNError(\n\u001b[0;32m 146\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot convert non-finite values (NA or inf) to integer\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 147\u001b[0m )\n\u001b[0;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype\u001b[38;5;241m.\u001b[39mkind \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mu\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 149\u001b[0m \u001b[38;5;66;03m# GH#45151\u001b[39;00m\n\u001b[0;32m 150\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (values \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m.\u001b[39mall():\n",
|
||
"\u001b[1;31mIntCastingNaNError\u001b[0m: Cannot convert non-finite values (NA or inf) to integer"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"map_flexibility_to_int = {'Low': 0, 'Moderate': 1, 'High': 2}\n",
|
||
"\n",
|
||
"df['Flexibility Level'] = df['Flexibility Level'].map(map_flexibility_to_int).astype('int32')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Шумов в датасете нет. Разбиваем датасет на три выборки: обучающую, контрольную и тестовую."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 137,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Размер обучающей выборки: (723, 10)\n",
|
||
"Размер контрольной выборки: (241, 10)\n",
|
||
"Размер тестовой выборки: (241, 10)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"X = df.drop(columns=['Flexibility Level'])\n",
|
||
"Y = df['Flexibility Level']\n",
|
||
"\n",
|
||
"X_train_df, X_test_df, Y_train_df, Y_test_df = train_test_split(X, Y, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"X_train_df, X_val_df, Y_train_df, Y_val_df = train_test_split(X_train_df, Y_train_df, test_size=0.25, random_state=42)\n",
|
||
"\n",
|
||
"print(\"Размер обучающей выборки:\", X_train_df.shape)\n",
|
||
"print(\"Размер контрольной выборки:\",X_val_df.shape)\n",
|
||
"print(\"Размер тестовой выборки:\", X_test_df.shape)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Проверка сбалансированности данных."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 138,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Распределение классов в обучающей выборке:\n",
|
||
"Flexibility Level\n",
|
||
"1 0.531120\n",
|
||
"0 0.385892\n",
|
||
"2 0.082988\n",
|
||
"Name: proportion, dtype: float64\n",
|
||
"\n",
|
||
"Распределение классов в контрольной выборке:\n",
|
||
"Flexibility Level\n",
|
||
"1 0.522822\n",
|
||
"0 0.406639\n",
|
||
"2 0.070539\n",
|
||
"Name: proportion, dtype: float64\n",
|
||
"\n",
|
||
"Распределение классов в тестовой выборке:\n",
|
||
"Flexibility Level\n",
|
||
"1 0.477178\n",
|
||
"0 0.427386\n",
|
||
"2 0.095436\n",
|
||
"Name: proportion, dtype: float64\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1800x500 with 3 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"\n",
|
||
"def analyze_balance(y_train, y_val, y_test, y_name):\n",
|
||
" print(\"Распределение классов в обучающей выборке:\")\n",
|
||
" print(y_train.value_counts(normalize=True))\n",
|
||
" \n",
|
||
" print(\"\\nРаспределение классов в контрольной выборке:\")\n",
|
||
" print(y_val.value_counts(normalize=True))\n",
|
||
" \n",
|
||
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
|
||
" print(y_test.value_counts(normalize=True))\n",
|
||
"\n",
|
||
" fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n",
|
||
" fig.suptitle('Распределение в различных выборках')\n",
|
||
"\n",
|
||
" sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts(normalize=True), ax=axes[0])\n",
|
||
" axes[0].set_title('Обучающая выборка')\n",
|
||
" axes[0].set_xlabel(y_name)\n",
|
||
" axes[0].set_ylabel('Доля')\n",
|
||
"\n",
|
||
" sns.barplot(x=y_val.value_counts().index, y=y_val.value_counts(normalize=True), ax=axes[1])\n",
|
||
" axes[1].set_title('Контрольная выборка')\n",
|
||
" axes[1].set_xlabel(y_name)\n",
|
||
"\n",
|
||
" sns.barplot(x=y_test.value_counts().index, y=y_test.value_counts(normalize=True), ax=axes[2])\n",
|
||
" axes[2].set_title('Тестовая выборка')\n",
|
||
" axes[2].set_xlabel(y_name)\n",
|
||
"\n",
|
||
" plt.show()\n",
|
||
"\n",
|
||
"analyze_balance(Y_train_df, Y_val_df, Y_test_df, 'Flexibility Level')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Выполним оверсемплинг для балансировки."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 139,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Распределение классов в обучающей выборке:\n",
|
||
"Flexibility Level\n",
|
||
"2 0.333333\n",
|
||
"0 0.333333\n",
|
||
"1 0.333333\n",
|
||
"Name: proportion, dtype: float64\n",
|
||
"\n",
|
||
"Распределение классов в контрольной выборке:\n",
|
||
"Flexibility Level\n",
|
||
"1 0.333333\n",
|
||
"0 0.333333\n",
|
||
"2 0.333333\n",
|
||
"Name: proportion, dtype: float64\n",
|
||
"\n",
|
||
"Распределение классов в тестовой выборке:\n",
|
||
"Flexibility Level\n",
|
||
"1 0.477178\n",
|
||
"0 0.427386\n",
|
||
"2 0.095436\n",
|
||
"Name: proportion, dtype: float64\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1800x500 with 3 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"ros = RandomOverSampler(random_state=42)\n",
|
||
"\n",
|
||
"X_train_resampled, Y_train_resampled = ros.fit_resample(X_train_df, Y_train_df)\n",
|
||
"X_val_resampled, Y_val_resampled = ros.fit_resample(X_val_df, Y_val_df)\n",
|
||
"\n",
|
||
"analyze_balance(Y_train_resampled, Y_val_resampled, Y_test_df, 'Flexibility Level')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Конструирование признаков. Для начала применим унитарное кодирование категориальных признаков (one-hot encoding), переведя их в бинарные вектора."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 140,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Education Level_School</th>\n",
|
||
" <th>Education Level_University</th>\n",
|
||
" <th>Institution Type_Public</th>\n",
|
||
" <th>Gender_Male</th>\n",
|
||
" <th>Device_Mobile</th>\n",
|
||
" <th>Device_Tab</th>\n",
|
||
" <th>IT Student_Yes</th>\n",
|
||
" <th>Location_Town</th>\n",
|
||
" <th>Financial Condition_Poor</th>\n",
|
||
" <th>Financial Condition_Rich</th>\n",
|
||
" <th>Internet Type_Wifi</th>\n",
|
||
" <th>Network Type_3G</th>\n",
|
||
" <th>Network Type_4G</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>10</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>18</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>23</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>18</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>23</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Age Education Level_School Education Level_University \\\n",
|
||
"0 10 True False \n",
|
||
"1 18 False False \n",
|
||
"2 23 False True \n",
|
||
"3 18 True False \n",
|
||
"4 23 False True \n",
|
||
"\n",
|
||
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
|
||
"0 False True True False \n",
|
||
"1 False False True False \n",
|
||
"2 False True True False \n",
|
||
"3 True True True False \n",
|
||
"4 False False True False \n",
|
||
"\n",
|
||
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
|
||
"0 False True False \n",
|
||
"1 False True False \n",
|
||
"2 False True False \n",
|
||
"3 False True False \n",
|
||
"4 False False False \n",
|
||
"\n",
|
||
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
|
||
"0 True True False \n",
|
||
"1 False True False \n",
|
||
"2 False True False \n",
|
||
"3 True False False \n",
|
||
"4 False True False \n",
|
||
"\n",
|
||
" Network Type_4G \n",
|
||
"0 True \n",
|
||
"1 True \n",
|
||
"2 True \n",
|
||
"3 True \n",
|
||
"4 True "
|
||
]
|
||
},
|
||
"execution_count": 140,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"cat_features = ['Education Level', 'Institution Type', 'Gender', 'Device', 'IT Student', 'Location', 'Financial Condition', 'Internet Type', 'Network Type']\n",
|
||
"\n",
|
||
"train_encoded = pd.get_dummies(X_train_resampled, columns=cat_features, drop_first=True)\n",
|
||
"val_encoded = pd.get_dummies(X_val_resampled, columns=cat_features, drop_first=True)\n",
|
||
"test_encoded = pd.get_dummies(X_test_df, columns=cat_features, drop_first=True)\n",
|
||
"\n",
|
||
"train_encoded.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Применим дискретизацию к числовым признакам."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 141,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Education Level_School</th>\n",
|
||
" <th>Education Level_University</th>\n",
|
||
" <th>Institution Type_Public</th>\n",
|
||
" <th>Gender_Male</th>\n",
|
||
" <th>Device_Mobile</th>\n",
|
||
" <th>Device_Tab</th>\n",
|
||
" <th>IT Student_Yes</th>\n",
|
||
" <th>Location_Town</th>\n",
|
||
" <th>Financial Condition_Poor</th>\n",
|
||
" <th>Financial Condition_Rich</th>\n",
|
||
" <th>Internet Type_Wifi</th>\n",
|
||
" <th>Network Type_3G</th>\n",
|
||
" <th>Network Type_4G</th>\n",
|
||
" <th>Age_Bin</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Education Level_School Education Level_University \\\n",
|
||
"0 True False \n",
|
||
"1 False False \n",
|
||
"2 False True \n",
|
||
"3 True False \n",
|
||
"4 False True \n",
|
||
"\n",
|
||
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
|
||
"0 False True True False \n",
|
||
"1 False False True False \n",
|
||
"2 False True True False \n",
|
||
"3 True True True False \n",
|
||
"4 False False True False \n",
|
||
"\n",
|
||
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
|
||
"0 False True False \n",
|
||
"1 False True False \n",
|
||
"2 False True False \n",
|
||
"3 False True False \n",
|
||
"4 False False False \n",
|
||
"\n",
|
||
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
|
||
"0 True True False \n",
|
||
"1 False True False \n",
|
||
"2 False True False \n",
|
||
"3 True False False \n",
|
||
"4 False True False \n",
|
||
"\n",
|
||
" Network Type_4G Age_Bin \n",
|
||
"0 True young \n",
|
||
"1 True young \n",
|
||
"2 True young \n",
|
||
"3 True young \n",
|
||
"4 True young "
|
||
]
|
||
},
|
||
"execution_count": 141,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"num_features = ['Age']\n",
|
||
"\n",
|
||
"def discretize_features(df, features, bins, labels):\n",
|
||
" for feature in features:\n",
|
||
" df[f'{feature}_Bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
|
||
" df.drop(columns=[feature], inplace=True)\n",
|
||
" return df\n",
|
||
"\n",
|
||
"age_bins = [0, 25, 55, 100]\n",
|
||
"age_labels = [\"young\", \"middle-aged\", \"old\"]\n",
|
||
"\n",
|
||
"train_encoded = discretize_features(train_encoded, num_features, bins=age_bins, labels=age_labels)\n",
|
||
"val_encoded = discretize_features(val_encoded, num_features, bins=age_bins, labels=age_labels)\n",
|
||
"test_encoded = discretize_features(test_encoded, num_features, bins=age_bins, labels=age_labels)\n",
|
||
"\n",
|
||
"train_encoded.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Применим ручной синтез признаков. К примеру, для этого датасета, сделаем признак \"соотвествие устройства для обучения\". Мобильные устройства часто менее удобны для учебы по сравнению с планшетами."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 142,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Education Level_School</th>\n",
|
||
" <th>Education Level_University</th>\n",
|
||
" <th>Institution Type_Public</th>\n",
|
||
" <th>Gender_Male</th>\n",
|
||
" <th>Device_Mobile</th>\n",
|
||
" <th>Device_Tab</th>\n",
|
||
" <th>IT Student_Yes</th>\n",
|
||
" <th>Location_Town</th>\n",
|
||
" <th>Financial Condition_Poor</th>\n",
|
||
" <th>Financial Condition_Rich</th>\n",
|
||
" <th>Internet Type_Wifi</th>\n",
|
||
" <th>Network Type_3G</th>\n",
|
||
" <th>Network Type_4G</th>\n",
|
||
" <th>Age_Bin</th>\n",
|
||
" <th>Device Suitability</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" <td>Low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" <td>Low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" <td>Low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" <td>Low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" <td>Low</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Education Level_School Education Level_University \\\n",
|
||
"0 True False \n",
|
||
"1 False False \n",
|
||
"2 False True \n",
|
||
"3 True False \n",
|
||
"4 False True \n",
|
||
"\n",
|
||
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
|
||
"0 False True True False \n",
|
||
"1 False False True False \n",
|
||
"2 False True True False \n",
|
||
"3 True True True False \n",
|
||
"4 False False True False \n",
|
||
"\n",
|
||
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
|
||
"0 False True False \n",
|
||
"1 False True False \n",
|
||
"2 False True False \n",
|
||
"3 False True False \n",
|
||
"4 False False False \n",
|
||
"\n",
|
||
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
|
||
"0 True True False \n",
|
||
"1 False True False \n",
|
||
"2 False True False \n",
|
||
"3 True False False \n",
|
||
"4 False True False \n",
|
||
"\n",
|
||
" Network Type_4G Age_Bin Device Suitability \n",
|
||
"0 True young Low \n",
|
||
"1 True young Low \n",
|
||
"2 True young Low \n",
|
||
"3 True young Low \n",
|
||
"4 True young Low "
|
||
]
|
||
},
|
||
"execution_count": 142,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train_encoded['Device Suitability'] = train_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n",
|
||
"val_encoded['Device Suitability'] = val_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n",
|
||
"test_encoded['Device Suitability'] = test_encoded['Device_Tab'].apply(lambda x: \"High\" if x == True else \"Low\")\n",
|
||
"\n",
|
||
"train_encoded.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Конструирование признаков с помощью фреймворка Featuretools."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 143,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n",
|
||
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
||
" pd.to_datetime(\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Education Level_School</th>\n",
|
||
" <th>Education Level_University</th>\n",
|
||
" <th>Institution Type_Public</th>\n",
|
||
" <th>Gender_Male</th>\n",
|
||
" <th>Device_Mobile</th>\n",
|
||
" <th>Device_Tab</th>\n",
|
||
" <th>IT Student_Yes</th>\n",
|
||
" <th>Location_Town</th>\n",
|
||
" <th>Financial Condition_Poor</th>\n",
|
||
" <th>Financial Condition_Rich</th>\n",
|
||
" <th>Internet Type_Wifi</th>\n",
|
||
" <th>Network Type_3G</th>\n",
|
||
" <th>Network Type_4G</th>\n",
|
||
" <th>Age_Bin</th>\n",
|
||
" <th>Device Suitability</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>id</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" <td>Low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" <td>Low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" <td>Low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" <td>Low</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>young</td>\n",
|
||
" <td>Low</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Education Level_School Education Level_University \\\n",
|
||
"id \n",
|
||
"0 True False \n",
|
||
"1 False False \n",
|
||
"2 False True \n",
|
||
"3 True False \n",
|
||
"4 False True \n",
|
||
"\n",
|
||
" Institution Type_Public Gender_Male Device_Mobile Device_Tab \\\n",
|
||
"id \n",
|
||
"0 False True True False \n",
|
||
"1 False False True False \n",
|
||
"2 False True True False \n",
|
||
"3 True True True False \n",
|
||
"4 False False True False \n",
|
||
"\n",
|
||
" IT Student_Yes Location_Town Financial Condition_Poor \\\n",
|
||
"id \n",
|
||
"0 False True False \n",
|
||
"1 False True False \n",
|
||
"2 False True False \n",
|
||
"3 False True False \n",
|
||
"4 False False False \n",
|
||
"\n",
|
||
" Financial Condition_Rich Internet Type_Wifi Network Type_3G \\\n",
|
||
"id \n",
|
||
"0 True True False \n",
|
||
"1 False True False \n",
|
||
"2 False True False \n",
|
||
"3 True False False \n",
|
||
"4 False True False \n",
|
||
"\n",
|
||
" Network Type_4G Age_Bin Device Suitability \n",
|
||
"id \n",
|
||
"0 True young Low \n",
|
||
"1 True young Low \n",
|
||
"2 True young Low \n",
|
||
"3 True young Low \n",
|
||
"4 True young Low "
|
||
]
|
||
},
|
||
"execution_count": 143,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ft_data = train_encoded.copy()\n",
|
||
"\n",
|
||
"es = ft.EntitySet(id=\"students\")\n",
|
||
"es = es.add_dataframe(dataframe_name=\"students_data\", dataframe=ft_data, index=\"id\", make_index=True)\n",
|
||
"\n",
|
||
"feature_matrix, feature_defs = ft.dfs(\n",
|
||
" entityset=es, \n",
|
||
" target_dataframe_name=\"students_data\",\n",
|
||
" max_depth=1\n",
|
||
")\n",
|
||
"\n",
|
||
"feature_matrix.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Featuretools не смог сделать новые признаки.\n",
|
||
"\n",
|
||
"Оценка качества набора признаков."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 144,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Время обучения модели: 0.22 секунд\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"train_encoded = pd.get_dummies(train_encoded, drop_first=True)\n",
|
||
"val_encoded = pd.get_dummies(val_encoded, drop_first=True)\n",
|
||
"test_encoded = pd.get_dummies(test_encoded, drop_first=True)\n",
|
||
"\n",
|
||
"cols = train_encoded.columns\n",
|
||
"\n",
|
||
"train_encoded = train_encoded.reindex(columns=cols, fill_value=0)\n",
|
||
"val_encoded = val_encoded.reindex(columns=cols, fill_value=0)\n",
|
||
"test_encoded = test_encoded.reindex(columns=cols, fill_value=0)\n",
|
||
"\n",
|
||
"model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
|
||
"\n",
|
||
"start = time.time()\n",
|
||
"model.fit(train_encoded, Y_train_resampled)\n",
|
||
"train_time = time.time() - start\n",
|
||
"\n",
|
||
"print(f'Время обучения модели: {train_time:.2f} секунд')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 145,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Feature Importance:\n",
|
||
" feature importance\n",
|
||
"9 Financial Condition_Rich 0.184028\n",
|
||
"3 Gender_Male 0.108992\n",
|
||
"8 Financial Condition_Poor 0.107030\n",
|
||
"2 Institution Type_Public 0.095663\n",
|
||
"10 Internet Type_Wifi 0.089925\n",
|
||
"7 Location_Town 0.078658\n",
|
||
"0 Education Level_School 0.061961\n",
|
||
"6 IT Student_Yes 0.055048\n",
|
||
"1 Education Level_University 0.049695\n",
|
||
"12 Network Type_4G 0.044837\n",
|
||
"4 Device_Mobile 0.042086\n",
|
||
"11 Network Type_3G 0.038541\n",
|
||
"13 Age_Bin_middle-aged 0.034876\n",
|
||
"15 Device Suitability_Low 0.004611\n",
|
||
"5 Device_Tab 0.004049\n",
|
||
"14 Age_Bin_old 0.000000\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Получение важности признаков\n",
|
||
"importances = model.feature_importances_\n",
|
||
"feature_names = train_encoded.columns\n",
|
||
"\n",
|
||
"# Сортировка признаков по важности\n",
|
||
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
|
||
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
|
||
"\n",
|
||
"print(\"Feature Importance:\")\n",
|
||
"print(feature_importance)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 154,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"RMSE: 0.5652451456569942\n",
|
||
"R²: 0.22569473420679287\n",
|
||
"MAE: 0.2697095435684647 \n",
|
||
"\n",
|
||
"Кросс-валидация RMSE: 0.5705060311373475 \n",
|
||
"\n",
|
||
"Train RMSE: 0.5237418787490223\n",
|
||
"Train R²: 0.5885416666666667\n",
|
||
"Train MAE: 0.19791666666666666\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\ulstu\\cr3\\sem1\\MAI\\AIM-PIbd-31-Makarov-DV\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error\n",
|
||
"import math\n",
|
||
"y_pred = model.predict(test_encoded)\n",
|
||
"\n",
|
||
"# Анализ важности признаков\n",
|
||
"feature_importances = model.feature_importances_\n",
|
||
"feature_names = train_encoded.columns\n",
|
||
"\n",
|
||
"importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n",
|
||
"importance_df = importance_df.sort_values(by='Importance', ascending=False)\n",
|
||
"\n",
|
||
"rmse = mean_squared_error(Y_test_df, y_pred, squared=False)\n",
|
||
"r2 = r2_score(Y_test_df, y_pred)\n",
|
||
"mae = mean_absolute_error(Y_test_df, y_pred)\n",
|
||
"\n",
|
||
"print()\n",
|
||
"print(f\"RMSE: {rmse}\")\n",
|
||
"print(f\"R²: {r2}\")\n",
|
||
"print(f\"MAE: {mae} \\n\")\n",
|
||
"\n",
|
||
"# Кросс-валидация\n",
|
||
"scores = cross_val_score(model, train_encoded, Y_train_resampled, cv=5, scoring='neg_mean_squared_error')\n",
|
||
"rmse_cv = math.sqrt((-scores.mean()))\n",
|
||
"print(f\"Кросс-валидация RMSE: {rmse_cv} \\n\")\n",
|
||
"\n",
|
||
"# Проверка на переобучение\n",
|
||
"y_train_pred = model.predict(train_encoded)\n",
|
||
"\n",
|
||
"rmse_train = mean_squared_error(Y_train_resampled, y_train_pred, squared=False)\n",
|
||
"r2_train = r2_score(Y_train_resampled, y_train_pred)\n",
|
||
"mae_train = mean_absolute_error(Y_train_resampled, y_train_pred)\n",
|
||
"\n",
|
||
"print(f\"Train RMSE: {rmse_train}\")\n",
|
||
"print(f\"Train R²: {r2_train}\")\n",
|
||
"print(f\"Train MAE: {mae_train}\")\n",
|
||
"print()"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.13.0"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|