PredictiveAnalytics/lab3.ipynb

1850 lines
64 KiB
Plaintext
Raw Permalink Normal View History

2025-01-07 00:58:14 +04:00
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Бизнес-цели для набора данных онлайн обучения\n",
"\n",
2025-01-08 18:21:45 +04:00
"1. определение уровня образования\n",
"2. определение It-направления\n"
2025-01-07 00:58:14 +04:00
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Age\n",
"23 374\n",
"11 353\n",
"18 278\n",
"9 81\n",
"27 68\n",
"10 51\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Обучающая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(723, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Age\n",
"23 224\n",
"11 212\n",
"18 167\n",
"9 48\n",
"27 41\n",
"10 31\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Контрольная выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(241, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Age\n",
"23 75\n",
"11 71\n",
"18 55\n",
"9 16\n",
"27 14\n",
"10 10\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Тестовая выборка: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(241, 3)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"Age\n",
"23 75\n",
"11 70\n",
"18 56\n",
"9 17\n",
"27 13\n",
"10 10\n",
"Name: count, dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"# загрузка данных\n",
"df = pd.read_csv(\"data/students_education.csv\")\n",
"\n",
"# Вывод распределения количества наблюдений по меткам (классам)\n",
"from src.utils import split_stratified_into_train_val_test\n",
"\n",
"\n",
"display(df.Age.value_counts())\n",
"display()\n",
"\n",
"data = df[[\"Age\", \"Device\", \"Education Level\"]].copy()\n",
"\n",
"df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
" data, stratify_colname=\"Age\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
")\n",
"\n",
"display(\"Обучающая выборка: \", df_train.shape)\n",
"display(df_train.Age.value_counts())\n",
"\n",
"display(\"Контрольная выборка: \", df_val.shape)\n",
"display(df_val.Age.value_counts())\n",
"\n",
"display(\"Тестовая выборка: \", df_test.shape)\n",
"display(df_test.Age.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Institution Type</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Device</th>\n",
" <th>IT Student</th>\n",
" <th>Location</th>\n",
" <th>Financial Condition</th>\n",
" <th>Internet Type</th>\n",
" <th>Network Type</th>\n",
" <th>Flexibility Level</th>\n",
" <th>education_College</th>\n",
" <th>education_School</th>\n",
" <th>education_University</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Private</td>\n",
" <td>Male</td>\n",
" <td>23</td>\n",
" <td>Tab</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>23</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Public</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Poor</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Low</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1200</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Low</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1201</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Rural</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1202</th>\n",
" <td>Private</td>\n",
" <td>Male</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1203</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Rural</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Low</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1204</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Poor</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1205 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" Institution Type Gender Age Device IT Student Location \\\n",
"0 Private Male 23 Tab No Town \n",
"1 Private Female 23 Mobile No Town \n",
"2 Public Female 18 Mobile No Town \n",
"3 Private Female 11 Mobile No Town \n",
"4 Private Female 18 Mobile No Town \n",
"... ... ... ... ... ... ... \n",
"1200 Private Female 18 Mobile No Town \n",
"1201 Private Female 18 Mobile No Rural \n",
"1202 Private Male 11 Mobile No Town \n",
"1203 Private Female 18 Mobile No Rural \n",
"1204 Private Female 11 Mobile No Town \n",
"\n",
" Financial Condition Internet Type Network Type Flexibility Level \\\n",
"0 Mid Wifi 4G Moderate \n",
"1 Mid Mobile Data 4G Moderate \n",
"2 Mid Wifi 4G Moderate \n",
"3 Mid Mobile Data 4G Moderate \n",
"4 Poor Mobile Data 3G Low \n",
"... ... ... ... ... \n",
"1200 Mid Wifi 4G Low \n",
"1201 Mid Wifi 4G Moderate \n",
"1202 Mid Mobile Data 3G Moderate \n",
"1203 Mid Wifi 4G Low \n",
"1204 Poor Mobile Data 3G Moderate \n",
"\n",
" education_College education_School education_University \n",
"0 False False True \n",
"1 False False True \n",
"2 True False False \n",
"3 False True False \n",
"4 False True False \n",
"... ... ... ... \n",
"1200 True False False \n",
"1201 True False False \n",
"1202 False True False \n",
"1203 True False False \n",
"1204 False True False \n",
"\n",
"[1205 rows x 13 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Применение one-hot encoding\n",
"df_encoded = pd.get_dummies(df, columns=['Education Level'], prefix='education')\n",
"\n",
"# Результат\n",
"df_encoded"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Institution Type</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Device</th>\n",
" <th>IT Student</th>\n",
" <th>Location</th>\n",
" <th>Financial Condition</th>\n",
" <th>Internet Type</th>\n",
" <th>Network Type</th>\n",
" <th>Flexibility Level</th>\n",
" <th>education_College</th>\n",
" <th>education_School</th>\n",
" <th>education_University</th>\n",
" <th>age_group</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Private</td>\n",
" <td>Male</td>\n",
" <td>23</td>\n",
" <td>Tab</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>19-23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>23</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>19-23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Public</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Poor</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Low</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1200</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Low</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1201</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Rural</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1202</th>\n",
" <td>Private</td>\n",
" <td>Male</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1203</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Rural</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Low</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1204</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Poor</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1205 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" Institution Type Gender Age Device IT Student Location \\\n",
"0 Private Male 23 Tab No Town \n",
"1 Private Female 23 Mobile No Town \n",
"2 Public Female 18 Mobile No Town \n",
"3 Private Female 11 Mobile No Town \n",
"4 Private Female 18 Mobile No Town \n",
"... ... ... ... ... ... ... \n",
"1200 Private Female 18 Mobile No Town \n",
"1201 Private Female 18 Mobile No Rural \n",
"1202 Private Male 11 Mobile No Town \n",
"1203 Private Female 18 Mobile No Rural \n",
"1204 Private Female 11 Mobile No Town \n",
"\n",
" Financial Condition Internet Type Network Type Flexibility Level \\\n",
"0 Mid Wifi 4G Moderate \n",
"1 Mid Mobile Data 4G Moderate \n",
"2 Mid Wifi 4G Moderate \n",
"3 Mid Mobile Data 4G Moderate \n",
"4 Poor Mobile Data 3G Low \n",
"... ... ... ... ... \n",
"1200 Mid Wifi 4G Low \n",
"1201 Mid Wifi 4G Moderate \n",
"1202 Mid Mobile Data 3G Moderate \n",
"1203 Mid Wifi 4G Low \n",
"1204 Poor Mobile Data 3G Moderate \n",
"\n",
" education_College education_School education_University age_group \n",
"0 False False True 19-23 \n",
"1 False False True 19-23 \n",
"2 True False False 10-18 \n",
"3 False True False 10-18 \n",
"4 False True False 10-18 \n",
"... ... ... ... ... \n",
"1200 True False False 10-18 \n",
"1201 True False False 10-18 \n",
"1202 False True False 10-18 \n",
"1203 True False False 10-18 \n",
"1204 False True False 10-18 \n",
"\n",
"[1205 rows x 14 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Дискретизация признака 'age'\n",
"bins = [0, 18, 23, 28]\n",
"labels = ['10-18', '19-23', '24-28']\n",
"df_encoded['age_group'] = pd.cut(df['Age'], bins=bins, labels=labels)\n",
"\n",
"# Результат\n",
"df_encoded"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Institution Type</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Device</th>\n",
" <th>IT Student</th>\n",
" <th>Location</th>\n",
" <th>Financial Condition</th>\n",
" <th>Internet Type</th>\n",
" <th>Network Type</th>\n",
" <th>Flexibility Level</th>\n",
" <th>education_College</th>\n",
" <th>education_School</th>\n",
" <th>education_University</th>\n",
" <th>age_group</th>\n",
" <th>internet</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Private</td>\n",
" <td>Male</td>\n",
" <td>23</td>\n",
" <td>Tab</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>19-23</td>\n",
" <td>Wifi_4G</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>23</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>19-23</td>\n",
" <td>Mobile Data_4G</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Public</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Wifi_4G</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Mobile Data_4G</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Poor</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Low</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Mobile Data_3G</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1200</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Low</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Wifi_4G</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1201</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Rural</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Wifi_4G</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1202</th>\n",
" <td>Private</td>\n",
" <td>Male</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Mobile Data_3G</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1203</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Rural</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Low</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Wifi_4G</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1204</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Poor</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Mobile Data_3G</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1205 rows × 15 columns</p>\n",
"</div>"
],
"text/plain": [
" Institution Type Gender Age Device IT Student Location \\\n",
"0 Private Male 23 Tab No Town \n",
"1 Private Female 23 Mobile No Town \n",
"2 Public Female 18 Mobile No Town \n",
"3 Private Female 11 Mobile No Town \n",
"4 Private Female 18 Mobile No Town \n",
"... ... ... ... ... ... ... \n",
"1200 Private Female 18 Mobile No Town \n",
"1201 Private Female 18 Mobile No Rural \n",
"1202 Private Male 11 Mobile No Town \n",
"1203 Private Female 18 Mobile No Rural \n",
"1204 Private Female 11 Mobile No Town \n",
"\n",
" Financial Condition Internet Type Network Type Flexibility Level \\\n",
"0 Mid Wifi 4G Moderate \n",
"1 Mid Mobile Data 4G Moderate \n",
"2 Mid Wifi 4G Moderate \n",
"3 Mid Mobile Data 4G Moderate \n",
"4 Poor Mobile Data 3G Low \n",
"... ... ... ... ... \n",
"1200 Mid Wifi 4G Low \n",
"1201 Mid Wifi 4G Moderate \n",
"1202 Mid Mobile Data 3G Moderate \n",
"1203 Mid Wifi 4G Low \n",
"1204 Poor Mobile Data 3G Moderate \n",
"\n",
" education_College education_School education_University age_group \\\n",
"0 False False True 19-23 \n",
"1 False False True 19-23 \n",
"2 True False False 10-18 \n",
"3 False True False 10-18 \n",
"4 False True False 10-18 \n",
"... ... ... ... ... \n",
"1200 True False False 10-18 \n",
"1201 True False False 10-18 \n",
"1202 False True False 10-18 \n",
"1203 True False False 10-18 \n",
"1204 False True False 10-18 \n",
"\n",
" internet \n",
"0 Wifi_4G \n",
"1 Mobile Data_4G \n",
"2 Wifi_4G \n",
"3 Mobile Data_4G \n",
"4 Mobile Data_3G \n",
"... ... \n",
"1200 Wifi_4G \n",
"1201 Wifi_4G \n",
"1202 Mobile Data_3G \n",
"1203 Wifi_4G \n",
"1204 Mobile Data_3G \n",
"\n",
"[1205 rows x 15 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Создание нового признака 'internet'\n",
"df_encoded['internet'] = df_encoded['Internet Type'] + '_' + df_encoded['Network Type'] \n",
"df_encoded"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Institution Type</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Device</th>\n",
" <th>IT Student</th>\n",
" <th>Location</th>\n",
" <th>Financial Condition</th>\n",
" <th>Internet Type</th>\n",
" <th>Network Type</th>\n",
" <th>Flexibility Level</th>\n",
" <th>education_College</th>\n",
" <th>education_School</th>\n",
" <th>education_University</th>\n",
" <th>age_group</th>\n",
" <th>internet</th>\n",
" <th>age_normalized</th>\n",
" <th>age_standardized</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Private</td>\n",
" <td>Male</td>\n",
" <td>23</td>\n",
" <td>Tab</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>19-23</td>\n",
" <td>Wifi_4G</td>\n",
" <td>0.777778</td>\n",
" <td>1.018272</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>23</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>19-23</td>\n",
" <td>Mobile Data_4G</td>\n",
" <td>0.777778</td>\n",
" <td>1.018272</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Public</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Wifi_4G</td>\n",
" <td>0.500000</td>\n",
" <td>0.160338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Mobile Data_4G</td>\n",
" <td>0.111111</td>\n",
" <td>-1.040771</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Poor</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Low</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Mobile Data_3G</td>\n",
" <td>0.500000</td>\n",
" <td>0.160338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1200</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Low</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Wifi_4G</td>\n",
" <td>0.500000</td>\n",
" <td>0.160338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1201</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Rural</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Moderate</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Wifi_4G</td>\n",
" <td>0.500000</td>\n",
" <td>0.160338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1202</th>\n",
" <td>Private</td>\n",
" <td>Male</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Mid</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Mobile Data_3G</td>\n",
" <td>0.111111</td>\n",
" <td>-1.040771</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1203</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>18</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Rural</td>\n",
" <td>Mid</td>\n",
" <td>Wifi</td>\n",
" <td>4G</td>\n",
" <td>Low</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Wifi_4G</td>\n",
" <td>0.500000</td>\n",
" <td>0.160338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1204</th>\n",
" <td>Private</td>\n",
" <td>Female</td>\n",
" <td>11</td>\n",
" <td>Mobile</td>\n",
" <td>No</td>\n",
" <td>Town</td>\n",
" <td>Poor</td>\n",
" <td>Mobile Data</td>\n",
" <td>3G</td>\n",
" <td>Moderate</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10-18</td>\n",
" <td>Mobile Data_3G</td>\n",
" <td>0.111111</td>\n",
" <td>-1.040771</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1205 rows × 17 columns</p>\n",
"</div>"
],
"text/plain": [
" Institution Type Gender Age Device IT Student Location \\\n",
"0 Private Male 23 Tab No Town \n",
"1 Private Female 23 Mobile No Town \n",
"2 Public Female 18 Mobile No Town \n",
"3 Private Female 11 Mobile No Town \n",
"4 Private Female 18 Mobile No Town \n",
"... ... ... ... ... ... ... \n",
"1200 Private Female 18 Mobile No Town \n",
"1201 Private Female 18 Mobile No Rural \n",
"1202 Private Male 11 Mobile No Town \n",
"1203 Private Female 18 Mobile No Rural \n",
"1204 Private Female 11 Mobile No Town \n",
"\n",
" Financial Condition Internet Type Network Type Flexibility Level \\\n",
"0 Mid Wifi 4G Moderate \n",
"1 Mid Mobile Data 4G Moderate \n",
"2 Mid Wifi 4G Moderate \n",
"3 Mid Mobile Data 4G Moderate \n",
"4 Poor Mobile Data 3G Low \n",
"... ... ... ... ... \n",
"1200 Mid Wifi 4G Low \n",
"1201 Mid Wifi 4G Moderate \n",
"1202 Mid Mobile Data 3G Moderate \n",
"1203 Mid Wifi 4G Low \n",
"1204 Poor Mobile Data 3G Moderate \n",
"\n",
" education_College education_School education_University age_group \\\n",
"0 False False True 19-23 \n",
"1 False False True 19-23 \n",
"2 True False False 10-18 \n",
"3 False True False 10-18 \n",
"4 False True False 10-18 \n",
"... ... ... ... ... \n",
"1200 True False False 10-18 \n",
"1201 True False False 10-18 \n",
"1202 False True False 10-18 \n",
"1203 True False False 10-18 \n",
"1204 False True False 10-18 \n",
"\n",
" internet age_normalized age_standardized \n",
"0 Wifi_4G 0.777778 1.018272 \n",
"1 Mobile Data_4G 0.777778 1.018272 \n",
"2 Wifi_4G 0.500000 0.160338 \n",
"3 Mobile Data_4G 0.111111 -1.040771 \n",
"4 Mobile Data_3G 0.500000 0.160338 \n",
"... ... ... ... \n",
"1200 Wifi_4G 0.500000 0.160338 \n",
"1201 Wifi_4G 0.500000 0.160338 \n",
"1202 Mobile Data_3G 0.111111 -1.040771 \n",
"1203 Wifi_4G 0.500000 0.160338 \n",
"1204 Mobile Data_3G 0.111111 -1.040771 \n",
"\n",
"[1205 rows x 17 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
"\n",
"# Создаем экземпляры масштабировщиков\n",
"minmax_scaler = MinMaxScaler()\n",
"standard_scaler = StandardScaler()\n",
"\n",
"# Нормировка\n",
"df_encoded['age_normalized'] = minmax_scaler.fit_transform(df_encoded[['Age']])\n",
"\n",
"# Стандартизация\n",
"df_encoded['age_standardized'] = standard_scaler.fit_transform(df_encoded[['Age']])\n",
"\n",
"df_encoded"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Institution Type</th>\n",
" <th>Gender</th>\n",
" <th>Age</th>\n",
" <th>Device</th>\n",
" <th>IT Student</th>\n",
" <th>Location</th>\n",
" <th>Financial Condition</th>\n",
" <th>Internet Type</th>\n",
" <th>Network Type</th>\n",
" <th>Flexibility Level</th>\n",
" <th>education_College</th>\n",
" <th>education_School</th>\n",
" <th>education_University</th>\n",
" <th>age_group</th>\n",
" <th>internet</th>\n",
" <th>age_normalized</th>\n",
" <th>age_standardized</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>23</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>0.777778</td>\n",
" <td>1.018272</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>23</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0.777778</td>\n",
" <td>1.018272</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>18</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>0.500000</td>\n",
" <td>0.160338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>11</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0.111111</td>\n",
" <td>-1.040771</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>18</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.500000</td>\n",
" <td>0.160338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1200</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>18</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>0.500000</td>\n",
" <td>0.160338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1201</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>18</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>0.500000</td>\n",
" <td>0.160338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1202</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>11</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.111111</td>\n",
" <td>-1.040771</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1203</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>18</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>0.500000</td>\n",
" <td>0.160338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1204</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>11</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.111111</td>\n",
" <td>-1.040771</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1205 rows × 17 columns</p>\n",
"</div>"
],
"text/plain": [
" Institution Type Gender Age Device IT Student Location \\\n",
"0 0 1 23 2 0 1 \n",
"1 0 0 23 1 0 1 \n",
"2 1 0 18 1 0 1 \n",
"3 0 0 11 1 0 1 \n",
"4 0 0 18 1 0 1 \n",
"... ... ... ... ... ... ... \n",
"1200 0 0 18 1 0 1 \n",
"1201 0 0 18 1 0 0 \n",
"1202 0 1 11 1 0 1 \n",
"1203 0 0 18 1 0 0 \n",
"1204 0 0 11 1 0 1 \n",
"\n",
" Financial Condition Internet Type Network Type Flexibility Level \\\n",
"0 0 1 2 2 \n",
"1 0 0 2 2 \n",
"2 0 1 2 2 \n",
"3 0 0 2 2 \n",
"4 1 0 1 1 \n",
"... ... ... ... ... \n",
"1200 0 1 2 1 \n",
"1201 0 1 2 2 \n",
"1202 0 0 1 2 \n",
"1203 0 1 2 1 \n",
"1204 1 0 1 2 \n",
"\n",
" education_College education_School education_University age_group \\\n",
"0 0 0 1 1 \n",
"1 0 0 1 1 \n",
"2 1 0 0 0 \n",
"3 0 1 0 0 \n",
"4 0 1 0 0 \n",
"... ... ... ... ... \n",
"1200 1 0 0 0 \n",
"1201 1 0 0 0 \n",
"1202 0 1 0 0 \n",
"1203 1 0 0 0 \n",
"1204 0 1 0 0 \n",
"\n",
" internet age_normalized age_standardized \n",
"0 5 0.777778 1.018272 \n",
"1 2 0.777778 1.018272 \n",
"2 5 0.500000 0.160338 \n",
"3 2 0.111111 -1.040771 \n",
"4 1 0.500000 0.160338 \n",
"... ... ... ... \n",
"1200 5 0.500000 0.160338 \n",
"1201 5 0.500000 0.160338 \n",
"1202 1 0.111111 -1.040771 \n",
"1203 5 0.500000 0.160338 \n",
"1204 1 0.111111 -1.040771 \n",
"\n",
"[1205 rows x 17 columns]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"# Преобразование категориальных переменных в числовые \n",
"label_encoder = LabelEncoder()\n",
"df_encoded['education_College'] = label_encoder.fit_transform(df_encoded['education_College'])\n",
"df_encoded"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Предсказательная способность': np.float64(0.941908713692946),\n",
" 'Скорость вычисления (с):': 1.1279711723327637,\n",
" 'Надежность': np.float64(0.005626647816237885),\n",
" 'Корреляция': array([0.01320147, 0.01701565, 0.11855808, 0.07147731, 0.00064522,\n",
" 0.01339934, 0.0080009 , 0.02973182, 0.00393624, 0. ,\n",
" 0.09713269, 0.14855715, 0.12622933, 0.03761528, 0.13008942,\n",
" 0.14516678]),\n",
" 'Цельность (%)': Education Level 0.0\n",
" Institution Type 0.0\n",
" Gender 0.0\n",
" Age 0.0\n",
" Device 0.0\n",
" IT Student 0.0\n",
" Location 0.0\n",
" Financial Condition 0.0\n",
" Internet Type 0.0\n",
" Network Type 0.0\n",
" Flexibility Level 0.0\n",
" age_group 0.0\n",
" age_normalized 0.0\n",
" age_standardized 0.0\n",
" dtype: float64}"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from time import time\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.feature_selection import mutual_info_classif\n",
"\n",
"\n",
"# Определение данных\n",
"X = df_encoded.drop('IT Student', axis=1) # Набор признаков\n",
"y = df_encoded['IT Student'] # Целевая переменная\n",
"\n",
"# Предсказательная способность\n",
"start_time = time()\n",
"model = RandomForestClassifier() \n",
"scores = cross_val_score(model, X, y, cv=5)\n",
"end_time = time()\n",
"\n",
"# Надежность \n",
"bootstrap_scores = []\n",
"for _ in range(100):\n",
" sample = df_encoded.sample(frac=1, replace=True)\n",
" sample_X = sample.drop('IT Student', axis=1)\n",
" sample_y = sample['IT Student']\n",
" bootstrap_score = accuracy_score(sample_y, model.fit(sample_X, sample_y).predict(sample_X))\n",
" bootstrap_scores.append(bootstrap_score)\n",
"\n",
"# Корреляция\n",
"correlations = mutual_info_classif(X, y, discrete_features='auto')\n",
"\n",
"# Цельность\n",
"null_percent = df.isnull().mean() * 100\n",
"\n",
"# Сборка всех метрик\n",
"quality_metrics = {\n",
" 'Предсказательная способность': scores.mean(),\n",
" 'Скорость вычисления (с):': end_time - start_time,\n",
" 'Надежность': np.std(bootstrap_scores),\n",
" 'Корреляция': correlations,\n",
" 'Цельность (%)': null_percent\n",
"}\n",
"\n",
"quality_metrics\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}