{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Бизнес-цели для набора данных онлайн обучения\n", "\n", "1. определение уровня образования\n", "2. определение It-направления\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Age\n", "23 374\n", "11 353\n", "18 278\n", "9 81\n", "27 68\n", "10 51\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Обучающая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(723, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Age\n", "23 224\n", "11 212\n", "18 167\n", "9 48\n", "27 41\n", "10 31\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Контрольная выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(241, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Age\n", "23 75\n", "11 71\n", "18 55\n", "9 16\n", "27 14\n", "10 10\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Тестовая выборка: '" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(241, 3)" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Age\n", "23 75\n", "11 70\n", "18 56\n", "9 17\n", "27 13\n", "10 10\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "# загрузка данных\n", "df = pd.read_csv(\"data/students_education.csv\")\n", "\n", "# Вывод распределения количества наблюдений по меткам (классам)\n", "from src.utils import split_stratified_into_train_val_test\n", "\n", "\n", "display(df.Age.value_counts())\n", "display()\n", "\n", "data = df[[\"Age\", \"Device\", \"Education Level\"]].copy()\n", "\n", "df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n", " data, stratify_colname=\"Age\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", ")\n", "\n", "display(\"Обучающая выборка: \", df_train.shape)\n", "display(df_train.Age.value_counts())\n", "\n", "display(\"Контрольная выборка: \", df_val.shape)\n", "display(df_val.Age.value_counts())\n", "\n", "display(\"Тестовая выборка: \", df_test.shape)\n", "display(df_test.Age.value_counts())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Institution TypeGenderAgeDeviceIT StudentLocationFinancial ConditionInternet TypeNetwork TypeFlexibility Leveleducation_Collegeeducation_Schooleducation_University
0PrivateMale23TabNoTownMidWifi4GModerateFalseFalseTrue
1PrivateFemale23MobileNoTownMidMobile Data4GModerateFalseFalseTrue
2PublicFemale18MobileNoTownMidWifi4GModerateTrueFalseFalse
3PrivateFemale11MobileNoTownMidMobile Data4GModerateFalseTrueFalse
4PrivateFemale18MobileNoTownPoorMobile Data3GLowFalseTrueFalse
..........................................
1200PrivateFemale18MobileNoTownMidWifi4GLowTrueFalseFalse
1201PrivateFemale18MobileNoRuralMidWifi4GModerateTrueFalseFalse
1202PrivateMale11MobileNoTownMidMobile Data3GModerateFalseTrueFalse
1203PrivateFemale18MobileNoRuralMidWifi4GLowTrueFalseFalse
1204PrivateFemale11MobileNoTownPoorMobile Data3GModerateFalseTrueFalse
\n", "

1205 rows × 13 columns

\n", "
" ], "text/plain": [ " Institution Type Gender Age Device IT Student Location \\\n", "0 Private Male 23 Tab No Town \n", "1 Private Female 23 Mobile No Town \n", "2 Public Female 18 Mobile No Town \n", "3 Private Female 11 Mobile No Town \n", "4 Private Female 18 Mobile No Town \n", "... ... ... ... ... ... ... \n", "1200 Private Female 18 Mobile No Town \n", "1201 Private Female 18 Mobile No Rural \n", "1202 Private Male 11 Mobile No Town \n", "1203 Private Female 18 Mobile No Rural \n", "1204 Private Female 11 Mobile No Town \n", "\n", " Financial Condition Internet Type Network Type Flexibility Level \\\n", "0 Mid Wifi 4G Moderate \n", "1 Mid Mobile Data 4G Moderate \n", "2 Mid Wifi 4G Moderate \n", "3 Mid Mobile Data 4G Moderate \n", "4 Poor Mobile Data 3G Low \n", "... ... ... ... ... \n", "1200 Mid Wifi 4G Low \n", "1201 Mid Wifi 4G Moderate \n", "1202 Mid Mobile Data 3G Moderate \n", "1203 Mid Wifi 4G Low \n", "1204 Poor Mobile Data 3G Moderate \n", "\n", " education_College education_School education_University \n", "0 False False True \n", "1 False False True \n", "2 True False False \n", "3 False True False \n", "4 False True False \n", "... ... ... ... \n", "1200 True False False \n", "1201 True False False \n", "1202 False True False \n", "1203 True False False \n", "1204 False True False \n", "\n", "[1205 rows x 13 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Применение one-hot encoding\n", "df_encoded = pd.get_dummies(df, columns=['Education Level'], prefix='education')\n", "\n", "# Результат\n", "df_encoded" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Institution TypeGenderAgeDeviceIT StudentLocationFinancial ConditionInternet TypeNetwork TypeFlexibility Leveleducation_Collegeeducation_Schooleducation_Universityage_group
0PrivateMale23TabNoTownMidWifi4GModerateFalseFalseTrue19-23
1PrivateFemale23MobileNoTownMidMobile Data4GModerateFalseFalseTrue19-23
2PublicFemale18MobileNoTownMidWifi4GModerateTrueFalseFalse10-18
3PrivateFemale11MobileNoTownMidMobile Data4GModerateFalseTrueFalse10-18
4PrivateFemale18MobileNoTownPoorMobile Data3GLowFalseTrueFalse10-18
.............................................
1200PrivateFemale18MobileNoTownMidWifi4GLowTrueFalseFalse10-18
1201PrivateFemale18MobileNoRuralMidWifi4GModerateTrueFalseFalse10-18
1202PrivateMale11MobileNoTownMidMobile Data3GModerateFalseTrueFalse10-18
1203PrivateFemale18MobileNoRuralMidWifi4GLowTrueFalseFalse10-18
1204PrivateFemale11MobileNoTownPoorMobile Data3GModerateFalseTrueFalse10-18
\n", "

1205 rows × 14 columns

\n", "
" ], "text/plain": [ " Institution Type Gender Age Device IT Student Location \\\n", "0 Private Male 23 Tab No Town \n", "1 Private Female 23 Mobile No Town \n", "2 Public Female 18 Mobile No Town \n", "3 Private Female 11 Mobile No Town \n", "4 Private Female 18 Mobile No Town \n", "... ... ... ... ... ... ... \n", "1200 Private Female 18 Mobile No Town \n", "1201 Private Female 18 Mobile No Rural \n", "1202 Private Male 11 Mobile No Town \n", "1203 Private Female 18 Mobile No Rural \n", "1204 Private Female 11 Mobile No Town \n", "\n", " Financial Condition Internet Type Network Type Flexibility Level \\\n", "0 Mid Wifi 4G Moderate \n", "1 Mid Mobile Data 4G Moderate \n", "2 Mid Wifi 4G Moderate \n", "3 Mid Mobile Data 4G Moderate \n", "4 Poor Mobile Data 3G Low \n", "... ... ... ... ... \n", "1200 Mid Wifi 4G Low \n", "1201 Mid Wifi 4G Moderate \n", "1202 Mid Mobile Data 3G Moderate \n", "1203 Mid Wifi 4G Low \n", "1204 Poor Mobile Data 3G Moderate \n", "\n", " education_College education_School education_University age_group \n", "0 False False True 19-23 \n", "1 False False True 19-23 \n", "2 True False False 10-18 \n", "3 False True False 10-18 \n", "4 False True False 10-18 \n", "... ... ... ... ... \n", "1200 True False False 10-18 \n", "1201 True False False 10-18 \n", "1202 False True False 10-18 \n", "1203 True False False 10-18 \n", "1204 False True False 10-18 \n", "\n", "[1205 rows x 14 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Дискретизация признака 'age'\n", "bins = [0, 18, 23, 28]\n", "labels = ['10-18', '19-23', '24-28']\n", "df_encoded['age_group'] = pd.cut(df['Age'], bins=bins, labels=labels)\n", "\n", "# Результат\n", "df_encoded" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Institution TypeGenderAgeDeviceIT StudentLocationFinancial ConditionInternet TypeNetwork TypeFlexibility Leveleducation_Collegeeducation_Schooleducation_Universityage_groupinternet
0PrivateMale23TabNoTownMidWifi4GModerateFalseFalseTrue19-23Wifi_4G
1PrivateFemale23MobileNoTownMidMobile Data4GModerateFalseFalseTrue19-23Mobile Data_4G
2PublicFemale18MobileNoTownMidWifi4GModerateTrueFalseFalse10-18Wifi_4G
3PrivateFemale11MobileNoTownMidMobile Data4GModerateFalseTrueFalse10-18Mobile Data_4G
4PrivateFemale18MobileNoTownPoorMobile Data3GLowFalseTrueFalse10-18Mobile Data_3G
................................................
1200PrivateFemale18MobileNoTownMidWifi4GLowTrueFalseFalse10-18Wifi_4G
1201PrivateFemale18MobileNoRuralMidWifi4GModerateTrueFalseFalse10-18Wifi_4G
1202PrivateMale11MobileNoTownMidMobile Data3GModerateFalseTrueFalse10-18Mobile Data_3G
1203PrivateFemale18MobileNoRuralMidWifi4GLowTrueFalseFalse10-18Wifi_4G
1204PrivateFemale11MobileNoTownPoorMobile Data3GModerateFalseTrueFalse10-18Mobile Data_3G
\n", "

1205 rows × 15 columns

\n", "
" ], "text/plain": [ " Institution Type Gender Age Device IT Student Location \\\n", "0 Private Male 23 Tab No Town \n", "1 Private Female 23 Mobile No Town \n", "2 Public Female 18 Mobile No Town \n", "3 Private Female 11 Mobile No Town \n", "4 Private Female 18 Mobile No Town \n", "... ... ... ... ... ... ... \n", "1200 Private Female 18 Mobile No Town \n", "1201 Private Female 18 Mobile No Rural \n", "1202 Private Male 11 Mobile No Town \n", "1203 Private Female 18 Mobile No Rural \n", "1204 Private Female 11 Mobile No Town \n", "\n", " Financial Condition Internet Type Network Type Flexibility Level \\\n", "0 Mid Wifi 4G Moderate \n", "1 Mid Mobile Data 4G Moderate \n", "2 Mid Wifi 4G Moderate \n", "3 Mid Mobile Data 4G Moderate \n", "4 Poor Mobile Data 3G Low \n", "... ... ... ... ... \n", "1200 Mid Wifi 4G Low \n", "1201 Mid Wifi 4G Moderate \n", "1202 Mid Mobile Data 3G Moderate \n", "1203 Mid Wifi 4G Low \n", "1204 Poor Mobile Data 3G Moderate \n", "\n", " education_College education_School education_University age_group \\\n", "0 False False True 19-23 \n", "1 False False True 19-23 \n", "2 True False False 10-18 \n", "3 False True False 10-18 \n", "4 False True False 10-18 \n", "... ... ... ... ... \n", "1200 True False False 10-18 \n", "1201 True False False 10-18 \n", "1202 False True False 10-18 \n", "1203 True False False 10-18 \n", "1204 False True False 10-18 \n", "\n", " internet \n", "0 Wifi_4G \n", "1 Mobile Data_4G \n", "2 Wifi_4G \n", "3 Mobile Data_4G \n", "4 Mobile Data_3G \n", "... ... \n", "1200 Wifi_4G \n", "1201 Wifi_4G \n", "1202 Mobile Data_3G \n", "1203 Wifi_4G \n", "1204 Mobile Data_3G \n", "\n", "[1205 rows x 15 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Создание нового признака 'internet'\n", "df_encoded['internet'] = df_encoded['Internet Type'] + '_' + df_encoded['Network Type'] \n", "df_encoded" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Institution TypeGenderAgeDeviceIT StudentLocationFinancial ConditionInternet TypeNetwork TypeFlexibility Leveleducation_Collegeeducation_Schooleducation_Universityage_groupinternetage_normalizedage_standardized
0PrivateMale23TabNoTownMidWifi4GModerateFalseFalseTrue19-23Wifi_4G0.7777781.018272
1PrivateFemale23MobileNoTownMidMobile Data4GModerateFalseFalseTrue19-23Mobile Data_4G0.7777781.018272
2PublicFemale18MobileNoTownMidWifi4GModerateTrueFalseFalse10-18Wifi_4G0.5000000.160338
3PrivateFemale11MobileNoTownMidMobile Data4GModerateFalseTrueFalse10-18Mobile Data_4G0.111111-1.040771
4PrivateFemale18MobileNoTownPoorMobile Data3GLowFalseTrueFalse10-18Mobile Data_3G0.5000000.160338
......................................................
1200PrivateFemale18MobileNoTownMidWifi4GLowTrueFalseFalse10-18Wifi_4G0.5000000.160338
1201PrivateFemale18MobileNoRuralMidWifi4GModerateTrueFalseFalse10-18Wifi_4G0.5000000.160338
1202PrivateMale11MobileNoTownMidMobile Data3GModerateFalseTrueFalse10-18Mobile Data_3G0.111111-1.040771
1203PrivateFemale18MobileNoRuralMidWifi4GLowTrueFalseFalse10-18Wifi_4G0.5000000.160338
1204PrivateFemale11MobileNoTownPoorMobile Data3GModerateFalseTrueFalse10-18Mobile Data_3G0.111111-1.040771
\n", "

1205 rows × 17 columns

\n", "
" ], "text/plain": [ " Institution Type Gender Age Device IT Student Location \\\n", "0 Private Male 23 Tab No Town \n", "1 Private Female 23 Mobile No Town \n", "2 Public Female 18 Mobile No Town \n", "3 Private Female 11 Mobile No Town \n", "4 Private Female 18 Mobile No Town \n", "... ... ... ... ... ... ... \n", "1200 Private Female 18 Mobile No Town \n", "1201 Private Female 18 Mobile No Rural \n", "1202 Private Male 11 Mobile No Town \n", "1203 Private Female 18 Mobile No Rural \n", "1204 Private Female 11 Mobile No Town \n", "\n", " Financial Condition Internet Type Network Type Flexibility Level \\\n", "0 Mid Wifi 4G Moderate \n", "1 Mid Mobile Data 4G Moderate \n", "2 Mid Wifi 4G Moderate \n", "3 Mid Mobile Data 4G Moderate \n", "4 Poor Mobile Data 3G Low \n", "... ... ... ... ... \n", "1200 Mid Wifi 4G Low \n", "1201 Mid Wifi 4G Moderate \n", "1202 Mid Mobile Data 3G Moderate \n", "1203 Mid Wifi 4G Low \n", "1204 Poor Mobile Data 3G Moderate \n", "\n", " education_College education_School education_University age_group \\\n", "0 False False True 19-23 \n", "1 False False True 19-23 \n", "2 True False False 10-18 \n", "3 False True False 10-18 \n", "4 False True False 10-18 \n", "... ... ... ... ... \n", "1200 True False False 10-18 \n", "1201 True False False 10-18 \n", "1202 False True False 10-18 \n", "1203 True False False 10-18 \n", "1204 False True False 10-18 \n", "\n", " internet age_normalized age_standardized \n", "0 Wifi_4G 0.777778 1.018272 \n", "1 Mobile Data_4G 0.777778 1.018272 \n", "2 Wifi_4G 0.500000 0.160338 \n", "3 Mobile Data_4G 0.111111 -1.040771 \n", "4 Mobile Data_3G 0.500000 0.160338 \n", "... ... ... ... \n", "1200 Wifi_4G 0.500000 0.160338 \n", "1201 Wifi_4G 0.500000 0.160338 \n", "1202 Mobile Data_3G 0.111111 -1.040771 \n", "1203 Wifi_4G 0.500000 0.160338 \n", "1204 Mobile Data_3G 0.111111 -1.040771 \n", "\n", "[1205 rows x 17 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", "\n", "# Создаем экземпляры масштабировщиков\n", "minmax_scaler = MinMaxScaler()\n", "standard_scaler = StandardScaler()\n", "\n", "# Нормировка\n", "df_encoded['age_normalized'] = minmax_scaler.fit_transform(df_encoded[['Age']])\n", "\n", "# Стандартизация\n", "df_encoded['age_standardized'] = standard_scaler.fit_transform(df_encoded[['Age']])\n", "\n", "df_encoded" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Institution TypeGenderAgeDeviceIT StudentLocationFinancial ConditionInternet TypeNetwork TypeFlexibility Leveleducation_Collegeeducation_Schooleducation_Universityage_groupinternetage_normalizedage_standardized
001232010122001150.7777781.018272
100231010022001120.7777781.018272
210181010122100050.5000000.160338
300111010022010020.111111-1.040771
400181011011010010.5000000.160338
......................................................
120000181010121100050.5000000.160338
120100181000122100050.5000000.160338
120201111010012010010.111111-1.040771
120300181000121100050.5000000.160338
120400111011012010010.111111-1.040771
\n", "

1205 rows × 17 columns

\n", "
" ], "text/plain": [ " Institution Type Gender Age Device IT Student Location \\\n", "0 0 1 23 2 0 1 \n", "1 0 0 23 1 0 1 \n", "2 1 0 18 1 0 1 \n", "3 0 0 11 1 0 1 \n", "4 0 0 18 1 0 1 \n", "... ... ... ... ... ... ... \n", "1200 0 0 18 1 0 1 \n", "1201 0 0 18 1 0 0 \n", "1202 0 1 11 1 0 1 \n", "1203 0 0 18 1 0 0 \n", "1204 0 0 11 1 0 1 \n", "\n", " Financial Condition Internet Type Network Type Flexibility Level \\\n", "0 0 1 2 2 \n", "1 0 0 2 2 \n", "2 0 1 2 2 \n", "3 0 0 2 2 \n", "4 1 0 1 1 \n", "... ... ... ... ... \n", "1200 0 1 2 1 \n", "1201 0 1 2 2 \n", "1202 0 0 1 2 \n", "1203 0 1 2 1 \n", "1204 1 0 1 2 \n", "\n", " education_College education_School education_University age_group \\\n", "0 0 0 1 1 \n", "1 0 0 1 1 \n", "2 1 0 0 0 \n", "3 0 1 0 0 \n", "4 0 1 0 0 \n", "... ... ... ... ... \n", "1200 1 0 0 0 \n", "1201 1 0 0 0 \n", "1202 0 1 0 0 \n", "1203 1 0 0 0 \n", "1204 0 1 0 0 \n", "\n", " internet age_normalized age_standardized \n", "0 5 0.777778 1.018272 \n", "1 2 0.777778 1.018272 \n", "2 5 0.500000 0.160338 \n", "3 2 0.111111 -1.040771 \n", "4 1 0.500000 0.160338 \n", "... ... ... ... \n", "1200 5 0.500000 0.160338 \n", "1201 5 0.500000 0.160338 \n", "1202 1 0.111111 -1.040771 \n", "1203 5 0.500000 0.160338 \n", "1204 1 0.111111 -1.040771 \n", "\n", "[1205 rows x 17 columns]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "# Преобразование категориальных переменных в числовые \n", "label_encoder = LabelEncoder()\n", "df_encoded['education_College'] = label_encoder.fit_transform(df_encoded['education_College'])\n", "df_encoded" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Предсказательная способность': np.float64(0.941908713692946),\n", " 'Скорость вычисления (с):': 1.1279711723327637,\n", " 'Надежность': np.float64(0.005626647816237885),\n", " 'Корреляция': array([0.01320147, 0.01701565, 0.11855808, 0.07147731, 0.00064522,\n", " 0.01339934, 0.0080009 , 0.02973182, 0.00393624, 0. ,\n", " 0.09713269, 0.14855715, 0.12622933, 0.03761528, 0.13008942,\n", " 0.14516678]),\n", " 'Цельность (%)': Education Level 0.0\n", " Institution Type 0.0\n", " Gender 0.0\n", " Age 0.0\n", " Device 0.0\n", " IT Student 0.0\n", " Location 0.0\n", " Financial Condition 0.0\n", " Internet Type 0.0\n", " Network Type 0.0\n", " Flexibility Level 0.0\n", " age_group 0.0\n", " age_normalized 0.0\n", " age_standardized 0.0\n", " dtype: float64}" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from time import time\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.feature_selection import mutual_info_classif\n", "\n", "\n", "# Определение данных\n", "X = df_encoded.drop('IT Student', axis=1) # Набор признаков\n", "y = df_encoded['IT Student'] # Целевая переменная\n", "\n", "# Предсказательная способность\n", "start_time = time()\n", "model = RandomForestClassifier() \n", "scores = cross_val_score(model, X, y, cv=5)\n", "end_time = time()\n", "\n", "# Надежность \n", "bootstrap_scores = []\n", "for _ in range(100):\n", " sample = df_encoded.sample(frac=1, replace=True)\n", " sample_X = sample.drop('IT Student', axis=1)\n", " sample_y = sample['IT Student']\n", " bootstrap_score = accuracy_score(sample_y, model.fit(sample_X, sample_y).predict(sample_X))\n", " bootstrap_scores.append(bootstrap_score)\n", "\n", "# Корреляция\n", "correlations = mutual_info_classif(X, y, discrete_features='auto')\n", "\n", "# Цельность\n", "null_percent = df.isnull().mean() * 100\n", "\n", "# Сборка всех метрик\n", "quality_metrics = {\n", " 'Предсказательная способность': scores.mean(),\n", " 'Скорость вычисления (с):': end_time - start_time,\n", " 'Надежность': np.std(bootstrap_scores),\n", " 'Корреляция': correlations,\n", " 'Цельность (%)': null_percent\n", "}\n", "\n", "quality_metrics\n" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }