PredictiveAnalytics/lab3.ipynb at 7330d543e59d971a2f2874e9e7411535376f8c64

Бизнес-цели для набора данных онлайн обучения

определение уровня образования
определение It-направления

In [1]:

import pandas as pd
# загрузка данных
df = pd.read_csv("data/students_education.csv")

# Вывод распределения количества наблюдений по меткам (классам)
from src.utils import split_stratified_into_train_val_test


display(df.Age.value_counts())
display()

data = df[["Age", "Device", "Education Level"]].copy()

df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
   data, stratify_colname="Age", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

display("Обучающая выборка: ", df_train.shape)
display(df_train.Age.value_counts())

display("Контрольная выборка: ", df_val.shape)
display(df_val.Age.value_counts())

display("Тестовая выборка: ", df_test.shape)
display(df_test.Age.value_counts())

Age
23    374
11    353
18    278
9      81
27     68
10     51
Name: count, dtype: int64

'Обучающая выборка: '

(723, 3)

Age
23    224
11    212
18    167
9      48
27     41
10     31
Name: count, dtype: int64

'Контрольная выборка: '

(241, 3)

Age
23    75
11    71
18    55
9     16
27    14
10    10
Name: count, dtype: int64

'Тестовая выборка: '

(241, 3)

Age
23    75
11    70
18    56
9     17
27    13
10    10
Name: count, dtype: int64

In [3]:

# Применение one-hot encoding
df_encoded = pd.get_dummies(df, columns=['Education Level'], prefix='education')

# Результат
df_encoded

Out[3]:

	Institution Type	Gender	Age	Device	IT Student	Location	Financial Condition	Internet Type	Network Type	Flexibility Level	education_College	education_School	education_University
0	Private	Male	23	Tab	No	Town	Mid	Wifi	4G	Moderate	False	False	True
1	Private	Female	23	Mobile	No	Town	Mid	Mobile Data	4G	Moderate	False	False	True
2	Public	Female	18	Mobile	No	Town	Mid	Wifi	4G	Moderate	True	False	False
3	Private	Female	11	Mobile	No	Town	Mid	Mobile Data	4G	Moderate	False	True	False
4	Private	Female	18	Mobile	No	Town	Poor	Mobile Data	3G	Low	False	True	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...
1200	Private	Female	18	Mobile	No	Town	Mid	Wifi	4G	Low	True	False	False
1201	Private	Female	18	Mobile	No	Rural	Mid	Wifi	4G	Moderate	True	False	False
1202	Private	Male	11	Mobile	No	Town	Mid	Mobile Data	3G	Moderate	False	True	False
1203	Private	Female	18	Mobile	No	Rural	Mid	Wifi	4G	Low	True	False	False
1204	Private	Female	11	Mobile	No	Town	Poor	Mobile Data	3G	Moderate	False	True	False

1205 rows × 13 columns

In [5]:

# Дискретизация признака 'age'
bins = [0, 18, 23, 28]
labels = ['10-18', '19-23', '24-28']
df_encoded['age_group'] = pd.cut(df['Age'], bins=bins, labels=labels)

# Результат
df_encoded

Out[5]:

	Institution Type	Gender	Age	Device	IT Student	Location	Financial Condition	Internet Type	Network Type	Flexibility Level	education_College	education_School	education_University	age_group
0	Private	Male	23	Tab	No	Town	Mid	Wifi	4G	Moderate	False	False	True	19-23
1	Private	Female	23	Mobile	No	Town	Mid	Mobile Data	4G	Moderate	False	False	True	19-23
2	Public	Female	18	Mobile	No	Town	Mid	Wifi	4G	Moderate	True	False	False	10-18
3	Private	Female	11	Mobile	No	Town	Mid	Mobile Data	4G	Moderate	False	True	False	10-18
4	Private	Female	18	Mobile	No	Town	Poor	Mobile Data	3G	Low	False	True	False	10-18
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1200	Private	Female	18	Mobile	No	Town	Mid	Wifi	4G	Low	True	False	False	10-18
1201	Private	Female	18	Mobile	No	Rural	Mid	Wifi	4G	Moderate	True	False	False	10-18
1202	Private	Male	11	Mobile	No	Town	Mid	Mobile Data	3G	Moderate	False	True	False	10-18
1203	Private	Female	18	Mobile	No	Rural	Mid	Wifi	4G	Low	True	False	False	10-18
1204	Private	Female	11	Mobile	No	Town	Poor	Mobile Data	3G	Moderate	False	True	False	10-18

1205 rows × 14 columns

In [9]:

# Создание нового признака 'internet'
df_encoded['internet'] = df_encoded['Internet Type'] + '_' + df_encoded['Network Type']  
df_encoded

Out[9]:

	Institution Type	Gender	Age	Device	IT Student	Location	Financial Condition	Internet Type	Network Type	Flexibility Level	education_College	education_School	education_University	age_group	internet
0	Private	Male	23	Tab	No	Town	Mid	Wifi	4G	Moderate	False	False	True	19-23	Wifi_4G
1	Private	Female	23	Mobile	No	Town	Mid	Mobile Data	4G	Moderate	False	False	True	19-23	Mobile Data_4G
2	Public	Female	18	Mobile	No	Town	Mid	Wifi	4G	Moderate	True	False	False	10-18	Wifi_4G
3	Private	Female	11	Mobile	No	Town	Mid	Mobile Data	4G	Moderate	False	True	False	10-18	Mobile Data_4G
4	Private	Female	18	Mobile	No	Town	Poor	Mobile Data	3G	Low	False	True	False	10-18	Mobile Data_3G
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1200	Private	Female	18	Mobile	No	Town	Mid	Wifi	4G	Low	True	False	False	10-18	Wifi_4G
1201	Private	Female	18	Mobile	No	Rural	Mid	Wifi	4G	Moderate	True	False	False	10-18	Wifi_4G
1202	Private	Male	11	Mobile	No	Town	Mid	Mobile Data	3G	Moderate	False	True	False	10-18	Mobile Data_3G
1203	Private	Female	18	Mobile	No	Rural	Mid	Wifi	4G	Low	True	False	False	10-18	Wifi_4G
1204	Private	Female	11	Mobile	No	Town	Poor	Mobile Data	3G	Moderate	False	True	False	10-18	Mobile Data_3G

1205 rows × 15 columns

In [15]:

from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Создаем экземпляры масштабировщиков
minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# Нормировка
df_encoded['age_normalized'] = minmax_scaler.fit_transform(df_encoded[['Age']])

# Стандартизация
df_encoded['age_standardized'] = standard_scaler.fit_transform(df_encoded[['Age']])

df_encoded

Out[15]:

	Institution Type	Gender	Age	Device	IT Student	Location	Financial Condition	Internet Type	Network Type	Flexibility Level	education_College	education_School	education_University	age_group	internet	age_normalized	age_standardized
0	Private	Male	23	Tab	No	Town	Mid	Wifi	4G	Moderate	False	False	True	19-23	Wifi_4G	0.777778	1.018272
1	Private	Female	23	Mobile	No	Town	Mid	Mobile Data	4G	Moderate	False	False	True	19-23	Mobile Data_4G	0.777778	1.018272
2	Public	Female	18	Mobile	No	Town	Mid	Wifi	4G	Moderate	True	False	False	10-18	Wifi_4G	0.500000	0.160338
3	Private	Female	11	Mobile	No	Town	Mid	Mobile Data	4G	Moderate	False	True	False	10-18	Mobile Data_4G	0.111111	-1.040771
4	Private	Female	18	Mobile	No	Town	Poor	Mobile Data	3G	Low	False	True	False	10-18	Mobile Data_3G	0.500000	0.160338
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1200	Private	Female	18	Mobile	No	Town	Mid	Wifi	4G	Low	True	False	False	10-18	Wifi_4G	0.500000	0.160338
1201	Private	Female	18	Mobile	No	Rural	Mid	Wifi	4G	Moderate	True	False	False	10-18	Wifi_4G	0.500000	0.160338
1202	Private	Male	11	Mobile	No	Town	Mid	Mobile Data	3G	Moderate	False	True	False	10-18	Mobile Data_3G	0.111111	-1.040771
1203	Private	Female	18	Mobile	No	Rural	Mid	Wifi	4G	Low	True	False	False	10-18	Wifi_4G	0.500000	0.160338
1204	Private	Female	11	Mobile	No	Town	Poor	Mobile Data	3G	Moderate	False	True	False	10-18	Mobile Data_3G	0.111111	-1.040771

1205 rows × 17 columns

In [39]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder

#  Преобразование категориальных переменных в числовые 
label_encoder = LabelEncoder()
df_encoded['education_College'] = label_encoder.fit_transform(df_encoded['education_College'])
df_encoded

Out[39]:

	Institution Type	Gender	Age	Device	IT Student	Location	Financial Condition	Internet Type	Network Type	Flexibility Level	education_College	education_School	education_University	age_group	internet	age_normalized	age_standardized
0	0	1	23	2	0	1	0	1	2	2	0	0	1	1	5	0.777778	1.018272
1	0	0	23	1	0	1	0	0	2	2	0	0	1	1	2	0.777778	1.018272
2	1	0	18	1	0	1	0	1	2	2	1	0	0	0	5	0.500000	0.160338
3	0	0	11	1	0	1	0	0	2	2	0	1	0	0	2	0.111111	-1.040771
4	0	0	18	1	0	1	1	0	1	1	0	1	0	0	1	0.500000	0.160338
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1200	0	0	18	1	0	1	0	1	2	1	1	0	0	0	5	0.500000	0.160338
1201	0	0	18	1	0	0	0	1	2	2	1	0	0	0	5	0.500000	0.160338
1202	0	1	11	1	0	1	0	0	1	2	0	1	0	0	1	0.111111	-1.040771
1203	0	0	18	1	0	0	0	1	2	1	1	0	0	0	5	0.500000	0.160338
1204	0	0	11	1	0	1	1	0	1	2	0	1	0	0	1	0.111111	-1.040771

1205 rows × 17 columns

In [42]:

import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif


# Определение данных
X = df_encoded.drop('IT Student', axis=1)  # Набор признаков
y = df_encoded['IT Student']                # Целевая переменная

# Предсказательная способность
start_time = time()
model = RandomForestClassifier()  
scores = cross_val_score(model, X, y, cv=5)
end_time = time()

# Надежность 
bootstrap_scores = []
for _ in range(100):
    sample = df_encoded.sample(frac=1, replace=True)
    sample_X = sample.drop('IT Student', axis=1)
    sample_y = sample['IT Student']
    bootstrap_score = accuracy_score(sample_y, model.fit(sample_X, sample_y).predict(sample_X))
    bootstrap_scores.append(bootstrap_score)

#  Корреляция
correlations = mutual_info_classif(X, y, discrete_features='auto')

#  Цельность
null_percent = df.isnull().mean() * 100

# Сборка всех метрик
quality_metrics = {
    'Предсказательная способность': scores.mean(),
    'Скорость вычисления (с):': end_time - start_time,
    'Надежность': np.std(bootstrap_scores),
    'Корреляция': correlations,
    'Цельность (%)': null_percent
}

quality_metrics

Out[42]:

{'Предсказательная способность': np.float64(0.941908713692946),
 'Скорость вычисления (с):': 1.1279711723327637,
 'Надежность': np.float64(0.005626647816237885),
 'Корреляция': array([0.01320147, 0.01701565, 0.11855808, 0.07147731, 0.00064522,
        0.01339934, 0.0080009 , 0.02973182, 0.00393624, 0.        ,
        0.09713269, 0.14855715, 0.12622933, 0.03761528, 0.13008942,
        0.14516678]),
 'Цельность (%)': Education Level        0.0
 Institution Type       0.0
 Gender                 0.0
 Age                    0.0
 Device                 0.0
 IT Student             0.0
 Location               0.0
 Financial Condition    0.0
 Internet Type          0.0
 Network Type           0.0
 Flexibility Level      0.0
 age_group              0.0
 age_normalized         0.0
 age_standardized       0.0
 dtype: float64}

64 KiB Raw Blame History Unescape Escape

64 KiB

Raw Blame History