PredictiveAnalytics/lab3.ipynb

64 KiB
Raw Permalink Blame History

Бизнес-цели для набора данных онлайн обучения

  1. определение уровня образования
  2. определение It-направления
In [1]:
import pandas as pd
# загрузка данных
df = pd.read_csv("data/students_education.csv")

# Вывод распределения количества наблюдений по меткам (классам)
from src.utils import split_stratified_into_train_val_test


display(df.Age.value_counts())
display()

data = df[["Age", "Device", "Education Level"]].copy()

df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
   data, stratify_colname="Age", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

display("Обучающая выборка: ", df_train.shape)
display(df_train.Age.value_counts())

display("Контрольная выборка: ", df_val.shape)
display(df_val.Age.value_counts())

display("Тестовая выборка: ", df_test.shape)
display(df_test.Age.value_counts())
Age
23    374
11    353
18    278
9      81
27     68
10     51
Name: count, dtype: int64
'Обучающая выборка: '
(723, 3)
Age
23    224
11    212
18    167
9      48
27     41
10     31
Name: count, dtype: int64
'Контрольная выборка: '
(241, 3)
Age
23    75
11    71
18    55
9     16
27    14
10    10
Name: count, dtype: int64
'Тестовая выборка: '
(241, 3)
Age
23    75
11    70
18    56
9     17
27    13
10    10
Name: count, dtype: int64
In [3]:
# Применение one-hot encoding
df_encoded = pd.get_dummies(df, columns=['Education Level'], prefix='education')

# Результат
df_encoded
Out[3]:
Institution Type Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level education_College education_School education_University
0 Private Male 23 Tab No Town Mid Wifi 4G Moderate False False True
1 Private Female 23 Mobile No Town Mid Mobile Data 4G Moderate False False True
2 Public Female 18 Mobile No Town Mid Wifi 4G Moderate True False False
3 Private Female 11 Mobile No Town Mid Mobile Data 4G Moderate False True False
4 Private Female 18 Mobile No Town Poor Mobile Data 3G Low False True False
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1200 Private Female 18 Mobile No Town Mid Wifi 4G Low True False False
1201 Private Female 18 Mobile No Rural Mid Wifi 4G Moderate True False False
1202 Private Male 11 Mobile No Town Mid Mobile Data 3G Moderate False True False
1203 Private Female 18 Mobile No Rural Mid Wifi 4G Low True False False
1204 Private Female 11 Mobile No Town Poor Mobile Data 3G Moderate False True False

1205 rows × 13 columns

In [5]:
# Дискретизация признака 'age'
bins = [0, 18, 23, 28]
labels = ['10-18', '19-23', '24-28']
df_encoded['age_group'] = pd.cut(df['Age'], bins=bins, labels=labels)

# Результат
df_encoded
Out[5]:
Institution Type Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level education_College education_School education_University age_group
0 Private Male 23 Tab No Town Mid Wifi 4G Moderate False False True 19-23
1 Private Female 23 Mobile No Town Mid Mobile Data 4G Moderate False False True 19-23
2 Public Female 18 Mobile No Town Mid Wifi 4G Moderate True False False 10-18
3 Private Female 11 Mobile No Town Mid Mobile Data 4G Moderate False True False 10-18
4 Private Female 18 Mobile No Town Poor Mobile Data 3G Low False True False 10-18
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1200 Private Female 18 Mobile No Town Mid Wifi 4G Low True False False 10-18
1201 Private Female 18 Mobile No Rural Mid Wifi 4G Moderate True False False 10-18
1202 Private Male 11 Mobile No Town Mid Mobile Data 3G Moderate False True False 10-18
1203 Private Female 18 Mobile No Rural Mid Wifi 4G Low True False False 10-18
1204 Private Female 11 Mobile No Town Poor Mobile Data 3G Moderate False True False 10-18

1205 rows × 14 columns

In [9]:
# Создание нового признака 'internet'
df_encoded['internet'] = df_encoded['Internet Type'] + '_' + df_encoded['Network Type']  
df_encoded
Out[9]:
Institution Type Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level education_College education_School education_University age_group internet
0 Private Male 23 Tab No Town Mid Wifi 4G Moderate False False True 19-23 Wifi_4G
1 Private Female 23 Mobile No Town Mid Mobile Data 4G Moderate False False True 19-23 Mobile Data_4G
2 Public Female 18 Mobile No Town Mid Wifi 4G Moderate True False False 10-18 Wifi_4G
3 Private Female 11 Mobile No Town Mid Mobile Data 4G Moderate False True False 10-18 Mobile Data_4G
4 Private Female 18 Mobile No Town Poor Mobile Data 3G Low False True False 10-18 Mobile Data_3G
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1200 Private Female 18 Mobile No Town Mid Wifi 4G Low True False False 10-18 Wifi_4G
1201 Private Female 18 Mobile No Rural Mid Wifi 4G Moderate True False False 10-18 Wifi_4G
1202 Private Male 11 Mobile No Town Mid Mobile Data 3G Moderate False True False 10-18 Mobile Data_3G
1203 Private Female 18 Mobile No Rural Mid Wifi 4G Low True False False 10-18 Wifi_4G
1204 Private Female 11 Mobile No Town Poor Mobile Data 3G Moderate False True False 10-18 Mobile Data_3G

1205 rows × 15 columns

In [15]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Создаем экземпляры масштабировщиков
minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# Нормировка
df_encoded['age_normalized'] = minmax_scaler.fit_transform(df_encoded[['Age']])

# Стандартизация
df_encoded['age_standardized'] = standard_scaler.fit_transform(df_encoded[['Age']])

df_encoded
Out[15]:
Institution Type Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level education_College education_School education_University age_group internet age_normalized age_standardized
0 Private Male 23 Tab No Town Mid Wifi 4G Moderate False False True 19-23 Wifi_4G 0.777778 1.018272
1 Private Female 23 Mobile No Town Mid Mobile Data 4G Moderate False False True 19-23 Mobile Data_4G 0.777778 1.018272
2 Public Female 18 Mobile No Town Mid Wifi 4G Moderate True False False 10-18 Wifi_4G 0.500000 0.160338
3 Private Female 11 Mobile No Town Mid Mobile Data 4G Moderate False True False 10-18 Mobile Data_4G 0.111111 -1.040771
4 Private Female 18 Mobile No Town Poor Mobile Data 3G Low False True False 10-18 Mobile Data_3G 0.500000 0.160338
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1200 Private Female 18 Mobile No Town Mid Wifi 4G Low True False False 10-18 Wifi_4G 0.500000 0.160338
1201 Private Female 18 Mobile No Rural Mid Wifi 4G Moderate True False False 10-18 Wifi_4G 0.500000 0.160338
1202 Private Male 11 Mobile No Town Mid Mobile Data 3G Moderate False True False 10-18 Mobile Data_3G 0.111111 -1.040771
1203 Private Female 18 Mobile No Rural Mid Wifi 4G Low True False False 10-18 Wifi_4G 0.500000 0.160338
1204 Private Female 11 Mobile No Town Poor Mobile Data 3G Moderate False True False 10-18 Mobile Data_3G 0.111111 -1.040771

1205 rows × 17 columns

In [39]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

#  Преобразование категориальных переменных в числовые 
label_encoder = LabelEncoder()
df_encoded['education_College'] = label_encoder.fit_transform(df_encoded['education_College'])
df_encoded
Out[39]:
Institution Type Gender Age Device IT Student Location Financial Condition Internet Type Network Type Flexibility Level education_College education_School education_University age_group internet age_normalized age_standardized
0 0 1 23 2 0 1 0 1 2 2 0 0 1 1 5 0.777778 1.018272
1 0 0 23 1 0 1 0 0 2 2 0 0 1 1 2 0.777778 1.018272
2 1 0 18 1 0 1 0 1 2 2 1 0 0 0 5 0.500000 0.160338
3 0 0 11 1 0 1 0 0 2 2 0 1 0 0 2 0.111111 -1.040771
4 0 0 18 1 0 1 1 0 1 1 0 1 0 0 1 0.500000 0.160338
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1200 0 0 18 1 0 1 0 1 2 1 1 0 0 0 5 0.500000 0.160338
1201 0 0 18 1 0 0 0 1 2 2 1 0 0 0 5 0.500000 0.160338
1202 0 1 11 1 0 1 0 0 1 2 0 1 0 0 1 0.111111 -1.040771
1203 0 0 18 1 0 0 0 1 2 1 1 0 0 0 5 0.500000 0.160338
1204 0 0 11 1 0 1 1 0 1 2 0 1 0 0 1 0.111111 -1.040771

1205 rows × 17 columns

In [42]:
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif


# Определение данных
X = df_encoded.drop('IT Student', axis=1)  # Набор признаков
y = df_encoded['IT Student']                # Целевая переменная

# Предсказательная способность
start_time = time()
model = RandomForestClassifier()  
scores = cross_val_score(model, X, y, cv=5)
end_time = time()

# Надежность 
bootstrap_scores = []
for _ in range(100):
    sample = df_encoded.sample(frac=1, replace=True)
    sample_X = sample.drop('IT Student', axis=1)
    sample_y = sample['IT Student']
    bootstrap_score = accuracy_score(sample_y, model.fit(sample_X, sample_y).predict(sample_X))
    bootstrap_scores.append(bootstrap_score)

#  Корреляция
correlations = mutual_info_classif(X, y, discrete_features='auto')

#  Цельность
null_percent = df.isnull().mean() * 100

# Сборка всех метрик
quality_metrics = {
    'Предсказательная способность': scores.mean(),
    'Скорость вычисления (с):': end_time - start_time,
    'Надежность': np.std(bootstrap_scores),
    'Корреляция': correlations,
    'Цельность (%)': null_percent
}

quality_metrics
Out[42]:
{'Предсказательная способность': np.float64(0.941908713692946),
 'Скорость вычисления (с):': 1.1279711723327637,
 'Надежность': np.float64(0.005626647816237885),
 'Корреляция': array([0.01320147, 0.01701565, 0.11855808, 0.07147731, 0.00064522,
        0.01339934, 0.0080009 , 0.02973182, 0.00393624, 0.        ,
        0.09713269, 0.14855715, 0.12622933, 0.03761528, 0.13008942,
        0.14516678]),
 'Цельность (%)': Education Level        0.0
 Institution Type       0.0
 Gender                 0.0
 Age                    0.0
 Device                 0.0
 IT Student             0.0
 Location               0.0
 Financial Condition    0.0
 Internet Type          0.0
 Network Type           0.0
 Flexibility Level      0.0
 age_group              0.0
 age_normalized         0.0
 age_standardized       0.0
 dtype: float64}