2494 lines
475 KiB
Plaintext
2494 lines
475 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Датасет №1 (Использование мобильных устройств и поведение пользователей)\n",
|
||
"Ссылка: https://www.kaggle.com/datasets/valakhorasani/mobile-device-usage-and-user-behavior-dataset\n",
|
||
"\n",
|
||
"Проблемная область: прогнозирование пользовательского поведения и сегментация пользователей для улучшения работы приложений, оптимизации потребления энергии, анализа пользовательского опыта или рекламы.\n",
|
||
"\n",
|
||
"Объекты наблюдения: пользователи мобильных устройств, чьи данные об использовании собираются и анализируются."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 195,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['User ID', 'Device Model', 'Operating System',\n",
|
||
" 'App Usage Time (min/day)', 'Screen On Time (hours/day)',\n",
|
||
" 'Battery Drain (mAh/day)', 'Number of Apps Installed',\n",
|
||
" 'Data Usage (MB/day)', 'Age', 'Gender', 'User Behavior Class'],\n",
|
||
" dtype='object')\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 700 entries, 0 to 699\n",
|
||
"Data columns (total 11 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 User ID 700 non-null int64 \n",
|
||
" 1 Device Model 700 non-null object \n",
|
||
" 2 Operating System 700 non-null object \n",
|
||
" 3 App Usage Time (min/day) 700 non-null int64 \n",
|
||
" 4 Screen On Time (hours/day) 700 non-null float64\n",
|
||
" 5 Battery Drain (mAh/day) 700 non-null int64 \n",
|
||
" 6 Number of Apps Installed 700 non-null int64 \n",
|
||
" 7 Data Usage (MB/day) 700 non-null int64 \n",
|
||
" 8 Age 700 non-null int64 \n",
|
||
" 9 Gender 700 non-null object \n",
|
||
" 10 User Behavior Class 700 non-null int64 \n",
|
||
"dtypes: float64(1), int64(7), object(3)\n",
|
||
"memory usage: 60.3+ KB\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>User ID</th>\n",
|
||
" <th>Device Model</th>\n",
|
||
" <th>Operating System</th>\n",
|
||
" <th>App Usage Time (min/day)</th>\n",
|
||
" <th>Screen On Time (hours/day)</th>\n",
|
||
" <th>Battery Drain (mAh/day)</th>\n",
|
||
" <th>Number of Apps Installed</th>\n",
|
||
" <th>Data Usage (MB/day)</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>Gender</th>\n",
|
||
" <th>User Behavior Class</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Google Pixel 5</td>\n",
|
||
" <td>Android</td>\n",
|
||
" <td>393</td>\n",
|
||
" <td>6.4</td>\n",
|
||
" <td>1872</td>\n",
|
||
" <td>67</td>\n",
|
||
" <td>1122</td>\n",
|
||
" <td>40</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>OnePlus 9</td>\n",
|
||
" <td>Android</td>\n",
|
||
" <td>268</td>\n",
|
||
" <td>4.7</td>\n",
|
||
" <td>1331</td>\n",
|
||
" <td>42</td>\n",
|
||
" <td>944</td>\n",
|
||
" <td>47</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>Xiaomi Mi 11</td>\n",
|
||
" <td>Android</td>\n",
|
||
" <td>154</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>761</td>\n",
|
||
" <td>32</td>\n",
|
||
" <td>322</td>\n",
|
||
" <td>42</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>4</td>\n",
|
||
" <td>Google Pixel 5</td>\n",
|
||
" <td>Android</td>\n",
|
||
" <td>239</td>\n",
|
||
" <td>4.8</td>\n",
|
||
" <td>1676</td>\n",
|
||
" <td>56</td>\n",
|
||
" <td>871</td>\n",
|
||
" <td>20</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>5</td>\n",
|
||
" <td>iPhone 12</td>\n",
|
||
" <td>iOS</td>\n",
|
||
" <td>187</td>\n",
|
||
" <td>4.3</td>\n",
|
||
" <td>1367</td>\n",
|
||
" <td>58</td>\n",
|
||
" <td>988</td>\n",
|
||
" <td>31</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>3</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" User ID Device Model Operating System App Usage Time (min/day) \\\n",
|
||
"0 1 Google Pixel 5 Android 393 \n",
|
||
"1 2 OnePlus 9 Android 268 \n",
|
||
"2 3 Xiaomi Mi 11 Android 154 \n",
|
||
"3 4 Google Pixel 5 Android 239 \n",
|
||
"4 5 iPhone 12 iOS 187 \n",
|
||
"\n",
|
||
" Screen On Time (hours/day) Battery Drain (mAh/day) \\\n",
|
||
"0 6.4 1872 \n",
|
||
"1 4.7 1331 \n",
|
||
"2 4.0 761 \n",
|
||
"3 4.8 1676 \n",
|
||
"4 4.3 1367 \n",
|
||
"\n",
|
||
" Number of Apps Installed Data Usage (MB/day) Age Gender \\\n",
|
||
"0 67 1122 40 Male \n",
|
||
"1 42 944 47 Female \n",
|
||
"2 32 322 42 Male \n",
|
||
"3 56 871 20 Male \n",
|
||
"4 58 988 31 Female \n",
|
||
"\n",
|
||
" User Behavior Class \n",
|
||
"0 4 \n",
|
||
"1 3 \n",
|
||
"2 2 \n",
|
||
"3 3 \n",
|
||
"4 3 "
|
||
]
|
||
},
|
||
"execution_count": 195,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import seaborn as sns\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"df_mobiles = pd.read_csv(\".//static//csv//user_behavior_dataset.csv\")\n",
|
||
"print(df_mobiles.columns)\n",
|
||
"df_mobiles.info()\n",
|
||
"df_mobiles.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Атрибуты объектов:\n",
|
||
"1. User ID — уникальный идентификатор пользователя.\n",
|
||
"2. Device Model — модель устройства.\n",
|
||
"3. Operating System — операционная система устройства.\n",
|
||
"4. App Usage Time (min/day) — время использования приложений в минутах в день.\n",
|
||
"5. Data Usage (MB/day) — время включенного экрана в часах в день.\n",
|
||
"6. Battery Drain (mAh/day) — потребление батареи в мАч в день.\n",
|
||
"7. Number of Apps Installed — количество установленных приложений.\n",
|
||
"8. Screen On Time (hours/day) — объем данных в мегабайтах в день.\n",
|
||
"9. Age — возраст пользователя.\n",
|
||
"10. Gender — пол пользователя.\n",
|
||
"11. User Behavior Class — класс поведения пользователя (категория для классификации).\n",
|
||
"\n",
|
||
"Связи между объектами:\n",
|
||
"Атрибуты, такие как модель устройства, ОС и время использования приложений, могут быть связаны с классом поведения, представляя зависимости между действиями пользователя и его характеристиками.\n",
|
||
"\n",
|
||
"Примеры бизнес-целей и эффекты для бизнеса:\n",
|
||
"1. Оптимизация энергопотребления устройств:\n",
|
||
" - Бизнес-цель: Оптимизировать работу приложений для снижения расхода батареи, что увеличит время работы устройства и улучшит пользовательский опыт.\n",
|
||
" - Эффект: Повышение удовлетворенности клиентов и снижение вероятности перехода на конкурентные приложения.\n",
|
||
"\n",
|
||
"2. Сегментация пользователей для рекламы:\n",
|
||
" - Бизнес-цель: Создание таргетированной рекламы на основе поведения пользователей (классы поведения).\n",
|
||
" - Эффект: Увеличение конверсий и доходов от рекламных кампаний за счет более точной сегментации.\n",
|
||
"\n",
|
||
"Примеры целей технического проекта:\n",
|
||
"1. Цель: Построение модели для прогнозирования расхода батареи.\n",
|
||
" - Вход: Модель устройства, ОС, время использования приложений, количество приложений, возраст.\n",
|
||
" - Целевой признак: Battery Drain (mAh/day).\n",
|
||
"\n",
|
||
"2. Цель: Сегментация пользователей для рекламных кампаний.\n",
|
||
" - Вход: Время использования приложений, возраст, пол, объем данных.\n",
|
||
" - Целевой признак: User Behavior Class."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Проверка на пустые значения и дубликаты"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 196,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Пустые значения по столбцам:\n",
|
||
"User ID 0\n",
|
||
"Device Model 0\n",
|
||
"Operating System 0\n",
|
||
"App Usage Time (min/day) 0\n",
|
||
"Screen On Time (hours/day) 0\n",
|
||
"Battery Drain (mAh/day) 0\n",
|
||
"Number of Apps Installed 0\n",
|
||
"Data Usage (MB/day) 0\n",
|
||
"Age 0\n",
|
||
"Gender 0\n",
|
||
"User Behavior Class 0\n",
|
||
"dtype: int64\n",
|
||
"\n",
|
||
"Количество дубликатов: 0\n",
|
||
"\n",
|
||
"Статистический обзор данных:\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>User ID</th>\n",
|
||
" <th>App Usage Time (min/day)</th>\n",
|
||
" <th>Screen On Time (hours/day)</th>\n",
|
||
" <th>Battery Drain (mAh/day)</th>\n",
|
||
" <th>Number of Apps Installed</th>\n",
|
||
" <th>Data Usage (MB/day)</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>User Behavior Class</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>count</th>\n",
|
||
" <td>700.00000</td>\n",
|
||
" <td>700.000000</td>\n",
|
||
" <td>700.000000</td>\n",
|
||
" <td>700.000000</td>\n",
|
||
" <td>700.000000</td>\n",
|
||
" <td>700.000000</td>\n",
|
||
" <td>700.000000</td>\n",
|
||
" <td>700.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>mean</th>\n",
|
||
" <td>350.50000</td>\n",
|
||
" <td>271.128571</td>\n",
|
||
" <td>5.272714</td>\n",
|
||
" <td>1525.158571</td>\n",
|
||
" <td>50.681429</td>\n",
|
||
" <td>929.742857</td>\n",
|
||
" <td>38.482857</td>\n",
|
||
" <td>2.990000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>202.21688</td>\n",
|
||
" <td>177.199484</td>\n",
|
||
" <td>3.068584</td>\n",
|
||
" <td>819.136414</td>\n",
|
||
" <td>26.943324</td>\n",
|
||
" <td>640.451729</td>\n",
|
||
" <td>12.012916</td>\n",
|
||
" <td>1.401476</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>1.00000</td>\n",
|
||
" <td>30.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>302.000000</td>\n",
|
||
" <td>10.000000</td>\n",
|
||
" <td>102.000000</td>\n",
|
||
" <td>18.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25%</th>\n",
|
||
" <td>175.75000</td>\n",
|
||
" <td>113.250000</td>\n",
|
||
" <td>2.500000</td>\n",
|
||
" <td>722.250000</td>\n",
|
||
" <td>26.000000</td>\n",
|
||
" <td>373.000000</td>\n",
|
||
" <td>28.000000</td>\n",
|
||
" <td>2.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>50%</th>\n",
|
||
" <td>350.50000</td>\n",
|
||
" <td>227.500000</td>\n",
|
||
" <td>4.900000</td>\n",
|
||
" <td>1502.500000</td>\n",
|
||
" <td>49.000000</td>\n",
|
||
" <td>823.500000</td>\n",
|
||
" <td>38.000000</td>\n",
|
||
" <td>3.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>75%</th>\n",
|
||
" <td>525.25000</td>\n",
|
||
" <td>434.250000</td>\n",
|
||
" <td>7.400000</td>\n",
|
||
" <td>2229.500000</td>\n",
|
||
" <td>74.000000</td>\n",
|
||
" <td>1341.000000</td>\n",
|
||
" <td>49.000000</td>\n",
|
||
" <td>4.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>max</th>\n",
|
||
" <td>700.00000</td>\n",
|
||
" <td>598.000000</td>\n",
|
||
" <td>12.000000</td>\n",
|
||
" <td>2993.000000</td>\n",
|
||
" <td>99.000000</td>\n",
|
||
" <td>2497.000000</td>\n",
|
||
" <td>59.000000</td>\n",
|
||
" <td>5.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" User ID App Usage Time (min/day) Screen On Time (hours/day) \\\n",
|
||
"count 700.00000 700.000000 700.000000 \n",
|
||
"mean 350.50000 271.128571 5.272714 \n",
|
||
"std 202.21688 177.199484 3.068584 \n",
|
||
"min 1.00000 30.000000 1.000000 \n",
|
||
"25% 175.75000 113.250000 2.500000 \n",
|
||
"50% 350.50000 227.500000 4.900000 \n",
|
||
"75% 525.25000 434.250000 7.400000 \n",
|
||
"max 700.00000 598.000000 12.000000 \n",
|
||
"\n",
|
||
" Battery Drain (mAh/day) Number of Apps Installed Data Usage (MB/day) \\\n",
|
||
"count 700.000000 700.000000 700.000000 \n",
|
||
"mean 1525.158571 50.681429 929.742857 \n",
|
||
"std 819.136414 26.943324 640.451729 \n",
|
||
"min 302.000000 10.000000 102.000000 \n",
|
||
"25% 722.250000 26.000000 373.000000 \n",
|
||
"50% 1502.500000 49.000000 823.500000 \n",
|
||
"75% 2229.500000 74.000000 1341.000000 \n",
|
||
"max 2993.000000 99.000000 2497.000000 \n",
|
||
"\n",
|
||
" Age User Behavior Class \n",
|
||
"count 700.000000 700.000000 \n",
|
||
"mean 38.482857 2.990000 \n",
|
||
"std 12.012916 1.401476 \n",
|
||
"min 18.000000 1.000000 \n",
|
||
"25% 28.000000 2.000000 \n",
|
||
"50% 38.000000 3.000000 \n",
|
||
"75% 49.000000 4.000000 \n",
|
||
"max 59.000000 5.000000 "
|
||
]
|
||
},
|
||
"execution_count": 196,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"null_values = df_mobiles.isnull().sum()\n",
|
||
"print(\"Пустые значения по столбцам:\")\n",
|
||
"print(null_values)\n",
|
||
"\n",
|
||
"duplicates = df_mobiles.duplicated().sum()\n",
|
||
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n",
|
||
"\n",
|
||
"print(\"\\nСтатистический обзор данных:\")\n",
|
||
"df_mobiles.describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Пустых значений и дубликатов нет, проверим на выбросы:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 197,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Количество выбросов в столбце 'App Usage Time (min/day)': 0\n",
|
||
"Количество выбросов в столбце 'Screen On Time (hours/day)': 0\n",
|
||
"Количество выбросов в столбце 'Battery Drain (mAh/day)': 0\n",
|
||
"Количество выбросов в столбце 'Number of Apps Installed': 0\n",
|
||
"Количество выбросов в столбце 'Data Usage (MB/day)': 0\n",
|
||
"Количество выбросов в столбце 'User Behavior Class': 0\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1500x1000 with 6 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Выбираем столбцы для анализа\n",
|
||
"columns_to_check = ['App Usage Time (min/day)', 'Screen On Time (hours/day)', 'Battery Drain (mAh/day)', 'Number of Apps Installed', 'Data Usage (MB/day)', 'User Behavior Class']\n",
|
||
"\n",
|
||
"# Функция для подсчета выбросов\n",
|
||
"def count_outliers(data, columns):\n",
|
||
" outliers_count = {}\n",
|
||
" for col in columns:\n",
|
||
" Q1 = data[col].quantile(0.25)\n",
|
||
" Q3 = data[col].quantile(0.75)\n",
|
||
" IQR = Q3 - Q1\n",
|
||
" lower_bound = Q1 - 1.5 * IQR\n",
|
||
" upper_bound = Q3 + 1.5 * IQR\n",
|
||
" \n",
|
||
" # Считаем количество выбросов\n",
|
||
" outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]\n",
|
||
" outliers_count[col] = len(outliers)\n",
|
||
" \n",
|
||
" return outliers_count\n",
|
||
"\n",
|
||
"# Подсчитываем выбросы\n",
|
||
"outliers_count = count_outliers(df_mobiles, columns_to_check)\n",
|
||
"\n",
|
||
"# Выводим количество выбросов для каждого столбца\n",
|
||
"for col, count in outliers_count.items():\n",
|
||
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
|
||
"\n",
|
||
"# Создаем диаграммы размахов\n",
|
||
"plt.figure(figsize=(15, 10))\n",
|
||
"for i, col in enumerate(columns_to_check, 1):\n",
|
||
" plt.subplot(2, 3, i)\n",
|
||
" sns.boxplot(x=df_mobiles[col])\n",
|
||
" plt.title(f'Box Plot of {col}')\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Выбросов нет"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 198,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Размер обучающей выборки: 420\n",
|
||
"Размер контрольной выборки: 140\n",
|
||
"Размер тестовой выборки: 140\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"train_df, test_df = train_test_split(df_mobiles, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
||
"\n",
|
||
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
||
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
||
"print(\"Размер тестовой выборки:\", len(test_df))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 199,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Распределение \"Класс поведения пользователя\" в обучающей выборке:\n",
|
||
"User Behavior Class\n",
|
||
"2 88\n",
|
||
"5 88\n",
|
||
"4 86\n",
|
||
"3 84\n",
|
||
"1 74\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Класс поведения пользователя\" в контрольной выборке:\n",
|
||
"User Behavior Class\n",
|
||
"1 35\n",
|
||
"2 29\n",
|
||
"4 26\n",
|
||
"5 25\n",
|
||
"3 25\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Класс поведения пользователя\" в тестовой выборке:\n",
|
||
"User Behavior Class\n",
|
||
"3 34\n",
|
||
"2 29\n",
|
||
"4 27\n",
|
||
"1 27\n",
|
||
"5 23\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"def check_balance(df, name):\n",
|
||
" counts = df['User Behavior Class'].value_counts()\n",
|
||
" print(f\"Распределение \\\"Класс поведения пользователя\\\" в {name}:\")\n",
|
||
" print(counts)\n",
|
||
" print()\n",
|
||
"\n",
|
||
"check_balance(train_df, \"обучающей выборке\")\n",
|
||
"check_balance(val_df, \"контрольной выборке\")\n",
|
||
"check_balance(test_df, \"тестовой выборке\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Оверсемплинг и андерсемплинг"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 200,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Оверсэмплинг:\n",
|
||
"Распределение \"Класс поведения пользователя\" в обучающей выборке:\n",
|
||
"User Behavior Class\n",
|
||
"1 88\n",
|
||
"2 88\n",
|
||
"5 88\n",
|
||
"4 88\n",
|
||
"3 88\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Класс поведения пользователя\" в контрольной выборке:\n",
|
||
"User Behavior Class\n",
|
||
"5 35\n",
|
||
"3 35\n",
|
||
"1 35\n",
|
||
"2 35\n",
|
||
"4 35\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Класс поведения пользователя\" в тестовой выборке:\n",
|
||
"User Behavior Class\n",
|
||
"4 34\n",
|
||
"1 34\n",
|
||
"2 34\n",
|
||
"3 34\n",
|
||
"5 34\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Андерсэмплинг:\n",
|
||
"Распределение \"Класс поведения пользователя\" в обучающей выборке:\n",
|
||
"User Behavior Class\n",
|
||
"1 74\n",
|
||
"2 74\n",
|
||
"3 74\n",
|
||
"4 74\n",
|
||
"5 74\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Класс поведения пользователя\" в контрольной выборке:\n",
|
||
"User Behavior Class\n",
|
||
"1 25\n",
|
||
"2 25\n",
|
||
"3 25\n",
|
||
"4 25\n",
|
||
"5 25\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Класс поведения пользователя\" в тестовой выборке:\n",
|
||
"User Behavior Class\n",
|
||
"1 23\n",
|
||
"2 23\n",
|
||
"3 23\n",
|
||
"4 23\n",
|
||
"5 23\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from imblearn.over_sampling import RandomOverSampler\n",
|
||
"from imblearn.under_sampling import RandomUnderSampler\n",
|
||
"\n",
|
||
"def oversample(df, target_column):\n",
|
||
" X = df.drop(target_column, axis=1)\n",
|
||
" y = df[target_column]\n",
|
||
" \n",
|
||
" oversampler = RandomOverSampler(random_state=42)\n",
|
||
" x_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
||
" \n",
|
||
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1) \n",
|
||
" return resampled_df\n",
|
||
"\n",
|
||
"def undersample(df, target_column):\n",
|
||
" X = df.drop(target_column, axis=1)\n",
|
||
" y = df[target_column]\n",
|
||
" \n",
|
||
" undersampler = RandomUnderSampler(random_state=42)\n",
|
||
" x_resampled, y_resampled = undersampler.fit_resample(X, y) # type: ignore\n",
|
||
" \n",
|
||
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1)\n",
|
||
" return resampled_df\n",
|
||
"\n",
|
||
"train_df_oversampled = oversample(train_df, 'User Behavior Class')\n",
|
||
"val_df_oversampled = oversample(val_df, 'User Behavior Class')\n",
|
||
"test_df_oversampled = oversample(test_df, 'User Behavior Class')\n",
|
||
"\n",
|
||
"train_df_undersampled = undersample(train_df, 'User Behavior Class')\n",
|
||
"val_df_undersampled = undersample(val_df, 'User Behavior Class')\n",
|
||
"test_df_undersampled = undersample(test_df, 'User Behavior Class')\n",
|
||
"\n",
|
||
"print(\"Оверсэмплинг:\")\n",
|
||
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
|
||
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
|
||
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
|
||
"\n",
|
||
"print(\"Андерсэмплинг:\")\n",
|
||
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
|
||
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
|
||
"check_balance(test_df_undersampled, \"тестовой выборке\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Датасет №2 (Характеристики автомобиля: данные об экономии топлива)\n",
|
||
"Ссылка: https://www.kaggle.com/datasets/arslaan5/explore-car-performance-fuel-efficiency-data\n",
|
||
"\n",
|
||
"Проблемная область: производительность и экономичность транспортных средств.\n",
|
||
"\n",
|
||
"Объекты наблюдения: автомобили, представленные набором характеристик."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 201,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['city_mpg', 'class', 'combination_mpg', 'cylinders', 'displacement',\n",
|
||
" 'drive', 'fuel_type', 'highway_mpg', 'make', 'model', 'transmission',\n",
|
||
" 'year'],\n",
|
||
" dtype='object')\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 550 entries, 0 to 549\n",
|
||
"Data columns (total 12 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 city_mpg 550 non-null int64 \n",
|
||
" 1 class 550 non-null object \n",
|
||
" 2 combination_mpg 550 non-null int64 \n",
|
||
" 3 cylinders 548 non-null float64\n",
|
||
" 4 displacement 548 non-null float64\n",
|
||
" 5 drive 550 non-null object \n",
|
||
" 6 fuel_type 550 non-null object \n",
|
||
" 7 highway_mpg 550 non-null int64 \n",
|
||
" 8 make 550 non-null object \n",
|
||
" 9 model 550 non-null object \n",
|
||
" 10 transmission 550 non-null object \n",
|
||
" 11 year 550 non-null int64 \n",
|
||
"dtypes: float64(2), int64(4), object(6)\n",
|
||
"memory usage: 51.7+ KB\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>city_mpg</th>\n",
|
||
" <th>class</th>\n",
|
||
" <th>combination_mpg</th>\n",
|
||
" <th>cylinders</th>\n",
|
||
" <th>displacement</th>\n",
|
||
" <th>drive</th>\n",
|
||
" <th>fuel_type</th>\n",
|
||
" <th>highway_mpg</th>\n",
|
||
" <th>make</th>\n",
|
||
" <th>model</th>\n",
|
||
" <th>transmission</th>\n",
|
||
" <th>year</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>25</td>\n",
|
||
" <td>midsize car</td>\n",
|
||
" <td>29</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>fwd</td>\n",
|
||
" <td>gas</td>\n",
|
||
" <td>36</td>\n",
|
||
" <td>mazda</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>m</td>\n",
|
||
" <td>2014</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>26</td>\n",
|
||
" <td>midsize car</td>\n",
|
||
" <td>30</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>fwd</td>\n",
|
||
" <td>gas</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>mazda</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>a</td>\n",
|
||
" <td>2014</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>25</td>\n",
|
||
" <td>small sport utility vehicle</td>\n",
|
||
" <td>27</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>2.5</td>\n",
|
||
" <td>fwd</td>\n",
|
||
" <td>gas</td>\n",
|
||
" <td>31</td>\n",
|
||
" <td>mazda</td>\n",
|
||
" <td>cx-5 2wd</td>\n",
|
||
" <td>a</td>\n",
|
||
" <td>2014</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>26</td>\n",
|
||
" <td>small sport utility vehicle</td>\n",
|
||
" <td>29</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>fwd</td>\n",
|
||
" <td>gas</td>\n",
|
||
" <td>34</td>\n",
|
||
" <td>mazda</td>\n",
|
||
" <td>cx-5 2wd</td>\n",
|
||
" <td>m</td>\n",
|
||
" <td>2014</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>26</td>\n",
|
||
" <td>small sport utility vehicle</td>\n",
|
||
" <td>28</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>fwd</td>\n",
|
||
" <td>gas</td>\n",
|
||
" <td>32</td>\n",
|
||
" <td>mazda</td>\n",
|
||
" <td>cx-5 2wd</td>\n",
|
||
" <td>a</td>\n",
|
||
" <td>2014</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" city_mpg class combination_mpg cylinders \\\n",
|
||
"0 25 midsize car 29 4.0 \n",
|
||
"1 26 midsize car 30 4.0 \n",
|
||
"2 25 small sport utility vehicle 27 4.0 \n",
|
||
"3 26 small sport utility vehicle 29 4.0 \n",
|
||
"4 26 small sport utility vehicle 28 4.0 \n",
|
||
"\n",
|
||
" displacement drive fuel_type highway_mpg make model transmission \\\n",
|
||
"0 2.5 fwd gas 36 mazda 6 m \n",
|
||
"1 2.5 fwd gas 37 mazda 6 a \n",
|
||
"2 2.5 fwd gas 31 mazda cx-5 2wd a \n",
|
||
"3 2.0 fwd gas 34 mazda cx-5 2wd m \n",
|
||
"4 2.0 fwd gas 32 mazda cx-5 2wd a \n",
|
||
"\n",
|
||
" year \n",
|
||
"0 2014 \n",
|
||
"1 2014 \n",
|
||
"2 2014 \n",
|
||
"3 2014 \n",
|
||
"4 2014 "
|
||
]
|
||
},
|
||
"execution_count": 201,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df_cars = pd.read_csv(\".//static//csv//car_data.csv\")\n",
|
||
"print(df_cars.columns)\n",
|
||
"df_cars.info()\n",
|
||
"df_cars.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Атрибуты объектов:\n",
|
||
"\n",
|
||
"1. city_mpg — расход топлива в городе (миль на галлон).\n",
|
||
"2. class — класс автомобиля (например, седан среднего размера, малый внедорожник).\n",
|
||
"3. combination_mpg — комбинированный расход топлива (миль на галлон).\n",
|
||
"4. cylinders — количество цилиндров.\n",
|
||
"5. displacement — объем двигателя (в литрах).\n",
|
||
"6. drive — тип привода (например, передний, полный).\n",
|
||
"7. fuel_type — тип топлива (бензин, дизель и др.).\n",
|
||
"8. highway_mpg — расход топлива на шоссе (миль на галлон).\n",
|
||
"9. make — марка автомобиля.\n",
|
||
"10. model — модель автомобиля.\n",
|
||
"11. transmission — тип трансмиссии (автоматическая, механическая).\n",
|
||
"12. year — год выпуска автомобиля.\n",
|
||
"\n",
|
||
"Связи между объектами:\n",
|
||
"Атрибуты, такие как объем двигателя, тип топлива, количество цилиндров и класс автомобиля, могут быть связаны с комбинированным расходом топлива (combination_mpg). Это позволяет выявлять зависимости между характеристиками автомобиля и его экономичностью.\n",
|
||
"\n",
|
||
"Примеры бизнес-целей и эффекты для бизнеса:\n",
|
||
"\n",
|
||
"1. Оптимизация ассортимента автомобилей:\n",
|
||
" - Бизнес-цель: Анализировать топливную экономичность различных моделей для оптимизации ассортимента, предлагать более популярные и экономичные модели.\n",
|
||
" - Эффект: Снижение затрат на производство низкоэффективных моделей и увеличение продаж популярных, экономичных автомобилей.\n",
|
||
"\n",
|
||
"2. Снижение углеродного следа:\n",
|
||
" - Бизнес-цель: Определение моделей с высоким расходом топлива для улучшения их эффективности и снижения выбросов.\n",
|
||
" - Эффект: Соответствие экологическим стандартам, улучшение репутации компании и соблюдение требований законодательства.\n",
|
||
"\n",
|
||
"Примеры целей технического проекта:\n",
|
||
"\n",
|
||
"1. Цель: Создание модели для прогнозирования топливной эффективности.\n",
|
||
" - Вход: Объем двигателя, тип топлива, количество цилиндров, класс, тип трансмиссии.\n",
|
||
" - Целевой признак: combination_mpg.\n",
|
||
"\n",
|
||
"2. Цель: Модель для предсказания углеродного следа автомобиля.\n",
|
||
" - Вход: Тип топлива, объем двигателя, класс автомобиля, тип привода.\n",
|
||
" - Целевой признак: combination_mpg."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Проверка на пустые значения и дубликаты"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 202,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Пустые значения по столбцам:\n",
|
||
"city_mpg 0\n",
|
||
"class 0\n",
|
||
"combination_mpg 0\n",
|
||
"cylinders 2\n",
|
||
"displacement 2\n",
|
||
"drive 0\n",
|
||
"fuel_type 0\n",
|
||
"highway_mpg 0\n",
|
||
"make 0\n",
|
||
"model 0\n",
|
||
"transmission 0\n",
|
||
"year 0\n",
|
||
"dtype: int64\n",
|
||
"\n",
|
||
"Количество дубликатов: 2\n",
|
||
"\n",
|
||
"Статистический обзор данных:\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>city_mpg</th>\n",
|
||
" <th>combination_mpg</th>\n",
|
||
" <th>cylinders</th>\n",
|
||
" <th>displacement</th>\n",
|
||
" <th>highway_mpg</th>\n",
|
||
" <th>year</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>count</th>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>548.000000</td>\n",
|
||
" <td>548.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>mean</th>\n",
|
||
" <td>21.460000</td>\n",
|
||
" <td>24.069091</td>\n",
|
||
" <td>5.315693</td>\n",
|
||
" <td>2.931752</td>\n",
|
||
" <td>28.609091</td>\n",
|
||
" <td>2019.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>8.147392</td>\n",
|
||
" <td>7.478369</td>\n",
|
||
" <td>1.759999</td>\n",
|
||
" <td>1.248419</td>\n",
|
||
" <td>6.832228</td>\n",
|
||
" <td>3.165156</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>11.000000</td>\n",
|
||
" <td>14.000000</td>\n",
|
||
" <td>3.000000</td>\n",
|
||
" <td>1.200000</td>\n",
|
||
" <td>18.000000</td>\n",
|
||
" <td>2014.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25%</th>\n",
|
||
" <td>17.000000</td>\n",
|
||
" <td>20.000000</td>\n",
|
||
" <td>4.000000</td>\n",
|
||
" <td>2.000000</td>\n",
|
||
" <td>24.000000</td>\n",
|
||
" <td>2016.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>50%</th>\n",
|
||
" <td>20.000000</td>\n",
|
||
" <td>23.000000</td>\n",
|
||
" <td>4.000000</td>\n",
|
||
" <td>2.500000</td>\n",
|
||
" <td>28.000000</td>\n",
|
||
" <td>2019.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>75%</th>\n",
|
||
" <td>24.000000</td>\n",
|
||
" <td>27.000000</td>\n",
|
||
" <td>6.000000</td>\n",
|
||
" <td>3.500000</td>\n",
|
||
" <td>32.000000</td>\n",
|
||
" <td>2022.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>max</th>\n",
|
||
" <td>126.000000</td>\n",
|
||
" <td>112.000000</td>\n",
|
||
" <td>12.000000</td>\n",
|
||
" <td>6.800000</td>\n",
|
||
" <td>102.000000</td>\n",
|
||
" <td>2024.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" city_mpg combination_mpg cylinders displacement highway_mpg \\\n",
|
||
"count 550.000000 550.000000 548.000000 548.000000 550.000000 \n",
|
||
"mean 21.460000 24.069091 5.315693 2.931752 28.609091 \n",
|
||
"std 8.147392 7.478369 1.759999 1.248419 6.832228 \n",
|
||
"min 11.000000 14.000000 3.000000 1.200000 18.000000 \n",
|
||
"25% 17.000000 20.000000 4.000000 2.000000 24.000000 \n",
|
||
"50% 20.000000 23.000000 4.000000 2.500000 28.000000 \n",
|
||
"75% 24.000000 27.000000 6.000000 3.500000 32.000000 \n",
|
||
"max 126.000000 112.000000 12.000000 6.800000 102.000000 \n",
|
||
"\n",
|
||
" year \n",
|
||
"count 550.000000 \n",
|
||
"mean 2019.000000 \n",
|
||
"std 3.165156 \n",
|
||
"min 2014.000000 \n",
|
||
"25% 2016.000000 \n",
|
||
"50% 2019.000000 \n",
|
||
"75% 2022.000000 \n",
|
||
"max 2024.000000 "
|
||
]
|
||
},
|
||
"execution_count": 202,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"null_values = df_cars.isnull().sum()\n",
|
||
"print(\"Пустые значения по столбцам:\")\n",
|
||
"print(null_values)\n",
|
||
"\n",
|
||
"duplicates = df_cars.duplicated().sum()\n",
|
||
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n",
|
||
"\n",
|
||
"print(\"\\nСтатистический обзор данных:\")\n",
|
||
"df_cars.describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Видим, что есть пустые данные, и дубликаты, удаляем их:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 203,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"В наборе данных 'Cars' было удалено 2 строк с пустыми значениями.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df_cars = df_cars.drop_duplicates()\n",
|
||
"\n",
|
||
"def drop_missing_values(dataframe, name):\n",
|
||
" before_shape = dataframe.shape \n",
|
||
" cleaned_dataframe = dataframe.dropna() \n",
|
||
" after_shape = cleaned_dataframe.shape \n",
|
||
" print(f\"В наборе данных '{name}' было удалено {before_shape[0] - after_shape[0]} строк с пустыми значениями.\")\n",
|
||
" return cleaned_dataframe\n",
|
||
"\n",
|
||
"df_cars = drop_missing_values(df_cars, \"Cars\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Проверка на выбросы:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 204,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Количество выбросов в столбце 'combination_mpg': 8\n",
|
||
"Количество выбросов в столбце 'cylinders': 10\n",
|
||
"Количество выбросов в столбце 'displacement': 21\n",
|
||
"Количество выбросов в столбце 'highway_mpg': 3\n",
|
||
"Количество выбросов в столбце 'city_mpg': 9\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1500x1000 with 5 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Выбираем столбцы для анализа\n",
|
||
"columns_to_check = ['combination_mpg', 'cylinders', 'displacement', 'highway_mpg', 'city_mpg']\n",
|
||
"\n",
|
||
"# Подсчитываем выбросы\n",
|
||
"outliers_count = count_outliers(df_cars, columns_to_check)\n",
|
||
"\n",
|
||
"# Выводим количество выбросов для каждого столбца\n",
|
||
"for col, count in outliers_count.items():\n",
|
||
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
|
||
"\n",
|
||
"# Создаем диаграммы размахов\n",
|
||
"plt.figure(figsize=(15, 10))\n",
|
||
"for i, col in enumerate(columns_to_check, 1):\n",
|
||
" plt.subplot(2, 3, i)\n",
|
||
" sns.boxplot(x=df_cars[col])\n",
|
||
" plt.title(f'Box Plot of {col}')\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"В каждом из выбранных столбцов присутствуют выбросы. Очистим их."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 205,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Количество удаленных строк: 36\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Figure size 1500x600 with 0 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1500x1000 with 5 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 0 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Выбираем столбцы для очистки\n",
|
||
"columns_to_clean = ['combination_mpg', 'cylinders', 'displacement', 'highway_mpg', 'city_mpg']\n",
|
||
"\n",
|
||
"# Функция для удаления выбросов\n",
|
||
"def remove_outliers(df, columns):\n",
|
||
" for col in columns:\n",
|
||
" Q1 = df[col].quantile(0.25)\n",
|
||
" Q3 = df[col].quantile(0.75)\n",
|
||
" IQR = Q3 - Q1\n",
|
||
" lower_bound = Q1 - 1.5 * IQR\n",
|
||
" upper_bound = Q3 + 1.5 * IQR\n",
|
||
" \n",
|
||
" # Удаляем строки, содержащие выбросы\n",
|
||
" df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n",
|
||
" \n",
|
||
" return df\n",
|
||
"\n",
|
||
"# Удаляем выбросы\n",
|
||
"df_cars_clean = remove_outliers(df_cars, columns_to_clean)\n",
|
||
"\n",
|
||
"# Выводим количество удаленных строк\n",
|
||
"print(f\"Количество удаленных строк: {len(df_cars) - len(df_cars_clean)}\")\n",
|
||
"\n",
|
||
"# Создаем диаграммы размаха для очищенных данных\n",
|
||
"plt.figure(figsize=(15, 6))\n",
|
||
"\n",
|
||
"# Создаем диаграммы размахов\n",
|
||
"plt.figure(figsize=(15, 10))\n",
|
||
"for i, col in enumerate(columns_to_clean, 1):\n",
|
||
" plt.subplot(2, 3, i)\n",
|
||
" sns.boxplot(x=df_cars_clean[col])\n",
|
||
" plt.title(f'Box Plot of {col}')\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()\n",
|
||
"\n",
|
||
"df_cars = df_cars_clean"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 206,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Размер обучающей выборки: 306\n",
|
||
"Размер контрольной выборки: 102\n",
|
||
"Размер тестовой выборки: 102\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"train_df, test_df = train_test_split(df_cars, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
||
"\n",
|
||
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
||
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
||
"print(\"Размер тестовой выборки:\", len(test_df))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 207,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Распределение \"Комбинированный расход топлива\" в обучающей выборке:\n",
|
||
"combination_mpg\n",
|
||
"23 32\n",
|
||
"22 29\n",
|
||
"24 23\n",
|
||
"25 22\n",
|
||
"27 22\n",
|
||
"18 21\n",
|
||
"19 19\n",
|
||
"29 18\n",
|
||
"21 18\n",
|
||
"26 17\n",
|
||
"31 16\n",
|
||
"28 14\n",
|
||
"20 13\n",
|
||
"32 12\n",
|
||
"17 11\n",
|
||
"30 10\n",
|
||
"16 3\n",
|
||
"34 3\n",
|
||
"36 1\n",
|
||
"33 1\n",
|
||
"14 1\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Комбинированный расход топлива\" в контрольной выборке:\n",
|
||
"combination_mpg\n",
|
||
"20 17\n",
|
||
"19 15\n",
|
||
"21 13\n",
|
||
"26 9\n",
|
||
"27 7\n",
|
||
"22 6\n",
|
||
"30 5\n",
|
||
"23 5\n",
|
||
"18 4\n",
|
||
"17 3\n",
|
||
"24 3\n",
|
||
"28 3\n",
|
||
"29 3\n",
|
||
"25 2\n",
|
||
"34 2\n",
|
||
"33 2\n",
|
||
"32 1\n",
|
||
"14 1\n",
|
||
"31 1\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Комбинированный расход топлива\" в тестовой выборке:\n",
|
||
"combination_mpg\n",
|
||
"21 14\n",
|
||
"18 13\n",
|
||
"22 12\n",
|
||
"27 12\n",
|
||
"23 10\n",
|
||
"31 5\n",
|
||
"20 5\n",
|
||
"26 5\n",
|
||
"24 4\n",
|
||
"29 4\n",
|
||
"28 4\n",
|
||
"19 4\n",
|
||
"25 3\n",
|
||
"32 3\n",
|
||
"17 3\n",
|
||
"30 1\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"def check_balance(df, name):\n",
|
||
" counts = df['combination_mpg'].value_counts()\n",
|
||
" print(f\"Распределение \\\"Комбинированный расход топлива\\\" в {name}:\")\n",
|
||
" print(counts)\n",
|
||
" print()\n",
|
||
"\n",
|
||
"check_balance(train_df, \"обучающей выборке\")\n",
|
||
"check_balance(val_df, \"контрольной выборке\")\n",
|
||
"check_balance(test_df, \"тестовой выборке\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Оверсемплинг и андерсемплинг"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 208,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Оверсэмплинг:\n",
|
||
"Распределение \"Комбинированный расход топлива\" в обучающей выборке:\n",
|
||
"combination_mpg\n",
|
||
"21 32\n",
|
||
"22 32\n",
|
||
"25 32\n",
|
||
"19 32\n",
|
||
"29 32\n",
|
||
"23 32\n",
|
||
"28 32\n",
|
||
"18 32\n",
|
||
"27 32\n",
|
||
"20 32\n",
|
||
"16 32\n",
|
||
"30 32\n",
|
||
"32 32\n",
|
||
"31 32\n",
|
||
"24 32\n",
|
||
"26 32\n",
|
||
"17 32\n",
|
||
"36 32\n",
|
||
"34 32\n",
|
||
"33 32\n",
|
||
"14 32\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Комбинированный расход топлива\" в контрольной выборке:\n",
|
||
"combination_mpg\n",
|
||
"20 17\n",
|
||
"19 17\n",
|
||
"17 17\n",
|
||
"27 17\n",
|
||
"22 17\n",
|
||
"26 17\n",
|
||
"24 17\n",
|
||
"32 17\n",
|
||
"21 17\n",
|
||
"18 17\n",
|
||
"30 17\n",
|
||
"23 17\n",
|
||
"29 17\n",
|
||
"28 17\n",
|
||
"34 17\n",
|
||
"25 17\n",
|
||
"14 17\n",
|
||
"33 17\n",
|
||
"31 17\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Комбинированный расход топлива\" в тестовой выборке:\n",
|
||
"combination_mpg\n",
|
||
"28 14\n",
|
||
"32 14\n",
|
||
"30 14\n",
|
||
"23 14\n",
|
||
"20 14\n",
|
||
"26 14\n",
|
||
"21 14\n",
|
||
"18 14\n",
|
||
"27 14\n",
|
||
"25 14\n",
|
||
"22 14\n",
|
||
"19 14\n",
|
||
"29 14\n",
|
||
"24 14\n",
|
||
"31 14\n",
|
||
"17 14\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Андерсэмплинг:\n",
|
||
"Распределение \"Комбинированный расход топлива\" в обучающей выборке:\n",
|
||
"combination_mpg\n",
|
||
"14 1\n",
|
||
"16 1\n",
|
||
"17 1\n",
|
||
"18 1\n",
|
||
"19 1\n",
|
||
"20 1\n",
|
||
"21 1\n",
|
||
"22 1\n",
|
||
"23 1\n",
|
||
"24 1\n",
|
||
"25 1\n",
|
||
"26 1\n",
|
||
"27 1\n",
|
||
"28 1\n",
|
||
"29 1\n",
|
||
"30 1\n",
|
||
"31 1\n",
|
||
"32 1\n",
|
||
"33 1\n",
|
||
"34 1\n",
|
||
"36 1\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Комбинированный расход топлива\" в контрольной выборке:\n",
|
||
"combination_mpg\n",
|
||
"14 1\n",
|
||
"17 1\n",
|
||
"18 1\n",
|
||
"19 1\n",
|
||
"20 1\n",
|
||
"21 1\n",
|
||
"22 1\n",
|
||
"23 1\n",
|
||
"24 1\n",
|
||
"25 1\n",
|
||
"26 1\n",
|
||
"27 1\n",
|
||
"28 1\n",
|
||
"29 1\n",
|
||
"30 1\n",
|
||
"31 1\n",
|
||
"32 1\n",
|
||
"33 1\n",
|
||
"34 1\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Комбинированный расход топлива\" в тестовой выборке:\n",
|
||
"combination_mpg\n",
|
||
"17 1\n",
|
||
"18 1\n",
|
||
"19 1\n",
|
||
"20 1\n",
|
||
"21 1\n",
|
||
"22 1\n",
|
||
"23 1\n",
|
||
"24 1\n",
|
||
"25 1\n",
|
||
"26 1\n",
|
||
"27 1\n",
|
||
"28 1\n",
|
||
"29 1\n",
|
||
"30 1\n",
|
||
"31 1\n",
|
||
"32 1\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"train_df_oversampled = oversample(train_df, 'combination_mpg')\n",
|
||
"val_df_oversampled = oversample(val_df, 'combination_mpg')\n",
|
||
"test_df_oversampled = oversample(test_df, 'combination_mpg')\n",
|
||
"\n",
|
||
"train_df_undersampled = undersample(train_df, 'combination_mpg')\n",
|
||
"val_df_undersampled = undersample(val_df, 'combination_mpg')\n",
|
||
"test_df_undersampled = undersample(test_df, 'combination_mpg')\n",
|
||
"\n",
|
||
"print(\"Оверсэмплинг:\")\n",
|
||
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
|
||
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
|
||
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
|
||
"\n",
|
||
"print(\"Андерсэмплинг:\")\n",
|
||
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
|
||
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
|
||
"check_balance(test_df_undersampled, \"тестовой выборке\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Датасет №3 (Экономика стран)\n",
|
||
"Ссылка: https://www.kaggle.com/datasets/pratik453609/economic-data-9-countries-19802020\n",
|
||
"\n",
|
||
"Проблемная область: экономический анализ и прогнозирование макроэкономических показателей.\n",
|
||
"\n",
|
||
"Объекты наблюдения: экономические индексы по странам за определённые годы."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 209,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['stock index', 'country', 'year', 'index price', 'log_indexprice',\n",
|
||
" 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n",
|
||
" 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n",
|
||
" 'tradebalance', 'USTreasury'],\n",
|
||
" dtype='object')\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 369 entries, 0 to 368\n",
|
||
"Data columns (total 14 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 stock index 369 non-null object \n",
|
||
" 1 country 369 non-null object \n",
|
||
" 2 year 369 non-null float64\n",
|
||
" 3 index price 317 non-null float64\n",
|
||
" 4 log_indexprice 369 non-null float64\n",
|
||
" 5 inflationrate 326 non-null float64\n",
|
||
" 6 oil prices 369 non-null float64\n",
|
||
" 7 exchange_rate 367 non-null float64\n",
|
||
" 8 gdppercent 350 non-null float64\n",
|
||
" 9 percapitaincome 368 non-null float64\n",
|
||
" 10 unemploymentrate 348 non-null float64\n",
|
||
" 11 manufacturingoutput 278 non-null float64\n",
|
||
" 12 tradebalance 365 non-null float64\n",
|
||
" 13 USTreasury 369 non-null float64\n",
|
||
"dtypes: float64(12), object(2)\n",
|
||
"memory usage: 40.5+ KB\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>stock index</th>\n",
|
||
" <th>country</th>\n",
|
||
" <th>year</th>\n",
|
||
" <th>index price</th>\n",
|
||
" <th>log_indexprice</th>\n",
|
||
" <th>inflationrate</th>\n",
|
||
" <th>oil prices</th>\n",
|
||
" <th>exchange_rate</th>\n",
|
||
" <th>gdppercent</th>\n",
|
||
" <th>percapitaincome</th>\n",
|
||
" <th>unemploymentrate</th>\n",
|
||
" <th>manufacturingoutput</th>\n",
|
||
" <th>tradebalance</th>\n",
|
||
" <th>USTreasury</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>NASDAQ</td>\n",
|
||
" <td>United States of America</td>\n",
|
||
" <td>1980.0</td>\n",
|
||
" <td>168.61</td>\n",
|
||
" <td>2.23</td>\n",
|
||
" <td>0.14</td>\n",
|
||
" <td>21.59</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.09</td>\n",
|
||
" <td>12575.0</td>\n",
|
||
" <td>0.07</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-13.06</td>\n",
|
||
" <td>0.11</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>NASDAQ</td>\n",
|
||
" <td>United States of America</td>\n",
|
||
" <td>1981.0</td>\n",
|
||
" <td>203.15</td>\n",
|
||
" <td>2.31</td>\n",
|
||
" <td>0.10</td>\n",
|
||
" <td>31.77</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.12</td>\n",
|
||
" <td>13976.0</td>\n",
|
||
" <td>0.08</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-12.52</td>\n",
|
||
" <td>0.14</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>NASDAQ</td>\n",
|
||
" <td>United States of America</td>\n",
|
||
" <td>1982.0</td>\n",
|
||
" <td>188.98</td>\n",
|
||
" <td>2.28</td>\n",
|
||
" <td>0.06</td>\n",
|
||
" <td>28.52</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.04</td>\n",
|
||
" <td>14434.0</td>\n",
|
||
" <td>0.10</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-19.97</td>\n",
|
||
" <td>0.13</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>NASDAQ</td>\n",
|
||
" <td>United States of America</td>\n",
|
||
" <td>1983.0</td>\n",
|
||
" <td>285.43</td>\n",
|
||
" <td>2.46</td>\n",
|
||
" <td>0.03</td>\n",
|
||
" <td>26.19</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.09</td>\n",
|
||
" <td>15544.0</td>\n",
|
||
" <td>0.10</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-51.64</td>\n",
|
||
" <td>0.11</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>NASDAQ</td>\n",
|
||
" <td>United States of America</td>\n",
|
||
" <td>1984.0</td>\n",
|
||
" <td>248.89</td>\n",
|
||
" <td>2.40</td>\n",
|
||
" <td>0.04</td>\n",
|
||
" <td>25.88</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.11</td>\n",
|
||
" <td>17121.0</td>\n",
|
||
" <td>0.08</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-102.73</td>\n",
|
||
" <td>0.12</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" stock index country year index price log_indexprice \\\n",
|
||
"0 NASDAQ United States of America 1980.0 168.61 2.23 \n",
|
||
"1 NASDAQ United States of America 1981.0 203.15 2.31 \n",
|
||
"2 NASDAQ United States of America 1982.0 188.98 2.28 \n",
|
||
"3 NASDAQ United States of America 1983.0 285.43 2.46 \n",
|
||
"4 NASDAQ United States of America 1984.0 248.89 2.40 \n",
|
||
"\n",
|
||
" inflationrate oil prices exchange_rate gdppercent percapitaincome \\\n",
|
||
"0 0.14 21.59 1.0 0.09 12575.0 \n",
|
||
"1 0.10 31.77 1.0 0.12 13976.0 \n",
|
||
"2 0.06 28.52 1.0 0.04 14434.0 \n",
|
||
"3 0.03 26.19 1.0 0.09 15544.0 \n",
|
||
"4 0.04 25.88 1.0 0.11 17121.0 \n",
|
||
"\n",
|
||
" unemploymentrate manufacturingoutput tradebalance USTreasury \n",
|
||
"0 0.07 NaN -13.06 0.11 \n",
|
||
"1 0.08 NaN -12.52 0.14 \n",
|
||
"2 0.10 NaN -19.97 0.13 \n",
|
||
"3 0.10 NaN -51.64 0.11 \n",
|
||
"4 0.08 NaN -102.73 0.12 "
|
||
]
|
||
},
|
||
"execution_count": 209,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df_countries = pd.read_csv(\".//static//csv//Economic Data - 9 Countries (1980-2020).csv\")\n",
|
||
"print(df_countries.columns)\n",
|
||
"df_countries.info()\n",
|
||
"df_countries.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Атрибуты объектов:\n",
|
||
"1. stock index — индекс акций.\n",
|
||
"2. country — страна.\n",
|
||
"3. year — год.\n",
|
||
"4. index price — цена индекса.\n",
|
||
"5. log_indexprice — логарифм цены индекса.\n",
|
||
"6. inflationrate — уровень инфляции.\n",
|
||
"7. oil prices — цены на нефть.\n",
|
||
"8. exchange_rate — валютный курс.\n",
|
||
"9. gdppercent — процент роста ВВП.\n",
|
||
"10. percapitaincome — доход на душу населения.\n",
|
||
"11. unemploymentrate — уровень безработицы.\n",
|
||
"12. manufacturingoutput — объём производства.\n",
|
||
"13. tradebalance — торговый баланс.\n",
|
||
"14. USTreasury — доходность казначейских облигаций США.\n",
|
||
"\n",
|
||
"Связи между объектами:\n",
|
||
"Некоторые атрибуты могут быть связаны друг с другом, например, уровень инфляции и процент роста ВВП могут коррелировать с ценами на нефть, уровнем безработицы и торговым балансом.\n",
|
||
"\n",
|
||
"Примеры бизнес-целей и эффект:\n",
|
||
"1. Прогнозирование экономического роста и планирование инвестиций:\n",
|
||
" - Бизнес-цель: Создать модель прогнозирования роста экономики для стран, чтобы принять стратегические инвестиционные решения.\n",
|
||
" - Эффект: Повышение точности экономических прогнозов и улучшение прибыльности инвестиционных стратегий.\n",
|
||
"\n",
|
||
"2. Анализ и оптимизация торговой политики:\n",
|
||
" - Бизнес-цель: Изучение влияния изменений торгового баланса и валютных курсов на экономику стран.\n",
|
||
" - Эффект: Улучшение торговых соглашений и политики, что приведёт к более устойчивому экономическому росту.\n",
|
||
"\n",
|
||
"Примеры целей технического проекта:\n",
|
||
"1. Цель: Построение модели для прогнозирования уровня инфляции.\n",
|
||
" - Вход: Уровень безработицы, ВВП, доход на душу населения, валютный курс, цены на нефть.\n",
|
||
" - Целевой признак: inflationrate.\n",
|
||
"\n",
|
||
"2. Цель: Построение модели для оценки экономического роста.\n",
|
||
" - Вход: Торговый баланс, доход на душу населения, валютный курс, инфляция.\n",
|
||
" - Целевой признак: gdppercent."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Проверка на пустые значения и дубликаты"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 210,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Пустые значения по столбцам:\n",
|
||
"stock index 0\n",
|
||
"country 0\n",
|
||
"year 0\n",
|
||
"index price 52\n",
|
||
"log_indexprice 0\n",
|
||
"inflationrate 43\n",
|
||
"oil prices 0\n",
|
||
"exchange_rate 2\n",
|
||
"gdppercent 19\n",
|
||
"percapitaincome 1\n",
|
||
"unemploymentrate 21\n",
|
||
"manufacturingoutput 91\n",
|
||
"tradebalance 4\n",
|
||
"USTreasury 0\n",
|
||
"dtype: int64\n",
|
||
"\n",
|
||
"Количество дубликатов: 0\n",
|
||
"\n",
|
||
"Статистический обзор данных:\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>year</th>\n",
|
||
" <th>index price</th>\n",
|
||
" <th>log_indexprice</th>\n",
|
||
" <th>inflationrate</th>\n",
|
||
" <th>oil prices</th>\n",
|
||
" <th>exchange_rate</th>\n",
|
||
" <th>gdppercent</th>\n",
|
||
" <th>percapitaincome</th>\n",
|
||
" <th>unemploymentrate</th>\n",
|
||
" <th>manufacturingoutput</th>\n",
|
||
" <th>tradebalance</th>\n",
|
||
" <th>USTreasury</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>count</th>\n",
|
||
" <td>369.000000</td>\n",
|
||
" <td>317.000000</td>\n",
|
||
" <td>369.000000</td>\n",
|
||
" <td>326.000000</td>\n",
|
||
" <td>369.000000</td>\n",
|
||
" <td>367.000000</td>\n",
|
||
" <td>350.000000</td>\n",
|
||
" <td>368.000000</td>\n",
|
||
" <td>348.000000</td>\n",
|
||
" <td>278.000000</td>\n",
|
||
" <td>365.000000</td>\n",
|
||
" <td>369.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>mean</th>\n",
|
||
" <td>2000.000000</td>\n",
|
||
" <td>7898.648297</td>\n",
|
||
" <td>3.610542</td>\n",
|
||
" <td>0.041748</td>\n",
|
||
" <td>39.743171</td>\n",
|
||
" <td>27.897548</td>\n",
|
||
" <td>0.037114</td>\n",
|
||
" <td>20719.964674</td>\n",
|
||
" <td>0.068908</td>\n",
|
||
" <td>328.084820</td>\n",
|
||
" <td>-15.996384</td>\n",
|
||
" <td>0.059024</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>11.848225</td>\n",
|
||
" <td>7811.336862</td>\n",
|
||
" <td>0.482481</td>\n",
|
||
" <td>0.039579</td>\n",
|
||
" <td>25.452654</td>\n",
|
||
" <td>49.620521</td>\n",
|
||
" <td>0.037850</td>\n",
|
||
" <td>17435.037783</td>\n",
|
||
" <td>0.043207</td>\n",
|
||
" <td>622.395923</td>\n",
|
||
" <td>154.557170</td>\n",
|
||
" <td>0.033086</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>1980.000000</td>\n",
|
||
" <td>168.610000</td>\n",
|
||
" <td>2.230000</td>\n",
|
||
" <td>-0.040000</td>\n",
|
||
" <td>11.350000</td>\n",
|
||
" <td>0.900000</td>\n",
|
||
" <td>-0.110000</td>\n",
|
||
" <td>27.000000</td>\n",
|
||
" <td>0.020000</td>\n",
|
||
" <td>0.590000</td>\n",
|
||
" <td>-770.930000</td>\n",
|
||
" <td>0.010000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25%</th>\n",
|
||
" <td>1990.000000</td>\n",
|
||
" <td>2407.100000</td>\n",
|
||
" <td>3.320000</td>\n",
|
||
" <td>0.020000</td>\n",
|
||
" <td>19.410000</td>\n",
|
||
" <td>1.330000</td>\n",
|
||
" <td>0.020000</td>\n",
|
||
" <td>2090.250000</td>\n",
|
||
" <td>0.040000</td>\n",
|
||
" <td>80.380000</td>\n",
|
||
" <td>-25.370000</td>\n",
|
||
" <td>0.030000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>50%</th>\n",
|
||
" <td>2000.000000</td>\n",
|
||
" <td>5160.100000</td>\n",
|
||
" <td>3.600000</td>\n",
|
||
" <td>0.030000</td>\n",
|
||
" <td>28.520000</td>\n",
|
||
" <td>5.440000</td>\n",
|
||
" <td>0.030000</td>\n",
|
||
" <td>19969.500000</td>\n",
|
||
" <td>0.060000</td>\n",
|
||
" <td>188.160000</td>\n",
|
||
" <td>-0.140000</td>\n",
|
||
" <td>0.050000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>75%</th>\n",
|
||
" <td>2010.000000</td>\n",
|
||
" <td>10279.500000</td>\n",
|
||
" <td>3.980000</td>\n",
|
||
" <td>0.057500</td>\n",
|
||
" <td>57.880000</td>\n",
|
||
" <td>15.055000</td>\n",
|
||
" <td>0.060000</td>\n",
|
||
" <td>36384.000000</td>\n",
|
||
" <td>0.090000</td>\n",
|
||
" <td>271.977500</td>\n",
|
||
" <td>19.080000</td>\n",
|
||
" <td>0.080000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>max</th>\n",
|
||
" <td>2020.000000</td>\n",
|
||
" <td>47751.330000</td>\n",
|
||
" <td>4.680000</td>\n",
|
||
" <td>0.240000</td>\n",
|
||
" <td>98.560000</td>\n",
|
||
" <td>249.050000</td>\n",
|
||
" <td>0.150000</td>\n",
|
||
" <td>65280.000000</td>\n",
|
||
" <td>0.260000</td>\n",
|
||
" <td>3868.460000</td>\n",
|
||
" <td>366.140000</td>\n",
|
||
" <td>0.140000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" year index price log_indexprice inflationrate oil prices \\\n",
|
||
"count 369.000000 317.000000 369.000000 326.000000 369.000000 \n",
|
||
"mean 2000.000000 7898.648297 3.610542 0.041748 39.743171 \n",
|
||
"std 11.848225 7811.336862 0.482481 0.039579 25.452654 \n",
|
||
"min 1980.000000 168.610000 2.230000 -0.040000 11.350000 \n",
|
||
"25% 1990.000000 2407.100000 3.320000 0.020000 19.410000 \n",
|
||
"50% 2000.000000 5160.100000 3.600000 0.030000 28.520000 \n",
|
||
"75% 2010.000000 10279.500000 3.980000 0.057500 57.880000 \n",
|
||
"max 2020.000000 47751.330000 4.680000 0.240000 98.560000 \n",
|
||
"\n",
|
||
" exchange_rate gdppercent percapitaincome unemploymentrate \\\n",
|
||
"count 367.000000 350.000000 368.000000 348.000000 \n",
|
||
"mean 27.897548 0.037114 20719.964674 0.068908 \n",
|
||
"std 49.620521 0.037850 17435.037783 0.043207 \n",
|
||
"min 0.900000 -0.110000 27.000000 0.020000 \n",
|
||
"25% 1.330000 0.020000 2090.250000 0.040000 \n",
|
||
"50% 5.440000 0.030000 19969.500000 0.060000 \n",
|
||
"75% 15.055000 0.060000 36384.000000 0.090000 \n",
|
||
"max 249.050000 0.150000 65280.000000 0.260000 \n",
|
||
"\n",
|
||
" manufacturingoutput tradebalance USTreasury \n",
|
||
"count 278.000000 365.000000 369.000000 \n",
|
||
"mean 328.084820 -15.996384 0.059024 \n",
|
||
"std 622.395923 154.557170 0.033086 \n",
|
||
"min 0.590000 -770.930000 0.010000 \n",
|
||
"25% 80.380000 -25.370000 0.030000 \n",
|
||
"50% 188.160000 -0.140000 0.050000 \n",
|
||
"75% 271.977500 19.080000 0.080000 \n",
|
||
"max 3868.460000 366.140000 0.140000 "
|
||
]
|
||
},
|
||
"execution_count": 210,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"null_values = df_countries.isnull().sum()\n",
|
||
"print(\"Пустые значения по столбцам:\")\n",
|
||
"print(null_values)\n",
|
||
"\n",
|
||
"duplicates = df_countries.duplicated().sum()\n",
|
||
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n",
|
||
"\n",
|
||
"print(\"\\nСтатистический обзор данных:\")\n",
|
||
"df_countries.describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Видим, что есть пустые данные, но нет дубликатов. Удаляем их"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 211,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"В наборе данных 'Countries' было удалено 150 строк с пустыми значениями.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df_countries = drop_missing_values(df_countries, \"Countries\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Проверка на выбросы:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 212,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Количество выбросов в столбце 'year': 0\n",
|
||
"Количество выбросов в столбце 'index price': 17\n",
|
||
"Количество выбросов в столбце 'log_indexprice': 1\n",
|
||
"Количество выбросов в столбце 'inflationrate': 35\n",
|
||
"Количество выбросов в столбце 'oil prices': 0\n",
|
||
"Количество выбросов в столбце 'exchange_rate': 53\n",
|
||
"Количество выбросов в столбце 'gdppercent': 13\n",
|
||
"Количество выбросов в столбце 'percapitaincome': 0\n",
|
||
"Количество выбросов в столбце 'unemploymentrate': 9\n",
|
||
"Количество выбросов в столбце 'manufacturingoutput': 29\n",
|
||
"Количество выбросов в столбце 'tradebalance': 47\n",
|
||
"Количество выбросов в столбце 'USTreasury': 9\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1500x1000 with 12 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Выбираем столбцы для анализа\n",
|
||
"columns_to_check = ['year', 'index price', 'log_indexprice',\n",
|
||
" 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n",
|
||
" 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n",
|
||
" 'tradebalance', 'USTreasury']\n",
|
||
"\n",
|
||
"# Подсчитываем выбросы\n",
|
||
"outliers_count = count_outliers(df_countries, columns_to_check)\n",
|
||
"\n",
|
||
"# Выводим количество выбросов для каждого столбца\n",
|
||
"for col, count in outliers_count.items():\n",
|
||
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
|
||
"\n",
|
||
"# Создаем диаграммы размахов\n",
|
||
"plt.figure(figsize=(15, 10))\n",
|
||
"for i, col in enumerate(columns_to_check, 1):\n",
|
||
" plt.subplot(3, 4, i)\n",
|
||
" sns.boxplot(x=df_countries[col])\n",
|
||
" plt.title(f'Box Plot of {col}')\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"В большинстве из выбранных столбцов присутствуют выбросы. Очистим их."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 213,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Количество удаленных строк: 136\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Figure size 1500x600 with 0 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1500x1000 with 9 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 0 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Выбираем столбцы для очистки\n",
|
||
"columns_to_clean = ['index price', 'log_indexprice',\n",
|
||
" 'inflationrate', 'exchange_rate', 'gdppercent', 'unemploymentrate', 'manufacturingoutput',\n",
|
||
" 'tradebalance', 'USTreasury']\n",
|
||
"\n",
|
||
"# Удаляем выбросы\n",
|
||
"df_countries_clean = remove_outliers(df_countries, columns_to_clean)\n",
|
||
"\n",
|
||
"# Выводим количество удаленных строк\n",
|
||
"print(f\"Количество удаленных строк: {len(df_countries) - len(df_countries_clean)}\")\n",
|
||
"\n",
|
||
"# Создаем диаграммы размаха для очищенных данных\n",
|
||
"plt.figure(figsize=(15, 6))\n",
|
||
"\n",
|
||
"# Создаем диаграммы размахов\n",
|
||
"plt.figure(figsize=(15, 10))\n",
|
||
"for i, col in enumerate(columns_to_clean, 1):\n",
|
||
" plt.subplot(3, 3, i)\n",
|
||
" sns.boxplot(x=df_countries_clean[col])\n",
|
||
" plt.title(f'Box Plot of {col}')\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()\n",
|
||
"\n",
|
||
"df_countries = df_countries_clean"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 214,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Размер обучающей выборки: 49\n",
|
||
"Размер контрольной выборки: 17\n",
|
||
"Размер тестовой выборки: 17\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"train_df, test_df = train_test_split(df_countries, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
||
"\n",
|
||
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
||
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
||
"print(\"Размер тестовой выборки:\", len(test_df))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 215,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Распределение \"Уровень инфляции\" в обучающей выборке:\n",
|
||
"inflationrate\n",
|
||
"0.02 25\n",
|
||
"0.03 11\n",
|
||
"0.01 9\n",
|
||
"0.04 4\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Уровень инфляции\" в контрольной выборке:\n",
|
||
"inflationrate\n",
|
||
"0.03 6\n",
|
||
"0.01 6\n",
|
||
"0.02 5\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Уровень инфляции\" в тестовой выборке:\n",
|
||
"inflationrate\n",
|
||
"0.02 6\n",
|
||
"0.03 6\n",
|
||
"0.01 4\n",
|
||
"0.04 1\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"def check_balance(df, name):\n",
|
||
" counts = df['inflationrate'].value_counts()\n",
|
||
" print(f\"Распределение \\\"Уровень инфляции\\\" в {name}:\")\n",
|
||
" print(counts)\n",
|
||
" print()\n",
|
||
"\n",
|
||
"check_balance(train_df, \"обучающей выборке\")\n",
|
||
"check_balance(val_df, \"контрольной выборке\")\n",
|
||
"check_balance(test_df, \"тестовой выборке\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Оверсемплинг и андерсемплинг"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 216,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Оверсэмплинг:\n",
|
||
"Распределение \"Уровень инфляции\" в обучающей выборке:\n",
|
||
"inflationrate\n",
|
||
"0.03 26\n",
|
||
"0.02 25\n",
|
||
"0.01 9\n",
|
||
"0.04 8\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Уровень инфляции\" в контрольной выборке:\n",
|
||
"inflationrate\n",
|
||
"0.03 11\n",
|
||
"0.01 6\n",
|
||
"0.02 5\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Уровень инфляции\" в тестовой выборке:\n",
|
||
"inflationrate\n",
|
||
"0.03 8\n",
|
||
"0.02 6\n",
|
||
"0.01 4\n",
|
||
"0.04 2\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Андерсэмплинг:\n",
|
||
"Распределение \"Уровень инфляции\" в обучающей выборке:\n",
|
||
"inflationrate\n",
|
||
"0.03 11\n",
|
||
"0.02 10\n",
|
||
"0.01 5\n",
|
||
"0.04 4\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Уровень инфляции\" в контрольной выборке:\n",
|
||
"inflationrate\n",
|
||
"0.03 6\n",
|
||
"0.01 4\n",
|
||
"0.02 2\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"Распределение \"Уровень инфляции\" в тестовой выборке:\n",
|
||
"inflationrate\n",
|
||
"0.03 6\n",
|
||
"0.02 5\n",
|
||
"0.01 2\n",
|
||
"0.04 1\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"def binning(target, bins):\n",
|
||
" return pd.qcut(target, q=bins, labels=False)\n",
|
||
"\n",
|
||
"train_df['inflationrate_binned'] = binning(train_df['inflationrate'], bins=2)\n",
|
||
"val_df['inflationrate_binned'] = binning(val_df['inflationrate'], bins=2)\n",
|
||
"test_df['inflationrate_binned'] = binning(test_df['inflationrate'], bins=2)\n",
|
||
"\n",
|
||
"train_df_oversampled = oversample(train_df, 'inflationrate_binned')\n",
|
||
"val_df_oversampled = oversample(val_df, 'inflationrate_binned')\n",
|
||
"test_df_oversampled = oversample(test_df, 'inflationrate_binned')\n",
|
||
"\n",
|
||
"train_df_undersampled = undersample(train_df, 'inflationrate_binned')\n",
|
||
"val_df_undersampled = undersample(val_df, 'inflationrate_binned')\n",
|
||
"test_df_undersampled = undersample(test_df, 'inflationrate_binned')\n",
|
||
"\n",
|
||
"print(\"Оверсэмплинг:\")\n",
|
||
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
|
||
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
|
||
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
|
||
"\n",
|
||
"print(\"Андерсэмплинг:\")\n",
|
||
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
|
||
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
|
||
"check_balance(test_df_undersampled, \"тестовой выборке\")"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "aimvenv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|