2494 lines
475 KiB
Plaintext
2494 lines
475 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Датасет №1 (Использование мобильных устройств и поведение пользователей)\n",
|
|||
|
"Ссылка: https://www.kaggle.com/datasets/valakhorasani/mobile-device-usage-and-user-behavior-dataset\n",
|
|||
|
"\n",
|
|||
|
"Проблемная область: прогнозирование пользовательского поведения и сегментация пользователей для улучшения работы приложений, оптимизации потребления энергии, анализа пользовательского опыта или рекламы.\n",
|
|||
|
"\n",
|
|||
|
"Объекты наблюдения: пользователи мобильных устройств, чьи данные об использовании собираются и анализируются."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 195,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['User ID', 'Device Model', 'Operating System',\n",
|
|||
|
" 'App Usage Time (min/day)', 'Screen On Time (hours/day)',\n",
|
|||
|
" 'Battery Drain (mAh/day)', 'Number of Apps Installed',\n",
|
|||
|
" 'Data Usage (MB/day)', 'Age', 'Gender', 'User Behavior Class'],\n",
|
|||
|
" dtype='object')\n",
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 700 entries, 0 to 699\n",
|
|||
|
"Data columns (total 11 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 User ID 700 non-null int64 \n",
|
|||
|
" 1 Device Model 700 non-null object \n",
|
|||
|
" 2 Operating System 700 non-null object \n",
|
|||
|
" 3 App Usage Time (min/day) 700 non-null int64 \n",
|
|||
|
" 4 Screen On Time (hours/day) 700 non-null float64\n",
|
|||
|
" 5 Battery Drain (mAh/day) 700 non-null int64 \n",
|
|||
|
" 6 Number of Apps Installed 700 non-null int64 \n",
|
|||
|
" 7 Data Usage (MB/day) 700 non-null int64 \n",
|
|||
|
" 8 Age 700 non-null int64 \n",
|
|||
|
" 9 Gender 700 non-null object \n",
|
|||
|
" 10 User Behavior Class 700 non-null int64 \n",
|
|||
|
"dtypes: float64(1), int64(7), object(3)\n",
|
|||
|
"memory usage: 60.3+ KB\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>User ID</th>\n",
|
|||
|
" <th>Device Model</th>\n",
|
|||
|
" <th>Operating System</th>\n",
|
|||
|
" <th>App Usage Time (min/day)</th>\n",
|
|||
|
" <th>Screen On Time (hours/day)</th>\n",
|
|||
|
" <th>Battery Drain (mAh/day)</th>\n",
|
|||
|
" <th>Number of Apps Installed</th>\n",
|
|||
|
" <th>Data Usage (MB/day)</th>\n",
|
|||
|
" <th>Age</th>\n",
|
|||
|
" <th>Gender</th>\n",
|
|||
|
" <th>User Behavior Class</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>Google Pixel 5</td>\n",
|
|||
|
" <td>Android</td>\n",
|
|||
|
" <td>393</td>\n",
|
|||
|
" <td>6.4</td>\n",
|
|||
|
" <td>1872</td>\n",
|
|||
|
" <td>67</td>\n",
|
|||
|
" <td>1122</td>\n",
|
|||
|
" <td>40</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>OnePlus 9</td>\n",
|
|||
|
" <td>Android</td>\n",
|
|||
|
" <td>268</td>\n",
|
|||
|
" <td>4.7</td>\n",
|
|||
|
" <td>1331</td>\n",
|
|||
|
" <td>42</td>\n",
|
|||
|
" <td>944</td>\n",
|
|||
|
" <td>47</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>Xiaomi Mi 11</td>\n",
|
|||
|
" <td>Android</td>\n",
|
|||
|
" <td>154</td>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>761</td>\n",
|
|||
|
" <td>32</td>\n",
|
|||
|
" <td>322</td>\n",
|
|||
|
" <td>42</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>Google Pixel 5</td>\n",
|
|||
|
" <td>Android</td>\n",
|
|||
|
" <td>239</td>\n",
|
|||
|
" <td>4.8</td>\n",
|
|||
|
" <td>1676</td>\n",
|
|||
|
" <td>56</td>\n",
|
|||
|
" <td>871</td>\n",
|
|||
|
" <td>20</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>iPhone 12</td>\n",
|
|||
|
" <td>iOS</td>\n",
|
|||
|
" <td>187</td>\n",
|
|||
|
" <td>4.3</td>\n",
|
|||
|
" <td>1367</td>\n",
|
|||
|
" <td>58</td>\n",
|
|||
|
" <td>988</td>\n",
|
|||
|
" <td>31</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" User ID Device Model Operating System App Usage Time (min/day) \\\n",
|
|||
|
"0 1 Google Pixel 5 Android 393 \n",
|
|||
|
"1 2 OnePlus 9 Android 268 \n",
|
|||
|
"2 3 Xiaomi Mi 11 Android 154 \n",
|
|||
|
"3 4 Google Pixel 5 Android 239 \n",
|
|||
|
"4 5 iPhone 12 iOS 187 \n",
|
|||
|
"\n",
|
|||
|
" Screen On Time (hours/day) Battery Drain (mAh/day) \\\n",
|
|||
|
"0 6.4 1872 \n",
|
|||
|
"1 4.7 1331 \n",
|
|||
|
"2 4.0 761 \n",
|
|||
|
"3 4.8 1676 \n",
|
|||
|
"4 4.3 1367 \n",
|
|||
|
"\n",
|
|||
|
" Number of Apps Installed Data Usage (MB/day) Age Gender \\\n",
|
|||
|
"0 67 1122 40 Male \n",
|
|||
|
"1 42 944 47 Female \n",
|
|||
|
"2 32 322 42 Male \n",
|
|||
|
"3 56 871 20 Male \n",
|
|||
|
"4 58 988 31 Female \n",
|
|||
|
"\n",
|
|||
|
" User Behavior Class \n",
|
|||
|
"0 4 \n",
|
|||
|
"1 3 \n",
|
|||
|
"2 2 \n",
|
|||
|
"3 3 \n",
|
|||
|
"4 3 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 195,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"df_mobiles = pd.read_csv(\".//static//csv//user_behavior_dataset.csv\")\n",
|
|||
|
"print(df_mobiles.columns)\n",
|
|||
|
"df_mobiles.info()\n",
|
|||
|
"df_mobiles.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Атрибуты объектов:\n",
|
|||
|
"1. User ID — уникальный идентификатор пользователя.\n",
|
|||
|
"2. Device Model — модель устройства.\n",
|
|||
|
"3. Operating System — операционная система устройства.\n",
|
|||
|
"4. App Usage Time (min/day) — время использования приложений в минутах в день.\n",
|
|||
|
"5. Data Usage (MB/day) — время включенного экрана в часах в день.\n",
|
|||
|
"6. Battery Drain (mAh/day) — потребление батареи в мАч в день.\n",
|
|||
|
"7. Number of Apps Installed — количество установленных приложений.\n",
|
|||
|
"8. Screen On Time (hours/day) — объем данных в мегабайтах в день.\n",
|
|||
|
"9. Age — возраст пользователя.\n",
|
|||
|
"10. Gender — пол пользователя.\n",
|
|||
|
"11. User Behavior Class — класс поведения пользователя (категория для классификации).\n",
|
|||
|
"\n",
|
|||
|
"Связи между объектами:\n",
|
|||
|
"Атрибуты, такие как модель устройства, ОС и время использования приложений, могут быть связаны с классом поведения, представляя зависимости между действиями пользователя и его характеристиками.\n",
|
|||
|
"\n",
|
|||
|
"Примеры бизнес-целей и эффекты для бизнеса:\n",
|
|||
|
"1. Оптимизация энергопотребления устройств:\n",
|
|||
|
" - Бизнес-цель: Оптимизировать работу приложений для снижения расхода батареи, что увеличит время работы устройства и улучшит пользовательский опыт.\n",
|
|||
|
" - Эффект: Повышение удовлетворенности клиентов и снижение вероятности перехода на конкурентные приложения.\n",
|
|||
|
"\n",
|
|||
|
"2. Сегментация пользователей для рекламы:\n",
|
|||
|
" - Бизнес-цель: Создание таргетированной рекламы на основе поведения пользователей (классы поведения).\n",
|
|||
|
" - Эффект: Увеличение конверсий и доходов от рекламных кампаний за счет более точной сегментации.\n",
|
|||
|
"\n",
|
|||
|
"Примеры целей технического проекта:\n",
|
|||
|
"1. Цель: Построение модели для прогнозирования расхода батареи.\n",
|
|||
|
" - Вход: Модель устройства, ОС, время использования приложений, количество приложений, возраст.\n",
|
|||
|
" - Целевой признак: Battery Drain (mAh/day).\n",
|
|||
|
"\n",
|
|||
|
"2. Цель: Сегментация пользователей для рекламных кампаний.\n",
|
|||
|
" - Вход: Время использования приложений, возраст, пол, объем данных.\n",
|
|||
|
" - Целевой признак: User Behavior Class."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверка на пустые значения и дубликаты"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 196,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Пустые значения по столбцам:\n",
|
|||
|
"User ID 0\n",
|
|||
|
"Device Model 0\n",
|
|||
|
"Operating System 0\n",
|
|||
|
"App Usage Time (min/day) 0\n",
|
|||
|
"Screen On Time (hours/day) 0\n",
|
|||
|
"Battery Drain (mAh/day) 0\n",
|
|||
|
"Number of Apps Installed 0\n",
|
|||
|
"Data Usage (MB/day) 0\n",
|
|||
|
"Age 0\n",
|
|||
|
"Gender 0\n",
|
|||
|
"User Behavior Class 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Количество дубликатов: 0\n",
|
|||
|
"\n",
|
|||
|
"Статистический обзор данных:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>User ID</th>\n",
|
|||
|
" <th>App Usage Time (min/day)</th>\n",
|
|||
|
" <th>Screen On Time (hours/day)</th>\n",
|
|||
|
" <th>Battery Drain (mAh/day)</th>\n",
|
|||
|
" <th>Number of Apps Installed</th>\n",
|
|||
|
" <th>Data Usage (MB/day)</th>\n",
|
|||
|
" <th>Age</th>\n",
|
|||
|
" <th>User Behavior Class</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>count</th>\n",
|
|||
|
" <td>700.00000</td>\n",
|
|||
|
" <td>700.000000</td>\n",
|
|||
|
" <td>700.000000</td>\n",
|
|||
|
" <td>700.000000</td>\n",
|
|||
|
" <td>700.000000</td>\n",
|
|||
|
" <td>700.000000</td>\n",
|
|||
|
" <td>700.000000</td>\n",
|
|||
|
" <td>700.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>mean</th>\n",
|
|||
|
" <td>350.50000</td>\n",
|
|||
|
" <td>271.128571</td>\n",
|
|||
|
" <td>5.272714</td>\n",
|
|||
|
" <td>1525.158571</td>\n",
|
|||
|
" <td>50.681429</td>\n",
|
|||
|
" <td>929.742857</td>\n",
|
|||
|
" <td>38.482857</td>\n",
|
|||
|
" <td>2.990000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>std</th>\n",
|
|||
|
" <td>202.21688</td>\n",
|
|||
|
" <td>177.199484</td>\n",
|
|||
|
" <td>3.068584</td>\n",
|
|||
|
" <td>819.136414</td>\n",
|
|||
|
" <td>26.943324</td>\n",
|
|||
|
" <td>640.451729</td>\n",
|
|||
|
" <td>12.012916</td>\n",
|
|||
|
" <td>1.401476</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>min</th>\n",
|
|||
|
" <td>1.00000</td>\n",
|
|||
|
" <td>30.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>302.000000</td>\n",
|
|||
|
" <td>10.000000</td>\n",
|
|||
|
" <td>102.000000</td>\n",
|
|||
|
" <td>18.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>25%</th>\n",
|
|||
|
" <td>175.75000</td>\n",
|
|||
|
" <td>113.250000</td>\n",
|
|||
|
" <td>2.500000</td>\n",
|
|||
|
" <td>722.250000</td>\n",
|
|||
|
" <td>26.000000</td>\n",
|
|||
|
" <td>373.000000</td>\n",
|
|||
|
" <td>28.000000</td>\n",
|
|||
|
" <td>2.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>50%</th>\n",
|
|||
|
" <td>350.50000</td>\n",
|
|||
|
" <td>227.500000</td>\n",
|
|||
|
" <td>4.900000</td>\n",
|
|||
|
" <td>1502.500000</td>\n",
|
|||
|
" <td>49.000000</td>\n",
|
|||
|
" <td>823.500000</td>\n",
|
|||
|
" <td>38.000000</td>\n",
|
|||
|
" <td>3.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>75%</th>\n",
|
|||
|
" <td>525.25000</td>\n",
|
|||
|
" <td>434.250000</td>\n",
|
|||
|
" <td>7.400000</td>\n",
|
|||
|
" <td>2229.500000</td>\n",
|
|||
|
" <td>74.000000</td>\n",
|
|||
|
" <td>1341.000000</td>\n",
|
|||
|
" <td>49.000000</td>\n",
|
|||
|
" <td>4.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>max</th>\n",
|
|||
|
" <td>700.00000</td>\n",
|
|||
|
" <td>598.000000</td>\n",
|
|||
|
" <td>12.000000</td>\n",
|
|||
|
" <td>2993.000000</td>\n",
|
|||
|
" <td>99.000000</td>\n",
|
|||
|
" <td>2497.000000</td>\n",
|
|||
|
" <td>59.000000</td>\n",
|
|||
|
" <td>5.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" User ID App Usage Time (min/day) Screen On Time (hours/day) \\\n",
|
|||
|
"count 700.00000 700.000000 700.000000 \n",
|
|||
|
"mean 350.50000 271.128571 5.272714 \n",
|
|||
|
"std 202.21688 177.199484 3.068584 \n",
|
|||
|
"min 1.00000 30.000000 1.000000 \n",
|
|||
|
"25% 175.75000 113.250000 2.500000 \n",
|
|||
|
"50% 350.50000 227.500000 4.900000 \n",
|
|||
|
"75% 525.25000 434.250000 7.400000 \n",
|
|||
|
"max 700.00000 598.000000 12.000000 \n",
|
|||
|
"\n",
|
|||
|
" Battery Drain (mAh/day) Number of Apps Installed Data Usage (MB/day) \\\n",
|
|||
|
"count 700.000000 700.000000 700.000000 \n",
|
|||
|
"mean 1525.158571 50.681429 929.742857 \n",
|
|||
|
"std 819.136414 26.943324 640.451729 \n",
|
|||
|
"min 302.000000 10.000000 102.000000 \n",
|
|||
|
"25% 722.250000 26.000000 373.000000 \n",
|
|||
|
"50% 1502.500000 49.000000 823.500000 \n",
|
|||
|
"75% 2229.500000 74.000000 1341.000000 \n",
|
|||
|
"max 2993.000000 99.000000 2497.000000 \n",
|
|||
|
"\n",
|
|||
|
" Age User Behavior Class \n",
|
|||
|
"count 700.000000 700.000000 \n",
|
|||
|
"mean 38.482857 2.990000 \n",
|
|||
|
"std 12.012916 1.401476 \n",
|
|||
|
"min 18.000000 1.000000 \n",
|
|||
|
"25% 28.000000 2.000000 \n",
|
|||
|
"50% 38.000000 3.000000 \n",
|
|||
|
"75% 49.000000 4.000000 \n",
|
|||
|
"max 59.000000 5.000000 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 196,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"null_values = df_mobiles.isnull().sum()\n",
|
|||
|
"print(\"Пустые значения по столбцам:\")\n",
|
|||
|
"print(null_values)\n",
|
|||
|
"\n",
|
|||
|
"duplicates = df_mobiles.duplicated().sum()\n",
|
|||
|
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n",
|
|||
|
"\n",
|
|||
|
"print(\"\\nСтатистический обзор данных:\")\n",
|
|||
|
"df_mobiles.describe()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Пустых значений и дубликатов нет, проверим на выбросы:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 197,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Количество выбросов в столбце 'App Usage Time (min/day)': 0\n",
|
|||
|
"Количество выбросов в столбце 'Screen On Time (hours/day)': 0\n",
|
|||
|
"Количество выбросов в столбце 'Battery Drain (mAh/day)': 0\n",
|
|||
|
"Количество выбросов в столбце 'Number of Apps Installed': 0\n",
|
|||
|
"Количество выбросов в столбце 'Data Usage (MB/day)': 0\n",
|
|||
|
"Количество выбросов в столбце 'User Behavior Class': 0\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABc8AAAPeCAYAAADatOK+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADskUlEQVR4nOzdd3QU5dvG8SuFdBJCAqEndKQJ0lF6FxFUpIgQsDewU2yAiIAo2FERQTGKIl2KFEEUUYrSfiA1ICpF6b0k9/sHZ/dlZ5OQhIRQvp9zOLozszPPPLuZe+baKT5mZgIAAAAAAAAAAG6+Od0AAAAAAAAAAAAuN4TnAAAAAAAAAAA4EJ4DAAAAAAAAAOBAeA4AAAAAAAAAgAPhOQAAAAAAAAAADoTnAAAAAAAAAAA4EJ4DAAAAAAAAAOBAeA4AAAAAAAAAgAPhOQAAAAAAAAAADoTnyLDt27fLx8dH48aNy+mmeJgzZ46qVKmioKAg+fj46ODBgzndJGRSXFycunfvntPN8PLII4+oWbNmWTrPS7WuixYtko+PjxYtWpRty+jUqZM6dOiQbfMHcGHUaGS1AQMGyMfHJ6eb4WXZsmUKCAjQjh073MPi4uJ0yy235GCrLo3s/kzOnDmjokWL6v3338+2ZQC4eNR8XKyGDRuqYcOGWT7f1157TeXKlVNycnKWzzs9XHXyv//+y9L5jhs3Tj4+Ptq+fXuWzvd8tWvXVu/evbNt/sgcwvMc5PrDO/9f/vz51ahRI82ePfuSt8cVrrn+5cqVSyVKlFC3bt20bdu2LFnGzz//rAEDBmR5Ad23b586dOig4OBgvffeexo/frxCQ0Mv+L73339fPj4+qlWrVpa2JzPSOuBbsWLFZbljlF7O71Za/y5XiYmJ+vjjj/Xcc8/ldFO8zJgxQ76+vtq9e3eOtqNPnz6aNGmSVq9enaPtALICNTrrZLRGr127Vu3bt1dsbKyCgoJUuHBhNWvWTO+8806WtutydOzYMQ0aNEiVK1dWSEiIIiIiVK9ePX322WcysyxbTlxcXLpq8uW83/H888+rc+fOio2NzemmZKmnn35a5cuXz9E25MqVS0899ZQGDx6skydP5mhbgEuBmp91MlLzs6vfX331VU2dOtVreHatc1bo3r27Rz+EhYWpRIkSat++vSZNmpRjIXRmHD58WMOGDVOfPn3k65s9kWPNmjXl4+OjUaNGZdk833nnHUVEROjMmTNZNs/M6NOnj957770cP7aHJ/+cbgCkl19+WcWLF5eZac+ePRo3bpxuvvlmzZgxI0fOnunVq5dq1KihM2fO6LffftNHH32kmTNnau3atSpUqNBFzfvnn3/WwIED1b17d+XJkydrGixp+fLlOnLkiAYNGqSmTZum+30JCQmKi4vTsmXLtGXLFpUqVSrL2oT/d91112n8+PEew/r166ewsDA9//zzXtNv3Lgx2wptZr311lsqXry4GjVqlKXzzYp1nTlzpqpVq6YCBQpkUasyp2rVqqpevbreeOMNffbZZznaFiCrUKMvXkZq9M8//6xGjRqpWLFiuv/++1WgQAHt3LlTv/zyi9566y317Nkzy9p1udmzZ4+aNGmiDRs2qFOnTnrsscd08uRJTZo0SfHx8Zo1a5YSEhLk5+d30ct68803dfToUffrWbNm6csvv9TIkSMVHR3tHl63bl3dfffd6tu370UvMyutWrVK8+fP188//5zTTclyM2fOVJs2bXK6GerRo4f69u2rL774Qvfcc09ONwe4JKj5Fy8zx+VZ3e+vvvqq2rdvr3bt2nkMz651ziqBgYH6+OOPJUknTpzQjh07NGPGDLVv314NGzbUtGnTFB4enqXLnDt3bpbOT5I++eQTnT17Vp07d87yeUvS5s2btXz5csXFxSkhIUEPP/xwlsx35syZat68uXLlypUl88ustm3bKjw8XO+//75efvnlHG0L/h/h+WWgVatWql69uvv1vffeq5iYGH355Zc5UqTr1aun9u3bSzq341ymTBn16tVLn376qfr163fJ25Mee/fulaQMFcHExET9/PPPmjx5sh588EElJCSof//+2dTCa1tMTIzuvvtuj2FDhw5VdHS013Dp3I7D5eTMmTNKSEjQQw89lOXzzop1nTVr1mVzYNuhQwf1799f77//vsLCwnK6OcBFo0ZfvIzU6MGDBysiIkLLly/3mt41n4thZjp58qSCg4Mvel5ZLT4+Xhs2bNCUKVN06623uof36tVLzz77rF5//XVVrVpVffr0uehlOQOF3bt368svv1S7du0UFxfnNb2//+V1yDB27FgVK1ZMtWvXzummpOn48eMKCQlJ9/Tbtm3Txo0b9cEHH2Rjq9InT548at68ucaNG3fZ7GMA2Y2af/Eyc1x+ufV7RmV0W58af39/r2PjV155RUOHDlW/fv10//3366uvvkr1/ZnZxwkICMh0e1MzduxY3XrrrQoKCsryeUvS559/rvz58+uNN95Q+/bttX379hT3XTLi+PHj+uGHH7L0TPbM8vX1Vfv27fXZZ59p4MCBl/XV+deSy+vUTkg6V2iCg4O9DlSOHTump59+WkWLFlVgYKDKli2r119/3X0Z74kTJ1SuXDmVK1dOJ06ccL9v//79KliwoOrWraukpKQMt6dx48aSzoXNafn+++9Vr149hYaGKk+ePGrbtq02bNjgHj9gwAA9++yzkqTixYu7L0m60P2iJk6cqGrVqik4ONgdtv7999/u8Q0bNlR8fLwkqUaNGvLx8UnXPaQTEhIUGRmp1q1bq3379kpISPCaxnUfuddff10jR45UbGysgoOD1aBBA61bt85j2u7duyssLEzbtm1TixYtFBoaqkKFCunll1/O0kutXXbv3q0ePXqoSJEiCgwMVMGCBdW2bVuP/pw2bZpat26tQoUKKTAwUCVLltSgQYNS/B689957KlGihIKDg1WzZk39+OOPKd4D7dSpU+rfv79KlSqlwMBAFS1aVL1799apU6eybN2c9wF3XdL3008/qVevXsqXL5/y5MmjBx98UKdPn9bBgwfVrVs3RUZGKjIyUr179/bq8+TkZL355puqUKGCgoKCFBMTowcffFAHDhy4YHt++ukn/ffff15nT7guqfz66681cOBAFS5cWLlz51b79u116NAhnTp1Sk888YTy58+vsLAw9ejRw6ufUlvXJUuW6KmnnlK+fPkUGhqq2267Tf/++69X29auXaudO3eqdevW7mF//fWX2rVrp9DQUOXPn19PPvlkip/Pjz/+qDvvvFPFihVzf5ZPPvmkx/Zj7Nix8vHx0e+//+71/ldffVV+fn4ef4/NmjXTsWPHNG/evAv2K3AlokZ7yuoavXXrVlWoUCHFg+78+fN7Dfv8889Vs2ZNhYSEKDIyUvXr1/c4i8p1S7TvvvtO1atXV3BwsD788ENJ0sGDB/XEE0+4P7NSpUpp2LBhXpdGp7d+uJb1008/qWbNmgoKClKJEiXSdSXOL7/8ou+++07du3f3CM5dhgwZotKlS2vYsGHu78/5+ygfffSRSpYsqcDAQNWoUUPLly+/4DLTK6X7a/v4+Oixxx7TxIkTVb58eQUHB6tOnTpau3atJOnDDz9UqVKlFBQUpIYNG6b4Pfr111/VsmVLRUREKCQkRA0aNNCSJUvS1aapU6eqcePGqR5Qpucz2LZtm+68807lzZtXISEhql27tmbOnOkxTWr3Nk3pOSINGzZUxYoVtXLlStWvX18hISHuW72tWLFCLVq0UHR0tIKDg1W8ePEUA+mZM2cqIiJCN910k8e61KhRQ0FBQSpZsqT7++s0duxYNW7cWPnz51dgYKDKly/vFQLEx8crOjo6xUvSmzdvrrJly3oMa9asmX766Sft378/xWUCVztqvqfsOi53Sq3fX3/9ddWtW1dRUVEKDg5WtWrV9M0333hM4+Pjo2PHjunTTz91r1f37t3Ttc6ff/65e/3y5s2rTp06aefOnR7zT21bn9Hta0b07dtXzZs318SJE7Vp0yb38LT2cdJTE1zrc/7x/vnHt4MHD1aRIkUUFBSkJk2aaMu
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1500x1000 with 6 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Выбираем столбцы для анализа\n",
|
|||
|
"columns_to_check = ['App Usage Time (min/day)', 'Screen On Time (hours/day)', 'Battery Drain (mAh/day)', 'Number of Apps Installed', 'Data Usage (MB/day)', 'User Behavior Class']\n",
|
|||
|
"\n",
|
|||
|
"# Функция для подсчета выбросов\n",
|
|||
|
"def count_outliers(data, columns):\n",
|
|||
|
" outliers_count = {}\n",
|
|||
|
" for col in columns:\n",
|
|||
|
" Q1 = data[col].quantile(0.25)\n",
|
|||
|
" Q3 = data[col].quantile(0.75)\n",
|
|||
|
" IQR = Q3 - Q1\n",
|
|||
|
" lower_bound = Q1 - 1.5 * IQR\n",
|
|||
|
" upper_bound = Q3 + 1.5 * IQR\n",
|
|||
|
" \n",
|
|||
|
" # Считаем количество выбросов\n",
|
|||
|
" outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]\n",
|
|||
|
" outliers_count[col] = len(outliers)\n",
|
|||
|
" \n",
|
|||
|
" return outliers_count\n",
|
|||
|
"\n",
|
|||
|
"# Подсчитываем выбросы\n",
|
|||
|
"outliers_count = count_outliers(df_mobiles, columns_to_check)\n",
|
|||
|
"\n",
|
|||
|
"# Выводим количество выбросов для каждого столбца\n",
|
|||
|
"for col, count in outliers_count.items():\n",
|
|||
|
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
|
|||
|
"\n",
|
|||
|
"# Создаем диаграммы размахов\n",
|
|||
|
"plt.figure(figsize=(15, 10))\n",
|
|||
|
"for i, col in enumerate(columns_to_check, 1):\n",
|
|||
|
" plt.subplot(2, 3, i)\n",
|
|||
|
" sns.boxplot(x=df_mobiles[col])\n",
|
|||
|
" plt.title(f'Box Plot of {col}')\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выбросов нет"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 198,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 420\n",
|
|||
|
"Размер контрольной выборки: 140\n",
|
|||
|
"Размер тестовой выборки: 140\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"train_df, test_df = train_test_split(df_mobiles, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 199,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение \"Класс поведения пользователя\" в обучающей выборке:\n",
|
|||
|
"User Behavior Class\n",
|
|||
|
"2 88\n",
|
|||
|
"5 88\n",
|
|||
|
"4 86\n",
|
|||
|
"3 84\n",
|
|||
|
"1 74\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Класс поведения пользователя\" в контрольной выборке:\n",
|
|||
|
"User Behavior Class\n",
|
|||
|
"1 35\n",
|
|||
|
"2 29\n",
|
|||
|
"4 26\n",
|
|||
|
"5 25\n",
|
|||
|
"3 25\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Класс поведения пользователя\" в тестовой выборке:\n",
|
|||
|
"User Behavior Class\n",
|
|||
|
"3 34\n",
|
|||
|
"2 29\n",
|
|||
|
"4 27\n",
|
|||
|
"1 27\n",
|
|||
|
"5 23\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['User Behavior Class'].value_counts()\n",
|
|||
|
" print(f\"Распределение \\\"Класс поведения пользователя\\\" в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Оверсемплинг и андерсемплинг"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 200,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Оверсэмплинг:\n",
|
|||
|
"Распределение \"Класс поведения пользователя\" в обучающей выборке:\n",
|
|||
|
"User Behavior Class\n",
|
|||
|
"1 88\n",
|
|||
|
"2 88\n",
|
|||
|
"5 88\n",
|
|||
|
"4 88\n",
|
|||
|
"3 88\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Класс поведения пользователя\" в контрольной выборке:\n",
|
|||
|
"User Behavior Class\n",
|
|||
|
"5 35\n",
|
|||
|
"3 35\n",
|
|||
|
"1 35\n",
|
|||
|
"2 35\n",
|
|||
|
"4 35\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Класс поведения пользователя\" в тестовой выборке:\n",
|
|||
|
"User Behavior Class\n",
|
|||
|
"4 34\n",
|
|||
|
"1 34\n",
|
|||
|
"2 34\n",
|
|||
|
"3 34\n",
|
|||
|
"5 34\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Андерсэмплинг:\n",
|
|||
|
"Распределение \"Класс поведения пользователя\" в обучающей выборке:\n",
|
|||
|
"User Behavior Class\n",
|
|||
|
"1 74\n",
|
|||
|
"2 74\n",
|
|||
|
"3 74\n",
|
|||
|
"4 74\n",
|
|||
|
"5 74\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Класс поведения пользователя\" в контрольной выборке:\n",
|
|||
|
"User Behavior Class\n",
|
|||
|
"1 25\n",
|
|||
|
"2 25\n",
|
|||
|
"3 25\n",
|
|||
|
"4 25\n",
|
|||
|
"5 25\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Класс поведения пользователя\" в тестовой выборке:\n",
|
|||
|
"User Behavior Class\n",
|
|||
|
"1 23\n",
|
|||
|
"2 23\n",
|
|||
|
"3 23\n",
|
|||
|
"4 23\n",
|
|||
|
"5 23\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df, target_column):\n",
|
|||
|
" X = df.drop(target_column, axis=1)\n",
|
|||
|
" y = df[target_column]\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" x_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1) \n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"def undersample(df, target_column):\n",
|
|||
|
" X = df.drop(target_column, axis=1)\n",
|
|||
|
" y = df[target_column]\n",
|
|||
|
" \n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" x_resampled, y_resampled = undersampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df, 'User Behavior Class')\n",
|
|||
|
"val_df_oversampled = oversample(val_df, 'User Behavior Class')\n",
|
|||
|
"test_df_oversampled = oversample(test_df, 'User Behavior Class')\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df, 'User Behavior Class')\n",
|
|||
|
"val_df_undersampled = undersample(val_df, 'User Behavior Class')\n",
|
|||
|
"test_df_undersampled = undersample(test_df, 'User Behavior Class')\n",
|
|||
|
"\n",
|
|||
|
"print(\"Оверсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"print(\"Андерсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Датасет №2 (Характеристики автомобиля: данные об экономии топлива)\n",
|
|||
|
"Ссылка: https://www.kaggle.com/datasets/arslaan5/explore-car-performance-fuel-efficiency-data\n",
|
|||
|
"\n",
|
|||
|
"Проблемная область: производительность и экономичность транспортных средств.\n",
|
|||
|
"\n",
|
|||
|
"Объекты наблюдения: автомобили, представленные набором характеристик."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 201,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['city_mpg', 'class', 'combination_mpg', 'cylinders', 'displacement',\n",
|
|||
|
" 'drive', 'fuel_type', 'highway_mpg', 'make', 'model', 'transmission',\n",
|
|||
|
" 'year'],\n",
|
|||
|
" dtype='object')\n",
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 550 entries, 0 to 549\n",
|
|||
|
"Data columns (total 12 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 city_mpg 550 non-null int64 \n",
|
|||
|
" 1 class 550 non-null object \n",
|
|||
|
" 2 combination_mpg 550 non-null int64 \n",
|
|||
|
" 3 cylinders 548 non-null float64\n",
|
|||
|
" 4 displacement 548 non-null float64\n",
|
|||
|
" 5 drive 550 non-null object \n",
|
|||
|
" 6 fuel_type 550 non-null object \n",
|
|||
|
" 7 highway_mpg 550 non-null int64 \n",
|
|||
|
" 8 make 550 non-null object \n",
|
|||
|
" 9 model 550 non-null object \n",
|
|||
|
" 10 transmission 550 non-null object \n",
|
|||
|
" 11 year 550 non-null int64 \n",
|
|||
|
"dtypes: float64(2), int64(4), object(6)\n",
|
|||
|
"memory usage: 51.7+ KB\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>city_mpg</th>\n",
|
|||
|
" <th>class</th>\n",
|
|||
|
" <th>combination_mpg</th>\n",
|
|||
|
" <th>cylinders</th>\n",
|
|||
|
" <th>displacement</th>\n",
|
|||
|
" <th>drive</th>\n",
|
|||
|
" <th>fuel_type</th>\n",
|
|||
|
" <th>highway_mpg</th>\n",
|
|||
|
" <th>make</th>\n",
|
|||
|
" <th>model</th>\n",
|
|||
|
" <th>transmission</th>\n",
|
|||
|
" <th>year</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>25</td>\n",
|
|||
|
" <td>midsize car</td>\n",
|
|||
|
" <td>29</td>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>2.5</td>\n",
|
|||
|
" <td>fwd</td>\n",
|
|||
|
" <td>gas</td>\n",
|
|||
|
" <td>36</td>\n",
|
|||
|
" <td>mazda</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>m</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>26</td>\n",
|
|||
|
" <td>midsize car</td>\n",
|
|||
|
" <td>30</td>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>2.5</td>\n",
|
|||
|
" <td>fwd</td>\n",
|
|||
|
" <td>gas</td>\n",
|
|||
|
" <td>37</td>\n",
|
|||
|
" <td>mazda</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>a</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>25</td>\n",
|
|||
|
" <td>small sport utility vehicle</td>\n",
|
|||
|
" <td>27</td>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>2.5</td>\n",
|
|||
|
" <td>fwd</td>\n",
|
|||
|
" <td>gas</td>\n",
|
|||
|
" <td>31</td>\n",
|
|||
|
" <td>mazda</td>\n",
|
|||
|
" <td>cx-5 2wd</td>\n",
|
|||
|
" <td>a</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>26</td>\n",
|
|||
|
" <td>small sport utility vehicle</td>\n",
|
|||
|
" <td>29</td>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>fwd</td>\n",
|
|||
|
" <td>gas</td>\n",
|
|||
|
" <td>34</td>\n",
|
|||
|
" <td>mazda</td>\n",
|
|||
|
" <td>cx-5 2wd</td>\n",
|
|||
|
" <td>m</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>26</td>\n",
|
|||
|
" <td>small sport utility vehicle</td>\n",
|
|||
|
" <td>28</td>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>fwd</td>\n",
|
|||
|
" <td>gas</td>\n",
|
|||
|
" <td>32</td>\n",
|
|||
|
" <td>mazda</td>\n",
|
|||
|
" <td>cx-5 2wd</td>\n",
|
|||
|
" <td>a</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" city_mpg class combination_mpg cylinders \\\n",
|
|||
|
"0 25 midsize car 29 4.0 \n",
|
|||
|
"1 26 midsize car 30 4.0 \n",
|
|||
|
"2 25 small sport utility vehicle 27 4.0 \n",
|
|||
|
"3 26 small sport utility vehicle 29 4.0 \n",
|
|||
|
"4 26 small sport utility vehicle 28 4.0 \n",
|
|||
|
"\n",
|
|||
|
" displacement drive fuel_type highway_mpg make model transmission \\\n",
|
|||
|
"0 2.5 fwd gas 36 mazda 6 m \n",
|
|||
|
"1 2.5 fwd gas 37 mazda 6 a \n",
|
|||
|
"2 2.5 fwd gas 31 mazda cx-5 2wd a \n",
|
|||
|
"3 2.0 fwd gas 34 mazda cx-5 2wd m \n",
|
|||
|
"4 2.0 fwd gas 32 mazda cx-5 2wd a \n",
|
|||
|
"\n",
|
|||
|
" year \n",
|
|||
|
"0 2014 \n",
|
|||
|
"1 2014 \n",
|
|||
|
"2 2014 \n",
|
|||
|
"3 2014 \n",
|
|||
|
"4 2014 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 201,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df_cars = pd.read_csv(\".//static//csv//car_data.csv\")\n",
|
|||
|
"print(df_cars.columns)\n",
|
|||
|
"df_cars.info()\n",
|
|||
|
"df_cars.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Атрибуты объектов:\n",
|
|||
|
"\n",
|
|||
|
"1. city_mpg — расход топлива в городе (миль на галлон).\n",
|
|||
|
"2. class — класс автомобиля (например, седан среднего размера, малый внедорожник).\n",
|
|||
|
"3. combination_mpg — комбинированный расход топлива (миль на галлон).\n",
|
|||
|
"4. cylinders — количество цилиндров.\n",
|
|||
|
"5. displacement — объем двигателя (в литрах).\n",
|
|||
|
"6. drive — тип привода (например, передний, полный).\n",
|
|||
|
"7. fuel_type — тип топлива (бензин, дизель и др.).\n",
|
|||
|
"8. highway_mpg — расход топлива на шоссе (миль на галлон).\n",
|
|||
|
"9. make — марка автомобиля.\n",
|
|||
|
"10. model — модель автомобиля.\n",
|
|||
|
"11. transmission — тип трансмиссии (автоматическая, механическая).\n",
|
|||
|
"12. year — год выпуска автомобиля.\n",
|
|||
|
"\n",
|
|||
|
"Связи между объектами:\n",
|
|||
|
"Атрибуты, такие как объем двигателя, тип топлива, количество цилиндров и класс автомобиля, могут быть связаны с комбинированным расходом топлива (combination_mpg). Это позволяет выявлять зависимости между характеристиками автомобиля и его экономичностью.\n",
|
|||
|
"\n",
|
|||
|
"Примеры бизнес-целей и эффекты для бизнеса:\n",
|
|||
|
"\n",
|
|||
|
"1. Оптимизация ассортимента автомобилей:\n",
|
|||
|
" - Бизнес-цель: Анализировать топливную экономичность различных моделей для оптимизации ассортимента, предлагать более популярные и экономичные модели.\n",
|
|||
|
" - Эффект: Снижение затрат на производство низкоэффективных моделей и увеличение продаж популярных, экономичных автомобилей.\n",
|
|||
|
"\n",
|
|||
|
"2. Снижение углеродного следа:\n",
|
|||
|
" - Бизнес-цель: Определение моделей с высоким расходом топлива для улучшения их эффективности и снижения выбросов.\n",
|
|||
|
" - Эффект: Соответствие экологическим стандартам, улучшение репутации компании и соблюдение требований законодательства.\n",
|
|||
|
"\n",
|
|||
|
"Примеры целей технического проекта:\n",
|
|||
|
"\n",
|
|||
|
"1. Цель: Создание модели для прогнозирования топливной эффективности.\n",
|
|||
|
" - Вход: Объем двигателя, тип топлива, количество цилиндров, класс, тип трансмиссии.\n",
|
|||
|
" - Целевой признак: combination_mpg.\n",
|
|||
|
"\n",
|
|||
|
"2. Цель: Модель для предсказания углеродного следа автомобиля.\n",
|
|||
|
" - Вход: Тип топлива, объем двигателя, класс автомобиля, тип привода.\n",
|
|||
|
" - Целевой признак: combination_mpg."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверка на пустые значения и дубликаты"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 202,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Пустые значения по столбцам:\n",
|
|||
|
"city_mpg 0\n",
|
|||
|
"class 0\n",
|
|||
|
"combination_mpg 0\n",
|
|||
|
"cylinders 2\n",
|
|||
|
"displacement 2\n",
|
|||
|
"drive 0\n",
|
|||
|
"fuel_type 0\n",
|
|||
|
"highway_mpg 0\n",
|
|||
|
"make 0\n",
|
|||
|
"model 0\n",
|
|||
|
"transmission 0\n",
|
|||
|
"year 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Количество дубликатов: 2\n",
|
|||
|
"\n",
|
|||
|
"Статистический обзор данных:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>city_mpg</th>\n",
|
|||
|
" <th>combination_mpg</th>\n",
|
|||
|
" <th>cylinders</th>\n",
|
|||
|
" <th>displacement</th>\n",
|
|||
|
" <th>highway_mpg</th>\n",
|
|||
|
" <th>year</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>count</th>\n",
|
|||
|
" <td>550.000000</td>\n",
|
|||
|
" <td>550.000000</td>\n",
|
|||
|
" <td>548.000000</td>\n",
|
|||
|
" <td>548.000000</td>\n",
|
|||
|
" <td>550.000000</td>\n",
|
|||
|
" <td>550.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>mean</th>\n",
|
|||
|
" <td>21.460000</td>\n",
|
|||
|
" <td>24.069091</td>\n",
|
|||
|
" <td>5.315693</td>\n",
|
|||
|
" <td>2.931752</td>\n",
|
|||
|
" <td>28.609091</td>\n",
|
|||
|
" <td>2019.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>std</th>\n",
|
|||
|
" <td>8.147392</td>\n",
|
|||
|
" <td>7.478369</td>\n",
|
|||
|
" <td>1.759999</td>\n",
|
|||
|
" <td>1.248419</td>\n",
|
|||
|
" <td>6.832228</td>\n",
|
|||
|
" <td>3.165156</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>min</th>\n",
|
|||
|
" <td>11.000000</td>\n",
|
|||
|
" <td>14.000000</td>\n",
|
|||
|
" <td>3.000000</td>\n",
|
|||
|
" <td>1.200000</td>\n",
|
|||
|
" <td>18.000000</td>\n",
|
|||
|
" <td>2014.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>25%</th>\n",
|
|||
|
" <td>17.000000</td>\n",
|
|||
|
" <td>20.000000</td>\n",
|
|||
|
" <td>4.000000</td>\n",
|
|||
|
" <td>2.000000</td>\n",
|
|||
|
" <td>24.000000</td>\n",
|
|||
|
" <td>2016.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>50%</th>\n",
|
|||
|
" <td>20.000000</td>\n",
|
|||
|
" <td>23.000000</td>\n",
|
|||
|
" <td>4.000000</td>\n",
|
|||
|
" <td>2.500000</td>\n",
|
|||
|
" <td>28.000000</td>\n",
|
|||
|
" <td>2019.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>75%</th>\n",
|
|||
|
" <td>24.000000</td>\n",
|
|||
|
" <td>27.000000</td>\n",
|
|||
|
" <td>6.000000</td>\n",
|
|||
|
" <td>3.500000</td>\n",
|
|||
|
" <td>32.000000</td>\n",
|
|||
|
" <td>2022.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>max</th>\n",
|
|||
|
" <td>126.000000</td>\n",
|
|||
|
" <td>112.000000</td>\n",
|
|||
|
" <td>12.000000</td>\n",
|
|||
|
" <td>6.800000</td>\n",
|
|||
|
" <td>102.000000</td>\n",
|
|||
|
" <td>2024.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" city_mpg combination_mpg cylinders displacement highway_mpg \\\n",
|
|||
|
"count 550.000000 550.000000 548.000000 548.000000 550.000000 \n",
|
|||
|
"mean 21.460000 24.069091 5.315693 2.931752 28.609091 \n",
|
|||
|
"std 8.147392 7.478369 1.759999 1.248419 6.832228 \n",
|
|||
|
"min 11.000000 14.000000 3.000000 1.200000 18.000000 \n",
|
|||
|
"25% 17.000000 20.000000 4.000000 2.000000 24.000000 \n",
|
|||
|
"50% 20.000000 23.000000 4.000000 2.500000 28.000000 \n",
|
|||
|
"75% 24.000000 27.000000 6.000000 3.500000 32.000000 \n",
|
|||
|
"max 126.000000 112.000000 12.000000 6.800000 102.000000 \n",
|
|||
|
"\n",
|
|||
|
" year \n",
|
|||
|
"count 550.000000 \n",
|
|||
|
"mean 2019.000000 \n",
|
|||
|
"std 3.165156 \n",
|
|||
|
"min 2014.000000 \n",
|
|||
|
"25% 2016.000000 \n",
|
|||
|
"50% 2019.000000 \n",
|
|||
|
"75% 2022.000000 \n",
|
|||
|
"max 2024.000000 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 202,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"null_values = df_cars.isnull().sum()\n",
|
|||
|
"print(\"Пустые значения по столбцам:\")\n",
|
|||
|
"print(null_values)\n",
|
|||
|
"\n",
|
|||
|
"duplicates = df_cars.duplicated().sum()\n",
|
|||
|
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n",
|
|||
|
"\n",
|
|||
|
"print(\"\\nСтатистический обзор данных:\")\n",
|
|||
|
"df_cars.describe()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Видим, что есть пустые данные, и дубликаты, удаляем их:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 203,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"В наборе данных 'Cars' было удалено 2 строк с пустыми значениями.\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df_cars = df_cars.drop_duplicates()\n",
|
|||
|
"\n",
|
|||
|
"def drop_missing_values(dataframe, name):\n",
|
|||
|
" before_shape = dataframe.shape \n",
|
|||
|
" cleaned_dataframe = dataframe.dropna() \n",
|
|||
|
" after_shape = cleaned_dataframe.shape \n",
|
|||
|
" print(f\"В наборе данных '{name}' было удалено {before_shape[0] - after_shape[0]} строк с пустыми значениями.\")\n",
|
|||
|
" return cleaned_dataframe\n",
|
|||
|
"\n",
|
|||
|
"df_cars = drop_missing_values(df_cars, \"Cars\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверка на выбросы:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 204,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Количество выбросов в столбце 'combination_mpg': 8\n",
|
|||
|
"Количество выбросов в столбце 'cylinders': 10\n",
|
|||
|
"Количество выбросов в столбце 'displacement': 21\n",
|
|||
|
"Количество выбросов в столбце 'highway_mpg': 3\n",
|
|||
|
"Количество выбросов в столбце 'city_mpg': 9\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdIAAAPeCAYAAAAI5OjmAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACeg0lEQVR4nOzdd1xW9f//8ecFsgQRwYkD3Lg110dNcWauTFPLLHBlpuZoWOYnV45PmZnZcJVYrtJKUzNXrszKUlNzG2qpae694P37wx/n6yVwAAUuwMf9dvNW1/us1znngvd5PznXuRzGGCMAAAAAAAAAAJAgN1cXAAAAAAAAAABARkaQDgAAAAAAAACADYJ0AAAAAAAAAABsEKQDAAAAAAAAAGCDIB0AAAAAAAAAABsE6QAAAAAAAAAA2CBIBwAAAAAAAADABkE6AAAAAAAAAAA2CNIBAAAAAAAAALBBkI4s4eDBg3I4HIqKinJ1KU6+++47Va5cWd7e3nI4HDp79qyrS0oWh8OhPn36JDlfVFSUHA6HDh48mPZFJaBz584KDQ11ybYBAFkP1xP/J6E+vn79+qpfv36qbict1gkAyNiyan87bNgwORwOp7bQ0FB17tw59Yq8g8Ph0LBhw9Js/QCcEaTDSdyg6fZ/efPmVYMGDbR06dJ0r2fNmjVOtXh4eKhYsWKKiIjQn3/+mSrb+PHHHzVs2LBUH5SeOnVKHTp0kI+Pjz744AN99tln8vX1TdVt3A+OHj2qYcOGaevWra4uBQCQTFxPpB6uJwAAiaG/TT30t1nP6NGjtWDBAleXgSwmm6sLQMY0YsQIFS1aVMYYHT9+XFFRUWrevLkWLVqkli1bpns9ffv2VfXq1XXjxg1t3rxZU6ZM0ZIlS7R9+3YFBwff07p//PFHDR8+XJ07d1ZAQEDqFCxp06ZNunDhgt544w01btw41dabkTz99NN64okn5OXllWbbOHr0qIYPH67Q0FBVrlzZadrUqVMVGxubZtsGANwbrifuXUa6nli+fLlLtw8ASBj97b1Lq/52z549cnPjHlZXGD16tNq1a6dHH33U1aUgCyFIR4KaNWumatWqWa+7deumfPnyac6cOS7piOvWrat27dpJkrp06aJSpUqpb9++mjFjhgYNGpTu9STHiRMnJClVO/eMxt3dXe7u7i7bvoeHh8u2DQBIGtcT9y4jXU94enq6ugRbsbGxun79ury9vV1dCgCkK/rbe5dW/W1a3nQGIP3xZzEkS0BAgHx8fJQtm/PfXi5duqQXX3xRhQsXlpeXl0qXLq23335bxhhJ0pUrVxQWFqawsDBduXLFWu706dMqUKCAateurZiYmBTX07BhQ0lSdHS07Xzff/+96tatK19fXwUEBKh169batWuXNX3YsGF6+eWXJUlFixa1PoKW1DO/582bp6pVq8rHx0e5c+fWU089pSNHjljT69evr8jISElS9erV5XA4knwu2pEjR9StWzcFBwfLy8tLRYsW1XPPPafr169b8/z5559q3769AgMDlT17dv3nP//RkiVLnNYT93G6L774QsOHD1fBggWVI0cOtWvXTufOndO1a9fUv39/5c2bV35+furSpYuuXbuWYE2zZs1S6dKl5e3trapVq2rdunVO0xN6fmpoaKhatmypH374QTVq1JC3t7eKFSumTz/91GnZ06dP66WXXlKFChXk5+cnf39/NWvWTL///rvTvlSvXl3SrQuwuPMT9yy9hJ6RntR7Mk7cc+AXLFig8uXLy8vLS+XKldN3332X+ElKQNzz/d5++2198MEHKlasmLJnz66HHnpIf/31l4wxeuONN1SoUCH5+PiodevWOn36tNM64o7Z8uXLrWfylS1bVl999VW87W3btk3h4eHy8fFRoUKFNHLkSE2fPt2lz6oHgOTiesJZel9P/Pnnn3I4HBo/fny85X788Uc5HA7NmTMn0XXf+Tzz2685Ro0apUKFCsnb21uNGjXS/v374y0/ZcoUFS9eXD4+PqpRo4bWr1+f4HauXbumoUOHqkSJEvLy8lLhwoU1cODAeNcrcX35rFmzVK5cOXl5eVn9+Ny5c1W1alXlyJFD/v7+qlChgiZMmGB77AAgq6C/dZYW/e0PP/yg6tWry9vbW8WLF9fkyZMTnO/OZ6TfuHFDw4cPV8mSJeXt7a2goCA9+OCDWrFihTVP586d5efnpz///FNNmzaVr6+vgoODNWLEiHjj2jsdOnRIvXr1UunSpeXj46OgoCC1b98+wWN09uxZDRgwQKGhofLy8lKhQoUUERGhkydPWvOktE+eN2+eypYtKx8fH9WqVUvbt2+XJE2ePFklSpSQt7e36tevn2A9P//8sx5++GHlzJlT2bNnV3h4uDZs2OA0T9xz6Pfv3299KiFnzpzq0qWLLl++7FTPpUuXNGPGDOt9kpbPqsf9gzvSkaBz587p5MmTMsboxIkTmjhxoi5evKinnnrKmscYo0ceeUSrV69Wt27dVLlyZS1btkwvv/yyjhw5ovHjx8vHx0czZsxQnTp1NHjwYL3zzjuSpN69e+vcuXOKioq6qzuaDxw4IEkKCgpKdJ6VK1eqWbNmKlasmIYNG6YrV65o4sSJqlOnjjZv3qzQ0FC1bdtWe/fu1Zw5czR+/Hjlzp1bkpQnT55E1xsVFaUuXbqoevXqGjNmjI4fP64JEyZow4YN2rJliwICAjR48GCVLl1aU6ZMsT5mV7x48UTXefToUdWoUUNnz55Vjx49FBYWpiNHjmj+/Pm6fPmyPD09dfz4cdWuXVuXL19W3759FRQUpBkzZuiRRx7R/Pnz1aZNG6d1jhkzRj4+Pnr11Ve1f/9+TZw4UR4eHnJzc9OZM2c0bNgw/fTTT4qKilLRokU1ZMgQp+XXrl2rzz//XH379pWXl5c+/PBDPfzww/rll19Uvnx52/Ozf/9+tWvXTt26dVNkZKQ++eQTde7cWVWrVlW5cuUk3fqjwIIFC9S+fXsVLVpUx48f1+TJkxUeHq6dO3cqODhYZcqU0YgRIzRkyBD16NFDdevWlSTVrl07we0m5z15ux9++EFfffWVevXqpRw5cui9997TY489psOHD9u+txIya9YsXb9+Xc8//7xOnz6tt956Sx06dFDDhg21Zs0avfLKK9Z5eOmll/TJJ584Lb9v3z49/vjj6tmzpyIjIzV9+nS1b99e3333nZo0aSLpVjjSoEEDORwODRo0SL6+vpo2bRp3OQDIsLieyFjXE8WKFVOdOnU0a9YsDRgwwGnZWbNmKUeOHGrdunVKDqEk6X//+5/c3Nz00ksv6dy5c3rrrbfUqVMn/fzzz9Y8H3/8sZ599lnVrl1b/fv3159//qlHHnlEgYGBKly4sDVfbGysHnnkEf3www/q0aOHypQpo+3bt2v8+PHau3dvvGedfv/99/riiy/Up08f5c6dW6GhoVqxYoU6duyoRo0a6c0335Qk7dq1Sxs2bFC/fv1SvH8AkNHR36Zvf7t9+3Y99NBDypMnj4YNG6abN29q6NChypcvX5LHYtiwYRozZoy6d++uGjVq6Pz58/r111+1efNma9wnSTExMXr44Yf1n//8R2+99Za+++47DR06VDdv3tSIESMSXf+mTZv0448/6oknnlChQoV08OBBffTRR6pfv7527typ7NmzS5IuXryounXrateuXerataseeOABnTx5Ut98843+/vtv5c6dO8V98vr16/XNN9+od+/ekm5lEi1bttTAgQP14YcfqlevXjpz5ozeeustde3aVd9//7217Pfff69mzZqpatWqGjp0qNzc3DR9+nQ1bNhQ69evV40aNZy21aFDBxUtWlRjxozR5s2bNW3aNOXNm9fq9z/77DPrGPfo0UOSbM8pkGwGuM306dONpHj/vLy8TFRUlNO8CxYsMJLMyJEjndrbtWtnHA6H2b9/v9U2aNAg4+bmZtatW2fmzZtnJJl33303yXpWr15tJJlPPvnE/Pvvv+bo0aNmyZIlJjQ01DgcDrNp0yZjjDHR0dFGkpk+fbq1bOXKlU3evHnNqVOnrLbff//duLm5mYiICKt
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1500x1000 with 5 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Выбираем столбцы для анализа\n",
|
|||
|
"columns_to_check = ['combination_mpg', 'cylinders', 'displacement', 'highway_mpg', 'city_mpg']\n",
|
|||
|
"\n",
|
|||
|
"# Подсчитываем выбросы\n",
|
|||
|
"outliers_count = count_outliers(df_cars, columns_to_check)\n",
|
|||
|
"\n",
|
|||
|
"# Выводим количество выбросов для каждого столбца\n",
|
|||
|
"for col, count in outliers_count.items():\n",
|
|||
|
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
|
|||
|
"\n",
|
|||
|
"# Создаем диаграммы размахов\n",
|
|||
|
"plt.figure(figsize=(15, 10))\n",
|
|||
|
"for i, col in enumerate(columns_to_check, 1):\n",
|
|||
|
" plt.subplot(2, 3, i)\n",
|
|||
|
" sns.boxplot(x=df_cars[col])\n",
|
|||
|
" plt.title(f'Box Plot of {col}')\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"В каждом из выбранных столбцов присутствуют выбросы. Очистим их."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 205,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Количество удаленных строк: 36\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1500x600 with 0 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAPeCAYAAADj01PlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACRRElEQVR4nOzdd5hU1f0/8M8usEWqVF2RpaiAImCwBBEBOyr2GhKqUSPW2GJMFHvUGGOMsQtGsaGxF2zYS2woRsWGqICgqDQRFM7vD387X8bl0mRZwNfreXh0zr1z72fundlz7ntmzhSklFIAAAAAAACVFFZ3AQAAAAAAsKoSogMAAAAAQAYhOgAAAAAAZBCiAwAAAABABiE6AAAAAABkEKIDAAAAAEAGIToAAAAAAGQQogMAAAAAQAYhOgAAAAAAZBCis9r76KOPoqCgIIYPH17dpeR56KGHonPnzlFSUhIFBQXx9ddfV3dJS6WgoCCOPPLIJa43fPjwKCgoiI8++qjqi1qEAQMGRMuWLatl3wCsmYwp/s+i+vmePXtGz549V+h+qmKbAKz61tQ+d+jQoVFQUJDX1rJlyxgwYMCKK/JHCgoKYujQoVW2feAHQnRyKi6WFv7XtGnT6NWrVzz44IMrvZ4nnngir5ZatWpF69ato1+/fvHhhx+ukH0899xzMXTo0BV+MTpt2rQ44IADorS0NC677LK44YYbonbt2it0Hz8HkyZNiqFDh8aYMWOquxQAloExxYpjTAHA4uhzVxx97prn3HPPjbvuuqu6y2ANUbO6C2DVc+aZZ0arVq0ipRRTpkyJ4cOHx6677hr33ntv7L777iu9nqOPPjq22GKL+O677+LVV1+Nq666Ku6///4YO3ZslJWV/aRtP/fcc3HGGWfEgAEDokGDBium4Ih46aWXYubMmXHWWWfFDjvssMK2uyr5zW9+EwcddFAUFxdX2T4mTZoUZ5xxRrRs2TI6d+6ct+zqq6+OBQsWVNm+AfjpjCl+ulVpTPHwww9X6/4ByKbP/emqqs8dN25cFBb6DGt1OPfcc2O//faLvfbaq7pLYQ0gRKeS3r17x+abb567PXjw4GjWrFncfPPN1dL5du/ePfbbb7+IiBg4cGBstNFGcfTRR8f1118fp5xyykqvZ2lMnTo1ImKFduirmho1akSNGjWqbf+1atWqtn0DsHSMKX66VWlMUVRUVN0lLNaCBQti3rx5UVJSUt2lAKx0+tyfrqr63Kr84Bmw8ngrjCVq0KBBlJaWRs2a+e+5zJ49O44//vhYf/31o7i4ONq2bRt//etfI6UUERFz5syJdu3aRbt27WLOnDm5+3355Zex7rrrxtZbbx3z589f5nq22267iIgYP378Ytd7/PHHo3v37lG7du1o0KBB7LnnnvH222/nlg8dOjROPPHEiIho1apV7utmS5rje+TIkdGlS5coLS2Nxo0bx69//euYOHFibnnPnj2jf//+ERGxxRZbREFBwRLnP5s4cWIMHjw4ysrKori4OFq1ahW/+93vYt68ebl1Pvzww9h///2jYcOGsdZaa8Uvf/nLuP/++/O2U/HVudtuuy3OOOOMWG+99aJu3bqx3377xfTp02Pu3Llx7LHHRtOmTaNOnToxcODAmDt37iJrGjFiRLRt2zZKSkqiS5cu8dRTT+UtX9RcqS1btozdd989nnnmmdhyyy2jpKQkWrduHf/+97/z7vvll1/GCSecEJtuumnUqVMn6tWrF717947XX38977FsscUWEfHDoKvi/FTMmbeoOdGX9JysUDHv+1133RUdOnSI4uLi2GSTTeKhhx7KPkmLUDGP31//+te47LLLonXr1rHWWmvFTjvtFJ988kmklOKss86K5s2bR2lpaey5557x5Zdf5m2j4pg9/PDDubn3Nt544/jPf/5TaX9vvPFG9OjRI0pLS6N58+Zx9tlnx7Bhw6p1bnqAZWFMkW9ljyk+/PDDKCgoiIsvvrjS/Z577rkoKCiIm2++OXPbP56/fOFxxznnnBPNmzePkpKS2H777eP999+vdP+rrroq2rRpE6WlpbHlllvG008/vcj9zJ07N04//fTYYIMNori4ONZff/046aSTKo1ZKvrzESNGxCabbBLFxcW5vvyWW26JLl26RN26daNevXqx6aabxiWXXLLYYwewJtHn5quKPveZZ56JLbbYIkpKSqJNmzZx5ZVXLnK9H8+J/t1338UZZ5wRG264YZSUlESjRo1im222iUceeSS3zoABA6JOnTrx4Ycfxs477xy1a9eOsrKyOPPMMytd3/7YhAkT4ogjjoi2bdtGaWlpNGrUKPbff/9FHqOvv/46jjvuuGjZsmUUFxdH8+bNo1+/fvHFF1/k1lnWfnnkyJGx8cYbR2lpaXTt2jXGjh0bERFXXnllbLDBBlFSUhI9e/ZcZD0vvvhi7LLLLlG/fv1Ya621okePHvHss8/mrVMx7/z777+f+zZC/fr1Y+DAgfHNN9/k1TN79uy4/vrrc8+TqpybnjWfT6JTyfTp0+OLL76IlFJMnTo1Lr300pg1a1b8+te/zq2TUoo99tgjRo8eHYMHD47OnTvHqFGj4sQTT4yJEyfGxRdfHKWlpXH99ddHt27d4tRTT42//e1vERExZMiQmD59egwfPny5Psn8wQcfREREo0aNMtd59NFHo3fv3tG6desYOnRozJkzJy699NLo1q1bvPrqq9GyZcvYZ5994t13342bb745Lr744mjcuHFERDRp0iRzu8OHD4+BAwfGFltsEeedd15MmTIlLrnkknj22WfjtddeiwYNGsSpp54abdu2jauuuir3lbo2bdpkbnPSpEmx5ZZbxtdffx2HHnpotGvXLiZOnBi33357fPPNN1FUVBRTpkyJrbfeOr755ps4+uijo1GjRnH99dfHHnvsEbfffnvsvffeeds877zzorS0NP7whz/E+++/H5deemnUqlUrCgsL46uvvoqhQ4fGCy+8EMOHD49WrVrFaaedlnf/J598Mm699dY4+uijo7i4OP71r3/FLrvsEv/973+jQ4cOiz0/77//fuy3334xePDg6N+/f1x33XUxYMCA6NKlS2yyySYR8cMbAnfddVfsv//+0apVq5gyZUpceeWV0aNHj3jrrbeirKws2rdvH2eeeWacdtppceihh0b37t0jImLrrbde5H6X5jm5sGeeeSb+85//xBFHHBF169aNf/zjH7HvvvvGxx9/vNjn1qKMGDEi5s2bF0cddVR8+eWXccEFF8QBBxwQ2223XTzxxBNx8skn587DCSecENddd13e/d9777048MAD4/DDD4/+/fvHsGHDYv/994+HHnoodtxxx4j4IRTp1atXFBQUxCmnnBK1a9eOa665xqcagFWaMcWqNaZo3bp1dOvWLUaMGBHHHXdc3n1HjBgRdevWjT333HNZDmFERPzlL3+JwsLCOOGEE2L69OlxwQUXRN++fePFF1/MrXPttdfGYYcdFltvvXUce+yx8eGHH8Yee+wRDRs2jPXXXz+33oIFC2KPPfaIZ555Jg499NBo3759jB07Ni6++OJ49913K81r+vjjj8dtt90WRx55ZDRu3DhatmwZjzzySBx88MGx/fbbx/nnnx8REW+//XY8++yzccwxxyzz4wNYHehzV26fO3bs2Nhpp52iSZMmMXTo0Pj+++/j9NNPj2bNmi3xWAwdOjTOO++8OOSQQ2LLLbeMGTNmxMsvvxyvvvpq7vovImL+/Pmxyy67xC9/+cu44IIL4qGHHorTTz89vv/++zjzzDMzt//SSy/Fc889FwcddFA0b948Pvroo7j88sujZ8+e8dZbb8Vaa60VERGzZs2K7t27x9tvvx2DBg2KX/ziF/HFF1/EPffcE59++mk0btx4mfvlp59+Ou65554YMmRIRPyQTey+++5x0kknxb/+9a844ogj4quvvooLLrggBg0aFI8//njuvo8//nj07t07unTpEqeffnoUFhbGsGHDYrvttounn346ttxyy7x9HXDAAdGqVas477zz4tVXX41rrrkmmjZtmuv7b7jhhtwxPvT
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1500x1000 with 5 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 0 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Выбираем столбцы для очистки\n",
|
|||
|
"columns_to_clean = ['combination_mpg', 'cylinders', 'displacement', 'highway_mpg', 'city_mpg']\n",
|
|||
|
"\n",
|
|||
|
"# Функция для удаления выбросов\n",
|
|||
|
"def remove_outliers(df, columns):\n",
|
|||
|
" for col in columns:\n",
|
|||
|
" Q1 = df[col].quantile(0.25)\n",
|
|||
|
" Q3 = df[col].quantile(0.75)\n",
|
|||
|
" IQR = Q3 - Q1\n",
|
|||
|
" lower_bound = Q1 - 1.5 * IQR\n",
|
|||
|
" upper_bound = Q3 + 1.5 * IQR\n",
|
|||
|
" \n",
|
|||
|
" # Удаляем строки, содержащие выбросы\n",
|
|||
|
" df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n",
|
|||
|
" \n",
|
|||
|
" return df\n",
|
|||
|
"\n",
|
|||
|
"# Удаляем выбросы\n",
|
|||
|
"df_cars_clean = remove_outliers(df_cars, columns_to_clean)\n",
|
|||
|
"\n",
|
|||
|
"# Выводим количество удаленных строк\n",
|
|||
|
"print(f\"Количество удаленных строк: {len(df_cars) - len(df_cars_clean)}\")\n",
|
|||
|
"\n",
|
|||
|
"# Создаем диаграммы размаха для очищенных данных\n",
|
|||
|
"plt.figure(figsize=(15, 6))\n",
|
|||
|
"\n",
|
|||
|
"# Создаем диаграммы размахов\n",
|
|||
|
"plt.figure(figsize=(15, 10))\n",
|
|||
|
"for i, col in enumerate(columns_to_clean, 1):\n",
|
|||
|
" plt.subplot(2, 3, i)\n",
|
|||
|
" sns.boxplot(x=df_cars_clean[col])\n",
|
|||
|
" plt.title(f'Box Plot of {col}')\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"df_cars = df_cars_clean"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 206,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 306\n",
|
|||
|
"Размер контрольной выборки: 102\n",
|
|||
|
"Размер тестовой выборки: 102\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"train_df, test_df = train_test_split(df_cars, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 207,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение \"Комбинированный расход топлива\" в обучающей выборке:\n",
|
|||
|
"combination_mpg\n",
|
|||
|
"23 32\n",
|
|||
|
"22 29\n",
|
|||
|
"24 23\n",
|
|||
|
"25 22\n",
|
|||
|
"27 22\n",
|
|||
|
"18 21\n",
|
|||
|
"19 19\n",
|
|||
|
"29 18\n",
|
|||
|
"21 18\n",
|
|||
|
"26 17\n",
|
|||
|
"31 16\n",
|
|||
|
"28 14\n",
|
|||
|
"20 13\n",
|
|||
|
"32 12\n",
|
|||
|
"17 11\n",
|
|||
|
"30 10\n",
|
|||
|
"16 3\n",
|
|||
|
"34 3\n",
|
|||
|
"36 1\n",
|
|||
|
"33 1\n",
|
|||
|
"14 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Комбинированный расход топлива\" в контрольной выборке:\n",
|
|||
|
"combination_mpg\n",
|
|||
|
"20 17\n",
|
|||
|
"19 15\n",
|
|||
|
"21 13\n",
|
|||
|
"26 9\n",
|
|||
|
"27 7\n",
|
|||
|
"22 6\n",
|
|||
|
"30 5\n",
|
|||
|
"23 5\n",
|
|||
|
"18 4\n",
|
|||
|
"17 3\n",
|
|||
|
"24 3\n",
|
|||
|
"28 3\n",
|
|||
|
"29 3\n",
|
|||
|
"25 2\n",
|
|||
|
"34 2\n",
|
|||
|
"33 2\n",
|
|||
|
"32 1\n",
|
|||
|
"14 1\n",
|
|||
|
"31 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Комбинированный расход топлива\" в тестовой выборке:\n",
|
|||
|
"combination_mpg\n",
|
|||
|
"21 14\n",
|
|||
|
"18 13\n",
|
|||
|
"22 12\n",
|
|||
|
"27 12\n",
|
|||
|
"23 10\n",
|
|||
|
"31 5\n",
|
|||
|
"20 5\n",
|
|||
|
"26 5\n",
|
|||
|
"24 4\n",
|
|||
|
"29 4\n",
|
|||
|
"28 4\n",
|
|||
|
"19 4\n",
|
|||
|
"25 3\n",
|
|||
|
"32 3\n",
|
|||
|
"17 3\n",
|
|||
|
"30 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['combination_mpg'].value_counts()\n",
|
|||
|
" print(f\"Распределение \\\"Комбинированный расход топлива\\\" в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Оверсемплинг и андерсемплинг"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 208,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Оверсэмплинг:\n",
|
|||
|
"Распределение \"Комбинированный расход топлива\" в обучающей выборке:\n",
|
|||
|
"combination_mpg\n",
|
|||
|
"21 32\n",
|
|||
|
"22 32\n",
|
|||
|
"25 32\n",
|
|||
|
"19 32\n",
|
|||
|
"29 32\n",
|
|||
|
"23 32\n",
|
|||
|
"28 32\n",
|
|||
|
"18 32\n",
|
|||
|
"27 32\n",
|
|||
|
"20 32\n",
|
|||
|
"16 32\n",
|
|||
|
"30 32\n",
|
|||
|
"32 32\n",
|
|||
|
"31 32\n",
|
|||
|
"24 32\n",
|
|||
|
"26 32\n",
|
|||
|
"17 32\n",
|
|||
|
"36 32\n",
|
|||
|
"34 32\n",
|
|||
|
"33 32\n",
|
|||
|
"14 32\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Комбинированный расход топлива\" в контрольной выборке:\n",
|
|||
|
"combination_mpg\n",
|
|||
|
"20 17\n",
|
|||
|
"19 17\n",
|
|||
|
"17 17\n",
|
|||
|
"27 17\n",
|
|||
|
"22 17\n",
|
|||
|
"26 17\n",
|
|||
|
"24 17\n",
|
|||
|
"32 17\n",
|
|||
|
"21 17\n",
|
|||
|
"18 17\n",
|
|||
|
"30 17\n",
|
|||
|
"23 17\n",
|
|||
|
"29 17\n",
|
|||
|
"28 17\n",
|
|||
|
"34 17\n",
|
|||
|
"25 17\n",
|
|||
|
"14 17\n",
|
|||
|
"33 17\n",
|
|||
|
"31 17\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Комбинированный расход топлива\" в тестовой выборке:\n",
|
|||
|
"combination_mpg\n",
|
|||
|
"28 14\n",
|
|||
|
"32 14\n",
|
|||
|
"30 14\n",
|
|||
|
"23 14\n",
|
|||
|
"20 14\n",
|
|||
|
"26 14\n",
|
|||
|
"21 14\n",
|
|||
|
"18 14\n",
|
|||
|
"27 14\n",
|
|||
|
"25 14\n",
|
|||
|
"22 14\n",
|
|||
|
"19 14\n",
|
|||
|
"29 14\n",
|
|||
|
"24 14\n",
|
|||
|
"31 14\n",
|
|||
|
"17 14\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Андерсэмплинг:\n",
|
|||
|
"Распределение \"Комбинированный расход топлива\" в обучающей выборке:\n",
|
|||
|
"combination_mpg\n",
|
|||
|
"14 1\n",
|
|||
|
"16 1\n",
|
|||
|
"17 1\n",
|
|||
|
"18 1\n",
|
|||
|
"19 1\n",
|
|||
|
"20 1\n",
|
|||
|
"21 1\n",
|
|||
|
"22 1\n",
|
|||
|
"23 1\n",
|
|||
|
"24 1\n",
|
|||
|
"25 1\n",
|
|||
|
"26 1\n",
|
|||
|
"27 1\n",
|
|||
|
"28 1\n",
|
|||
|
"29 1\n",
|
|||
|
"30 1\n",
|
|||
|
"31 1\n",
|
|||
|
"32 1\n",
|
|||
|
"33 1\n",
|
|||
|
"34 1\n",
|
|||
|
"36 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Комбинированный расход топлива\" в контрольной выборке:\n",
|
|||
|
"combination_mpg\n",
|
|||
|
"14 1\n",
|
|||
|
"17 1\n",
|
|||
|
"18 1\n",
|
|||
|
"19 1\n",
|
|||
|
"20 1\n",
|
|||
|
"21 1\n",
|
|||
|
"22 1\n",
|
|||
|
"23 1\n",
|
|||
|
"24 1\n",
|
|||
|
"25 1\n",
|
|||
|
"26 1\n",
|
|||
|
"27 1\n",
|
|||
|
"28 1\n",
|
|||
|
"29 1\n",
|
|||
|
"30 1\n",
|
|||
|
"31 1\n",
|
|||
|
"32 1\n",
|
|||
|
"33 1\n",
|
|||
|
"34 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Комбинированный расход топлива\" в тестовой выборке:\n",
|
|||
|
"combination_mpg\n",
|
|||
|
"17 1\n",
|
|||
|
"18 1\n",
|
|||
|
"19 1\n",
|
|||
|
"20 1\n",
|
|||
|
"21 1\n",
|
|||
|
"22 1\n",
|
|||
|
"23 1\n",
|
|||
|
"24 1\n",
|
|||
|
"25 1\n",
|
|||
|
"26 1\n",
|
|||
|
"27 1\n",
|
|||
|
"28 1\n",
|
|||
|
"29 1\n",
|
|||
|
"30 1\n",
|
|||
|
"31 1\n",
|
|||
|
"32 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"train_df_oversampled = oversample(train_df, 'combination_mpg')\n",
|
|||
|
"val_df_oversampled = oversample(val_df, 'combination_mpg')\n",
|
|||
|
"test_df_oversampled = oversample(test_df, 'combination_mpg')\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df, 'combination_mpg')\n",
|
|||
|
"val_df_undersampled = undersample(val_df, 'combination_mpg')\n",
|
|||
|
"test_df_undersampled = undersample(test_df, 'combination_mpg')\n",
|
|||
|
"\n",
|
|||
|
"print(\"Оверсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"print(\"Андерсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Датасет №3 (Экономика стран)\n",
|
|||
|
"Ссылка: https://www.kaggle.com/datasets/pratik453609/economic-data-9-countries-19802020\n",
|
|||
|
"\n",
|
|||
|
"Проблемная область: экономический анализ и прогнозирование макроэкономических показателей.\n",
|
|||
|
"\n",
|
|||
|
"Объекты наблюдения: экономические индексы по странам за определённые годы."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 209,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['stock index', 'country', 'year', 'index price', 'log_indexprice',\n",
|
|||
|
" 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n",
|
|||
|
" 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n",
|
|||
|
" 'tradebalance', 'USTreasury'],\n",
|
|||
|
" dtype='object')\n",
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 369 entries, 0 to 368\n",
|
|||
|
"Data columns (total 14 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 stock index 369 non-null object \n",
|
|||
|
" 1 country 369 non-null object \n",
|
|||
|
" 2 year 369 non-null float64\n",
|
|||
|
" 3 index price 317 non-null float64\n",
|
|||
|
" 4 log_indexprice 369 non-null float64\n",
|
|||
|
" 5 inflationrate 326 non-null float64\n",
|
|||
|
" 6 oil prices 369 non-null float64\n",
|
|||
|
" 7 exchange_rate 367 non-null float64\n",
|
|||
|
" 8 gdppercent 350 non-null float64\n",
|
|||
|
" 9 percapitaincome 368 non-null float64\n",
|
|||
|
" 10 unemploymentrate 348 non-null float64\n",
|
|||
|
" 11 manufacturingoutput 278 non-null float64\n",
|
|||
|
" 12 tradebalance 365 non-null float64\n",
|
|||
|
" 13 USTreasury 369 non-null float64\n",
|
|||
|
"dtypes: float64(12), object(2)\n",
|
|||
|
"memory usage: 40.5+ KB\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>stock index</th>\n",
|
|||
|
" <th>country</th>\n",
|
|||
|
" <th>year</th>\n",
|
|||
|
" <th>index price</th>\n",
|
|||
|
" <th>log_indexprice</th>\n",
|
|||
|
" <th>inflationrate</th>\n",
|
|||
|
" <th>oil prices</th>\n",
|
|||
|
" <th>exchange_rate</th>\n",
|
|||
|
" <th>gdppercent</th>\n",
|
|||
|
" <th>percapitaincome</th>\n",
|
|||
|
" <th>unemploymentrate</th>\n",
|
|||
|
" <th>manufacturingoutput</th>\n",
|
|||
|
" <th>tradebalance</th>\n",
|
|||
|
" <th>USTreasury</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1980.0</td>\n",
|
|||
|
" <td>168.61</td>\n",
|
|||
|
" <td>2.23</td>\n",
|
|||
|
" <td>0.14</td>\n",
|
|||
|
" <td>21.59</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.09</td>\n",
|
|||
|
" <td>12575.0</td>\n",
|
|||
|
" <td>0.07</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-13.06</td>\n",
|
|||
|
" <td>0.11</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1981.0</td>\n",
|
|||
|
" <td>203.15</td>\n",
|
|||
|
" <td>2.31</td>\n",
|
|||
|
" <td>0.10</td>\n",
|
|||
|
" <td>31.77</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.12</td>\n",
|
|||
|
" <td>13976.0</td>\n",
|
|||
|
" <td>0.08</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-12.52</td>\n",
|
|||
|
" <td>0.14</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1982.0</td>\n",
|
|||
|
" <td>188.98</td>\n",
|
|||
|
" <td>2.28</td>\n",
|
|||
|
" <td>0.06</td>\n",
|
|||
|
" <td>28.52</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.04</td>\n",
|
|||
|
" <td>14434.0</td>\n",
|
|||
|
" <td>0.10</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-19.97</td>\n",
|
|||
|
" <td>0.13</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1983.0</td>\n",
|
|||
|
" <td>285.43</td>\n",
|
|||
|
" <td>2.46</td>\n",
|
|||
|
" <td>0.03</td>\n",
|
|||
|
" <td>26.19</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.09</td>\n",
|
|||
|
" <td>15544.0</td>\n",
|
|||
|
" <td>0.10</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-51.64</td>\n",
|
|||
|
" <td>0.11</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>NASDAQ</td>\n",
|
|||
|
" <td>United States of America</td>\n",
|
|||
|
" <td>1984.0</td>\n",
|
|||
|
" <td>248.89</td>\n",
|
|||
|
" <td>2.40</td>\n",
|
|||
|
" <td>0.04</td>\n",
|
|||
|
" <td>25.88</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.11</td>\n",
|
|||
|
" <td>17121.0</td>\n",
|
|||
|
" <td>0.08</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>-102.73</td>\n",
|
|||
|
" <td>0.12</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" stock index country year index price log_indexprice \\\n",
|
|||
|
"0 NASDAQ United States of America 1980.0 168.61 2.23 \n",
|
|||
|
"1 NASDAQ United States of America 1981.0 203.15 2.31 \n",
|
|||
|
"2 NASDAQ United States of America 1982.0 188.98 2.28 \n",
|
|||
|
"3 NASDAQ United States of America 1983.0 285.43 2.46 \n",
|
|||
|
"4 NASDAQ United States of America 1984.0 248.89 2.40 \n",
|
|||
|
"\n",
|
|||
|
" inflationrate oil prices exchange_rate gdppercent percapitaincome \\\n",
|
|||
|
"0 0.14 21.59 1.0 0.09 12575.0 \n",
|
|||
|
"1 0.10 31.77 1.0 0.12 13976.0 \n",
|
|||
|
"2 0.06 28.52 1.0 0.04 14434.0 \n",
|
|||
|
"3 0.03 26.19 1.0 0.09 15544.0 \n",
|
|||
|
"4 0.04 25.88 1.0 0.11 17121.0 \n",
|
|||
|
"\n",
|
|||
|
" unemploymentrate manufacturingoutput tradebalance USTreasury \n",
|
|||
|
"0 0.07 NaN -13.06 0.11 \n",
|
|||
|
"1 0.08 NaN -12.52 0.14 \n",
|
|||
|
"2 0.10 NaN -19.97 0.13 \n",
|
|||
|
"3 0.10 NaN -51.64 0.11 \n",
|
|||
|
"4 0.08 NaN -102.73 0.12 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 209,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df_countries = pd.read_csv(\".//static//csv//Economic Data - 9 Countries (1980-2020).csv\")\n",
|
|||
|
"print(df_countries.columns)\n",
|
|||
|
"df_countries.info()\n",
|
|||
|
"df_countries.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Атрибуты объектов:\n",
|
|||
|
"1. stock index — индекс акций.\n",
|
|||
|
"2. country — страна.\n",
|
|||
|
"3. year — год.\n",
|
|||
|
"4. index price — цена индекса.\n",
|
|||
|
"5. log_indexprice — логарифм цены индекса.\n",
|
|||
|
"6. inflationrate — уровень инфляции.\n",
|
|||
|
"7. oil prices — цены на нефть.\n",
|
|||
|
"8. exchange_rate — валютный курс.\n",
|
|||
|
"9. gdppercent — процент роста ВВП.\n",
|
|||
|
"10. percapitaincome — доход на душу населения.\n",
|
|||
|
"11. unemploymentrate — уровень безработицы.\n",
|
|||
|
"12. manufacturingoutput — объём производства.\n",
|
|||
|
"13. tradebalance — торговый баланс.\n",
|
|||
|
"14. USTreasury — доходность казначейских облигаций США.\n",
|
|||
|
"\n",
|
|||
|
"Связи между объектами:\n",
|
|||
|
"Некоторые атрибуты могут быть связаны друг с другом, например, уровень инфляции и процент роста ВВП могут коррелировать с ценами на нефть, уровнем безработицы и торговым балансом.\n",
|
|||
|
"\n",
|
|||
|
"Примеры бизнес-целей и эффект:\n",
|
|||
|
"1. Прогнозирование экономического роста и планирование инвестиций:\n",
|
|||
|
" - Бизнес-цель: Создать модель прогнозирования роста экономики для стран, чтобы принять стратегические инвестиционные решения.\n",
|
|||
|
" - Эффект: Повышение точности экономических прогнозов и улучшение прибыльности инвестиционных стратегий.\n",
|
|||
|
"\n",
|
|||
|
"2. Анализ и оптимизация торговой политики:\n",
|
|||
|
" - Бизнес-цель: Изучение влияния изменений торгового баланса и валютных курсов на экономику стран.\n",
|
|||
|
" - Эффект: Улучшение торговых соглашений и политики, что приведёт к более устойчивому экономическому росту.\n",
|
|||
|
"\n",
|
|||
|
"Примеры целей технического проекта:\n",
|
|||
|
"1. Цель: Построение модели для прогнозирования уровня инфляции.\n",
|
|||
|
" - Вход: Уровень безработицы, ВВП, доход на душу населения, валютный курс, цены на нефть.\n",
|
|||
|
" - Целевой признак: inflationrate.\n",
|
|||
|
"\n",
|
|||
|
"2. Цель: Построение модели для оценки экономического роста.\n",
|
|||
|
" - Вход: Торговый баланс, доход на душу населения, валютный курс, инфляция.\n",
|
|||
|
" - Целевой признак: gdppercent."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверка на пустые значения и дубликаты"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 210,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Пустые значения по столбцам:\n",
|
|||
|
"stock index 0\n",
|
|||
|
"country 0\n",
|
|||
|
"year 0\n",
|
|||
|
"index price 52\n",
|
|||
|
"log_indexprice 0\n",
|
|||
|
"inflationrate 43\n",
|
|||
|
"oil prices 0\n",
|
|||
|
"exchange_rate 2\n",
|
|||
|
"gdppercent 19\n",
|
|||
|
"percapitaincome 1\n",
|
|||
|
"unemploymentrate 21\n",
|
|||
|
"manufacturingoutput 91\n",
|
|||
|
"tradebalance 4\n",
|
|||
|
"USTreasury 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Количество дубликатов: 0\n",
|
|||
|
"\n",
|
|||
|
"Статистический обзор данных:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>year</th>\n",
|
|||
|
" <th>index price</th>\n",
|
|||
|
" <th>log_indexprice</th>\n",
|
|||
|
" <th>inflationrate</th>\n",
|
|||
|
" <th>oil prices</th>\n",
|
|||
|
" <th>exchange_rate</th>\n",
|
|||
|
" <th>gdppercent</th>\n",
|
|||
|
" <th>percapitaincome</th>\n",
|
|||
|
" <th>unemploymentrate</th>\n",
|
|||
|
" <th>manufacturingoutput</th>\n",
|
|||
|
" <th>tradebalance</th>\n",
|
|||
|
" <th>USTreasury</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>count</th>\n",
|
|||
|
" <td>369.000000</td>\n",
|
|||
|
" <td>317.000000</td>\n",
|
|||
|
" <td>369.000000</td>\n",
|
|||
|
" <td>326.000000</td>\n",
|
|||
|
" <td>369.000000</td>\n",
|
|||
|
" <td>367.000000</td>\n",
|
|||
|
" <td>350.000000</td>\n",
|
|||
|
" <td>368.000000</td>\n",
|
|||
|
" <td>348.000000</td>\n",
|
|||
|
" <td>278.000000</td>\n",
|
|||
|
" <td>365.000000</td>\n",
|
|||
|
" <td>369.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>mean</th>\n",
|
|||
|
" <td>2000.000000</td>\n",
|
|||
|
" <td>7898.648297</td>\n",
|
|||
|
" <td>3.610542</td>\n",
|
|||
|
" <td>0.041748</td>\n",
|
|||
|
" <td>39.743171</td>\n",
|
|||
|
" <td>27.897548</td>\n",
|
|||
|
" <td>0.037114</td>\n",
|
|||
|
" <td>20719.964674</td>\n",
|
|||
|
" <td>0.068908</td>\n",
|
|||
|
" <td>328.084820</td>\n",
|
|||
|
" <td>-15.996384</td>\n",
|
|||
|
" <td>0.059024</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>std</th>\n",
|
|||
|
" <td>11.848225</td>\n",
|
|||
|
" <td>7811.336862</td>\n",
|
|||
|
" <td>0.482481</td>\n",
|
|||
|
" <td>0.039579</td>\n",
|
|||
|
" <td>25.452654</td>\n",
|
|||
|
" <td>49.620521</td>\n",
|
|||
|
" <td>0.037850</td>\n",
|
|||
|
" <td>17435.037783</td>\n",
|
|||
|
" <td>0.043207</td>\n",
|
|||
|
" <td>622.395923</td>\n",
|
|||
|
" <td>154.557170</td>\n",
|
|||
|
" <td>0.033086</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>min</th>\n",
|
|||
|
" <td>1980.000000</td>\n",
|
|||
|
" <td>168.610000</td>\n",
|
|||
|
" <td>2.230000</td>\n",
|
|||
|
" <td>-0.040000</td>\n",
|
|||
|
" <td>11.350000</td>\n",
|
|||
|
" <td>0.900000</td>\n",
|
|||
|
" <td>-0.110000</td>\n",
|
|||
|
" <td>27.000000</td>\n",
|
|||
|
" <td>0.020000</td>\n",
|
|||
|
" <td>0.590000</td>\n",
|
|||
|
" <td>-770.930000</td>\n",
|
|||
|
" <td>0.010000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>25%</th>\n",
|
|||
|
" <td>1990.000000</td>\n",
|
|||
|
" <td>2407.100000</td>\n",
|
|||
|
" <td>3.320000</td>\n",
|
|||
|
" <td>0.020000</td>\n",
|
|||
|
" <td>19.410000</td>\n",
|
|||
|
" <td>1.330000</td>\n",
|
|||
|
" <td>0.020000</td>\n",
|
|||
|
" <td>2090.250000</td>\n",
|
|||
|
" <td>0.040000</td>\n",
|
|||
|
" <td>80.380000</td>\n",
|
|||
|
" <td>-25.370000</td>\n",
|
|||
|
" <td>0.030000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>50%</th>\n",
|
|||
|
" <td>2000.000000</td>\n",
|
|||
|
" <td>5160.100000</td>\n",
|
|||
|
" <td>3.600000</td>\n",
|
|||
|
" <td>0.030000</td>\n",
|
|||
|
" <td>28.520000</td>\n",
|
|||
|
" <td>5.440000</td>\n",
|
|||
|
" <td>0.030000</td>\n",
|
|||
|
" <td>19969.500000</td>\n",
|
|||
|
" <td>0.060000</td>\n",
|
|||
|
" <td>188.160000</td>\n",
|
|||
|
" <td>-0.140000</td>\n",
|
|||
|
" <td>0.050000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>75%</th>\n",
|
|||
|
" <td>2010.000000</td>\n",
|
|||
|
" <td>10279.500000</td>\n",
|
|||
|
" <td>3.980000</td>\n",
|
|||
|
" <td>0.057500</td>\n",
|
|||
|
" <td>57.880000</td>\n",
|
|||
|
" <td>15.055000</td>\n",
|
|||
|
" <td>0.060000</td>\n",
|
|||
|
" <td>36384.000000</td>\n",
|
|||
|
" <td>0.090000</td>\n",
|
|||
|
" <td>271.977500</td>\n",
|
|||
|
" <td>19.080000</td>\n",
|
|||
|
" <td>0.080000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>max</th>\n",
|
|||
|
" <td>2020.000000</td>\n",
|
|||
|
" <td>47751.330000</td>\n",
|
|||
|
" <td>4.680000</td>\n",
|
|||
|
" <td>0.240000</td>\n",
|
|||
|
" <td>98.560000</td>\n",
|
|||
|
" <td>249.050000</td>\n",
|
|||
|
" <td>0.150000</td>\n",
|
|||
|
" <td>65280.000000</td>\n",
|
|||
|
" <td>0.260000</td>\n",
|
|||
|
" <td>3868.460000</td>\n",
|
|||
|
" <td>366.140000</td>\n",
|
|||
|
" <td>0.140000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" year index price log_indexprice inflationrate oil prices \\\n",
|
|||
|
"count 369.000000 317.000000 369.000000 326.000000 369.000000 \n",
|
|||
|
"mean 2000.000000 7898.648297 3.610542 0.041748 39.743171 \n",
|
|||
|
"std 11.848225 7811.336862 0.482481 0.039579 25.452654 \n",
|
|||
|
"min 1980.000000 168.610000 2.230000 -0.040000 11.350000 \n",
|
|||
|
"25% 1990.000000 2407.100000 3.320000 0.020000 19.410000 \n",
|
|||
|
"50% 2000.000000 5160.100000 3.600000 0.030000 28.520000 \n",
|
|||
|
"75% 2010.000000 10279.500000 3.980000 0.057500 57.880000 \n",
|
|||
|
"max 2020.000000 47751.330000 4.680000 0.240000 98.560000 \n",
|
|||
|
"\n",
|
|||
|
" exchange_rate gdppercent percapitaincome unemploymentrate \\\n",
|
|||
|
"count 367.000000 350.000000 368.000000 348.000000 \n",
|
|||
|
"mean 27.897548 0.037114 20719.964674 0.068908 \n",
|
|||
|
"std 49.620521 0.037850 17435.037783 0.043207 \n",
|
|||
|
"min 0.900000 -0.110000 27.000000 0.020000 \n",
|
|||
|
"25% 1.330000 0.020000 2090.250000 0.040000 \n",
|
|||
|
"50% 5.440000 0.030000 19969.500000 0.060000 \n",
|
|||
|
"75% 15.055000 0.060000 36384.000000 0.090000 \n",
|
|||
|
"max 249.050000 0.150000 65280.000000 0.260000 \n",
|
|||
|
"\n",
|
|||
|
" manufacturingoutput tradebalance USTreasury \n",
|
|||
|
"count 278.000000 365.000000 369.000000 \n",
|
|||
|
"mean 328.084820 -15.996384 0.059024 \n",
|
|||
|
"std 622.395923 154.557170 0.033086 \n",
|
|||
|
"min 0.590000 -770.930000 0.010000 \n",
|
|||
|
"25% 80.380000 -25.370000 0.030000 \n",
|
|||
|
"50% 188.160000 -0.140000 0.050000 \n",
|
|||
|
"75% 271.977500 19.080000 0.080000 \n",
|
|||
|
"max 3868.460000 366.140000 0.140000 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 210,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"null_values = df_countries.isnull().sum()\n",
|
|||
|
"print(\"Пустые значения по столбцам:\")\n",
|
|||
|
"print(null_values)\n",
|
|||
|
"\n",
|
|||
|
"duplicates = df_countries.duplicated().sum()\n",
|
|||
|
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n",
|
|||
|
"\n",
|
|||
|
"print(\"\\nСтатистический обзор данных:\")\n",
|
|||
|
"df_countries.describe()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Видим, что есть пустые данные, но нет дубликатов. Удаляем их"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 211,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"В наборе данных 'Countries' было удалено 150 строк с пустыми значениями.\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df_countries = drop_missing_values(df_countries, \"Countries\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверка на выбросы:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 212,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Количество выбросов в столбце 'year': 0\n",
|
|||
|
"Количество выбросов в столбце 'index price': 17\n",
|
|||
|
"Количество выбросов в столбце 'log_indexprice': 1\n",
|
|||
|
"Количество выбросов в столбце 'inflationrate': 35\n",
|
|||
|
"Количество выбросов в столбце 'oil prices': 0\n",
|
|||
|
"Количество выбросов в столбце 'exchange_rate': 53\n",
|
|||
|
"Количество выбросов в столбце 'gdppercent': 13\n",
|
|||
|
"Количество выбросов в столбце 'percapitaincome': 0\n",
|
|||
|
"Количество выбросов в столбце 'unemploymentrate': 9\n",
|
|||
|
"Количество выбросов в столбце 'manufacturingoutput': 29\n",
|
|||
|
"Количество выбросов в столбце 'tradebalance': 47\n",
|
|||
|
"Количество выбросов в столбце 'USTreasury': 9\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdAAAAPeCAYAAAAMETjbAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeZyN9f//8eeZMRtjZgyDGTTW7OsgFENkX0qlRHaJSpJWZSspiVTWzwcjaVPiU0iEErJl30JDhWxjBtln3r8//M71nWOuMYszq8f9dnOrua73eV+v632u87re53Wucx2HMcYIAAAAAAAAAAC48MjqAAAAAAAAAAAAyI4ooAMAAAAAAAAAYIMCOgAAAAAAAAAANiigAwAAAAAAAABggwI6AAAAAAAAAAA2KKADAAAAAAAAAGCDAjoAAAAAAAAAADYooAMAAAAAAAAAYIMCOgAAAAAAAAAANiigI1s5dOiQHA6HoqKisjoUF99//71q1KghX19fORwOxcbGZnVIAHDLcnrOjYqKksPh0KFDh9y27YzoM7OsWrVKDodDq1atyupQALgJedo9GjdurMaNG2f7PjNCyZIl1aNHj6wOA0A63O7ngP3796t58+YKDAyUw+HQggULMuS8kl3HGdkLBfRcyplUEv8rXLiwmjRpoiVLlmR6PM439c5/Xl5eKl26tLp166Y//vjDLdtYu3atRowY4fbi9unTp9WpUyf5+flp0qRJmjNnjvLly+fWbQDI2ci57kPOBZARyNPuQ54GkNNwDnCfzDwHdO/eXTt27NDo0aM1Z84c1a5d+5b6+/TTT/X++++7J7gs8tZbb2nBggVZHcZtKU9WB4CMNWrUKJUqVUrGGB0/flxRUVFq3bq1vv32W7Vt2zbT4xk4cKDq1Kmjq1ev6rffftP06dO1aNEi7dixQ2FhYbfU99q1azVy5Ej16NFDQUFB7glY0saNG3Xu3Dm98cYbatasmdv6BZD7kHNvXVpy7uOPP65HH31UPj4+btt+TtaoUSNdvHhR3t7eWR0KkG2Rp29dTpwb//DDD1kdQpbZt2+fPDy4bg6QOAe4Q2bN1S9evKh169Zp6NChevrpp9MbrotPP/1UO3fu1KBBg1yWh4eH6+LFi/Ly8nLLdjLSW2+9pYceekj3339/Vody26GAnsu1atXK5VO63r17q0iRIvrss8+y5ATRsGFDPfTQQ5Kknj176s4779TAgQM1e/ZsvfLKK5keT2qcOHFCktx60skMly5dkre3NxNmIBORc29dWnKup6enPD09Mzii7C9xvvf19c3qcIBsjTx963Li3Ph2+2DRGKNLly7Jz8+PD5mBRDgH3LrMmqufPHky1du5VQ6HI0vm0AkJCbpy5Qrz9xyCytptJigoSH5+fsqTx/Wzk3///VfPP/+8SpQoIR8fH5UvX17jxo2TMUbS9U//KlSooAoVKujixYvW42JiYhQaGqoGDRooPj4+zfHce++9kqTo6OibtluxYoUaNmyofPnyKSgoSB06dNCePXus9SNGjNALL7wgSSpVqpT1VaiU7os1b948RUREyM/PT4UKFVLXrl115MgRa33jxo3VvXt3SVKdOnXkcDiSvYfgypUr5XA49M033yRZ9+mnn8rhcGjdunXWsr179+qhhx5ScHCwfH19Vbt2bf3vf/9zeVxMTIyGDBmiqlWryt/fXwEBAWrVqpW2bdvm0s75FbDPP/9cr732mooVK6a8efPq7NmzN91/ABmLnOvKnTlXsr+vYsmSJdW2bVv98ssvqlu3rnx9fVW6dGl9/PHHSR6/a9cu3XvvvfLz81Px4sX15ptvKiEhwXZbS5YsscYkf/78atOmjXbt2uUyZh4eHho2bJjL45z5f8qUKTcdm8aNG6tKlSravHmzGjRoID8/P5UqVUpTp051aXezfJ/cPdDXr1+v1q1bq0CBAsqXL5+qVaumiRMnurRJzTkJyI3I067cnaeTM3nyZFWuXFk+Pj4KCwvTU089ZXubgUmTJql06dLy8/NT3bp1tXr16nTde/zGxzjz5ZdffqnRo0erePHi8vX1VdOmTXXgwIEkj58+fbrKlCnjEoedy5cva/jw4Spbtqx8fHxUokQJvfjii7p8+bLVpnv37vL19XV5viSpRYsWKlCggI4ePSrp/85xP//8s/r166eCBQsqICBA3bp105kzZ1we6zz3LV26VLVr15afn5+mTZtmrbvxOYqNjdVzzz2nkiVLysfHR8WLF1e3bt106tSpNO0LkNNxDnCVXebqI0aMUHh4uCTphRdekMPhUMmSJZPdzsKFC9WmTRuFhYXJx8dHZcqU0RtvvOHyHDRu3FiLFi3S4cOHrfFw9pncPdBTGmdnrA6HQwcOHLCu9g8MDFTPnj114cIFl7YOh0NPP/205s6da50Dv//+e0nSuHHj1KBBAxUsWFB+fn6KiIjQV199leTx//77r2bPnm3tQ+LxP3LkiHr16qUiRYrIx8dHlStX1syZM5MdN6QNV6DncnFxcTp16pSMMTpx4oQ+/PBDnT9/Xl27drXaGGPUvn17rVy5Ur1791aNGjW0dOlSvfDCCzpy5IgmTJggPz8/zZ49W3fffbeGDh2q8ePHS5KeeuopxcXFKSoqKl2fLB48eFCSVLBgwWTbLF++XK1atVLp0qU1YsQIXbx4UR9++KHuvvtu/fbbbypZsqQ6duyo33//XZ999pkmTJigQoUKSZJCQkKS7TcqKko9e/ZUnTp1NGbMGB0/flwTJ07UmjVrtGXLFgUFBWno0KEqX768pk+fbn3dq0yZMrb9NW7cWCVKlNDcuXP1wAMPuKybO3euypQpo/r160u6XrS5++67VaxYMb388svKly+fvvzyS91///36+uuvrcf/8ccfWrBggR5++GGVKlVKx48f17Rp0xQZGandu3cn+VrXG2+8IW9vbw0ZMkSXL1++7a62AbIaOTfzcu7NHDhwQA899JB69+6t7t27a+bMmerRo4ciIiJUuXJlSdI///yjJk2a6Nq1a1Yenj59uvz8/JL0N2fOHHXv3l0tWrTQO++8owsXLmjKlCm65557tGXLFpUsWVL33nuvBgwYoDFjxuj+++9XrVq1dOzYMT3zzDNq1qyZnnzyyRTjPnPmjFq3bq1OnTqpc+fO+vLLL9W/f395e3urV69eLm1Tm++XLVumtm3bKjQ0VM8++6yKFi2qPXv26LvvvtOzzz4rKfXnJCA3IE9nfZ4eMWKERo4cqWbNmql///7at2+fpkyZoo0bN2rNmjXWV+inTJmip59+Wg0bNtRzzz2nQ4cO6f7771eBAgVUvHjxNG0zOW+//bY8PDw0ZMgQxcXFaezYserSpYvWr19vtZkxY4b69eunBg0aaNCgQfrjjz/Uvn17BQcHq0SJEla7hIQEtW/fXr/88oueeOIJVaxYUTt27NCECRP0+++/W/esnThxolasWKHu3btr3bp18vT01LRp0/TDDz9ozpw5Seb3Tz/9tIKCgjRixAhrrA4fPmx9COC0b98+de7cWf369VPfvn1Vvnx5230+f/68GjZsqD179qhXr16qVauWTp06pf/973/6+++/VahQoVTvC5DTcA7I+nOAlPJcvWPHjgoKCtJzzz2nzp07q3Xr1vL3979p7P7+/ho8eLD8/f21YsUKDRs2TGfPntW7774rSRo6dKji4uL0999/a8KECZJ00z5TM86JderUSaVKldKYMWP022+/6b///a8KFy6sd955x6XdihUr9OWXX+rpp59WoUKFrH4mTpyo9u3bq0uXLrpy5Yo+//xzPfzww/ruu+/Upk0bSdffk/Tp00d169bVE088IUnW+B8/flz16tWzivQhISFasmSJevfurbNnzya5bQ3SwSBXmjVrlpGU5J+Pj4+JiopyabtgwQIjybz55psuyx966CHjcDjMgQMHrGWvvPKK8fDwMD///LOZN2+ekWTef//9FONZuXKlkWRmzpxpTp48aY4ePWoWLVpkSpYsaRwOh9m4caMxxpjo6GgjycyaNct6bI0aNUzhwoXN6dOnrWXbtm0zHh4eplu3btayd99
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1500x1000 with 12 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Выбираем столбцы для анализа\n",
|
|||
|
"columns_to_check = ['year', 'index price', 'log_indexprice',\n",
|
|||
|
" 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n",
|
|||
|
" 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n",
|
|||
|
" 'tradebalance', 'USTreasury']\n",
|
|||
|
"\n",
|
|||
|
"# Подсчитываем выбросы\n",
|
|||
|
"outliers_count = count_outliers(df_countries, columns_to_check)\n",
|
|||
|
"\n",
|
|||
|
"# Выводим количество выбросов для каждого столбца\n",
|
|||
|
"for col, count in outliers_count.items():\n",
|
|||
|
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
|
|||
|
"\n",
|
|||
|
"# Создаем диаграммы размахов\n",
|
|||
|
"plt.figure(figsize=(15, 10))\n",
|
|||
|
"for i, col in enumerate(columns_to_check, 1):\n",
|
|||
|
" plt.subplot(3, 4, i)\n",
|
|||
|
" sns.boxplot(x=df_countries[col])\n",
|
|||
|
" plt.title(f'Box Plot of {col}')\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"В большинстве из выбранных столбцов присутствуют выбросы. Очистим их."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 213,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Количество удаленных строк: 136\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1500x600 with 0 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABc0AAAPeCAYAAADeQTKDAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3gUZdfH8d8mpAEptEACmNB7bwJSVJQm2FGkhCKggAhWsNCUoviISIngo4CIXR4LgoAKFkSUDgpICSjSQQJIJ+f9w919s6SQhCSbhO/nunLpzt57z7nvHfbMnJ2dcZiZCQAAAAAAAAAAyMfbAQAAAAAAAAAAkFNQNAcAAAAAAAAAwImiOQAAAAAAAAAAThTNAQAAAAAAAABwomgOAAAAAAAAAIATRXMAAAAAAAAAAJwomgMAAAAAAAAA4ETRHAAAAAAAAAAAJ4rmAAAAAAAAAAA4UTQH0mnXrl1yOByaNWuWt0Px8OWXX6p27doKDAyUw+HQsWPHkm03a9YsORwO7dq1K9PWnRV9Zpdly5bJ4XBo2bJl3g4FAJAK8m/maNmypVq2bJnj+8wK0dHR6tGjh7fDAAAkcrXn923btunmm29WaGioHA6HPvnkkyzZZ8ip8wzkZBTN4TWuRJD4Lzw8XNdff70WLlyY7fG4iqeuPz8/P5UtW1bdu3fXzp07M2UdP/74o0aOHJliws2oI0eOqFOnTgoKCtLUqVM1Z84cFShQIFPXAQDIG8i/mYf8CwDIKcjvmSc783tMTIw2btyoMWPGaM6cOapfv/4V9ffOO+/olVdeyZzgvGTs2LH65JNPvB0GoHzeDgAYPXq0ypQpIzPTgQMHNGvWLLVr106ff/65brnllmyPZ9CgQWrQoIHOnz+vNWvWaMaMGfriiy+0ceNGRUZGXlHfP/74o0aNGqUePXooLCwscwKW9Msvv+jEiRN67rnn1KpVq1TbduvWTffee68CAgIybf25WfPmzXX69Gn5+/t7OxQAyFbk3yuXnvybUyxevNjbIXjN1q1b5ePDOUMA8jby+5XLruPr06dPa8WKFXr66ac1cODAjIbr4Z133tGmTZs0ePBgj+VRUVE6ffq0/Pz8MmU9WWns2LG66667dNttt3k7FFzlKJrD69q2bevxbWrv3r1VvHhxvfvuu15J6s2aNdNdd90lSerZs6cqVqyoQYMGafbs2Ro2bFi2x5MWBw8elKQ07Sj4+vrK19c3iyPK+c6cOSN/f3/5+PgoMDDQ2+EAQLYj/1659OTfnOJq+5LYzHTmzBkFBQVxwgCAqwL5/cpl1/H1oUOH0ryeK+VwOLxy3JuQkKBz585xzI1ciVMtkOOEhYUpKChI+fJ5fqfzzz//6NFHH1Xp0qUVEBCgSpUq6aWXXpKZSfr3W9rKlSurcuXKOn36tPt1R48eVUREhJo0aaKLFy+mO54bbrhBkhQXF5dqu2+++UbNmjVTgQIFFBYWpltvvVWbN292Pz9y5Eg9/vjjkqQyZcq4f6Z2ueuUffjhh6pXr56CgoJUtGhRde3aVX/99Zf7+ZYtWyomJkaS1KBBAzkcjlSv15nc9dGio6N1yy236IcfflDDhg0VGBiosmXL6q233kry+l9//VU33HCDgoKCVKpUKT3//PNKSEhIdl0LFy50z0lwcLDat2+vX3/91WPOfHx8NHz4cI/XvfPOO3I4HIqNjU11blq2bKnq1atr9erVatKkiYKCglSmTBm99tprHu1cPw1877339Mwzz6hkyZLKnz+/jh8/nuI1zVeuXKl27dqpUKFCKlCggGrWrKlJkyZ5tNmyZYvuuusuFS5cWIGBgapfv74+++yzVGMGgJyK/Osps/NvSqZNm6Zq1aopICBAkZGRGjBgQLI/M586darKli2roKAgNWzYUN9//32GriV+6WtcefCDDz7QmDFjVKpUKQUGBurGG2/U9u3bk7x+xowZKleunEccyTl79qxGjBih8uXLKyAgQKVLl9YTTzyhs2fPutvExMQoMDDQ4/2SpNatW6tQoULau3evpP/fd/nuu+/Ur18/FSlSRCEhIerevbv+/vtvj9e69mkWLVqk+vXrKygoSNOnT3c/d+l7dOzYMQ0ZMkTR0dEKCAhQqVKl1L17dx0+fDhdYwGAnIr87imnHF+PHDlSUVFRkqTHH39cDodD0dHRKa7n008/Vfv27RUZGamAgACVK1dOzz33nMd70LJlS33xxRfavXu3ez5cfaZ0TfPLzbMrVofDoe3bt7vP6g8NDVXPnj116tQpj7YOh0MDBw7U3Llz3fs3X375pSTppZdeUpMmTVSkSBEFBQWpXr16+uijj5K8/p9//tHs2bPdY0g8/3/99Zd69eql4sWLKyAgQNWqVdObb76Z4rwBV4IzzeF18fHxOnz4sMxMBw8e1OTJk3Xy5El17drV3cbM1LFjRy1dulS9e/dW7dq1tWjRIj3++OP666+/NHHiRAUFBWn27Nlq2rSpnn76ab388suSpAEDBig+Pl6zZs3K0DfAO3bskCQVKVIkxTZfffWV2rZtq7Jly2rkyJE6ffq0Jk+erKZNm2rNmjWKjo7WHXfcod9//13vvvuuJk6cqKJFi0qSihUrlmK/s2bNUs+ePdWgQQONGzdOBw4c0KRJk7R8+XKtXbtWYWFhevrpp1WpUiXNmDHD/VO8cuXKpXuc27dv11133aXevXsrJiZGb775pnr06KF69eqpWrVqkqT9+/fr+uuv14ULFzR06FAVKFBAM2bMUFBQUJL+5syZo5iYGLVu3VovvPCCTp06pdjYWF133XVau3atoqOjdcMNN6h///4aN26cbrvtNtWtW1f79u3TQw89pFatWumBBx64bNx///232rVrp06dOqlz58764IMP9OCDD8rf31+9evXyaPvcc8/J399fjz32mM6ePZvi2XZLlizRLbfcooiICD388MMqUaKENm/erPnz5+vhhx+W9O+XB02bNlXJkiXdc/HBBx/otttu08cff6zbb789vW8BAGQr8q/38+/IkSM1atQotWrVSg8++KC2bt2q2NhY/fLLL1q+fLn7J9SxsbEaOHCgmjVrpiFDhmjXrl267bbbVKhQIZUqVSpd60zJ+PHj5ePjo8cee0zx8fF68cUX1aVLF61cudLd5o033lC/fv3UpEkTDR48WDt37lTHjh1VuHBhlS5d2t0uISFBHTt21A8//KC+ffuqSpUq2rhxoyZOnKjff//dfZ3SSZMm6ZtvvlFMTIxWrFghX19fTZ8+XYsXL9acOXOS/Gx/4MCBCgsL08iRI91ztXv3bnfh32Xr1q3q3Lmz+vXrpz59+qhSpUrJjvnkyZNq1qyZNm/erF69eqlu3bo6fPiwPvvsM+3Zs0dFixZN81gAIKcgv3s/v0uXP76+4447FBYWpiFDhqhz585q166dChYsmGrsBQsW1COPPKKCBQvqm2++0fDhw3X8+HFNmDBBkvT0008rPj5ee/bs0cSJEyUp1T7TMs+JderUSWXKlNG4ceO0Zs0a/fe//1V4eLheeOEFj3bffPONPvjgAw0cOFBFixZ19zNp0iR17NhRXbp00blz5/Tee+/p7rvv1vz589W+fXtJ/9YR7r//fjVs2FB9+/aVJPf8HzhwQNdee627MF+sWDEtXLhQvXv31vHjx5Nckga4YgZ4ycyZM01Skr+AgACbNWuWR9tPPvnEJNnzzz/vsfyuu+4yh8Nh27dvdy8bNmyY+fj42HfffWcffvihSbJXXnnlsvEsXbrUJNmbb75phw4dsr1799oXX3xh0dHR5nA47JdffjEzs7i4OJNkM2fOdL+2du3aFh4ebkeOHHEvW79+vfn4+Fj37t3dyyZMmGCSLC4u7rLxnDt3zsLDw6169ep2+vRp9/L58+ebJBs+fLh7mWsuXTGmxtU2cQxRUVEmyb777jv3soMHD1pAQIA9+uij7mWDBw82SbZy5UqPdqGhoR59njhxwsLCwqxPnz4e696/f7+FhoZ6LP/nn3+sfPnyVq1aNTtz5oy1b9/eQkJCbPfu3ZcdS4sWLUyS/ec//3EvO3v2rPv9OHfunJn
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1500x1000 with 9 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 0 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Выбираем столбцы для очистки\n",
|
|||
|
"columns_to_clean = ['index price', 'log_indexprice',\n",
|
|||
|
" 'inflationrate', 'exchange_rate', 'gdppercent', 'unemploymentrate', 'manufacturingoutput',\n",
|
|||
|
" 'tradebalance', 'USTreasury']\n",
|
|||
|
"\n",
|
|||
|
"# Удаляем выбросы\n",
|
|||
|
"df_countries_clean = remove_outliers(df_countries, columns_to_clean)\n",
|
|||
|
"\n",
|
|||
|
"# Выводим количество удаленных строк\n",
|
|||
|
"print(f\"Количество удаленных строк: {len(df_countries) - len(df_countries_clean)}\")\n",
|
|||
|
"\n",
|
|||
|
"# Создаем диаграммы размаха для очищенных данных\n",
|
|||
|
"plt.figure(figsize=(15, 6))\n",
|
|||
|
"\n",
|
|||
|
"# Создаем диаграммы размахов\n",
|
|||
|
"plt.figure(figsize=(15, 10))\n",
|
|||
|
"for i, col in enumerate(columns_to_clean, 1):\n",
|
|||
|
" plt.subplot(3, 3, i)\n",
|
|||
|
" sns.boxplot(x=df_countries_clean[col])\n",
|
|||
|
" plt.title(f'Box Plot of {col}')\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"df_countries = df_countries_clean"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 214,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 49\n",
|
|||
|
"Размер контрольной выборки: 17\n",
|
|||
|
"Размер тестовой выборки: 17\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"train_df, test_df = train_test_split(df_countries, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 215,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение \"Уровень инфляции\" в обучающей выборке:\n",
|
|||
|
"inflationrate\n",
|
|||
|
"0.02 25\n",
|
|||
|
"0.03 11\n",
|
|||
|
"0.01 9\n",
|
|||
|
"0.04 4\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Уровень инфляции\" в контрольной выборке:\n",
|
|||
|
"inflationrate\n",
|
|||
|
"0.03 6\n",
|
|||
|
"0.01 6\n",
|
|||
|
"0.02 5\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Уровень инфляции\" в тестовой выборке:\n",
|
|||
|
"inflationrate\n",
|
|||
|
"0.02 6\n",
|
|||
|
"0.03 6\n",
|
|||
|
"0.01 4\n",
|
|||
|
"0.04 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['inflationrate'].value_counts()\n",
|
|||
|
" print(f\"Распределение \\\"Уровень инфляции\\\" в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Оверсемплинг и андерсемплинг"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 216,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Оверсэмплинг:\n",
|
|||
|
"Распределение \"Уровень инфляции\" в обучающей выборке:\n",
|
|||
|
"inflationrate\n",
|
|||
|
"0.03 26\n",
|
|||
|
"0.02 25\n",
|
|||
|
"0.01 9\n",
|
|||
|
"0.04 8\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Уровень инфляции\" в контрольной выборке:\n",
|
|||
|
"inflationrate\n",
|
|||
|
"0.03 11\n",
|
|||
|
"0.01 6\n",
|
|||
|
"0.02 5\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Уровень инфляции\" в тестовой выборке:\n",
|
|||
|
"inflationrate\n",
|
|||
|
"0.03 8\n",
|
|||
|
"0.02 6\n",
|
|||
|
"0.01 4\n",
|
|||
|
"0.04 2\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Андерсэмплинг:\n",
|
|||
|
"Распределение \"Уровень инфляции\" в обучающей выборке:\n",
|
|||
|
"inflationrate\n",
|
|||
|
"0.03 11\n",
|
|||
|
"0.02 10\n",
|
|||
|
"0.01 5\n",
|
|||
|
"0.04 4\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Уровень инфляции\" в контрольной выборке:\n",
|
|||
|
"inflationrate\n",
|
|||
|
"0.03 6\n",
|
|||
|
"0.01 4\n",
|
|||
|
"0.02 2\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение \"Уровень инфляции\" в тестовой выборке:\n",
|
|||
|
"inflationrate\n",
|
|||
|
"0.03 6\n",
|
|||
|
"0.02 5\n",
|
|||
|
"0.01 2\n",
|
|||
|
"0.04 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def binning(target, bins):\n",
|
|||
|
" return pd.qcut(target, q=bins, labels=False)\n",
|
|||
|
"\n",
|
|||
|
"train_df['inflationrate_binned'] = binning(train_df['inflationrate'], bins=2)\n",
|
|||
|
"val_df['inflationrate_binned'] = binning(val_df['inflationrate'], bins=2)\n",
|
|||
|
"test_df['inflationrate_binned'] = binning(test_df['inflationrate'], bins=2)\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df, 'inflationrate_binned')\n",
|
|||
|
"val_df_oversampled = oversample(val_df, 'inflationrate_binned')\n",
|
|||
|
"test_df_oversampled = oversample(test_df, 'inflationrate_binned')\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df, 'inflationrate_binned')\n",
|
|||
|
"val_df_undersampled = undersample(val_df, 'inflationrate_binned')\n",
|
|||
|
"test_df_undersampled = undersample(test_df, 'inflationrate_binned')\n",
|
|||
|
"\n",
|
|||
|
"print(\"Оверсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"print(\"Андерсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimvenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|