diff --git a/lab_2/lab2.ipynb b/lab_2/lab2.ipynb new file mode 100644 index 0000000..fcf0162 --- /dev/null +++ b/lab_2/lab2.ipynb @@ -0,0 +1,2493 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Датасет №1 (Использование мобильных устройств и поведение пользователей)\n", + "Ссылка: https://www.kaggle.com/datasets/valakhorasani/mobile-device-usage-and-user-behavior-dataset\n", + "\n", + "Проблемная область: прогнозирование пользовательского поведения и сегментация пользователей для улучшения работы приложений, оптимизации потребления энергии, анализа пользовательского опыта или рекламы.\n", + "\n", + "Объекты наблюдения: пользователи мобильных устройств, чьи данные об использовании собираются и анализируются." + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['User ID', 'Device Model', 'Operating System',\n", + " 'App Usage Time (min/day)', 'Screen On Time (hours/day)',\n", + " 'Battery Drain (mAh/day)', 'Number of Apps Installed',\n", + " 'Data Usage (MB/day)', 'Age', 'Gender', 'User Behavior Class'],\n", + " dtype='object')\n", + "\n", + "RangeIndex: 700 entries, 0 to 699\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 User ID 700 non-null int64 \n", + " 1 Device Model 700 non-null object \n", + " 2 Operating System 700 non-null object \n", + " 3 App Usage Time (min/day) 700 non-null int64 \n", + " 4 Screen On Time (hours/day) 700 non-null float64\n", + " 5 Battery Drain (mAh/day) 700 non-null int64 \n", + " 6 Number of Apps Installed 700 non-null int64 \n", + " 7 Data Usage (MB/day) 700 non-null int64 \n", + " 8 Age 700 non-null int64 \n", + " 9 Gender 700 non-null object \n", + " 10 User Behavior Class 700 non-null int64 \n", + "dtypes: float64(1), int64(7), object(3)\n", + "memory usage: 60.3+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
User IDDevice ModelOperating SystemApp Usage Time (min/day)Screen On Time (hours/day)Battery Drain (mAh/day)Number of Apps InstalledData Usage (MB/day)AgeGenderUser Behavior Class
01Google Pixel 5Android3936.4187267112240Male4
12OnePlus 9Android2684.713314294447Female3
23Xiaomi Mi 11Android1544.07613232242Male2
34Google Pixel 5Android2394.816765687120Male3
45iPhone 12iOS1874.313675898831Female3
\n", + "
" + ], + "text/plain": [ + " User ID Device Model Operating System App Usage Time (min/day) \\\n", + "0 1 Google Pixel 5 Android 393 \n", + "1 2 OnePlus 9 Android 268 \n", + "2 3 Xiaomi Mi 11 Android 154 \n", + "3 4 Google Pixel 5 Android 239 \n", + "4 5 iPhone 12 iOS 187 \n", + "\n", + " Screen On Time (hours/day) Battery Drain (mAh/day) \\\n", + "0 6.4 1872 \n", + "1 4.7 1331 \n", + "2 4.0 761 \n", + "3 4.8 1676 \n", + "4 4.3 1367 \n", + "\n", + " Number of Apps Installed Data Usage (MB/day) Age Gender \\\n", + "0 67 1122 40 Male \n", + "1 42 944 47 Female \n", + "2 32 322 42 Male \n", + "3 56 871 20 Male \n", + "4 58 988 31 Female \n", + "\n", + " User Behavior Class \n", + "0 4 \n", + "1 3 \n", + "2 2 \n", + "3 3 \n", + "4 3 " + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "df_mobiles = pd.read_csv(\".//static//csv//user_behavior_dataset.csv\")\n", + "print(df_mobiles.columns)\n", + "df_mobiles.info()\n", + "df_mobiles.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Атрибуты объектов:\n", + "1. User ID — уникальный идентификатор пользователя.\n", + "2. Device Model — модель устройства.\n", + "3. Operating System — операционная система устройства.\n", + "4. App Usage Time (min/day) — время использования приложений в минутах в день.\n", + "5. Data Usage (MB/day) — время включенного экрана в часах в день.\n", + "6. Battery Drain (mAh/day) — потребление батареи в мАч в день.\n", + "7. Number of Apps Installed — количество установленных приложений.\n", + "8. Screen On Time (hours/day) — объем данных в мегабайтах в день.\n", + "9. Age — возраст пользователя.\n", + "10. Gender — пол пользователя.\n", + "11. User Behavior Class — класс поведения пользователя (категория для классификации).\n", + "\n", + "Связи между объектами:\n", + "Атрибуты, такие как модель устройства, ОС и время использования приложений, могут быть связаны с классом поведения, представляя зависимости между действиями пользователя и его характеристиками.\n", + "\n", + "Примеры бизнес-целей и эффекты для бизнеса:\n", + "1. Оптимизация энергопотребления устройств:\n", + " - Бизнес-цель: Оптимизировать работу приложений для снижения расхода батареи, что увеличит время работы устройства и улучшит пользовательский опыт.\n", + " - Эффект: Повышение удовлетворенности клиентов и снижение вероятности перехода на конкурентные приложения.\n", + "\n", + "2. Сегментация пользователей для рекламы:\n", + " - Бизнес-цель: Создание таргетированной рекламы на основе поведения пользователей (классы поведения).\n", + " - Эффект: Увеличение конверсий и доходов от рекламных кампаний за счет более точной сегментации.\n", + "\n", + "Примеры целей технического проекта:\n", + "1. Цель: Построение модели для прогнозирования расхода батареи.\n", + " - Вход: Модель устройства, ОС, время использования приложений, количество приложений, возраст.\n", + " - Целевой признак: Battery Drain (mAh/day).\n", + "\n", + "2. Цель: Сегментация пользователей для рекламных кампаний.\n", + " - Вход: Время использования приложений, возраст, пол, объем данных.\n", + " - Целевой признак: User Behavior Class." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Проверка на пустые значения и дубликаты" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Пустые значения по столбцам:\n", + "User ID 0\n", + "Device Model 0\n", + "Operating System 0\n", + "App Usage Time (min/day) 0\n", + "Screen On Time (hours/day) 0\n", + "Battery Drain (mAh/day) 0\n", + "Number of Apps Installed 0\n", + "Data Usage (MB/day) 0\n", + "Age 0\n", + "Gender 0\n", + "User Behavior Class 0\n", + "dtype: int64\n", + "\n", + "Количество дубликатов: 0\n", + "\n", + "Статистический обзор данных:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
User IDApp Usage Time (min/day)Screen On Time (hours/day)Battery Drain (mAh/day)Number of Apps InstalledData Usage (MB/day)AgeUser Behavior Class
count700.00000700.000000700.000000700.000000700.000000700.000000700.000000700.000000
mean350.50000271.1285715.2727141525.15857150.681429929.74285738.4828572.990000
std202.21688177.1994843.068584819.13641426.943324640.45172912.0129161.401476
min1.0000030.0000001.000000302.00000010.000000102.00000018.0000001.000000
25%175.75000113.2500002.500000722.25000026.000000373.00000028.0000002.000000
50%350.50000227.5000004.9000001502.50000049.000000823.50000038.0000003.000000
75%525.25000434.2500007.4000002229.50000074.0000001341.00000049.0000004.000000
max700.00000598.00000012.0000002993.00000099.0000002497.00000059.0000005.000000
\n", + "
" + ], + "text/plain": [ + " User ID App Usage Time (min/day) Screen On Time (hours/day) \\\n", + "count 700.00000 700.000000 700.000000 \n", + "mean 350.50000 271.128571 5.272714 \n", + "std 202.21688 177.199484 3.068584 \n", + "min 1.00000 30.000000 1.000000 \n", + "25% 175.75000 113.250000 2.500000 \n", + "50% 350.50000 227.500000 4.900000 \n", + "75% 525.25000 434.250000 7.400000 \n", + "max 700.00000 598.000000 12.000000 \n", + "\n", + " Battery Drain (mAh/day) Number of Apps Installed Data Usage (MB/day) \\\n", + "count 700.000000 700.000000 700.000000 \n", + "mean 1525.158571 50.681429 929.742857 \n", + "std 819.136414 26.943324 640.451729 \n", + "min 302.000000 10.000000 102.000000 \n", + "25% 722.250000 26.000000 373.000000 \n", + "50% 1502.500000 49.000000 823.500000 \n", + "75% 2229.500000 74.000000 1341.000000 \n", + "max 2993.000000 99.000000 2497.000000 \n", + "\n", + " Age User Behavior Class \n", + "count 700.000000 700.000000 \n", + "mean 38.482857 2.990000 \n", + "std 12.012916 1.401476 \n", + "min 18.000000 1.000000 \n", + "25% 28.000000 2.000000 \n", + "50% 38.000000 3.000000 \n", + "75% 49.000000 4.000000 \n", + "max 59.000000 5.000000 " + ] + }, + "execution_count": 196, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "null_values = df_mobiles.isnull().sum()\n", + "print(\"Пустые значения по столбцам:\")\n", + "print(null_values)\n", + "\n", + "duplicates = df_mobiles.duplicated().sum()\n", + "print(f\"\\nКоличество дубликатов: {duplicates}\")\n", + "\n", + "print(\"\\nСтатистический обзор данных:\")\n", + "df_mobiles.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Пустых значений и дубликатов нет, проверим на выбросы:" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Количество выбросов в столбце 'App Usage Time (min/day)': 0\n", + "Количество выбросов в столбце 'Screen On Time (hours/day)': 0\n", + "Количество выбросов в столбце 'Battery Drain (mAh/day)': 0\n", + "Количество выбросов в столбце 'Number of Apps Installed': 0\n", + "Количество выбросов в столбце 'Data Usage (MB/day)': 0\n", + "Количество выбросов в столбце 'User Behavior Class': 0\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Выбираем столбцы для анализа\n", + "columns_to_check = ['App Usage Time (min/day)', 'Screen On Time (hours/day)', 'Battery Drain (mAh/day)', 'Number of Apps Installed', 'Data Usage (MB/day)', 'User Behavior Class']\n", + "\n", + "# Функция для подсчета выбросов\n", + "def count_outliers(data, columns):\n", + " outliers_count = {}\n", + " for col in columns:\n", + " Q1 = data[col].quantile(0.25)\n", + " Q3 = data[col].quantile(0.75)\n", + " IQR = Q3 - Q1\n", + " lower_bound = Q1 - 1.5 * IQR\n", + " upper_bound = Q3 + 1.5 * IQR\n", + " \n", + " # Считаем количество выбросов\n", + " outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]\n", + " outliers_count[col] = len(outliers)\n", + " \n", + " return outliers_count\n", + "\n", + "# Подсчитываем выбросы\n", + "outliers_count = count_outliers(df_mobiles, columns_to_check)\n", + "\n", + "# Выводим количество выбросов для каждого столбца\n", + "for col, count in outliers_count.items():\n", + " print(f\"Количество выбросов в столбце '{col}': {count}\")\n", + "\n", + "# Создаем диаграммы размахов\n", + "plt.figure(figsize=(15, 10))\n", + "for i, col in enumerate(columns_to_check, 1):\n", + " plt.subplot(2, 3, i)\n", + " sns.boxplot(x=df_mobiles[col])\n", + " plt.title(f'Box Plot of {col}')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выбросов нет" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Разбиение набора данных на обучающую, контрольную и тестовую выборки" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: 420\n", + "Размер контрольной выборки: 140\n", + "Размер тестовой выборки: 140\n" + ] + } + ], + "source": [ + "train_df, test_df = train_test_split(df_mobiles, test_size=0.2, random_state=42)\n", + "\n", + "train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n", + "\n", + "print(\"Размер обучающей выборки:\", len(train_df))\n", + "print(\"Размер контрольной выборки:\", len(val_df))\n", + "print(\"Размер тестовой выборки:\", len(test_df))" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение \"Класс поведения пользователя\" в обучающей выборке:\n", + "User Behavior Class\n", + "2 88\n", + "5 88\n", + "4 86\n", + "3 84\n", + "1 74\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Класс поведения пользователя\" в контрольной выборке:\n", + "User Behavior Class\n", + "1 35\n", + "2 29\n", + "4 26\n", + "5 25\n", + "3 25\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Класс поведения пользователя\" в тестовой выборке:\n", + "User Behavior Class\n", + "3 34\n", + "2 29\n", + "4 27\n", + "1 27\n", + "5 23\n", + "Name: count, dtype: int64\n", + "\n" + ] + } + ], + "source": [ + "def check_balance(df, name):\n", + " counts = df['User Behavior Class'].value_counts()\n", + " print(f\"Распределение \\\"Класс поведения пользователя\\\" в {name}:\")\n", + " print(counts)\n", + " print()\n", + "\n", + "check_balance(train_df, \"обучающей выборке\")\n", + "check_balance(val_df, \"контрольной выборке\")\n", + "check_balance(test_df, \"тестовой выборке\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оверсемплинг и андерсемплинг" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Оверсэмплинг:\n", + "Распределение \"Класс поведения пользователя\" в обучающей выборке:\n", + "User Behavior Class\n", + "1 88\n", + "2 88\n", + "5 88\n", + "4 88\n", + "3 88\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Класс поведения пользователя\" в контрольной выборке:\n", + "User Behavior Class\n", + "5 35\n", + "3 35\n", + "1 35\n", + "2 35\n", + "4 35\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Класс поведения пользователя\" в тестовой выборке:\n", + "User Behavior Class\n", + "4 34\n", + "1 34\n", + "2 34\n", + "3 34\n", + "5 34\n", + "Name: count, dtype: int64\n", + "\n", + "Андерсэмплинг:\n", + "Распределение \"Класс поведения пользователя\" в обучающей выборке:\n", + "User Behavior Class\n", + "1 74\n", + "2 74\n", + "3 74\n", + "4 74\n", + "5 74\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Класс поведения пользователя\" в контрольной выборке:\n", + "User Behavior Class\n", + "1 25\n", + "2 25\n", + "3 25\n", + "4 25\n", + "5 25\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Класс поведения пользователя\" в тестовой выборке:\n", + "User Behavior Class\n", + "1 23\n", + "2 23\n", + "3 23\n", + "4 23\n", + "5 23\n", + "Name: count, dtype: int64\n", + "\n" + ] + } + ], + "source": [ + "from imblearn.over_sampling import RandomOverSampler\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "\n", + "def oversample(df, target_column):\n", + " X = df.drop(target_column, axis=1)\n", + " y = df[target_column]\n", + " \n", + " oversampler = RandomOverSampler(random_state=42)\n", + " x_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n", + " \n", + " resampled_df = pd.concat([x_resampled, y_resampled], axis=1) \n", + " return resampled_df\n", + "\n", + "def undersample(df, target_column):\n", + " X = df.drop(target_column, axis=1)\n", + " y = df[target_column]\n", + " \n", + " undersampler = RandomUnderSampler(random_state=42)\n", + " x_resampled, y_resampled = undersampler.fit_resample(X, y) # type: ignore\n", + " \n", + " resampled_df = pd.concat([x_resampled, y_resampled], axis=1)\n", + " return resampled_df\n", + "\n", + "train_df_oversampled = oversample(train_df, 'User Behavior Class')\n", + "val_df_oversampled = oversample(val_df, 'User Behavior Class')\n", + "test_df_oversampled = oversample(test_df, 'User Behavior Class')\n", + "\n", + "train_df_undersampled = undersample(train_df, 'User Behavior Class')\n", + "val_df_undersampled = undersample(val_df, 'User Behavior Class')\n", + "test_df_undersampled = undersample(test_df, 'User Behavior Class')\n", + "\n", + "print(\"Оверсэмплинг:\")\n", + "check_balance(train_df_oversampled, \"обучающей выборке\")\n", + "check_balance(val_df_oversampled, \"контрольной выборке\")\n", + "check_balance(test_df_oversampled, \"тестовой выборке\")\n", + "\n", + "print(\"Андерсэмплинг:\")\n", + "check_balance(train_df_undersampled, \"обучающей выборке\")\n", + "check_balance(val_df_undersampled, \"контрольной выборке\")\n", + "check_balance(test_df_undersampled, \"тестовой выборке\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Датасет №2 (Характеристики автомобиля: данные об экономии топлива)\n", + "Ссылка: https://www.kaggle.com/datasets/arslaan5/explore-car-performance-fuel-efficiency-data\n", + "\n", + "Проблемная область: производительность и экономичность транспортных средств.\n", + "\n", + "Объекты наблюдения: автомобили, представленные набором характеристик." + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['city_mpg', 'class', 'combination_mpg', 'cylinders', 'displacement',\n", + " 'drive', 'fuel_type', 'highway_mpg', 'make', 'model', 'transmission',\n", + " 'year'],\n", + " dtype='object')\n", + "\n", + "RangeIndex: 550 entries, 0 to 549\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 city_mpg 550 non-null int64 \n", + " 1 class 550 non-null object \n", + " 2 combination_mpg 550 non-null int64 \n", + " 3 cylinders 548 non-null float64\n", + " 4 displacement 548 non-null float64\n", + " 5 drive 550 non-null object \n", + " 6 fuel_type 550 non-null object \n", + " 7 highway_mpg 550 non-null int64 \n", + " 8 make 550 non-null object \n", + " 9 model 550 non-null object \n", + " 10 transmission 550 non-null object \n", + " 11 year 550 non-null int64 \n", + "dtypes: float64(2), int64(4), object(6)\n", + "memory usage: 51.7+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city_mpgclasscombination_mpgcylindersdisplacementdrivefuel_typehighway_mpgmakemodeltransmissionyear
025midsize car294.02.5fwdgas36mazda6m2014
126midsize car304.02.5fwdgas37mazda6a2014
225small sport utility vehicle274.02.5fwdgas31mazdacx-5 2wda2014
326small sport utility vehicle294.02.0fwdgas34mazdacx-5 2wdm2014
426small sport utility vehicle284.02.0fwdgas32mazdacx-5 2wda2014
\n", + "
" + ], + "text/plain": [ + " city_mpg class combination_mpg cylinders \\\n", + "0 25 midsize car 29 4.0 \n", + "1 26 midsize car 30 4.0 \n", + "2 25 small sport utility vehicle 27 4.0 \n", + "3 26 small sport utility vehicle 29 4.0 \n", + "4 26 small sport utility vehicle 28 4.0 \n", + "\n", + " displacement drive fuel_type highway_mpg make model transmission \\\n", + "0 2.5 fwd gas 36 mazda 6 m \n", + "1 2.5 fwd gas 37 mazda 6 a \n", + "2 2.5 fwd gas 31 mazda cx-5 2wd a \n", + "3 2.0 fwd gas 34 mazda cx-5 2wd m \n", + "4 2.0 fwd gas 32 mazda cx-5 2wd a \n", + "\n", + " year \n", + "0 2014 \n", + "1 2014 \n", + "2 2014 \n", + "3 2014 \n", + "4 2014 " + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_cars = pd.read_csv(\".//static//csv//car_data.csv\")\n", + "print(df_cars.columns)\n", + "df_cars.info()\n", + "df_cars.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Атрибуты объектов:\n", + "\n", + "1. city_mpg — расход топлива в городе (миль на галлон).\n", + "2. class — класс автомобиля (например, седан среднего размера, малый внедорожник).\n", + "3. combination_mpg — комбинированный расход топлива (миль на галлон).\n", + "4. cylinders — количество цилиндров.\n", + "5. displacement — объем двигателя (в литрах).\n", + "6. drive — тип привода (например, передний, полный).\n", + "7. fuel_type — тип топлива (бензин, дизель и др.).\n", + "8. highway_mpg — расход топлива на шоссе (миль на галлон).\n", + "9. make — марка автомобиля.\n", + "10. model — модель автомобиля.\n", + "11. transmission — тип трансмиссии (автоматическая, механическая).\n", + "12. year — год выпуска автомобиля.\n", + "\n", + "Связи между объектами:\n", + "Атрибуты, такие как объем двигателя, тип топлива, количество цилиндров и класс автомобиля, могут быть связаны с комбинированным расходом топлива (combination_mpg). Это позволяет выявлять зависимости между характеристиками автомобиля и его экономичностью.\n", + "\n", + "Примеры бизнес-целей и эффекты для бизнеса:\n", + "\n", + "1. Оптимизация ассортимента автомобилей:\n", + " - Бизнес-цель: Анализировать топливную экономичность различных моделей для оптимизации ассортимента, предлагать более популярные и экономичные модели.\n", + " - Эффект: Снижение затрат на производство низкоэффективных моделей и увеличение продаж популярных, экономичных автомобилей.\n", + "\n", + "2. Снижение углеродного следа:\n", + " - Бизнес-цель: Определение моделей с высоким расходом топлива для улучшения их эффективности и снижения выбросов.\n", + " - Эффект: Соответствие экологическим стандартам, улучшение репутации компании и соблюдение требований законодательства.\n", + "\n", + "Примеры целей технического проекта:\n", + "\n", + "1. Цель: Создание модели для прогнозирования топливной эффективности.\n", + " - Вход: Объем двигателя, тип топлива, количество цилиндров, класс, тип трансмиссии.\n", + " - Целевой признак: combination_mpg.\n", + "\n", + "2. Цель: Модель для предсказания углеродного следа автомобиля.\n", + " - Вход: Тип топлива, объем двигателя, класс автомобиля, тип привода.\n", + " - Целевой признак: combination_mpg." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Проверка на пустые значения и дубликаты" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Пустые значения по столбцам:\n", + "city_mpg 0\n", + "class 0\n", + "combination_mpg 0\n", + "cylinders 2\n", + "displacement 2\n", + "drive 0\n", + "fuel_type 0\n", + "highway_mpg 0\n", + "make 0\n", + "model 0\n", + "transmission 0\n", + "year 0\n", + "dtype: int64\n", + "\n", + "Количество дубликатов: 2\n", + "\n", + "Статистический обзор данных:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city_mpgcombination_mpgcylindersdisplacementhighway_mpgyear
count550.000000550.000000548.000000548.000000550.000000550.000000
mean21.46000024.0690915.3156932.93175228.6090912019.000000
std8.1473927.4783691.7599991.2484196.8322283.165156
min11.00000014.0000003.0000001.20000018.0000002014.000000
25%17.00000020.0000004.0000002.00000024.0000002016.000000
50%20.00000023.0000004.0000002.50000028.0000002019.000000
75%24.00000027.0000006.0000003.50000032.0000002022.000000
max126.000000112.00000012.0000006.800000102.0000002024.000000
\n", + "
" + ], + "text/plain": [ + " city_mpg combination_mpg cylinders displacement highway_mpg \\\n", + "count 550.000000 550.000000 548.000000 548.000000 550.000000 \n", + "mean 21.460000 24.069091 5.315693 2.931752 28.609091 \n", + "std 8.147392 7.478369 1.759999 1.248419 6.832228 \n", + "min 11.000000 14.000000 3.000000 1.200000 18.000000 \n", + "25% 17.000000 20.000000 4.000000 2.000000 24.000000 \n", + "50% 20.000000 23.000000 4.000000 2.500000 28.000000 \n", + "75% 24.000000 27.000000 6.000000 3.500000 32.000000 \n", + "max 126.000000 112.000000 12.000000 6.800000 102.000000 \n", + "\n", + " year \n", + "count 550.000000 \n", + "mean 2019.000000 \n", + "std 3.165156 \n", + "min 2014.000000 \n", + "25% 2016.000000 \n", + "50% 2019.000000 \n", + "75% 2022.000000 \n", + "max 2024.000000 " + ] + }, + "execution_count": 202, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "null_values = df_cars.isnull().sum()\n", + "print(\"Пустые значения по столбцам:\")\n", + "print(null_values)\n", + "\n", + "duplicates = df_cars.duplicated().sum()\n", + "print(f\"\\nКоличество дубликатов: {duplicates}\")\n", + "\n", + "print(\"\\nСтатистический обзор данных:\")\n", + "df_cars.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Видим, что есть пустые данные, и дубликаты, удаляем их:" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "В наборе данных 'Cars' было удалено 2 строк с пустыми значениями.\n" + ] + } + ], + "source": [ + "df_cars = df_cars.drop_duplicates()\n", + "\n", + "def drop_missing_values(dataframe, name):\n", + " before_shape = dataframe.shape \n", + " cleaned_dataframe = dataframe.dropna() \n", + " after_shape = cleaned_dataframe.shape \n", + " print(f\"В наборе данных '{name}' было удалено {before_shape[0] - after_shape[0]} строк с пустыми значениями.\")\n", + " return cleaned_dataframe\n", + "\n", + "df_cars = drop_missing_values(df_cars, \"Cars\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Проверка на выбросы:" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Количество выбросов в столбце 'combination_mpg': 8\n", + "Количество выбросов в столбце 'cylinders': 10\n", + "Количество выбросов в столбце 'displacement': 21\n", + "Количество выбросов в столбце 'highway_mpg': 3\n", + "Количество выбросов в столбце 'city_mpg': 9\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Выбираем столбцы для анализа\n", + "columns_to_check = ['combination_mpg', 'cylinders', 'displacement', 'highway_mpg', 'city_mpg']\n", + "\n", + "# Подсчитываем выбросы\n", + "outliers_count = count_outliers(df_cars, columns_to_check)\n", + "\n", + "# Выводим количество выбросов для каждого столбца\n", + "for col, count in outliers_count.items():\n", + " print(f\"Количество выбросов в столбце '{col}': {count}\")\n", + "\n", + "# Создаем диаграммы размахов\n", + "plt.figure(figsize=(15, 10))\n", + "for i, col in enumerate(columns_to_check, 1):\n", + " plt.subplot(2, 3, i)\n", + " sns.boxplot(x=df_cars[col])\n", + " plt.title(f'Box Plot of {col}')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В каждом из выбранных столбцов присутствуют выбросы. Очистим их." + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Количество удаленных строк: 36\n" + ] + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Выбираем столбцы для очистки\n", + "columns_to_clean = ['combination_mpg', 'cylinders', 'displacement', 'highway_mpg', 'city_mpg']\n", + "\n", + "# Функция для удаления выбросов\n", + "def remove_outliers(df, columns):\n", + " for col in columns:\n", + " Q1 = df[col].quantile(0.25)\n", + " Q3 = df[col].quantile(0.75)\n", + " IQR = Q3 - Q1\n", + " lower_bound = Q1 - 1.5 * IQR\n", + " upper_bound = Q3 + 1.5 * IQR\n", + " \n", + " # Удаляем строки, содержащие выбросы\n", + " df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n", + " \n", + " return df\n", + "\n", + "# Удаляем выбросы\n", + "df_cars_clean = remove_outliers(df_cars, columns_to_clean)\n", + "\n", + "# Выводим количество удаленных строк\n", + "print(f\"Количество удаленных строк: {len(df_cars) - len(df_cars_clean)}\")\n", + "\n", + "# Создаем диаграммы размаха для очищенных данных\n", + "plt.figure(figsize=(15, 6))\n", + "\n", + "# Создаем диаграммы размахов\n", + "plt.figure(figsize=(15, 10))\n", + "for i, col in enumerate(columns_to_clean, 1):\n", + " plt.subplot(2, 3, i)\n", + " sns.boxplot(x=df_cars_clean[col])\n", + " plt.title(f'Box Plot of {col}')\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "df_cars = df_cars_clean" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Разбиение набора данных на обучающую, контрольную и тестовую выборки" + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: 306\n", + "Размер контрольной выборки: 102\n", + "Размер тестовой выборки: 102\n" + ] + } + ], + "source": [ + "train_df, test_df = train_test_split(df_cars, test_size=0.2, random_state=42)\n", + "\n", + "train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n", + "\n", + "print(\"Размер обучающей выборки:\", len(train_df))\n", + "print(\"Размер контрольной выборки:\", len(val_df))\n", + "print(\"Размер тестовой выборки:\", len(test_df))" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение \"Комбинированный расход топлива\" в обучающей выборке:\n", + "combination_mpg\n", + "23 32\n", + "22 29\n", + "24 23\n", + "25 22\n", + "27 22\n", + "18 21\n", + "19 19\n", + "29 18\n", + "21 18\n", + "26 17\n", + "31 16\n", + "28 14\n", + "20 13\n", + "32 12\n", + "17 11\n", + "30 10\n", + "16 3\n", + "34 3\n", + "36 1\n", + "33 1\n", + "14 1\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Комбинированный расход топлива\" в контрольной выборке:\n", + "combination_mpg\n", + "20 17\n", + "19 15\n", + "21 13\n", + "26 9\n", + "27 7\n", + "22 6\n", + "30 5\n", + "23 5\n", + "18 4\n", + "17 3\n", + "24 3\n", + "28 3\n", + "29 3\n", + "25 2\n", + "34 2\n", + "33 2\n", + "32 1\n", + "14 1\n", + "31 1\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Комбинированный расход топлива\" в тестовой выборке:\n", + "combination_mpg\n", + "21 14\n", + "18 13\n", + "22 12\n", + "27 12\n", + "23 10\n", + "31 5\n", + "20 5\n", + "26 5\n", + "24 4\n", + "29 4\n", + "28 4\n", + "19 4\n", + "25 3\n", + "32 3\n", + "17 3\n", + "30 1\n", + "Name: count, dtype: int64\n", + "\n" + ] + } + ], + "source": [ + "def check_balance(df, name):\n", + " counts = df['combination_mpg'].value_counts()\n", + " print(f\"Распределение \\\"Комбинированный расход топлива\\\" в {name}:\")\n", + " print(counts)\n", + " print()\n", + "\n", + "check_balance(train_df, \"обучающей выборке\")\n", + "check_balance(val_df, \"контрольной выборке\")\n", + "check_balance(test_df, \"тестовой выборке\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оверсемплинг и андерсемплинг" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Оверсэмплинг:\n", + "Распределение \"Комбинированный расход топлива\" в обучающей выборке:\n", + "combination_mpg\n", + "21 32\n", + "22 32\n", + "25 32\n", + "19 32\n", + "29 32\n", + "23 32\n", + "28 32\n", + "18 32\n", + "27 32\n", + "20 32\n", + "16 32\n", + "30 32\n", + "32 32\n", + "31 32\n", + "24 32\n", + "26 32\n", + "17 32\n", + "36 32\n", + "34 32\n", + "33 32\n", + "14 32\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Комбинированный расход топлива\" в контрольной выборке:\n", + "combination_mpg\n", + "20 17\n", + "19 17\n", + "17 17\n", + "27 17\n", + "22 17\n", + "26 17\n", + "24 17\n", + "32 17\n", + "21 17\n", + "18 17\n", + "30 17\n", + "23 17\n", + "29 17\n", + "28 17\n", + "34 17\n", + "25 17\n", + "14 17\n", + "33 17\n", + "31 17\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Комбинированный расход топлива\" в тестовой выборке:\n", + "combination_mpg\n", + "28 14\n", + "32 14\n", + "30 14\n", + "23 14\n", + "20 14\n", + "26 14\n", + "21 14\n", + "18 14\n", + "27 14\n", + "25 14\n", + "22 14\n", + "19 14\n", + "29 14\n", + "24 14\n", + "31 14\n", + "17 14\n", + "Name: count, dtype: int64\n", + "\n", + "Андерсэмплинг:\n", + "Распределение \"Комбинированный расход топлива\" в обучающей выборке:\n", + "combination_mpg\n", + "14 1\n", + "16 1\n", + "17 1\n", + "18 1\n", + "19 1\n", + "20 1\n", + "21 1\n", + "22 1\n", + "23 1\n", + "24 1\n", + "25 1\n", + "26 1\n", + "27 1\n", + "28 1\n", + "29 1\n", + "30 1\n", + "31 1\n", + "32 1\n", + "33 1\n", + "34 1\n", + "36 1\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Комбинированный расход топлива\" в контрольной выборке:\n", + "combination_mpg\n", + "14 1\n", + "17 1\n", + "18 1\n", + "19 1\n", + "20 1\n", + "21 1\n", + "22 1\n", + "23 1\n", + "24 1\n", + "25 1\n", + "26 1\n", + "27 1\n", + "28 1\n", + "29 1\n", + "30 1\n", + "31 1\n", + "32 1\n", + "33 1\n", + "34 1\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Комбинированный расход топлива\" в тестовой выборке:\n", + "combination_mpg\n", + "17 1\n", + "18 1\n", + "19 1\n", + "20 1\n", + "21 1\n", + "22 1\n", + "23 1\n", + "24 1\n", + "25 1\n", + "26 1\n", + "27 1\n", + "28 1\n", + "29 1\n", + "30 1\n", + "31 1\n", + "32 1\n", + "Name: count, dtype: int64\n", + "\n" + ] + } + ], + "source": [ + "train_df_oversampled = oversample(train_df, 'combination_mpg')\n", + "val_df_oversampled = oversample(val_df, 'combination_mpg')\n", + "test_df_oversampled = oversample(test_df, 'combination_mpg')\n", + "\n", + "train_df_undersampled = undersample(train_df, 'combination_mpg')\n", + "val_df_undersampled = undersample(val_df, 'combination_mpg')\n", + "test_df_undersampled = undersample(test_df, 'combination_mpg')\n", + "\n", + "print(\"Оверсэмплинг:\")\n", + "check_balance(train_df_oversampled, \"обучающей выборке\")\n", + "check_balance(val_df_oversampled, \"контрольной выборке\")\n", + "check_balance(test_df_oversampled, \"тестовой выборке\")\n", + "\n", + "print(\"Андерсэмплинг:\")\n", + "check_balance(train_df_undersampled, \"обучающей выборке\")\n", + "check_balance(val_df_undersampled, \"контрольной выборке\")\n", + "check_balance(test_df_undersampled, \"тестовой выборке\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Датасет №3 (Экономика стран)\n", + "Ссылка: https://www.kaggle.com/datasets/pratik453609/economic-data-9-countries-19802020\n", + "\n", + "Проблемная область: экономический анализ и прогнозирование макроэкономических показателей.\n", + "\n", + "Объекты наблюдения: экономические индексы по странам за определённые годы." + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['stock index', 'country', 'year', 'index price', 'log_indexprice',\n", + " 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n", + " 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n", + " 'tradebalance', 'USTreasury'],\n", + " dtype='object')\n", + "\n", + "RangeIndex: 369 entries, 0 to 368\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 stock index 369 non-null object \n", + " 1 country 369 non-null object \n", + " 2 year 369 non-null float64\n", + " 3 index price 317 non-null float64\n", + " 4 log_indexprice 369 non-null float64\n", + " 5 inflationrate 326 non-null float64\n", + " 6 oil prices 369 non-null float64\n", + " 7 exchange_rate 367 non-null float64\n", + " 8 gdppercent 350 non-null float64\n", + " 9 percapitaincome 368 non-null float64\n", + " 10 unemploymentrate 348 non-null float64\n", + " 11 manufacturingoutput 278 non-null float64\n", + " 12 tradebalance 365 non-null float64\n", + " 13 USTreasury 369 non-null float64\n", + "dtypes: float64(12), object(2)\n", + "memory usage: 40.5+ KB\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stock indexcountryyearindex pricelog_indexpriceinflationrateoil pricesexchange_rategdppercentpercapitaincomeunemploymentratemanufacturingoutputtradebalanceUSTreasury
0NASDAQUnited States of America1980.0168.612.230.1421.591.00.0912575.00.07NaN-13.060.11
1NASDAQUnited States of America1981.0203.152.310.1031.771.00.1213976.00.08NaN-12.520.14
2NASDAQUnited States of America1982.0188.982.280.0628.521.00.0414434.00.10NaN-19.970.13
3NASDAQUnited States of America1983.0285.432.460.0326.191.00.0915544.00.10NaN-51.640.11
4NASDAQUnited States of America1984.0248.892.400.0425.881.00.1117121.00.08NaN-102.730.12
\n", + "
" + ], + "text/plain": [ + " stock index country year index price log_indexprice \\\n", + "0 NASDAQ United States of America 1980.0 168.61 2.23 \n", + "1 NASDAQ United States of America 1981.0 203.15 2.31 \n", + "2 NASDAQ United States of America 1982.0 188.98 2.28 \n", + "3 NASDAQ United States of America 1983.0 285.43 2.46 \n", + "4 NASDAQ United States of America 1984.0 248.89 2.40 \n", + "\n", + " inflationrate oil prices exchange_rate gdppercent percapitaincome \\\n", + "0 0.14 21.59 1.0 0.09 12575.0 \n", + "1 0.10 31.77 1.0 0.12 13976.0 \n", + "2 0.06 28.52 1.0 0.04 14434.0 \n", + "3 0.03 26.19 1.0 0.09 15544.0 \n", + "4 0.04 25.88 1.0 0.11 17121.0 \n", + "\n", + " unemploymentrate manufacturingoutput tradebalance USTreasury \n", + "0 0.07 NaN -13.06 0.11 \n", + "1 0.08 NaN -12.52 0.14 \n", + "2 0.10 NaN -19.97 0.13 \n", + "3 0.10 NaN -51.64 0.11 \n", + "4 0.08 NaN -102.73 0.12 " + ] + }, + "execution_count": 209, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_countries = pd.read_csv(\".//static//csv//Economic Data - 9 Countries (1980-2020).csv\")\n", + "print(df_countries.columns)\n", + "df_countries.info()\n", + "df_countries.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Атрибуты объектов:\n", + "1. stock index — индекс акций.\n", + "2. country — страна.\n", + "3. year — год.\n", + "4. index price — цена индекса.\n", + "5. log_indexprice — логарифм цены индекса.\n", + "6. inflationrate — уровень инфляции.\n", + "7. oil prices — цены на нефть.\n", + "8. exchange_rate — валютный курс.\n", + "9. gdppercent — процент роста ВВП.\n", + "10. percapitaincome — доход на душу населения.\n", + "11. unemploymentrate — уровень безработицы.\n", + "12. manufacturingoutput — объём производства.\n", + "13. tradebalance — торговый баланс.\n", + "14. USTreasury — доходность казначейских облигаций США.\n", + "\n", + "Связи между объектами:\n", + "Некоторые атрибуты могут быть связаны друг с другом, например, уровень инфляции и процент роста ВВП могут коррелировать с ценами на нефть, уровнем безработицы и торговым балансом.\n", + "\n", + "Примеры бизнес-целей и эффект:\n", + "1. Прогнозирование экономического роста и планирование инвестиций:\n", + " - Бизнес-цель: Создать модель прогнозирования роста экономики для стран, чтобы принять стратегические инвестиционные решения.\n", + " - Эффект: Повышение точности экономических прогнозов и улучшение прибыльности инвестиционных стратегий.\n", + "\n", + "2. Анализ и оптимизация торговой политики:\n", + " - Бизнес-цель: Изучение влияния изменений торгового баланса и валютных курсов на экономику стран.\n", + " - Эффект: Улучшение торговых соглашений и политики, что приведёт к более устойчивому экономическому росту.\n", + "\n", + "Примеры целей технического проекта:\n", + "1. Цель: Построение модели для прогнозирования уровня инфляции.\n", + " - Вход: Уровень безработицы, ВВП, доход на душу населения, валютный курс, цены на нефть.\n", + " - Целевой признак: inflationrate.\n", + "\n", + "2. Цель: Построение модели для оценки экономического роста.\n", + " - Вход: Торговый баланс, доход на душу населения, валютный курс, инфляция.\n", + " - Целевой признак: gdppercent." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Проверка на пустые значения и дубликаты" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Пустые значения по столбцам:\n", + "stock index 0\n", + "country 0\n", + "year 0\n", + "index price 52\n", + "log_indexprice 0\n", + "inflationrate 43\n", + "oil prices 0\n", + "exchange_rate 2\n", + "gdppercent 19\n", + "percapitaincome 1\n", + "unemploymentrate 21\n", + "manufacturingoutput 91\n", + "tradebalance 4\n", + "USTreasury 0\n", + "dtype: int64\n", + "\n", + "Количество дубликатов: 0\n", + "\n", + "Статистический обзор данных:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearindex pricelog_indexpriceinflationrateoil pricesexchange_rategdppercentpercapitaincomeunemploymentratemanufacturingoutputtradebalanceUSTreasury
count369.000000317.000000369.000000326.000000369.000000367.000000350.000000368.000000348.000000278.000000365.000000369.000000
mean2000.0000007898.6482973.6105420.04174839.74317127.8975480.03711420719.9646740.068908328.084820-15.9963840.059024
std11.8482257811.3368620.4824810.03957925.45265449.6205210.03785017435.0377830.043207622.395923154.5571700.033086
min1980.000000168.6100002.230000-0.04000011.3500000.900000-0.11000027.0000000.0200000.590000-770.9300000.010000
25%1990.0000002407.1000003.3200000.02000019.4100001.3300000.0200002090.2500000.04000080.380000-25.3700000.030000
50%2000.0000005160.1000003.6000000.03000028.5200005.4400000.03000019969.5000000.060000188.160000-0.1400000.050000
75%2010.00000010279.5000003.9800000.05750057.88000015.0550000.06000036384.0000000.090000271.97750019.0800000.080000
max2020.00000047751.3300004.6800000.24000098.560000249.0500000.15000065280.0000000.2600003868.460000366.1400000.140000
\n", + "
" + ], + "text/plain": [ + " year index price log_indexprice inflationrate oil prices \\\n", + "count 369.000000 317.000000 369.000000 326.000000 369.000000 \n", + "mean 2000.000000 7898.648297 3.610542 0.041748 39.743171 \n", + "std 11.848225 7811.336862 0.482481 0.039579 25.452654 \n", + "min 1980.000000 168.610000 2.230000 -0.040000 11.350000 \n", + "25% 1990.000000 2407.100000 3.320000 0.020000 19.410000 \n", + "50% 2000.000000 5160.100000 3.600000 0.030000 28.520000 \n", + "75% 2010.000000 10279.500000 3.980000 0.057500 57.880000 \n", + "max 2020.000000 47751.330000 4.680000 0.240000 98.560000 \n", + "\n", + " exchange_rate gdppercent percapitaincome unemploymentrate \\\n", + "count 367.000000 350.000000 368.000000 348.000000 \n", + "mean 27.897548 0.037114 20719.964674 0.068908 \n", + "std 49.620521 0.037850 17435.037783 0.043207 \n", + "min 0.900000 -0.110000 27.000000 0.020000 \n", + "25% 1.330000 0.020000 2090.250000 0.040000 \n", + "50% 5.440000 0.030000 19969.500000 0.060000 \n", + "75% 15.055000 0.060000 36384.000000 0.090000 \n", + "max 249.050000 0.150000 65280.000000 0.260000 \n", + "\n", + " manufacturingoutput tradebalance USTreasury \n", + "count 278.000000 365.000000 369.000000 \n", + "mean 328.084820 -15.996384 0.059024 \n", + "std 622.395923 154.557170 0.033086 \n", + "min 0.590000 -770.930000 0.010000 \n", + "25% 80.380000 -25.370000 0.030000 \n", + "50% 188.160000 -0.140000 0.050000 \n", + "75% 271.977500 19.080000 0.080000 \n", + "max 3868.460000 366.140000 0.140000 " + ] + }, + "execution_count": 210, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "null_values = df_countries.isnull().sum()\n", + "print(\"Пустые значения по столбцам:\")\n", + "print(null_values)\n", + "\n", + "duplicates = df_countries.duplicated().sum()\n", + "print(f\"\\nКоличество дубликатов: {duplicates}\")\n", + "\n", + "print(\"\\nСтатистический обзор данных:\")\n", + "df_countries.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Видим, что есть пустые данные, но нет дубликатов. Удаляем их" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "В наборе данных 'Countries' было удалено 150 строк с пустыми значениями.\n" + ] + } + ], + "source": [ + "df_countries = drop_missing_values(df_countries, \"Countries\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Проверка на выбросы:" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Количество выбросов в столбце 'year': 0\n", + "Количество выбросов в столбце 'index price': 17\n", + "Количество выбросов в столбце 'log_indexprice': 1\n", + "Количество выбросов в столбце 'inflationrate': 35\n", + "Количество выбросов в столбце 'oil prices': 0\n", + "Количество выбросов в столбце 'exchange_rate': 53\n", + "Количество выбросов в столбце 'gdppercent': 13\n", + "Количество выбросов в столбце 'percapitaincome': 0\n", + "Количество выбросов в столбце 'unemploymentrate': 9\n", + "Количество выбросов в столбце 'manufacturingoutput': 29\n", + "Количество выбросов в столбце 'tradebalance': 47\n", + "Количество выбросов в столбце 'USTreasury': 9\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Выбираем столбцы для анализа\n", + "columns_to_check = ['year', 'index price', 'log_indexprice',\n", + " 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n", + " 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n", + " 'tradebalance', 'USTreasury']\n", + "\n", + "# Подсчитываем выбросы\n", + "outliers_count = count_outliers(df_countries, columns_to_check)\n", + "\n", + "# Выводим количество выбросов для каждого столбца\n", + "for col, count in outliers_count.items():\n", + " print(f\"Количество выбросов в столбце '{col}': {count}\")\n", + "\n", + "# Создаем диаграммы размахов\n", + "plt.figure(figsize=(15, 10))\n", + "for i, col in enumerate(columns_to_check, 1):\n", + " plt.subplot(3, 4, i)\n", + " sns.boxplot(x=df_countries[col])\n", + " plt.title(f'Box Plot of {col}')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В большинстве из выбранных столбцов присутствуют выбросы. Очистим их." + ] + }, + { + "cell_type": "code", + "execution_count": 213, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Количество удаленных строк: 136\n" + ] + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Выбираем столбцы для очистки\n", + "columns_to_clean = ['index price', 'log_indexprice',\n", + " 'inflationrate', 'exchange_rate', 'gdppercent', 'unemploymentrate', 'manufacturingoutput',\n", + " 'tradebalance', 'USTreasury']\n", + "\n", + "# Удаляем выбросы\n", + "df_countries_clean = remove_outliers(df_countries, columns_to_clean)\n", + "\n", + "# Выводим количество удаленных строк\n", + "print(f\"Количество удаленных строк: {len(df_countries) - len(df_countries_clean)}\")\n", + "\n", + "# Создаем диаграммы размаха для очищенных данных\n", + "plt.figure(figsize=(15, 6))\n", + "\n", + "# Создаем диаграммы размахов\n", + "plt.figure(figsize=(15, 10))\n", + "for i, col in enumerate(columns_to_clean, 1):\n", + " plt.subplot(3, 3, i)\n", + " sns.boxplot(x=df_countries_clean[col])\n", + " plt.title(f'Box Plot of {col}')\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "df_countries = df_countries_clean" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Разбиение набора данных на обучающую, контрольную и тестовую выборки" + ] + }, + { + "cell_type": "code", + "execution_count": 214, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: 49\n", + "Размер контрольной выборки: 17\n", + "Размер тестовой выборки: 17\n" + ] + } + ], + "source": [ + "train_df, test_df = train_test_split(df_countries, test_size=0.2, random_state=42)\n", + "\n", + "train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n", + "\n", + "print(\"Размер обучающей выборки:\", len(train_df))\n", + "print(\"Размер контрольной выборки:\", len(val_df))\n", + "print(\"Размер тестовой выборки:\", len(test_df))" + ] + }, + { + "cell_type": "code", + "execution_count": 215, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение \"Уровень инфляции\" в обучающей выборке:\n", + "inflationrate\n", + "0.02 25\n", + "0.03 11\n", + "0.01 9\n", + "0.04 4\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Уровень инфляции\" в контрольной выборке:\n", + "inflationrate\n", + "0.03 6\n", + "0.01 6\n", + "0.02 5\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Уровень инфляции\" в тестовой выборке:\n", + "inflationrate\n", + "0.02 6\n", + "0.03 6\n", + "0.01 4\n", + "0.04 1\n", + "Name: count, dtype: int64\n", + "\n" + ] + } + ], + "source": [ + "def check_balance(df, name):\n", + " counts = df['inflationrate'].value_counts()\n", + " print(f\"Распределение \\\"Уровень инфляции\\\" в {name}:\")\n", + " print(counts)\n", + " print()\n", + "\n", + "check_balance(train_df, \"обучающей выборке\")\n", + "check_balance(val_df, \"контрольной выборке\")\n", + "check_balance(test_df, \"тестовой выборке\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оверсемплинг и андерсемплинг" + ] + }, + { + "cell_type": "code", + "execution_count": 216, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Оверсэмплинг:\n", + "Распределение \"Уровень инфляции\" в обучающей выборке:\n", + "inflationrate\n", + "0.03 26\n", + "0.02 25\n", + "0.01 9\n", + "0.04 8\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Уровень инфляции\" в контрольной выборке:\n", + "inflationrate\n", + "0.03 11\n", + "0.01 6\n", + "0.02 5\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Уровень инфляции\" в тестовой выборке:\n", + "inflationrate\n", + "0.03 8\n", + "0.02 6\n", + "0.01 4\n", + "0.04 2\n", + "Name: count, dtype: int64\n", + "\n", + "Андерсэмплинг:\n", + "Распределение \"Уровень инфляции\" в обучающей выборке:\n", + "inflationrate\n", + "0.03 11\n", + "0.02 10\n", + "0.01 5\n", + "0.04 4\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Уровень инфляции\" в контрольной выборке:\n", + "inflationrate\n", + "0.03 6\n", + "0.01 4\n", + "0.02 2\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение \"Уровень инфляции\" в тестовой выборке:\n", + "inflationrate\n", + "0.03 6\n", + "0.02 5\n", + "0.01 2\n", + "0.04 1\n", + "Name: count, dtype: int64\n", + "\n" + ] + } + ], + "source": [ + "def binning(target, bins):\n", + " return pd.qcut(target, q=bins, labels=False)\n", + "\n", + "train_df['inflationrate_binned'] = binning(train_df['inflationrate'], bins=2)\n", + "val_df['inflationrate_binned'] = binning(val_df['inflationrate'], bins=2)\n", + "test_df['inflationrate_binned'] = binning(test_df['inflationrate'], bins=2)\n", + "\n", + "train_df_oversampled = oversample(train_df, 'inflationrate_binned')\n", + "val_df_oversampled = oversample(val_df, 'inflationrate_binned')\n", + "test_df_oversampled = oversample(test_df, 'inflationrate_binned')\n", + "\n", + "train_df_undersampled = undersample(train_df, 'inflationrate_binned')\n", + "val_df_undersampled = undersample(val_df, 'inflationrate_binned')\n", + "test_df_undersampled = undersample(test_df, 'inflationrate_binned')\n", + "\n", + "print(\"Оверсэмплинг:\")\n", + "check_balance(train_df_oversampled, \"обучающей выборке\")\n", + "check_balance(val_df_oversampled, \"контрольной выборке\")\n", + "check_balance(test_df_oversampled, \"тестовой выборке\")\n", + "\n", + "print(\"Андерсэмплинг:\")\n", + "check_balance(train_df_undersampled, \"обучающей выборке\")\n", + "check_balance(val_df_undersampled, \"контрольной выборке\")\n", + "check_balance(test_df_undersampled, \"тестовой выборке\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aimvenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lab_2/requirements.txt b/lab_2/requirements.txt new file mode 100644 index 0000000..649b7fd Binary files /dev/null and b/lab_2/requirements.txt differ