From 523bb0852dce751019d281960ca434b03cab7c45 Mon Sep 17 00:00:00 2001 From: GokaPek Date: Fri, 11 Oct 2024 13:42:11 +0400 Subject: [PATCH 1/4] init --- .gitignore | 4 +- lab_3/lab2.ipynb | 1337 ++++++++++++++++++++++++++++++++++++++++++++++ lab_3/lab3.ipynb | 94 ++++ 3 files changed, 1434 insertions(+), 1 deletion(-) create mode 100644 lab_3/lab2.ipynb create mode 100644 lab_3/lab3.ipynb diff --git a/.gitignore b/.gitignore index c5cb3da..d229cc0 100644 --- a/.gitignore +++ b/.gitignore @@ -176,4 +176,6 @@ cython_debug/ *.csv -/lab_2/aimenv \ No newline at end of file +/lab_2/aimenv + +/lab_3/aimenv \ No newline at end of file diff --git a/lab_3/lab2.ipynb b/lab_3/lab2.ipynb new file mode 100644 index 0000000..f7ec25d --- /dev/null +++ b/lab_3/lab2.ipynb @@ -0,0 +1,1337 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Выгрузка в датафрейм первый набор (игры в Steam)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://www.kaggle.com/datasets/wajihulhassan369/steam-games-dataset. Набор представляет собой данные об экшенах, доступных в Steam. Эта информация полезна для изучения игровых паттернов, моделирования цен и исследования корреляции между игровыми тегами и методами ценообразования. Этот набор позволяет провести предварительный анализ данных, построить модели машинного обучения или исследовать игровую индустрию. В наборе пресдтавлена дата, различные теги, рейтинг отзывов. Так можно понять, какие теги популярнее, что в играх людям нравится больше, изменилось ли качество игр со временем и т.д. Для бизнеса такой набор данных может быть полезен для прогнозирования, в разработку каки игр целесообразнее вкладываться. Так компания не потеряет деньги.\n", + "Пример цели: Разработка игры на пк в нужную фазу рынка\n", + "Входные данные: год выпуска, сумма продаж\n", + "Целевой признак: продаваемость игр в текущей фазе рынка в сравнении с предыдущими." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['Name', 'Price', 'Release_date', 'Review_no', 'Review_type', 'Tags',\n", + " 'Description'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "df = pd.read_csv(\".//static//csv//steam_cleaned.csv\")\n", + "print(df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Преобразуем дату выпуска в формат datetime\n", + "df['Release_date'] = pd.to_datetime(df['Release_date'])\n", + "\n", + "# Визуализация данных\n", + "plt.figure(figsize=(10, 6))\n", + "plt.scatter(df['Release_date'], df['Review_no'])\n", + "plt.xlabel('Release Date')\n", + "plt.ylabel('Review Number')\n", + "plt.title('Scatter Plot of Review Number vs Release Date')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "При проверке на шум можно заметить выброс в 2014 году. количество обзоров там запредельное. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Все выбросы удалены путём определения порогов квантилями. Зашумленность не очень высокая. Покрытие данных высокое и подошло бы для поставленной задачи по актуальности." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Выбросы:\n", + " Name Price Release_date Review_no \\\n", + "18 GUNDAM BREAKER 4 59.99 2024-08-29 1846.0 \n", + "22 LOCKDOWN Protocol 5.49 2024-07-22 2192.0 \n", + "34 CarX Street 19.99 2024-08-29 4166.0 \n", + "45 Harry Potter: Quidditch Champions 25.99 2024-09-03 1216.0 \n", + "61 SMITE 2 18.00 2024-08-27 1633.0 \n", + "... ... ... ... ... \n", + "7695 Dude Simulator 2 2.99 2018-07-28 1734.0 \n", + "7717 Golfing Over It with Alva Majo 2.39 2018-03-28 1367.0 \n", + "7740 Dungeon Siege II 4.99 2005-08-16 2274.0 \n", + "7765 Phantom Doctrine 12.99 2018-08-14 3538.0 \n", + "7768 NECROPOLIS: BRUTAL EDITION 19.99 2016-07-12 3668.0 \n", + "\n", + " Review_type Tags \\\n", + "18 Very Positive Action,Robots,Hack and Slash,RPG,Mechs,Action ... \n", + "22 Very Positive Multiplayer,Social Deduction,Conversation,Acti... \n", + "34 Mixed Racing,Open World,Automobile Sim,PvP,Multiplay... \n", + "45 Mostly Positive Action,Sports,Flight,Arcade,Third Person,Magic... \n", + "61 Mixed Action,MOBA,Third Person,Strategy,Adventure,Ca... \n", + "... ... ... \n", + "7695 Mixed Life Sim,Indie,Simulation,Racing,Action,Advent... \n", + "7717 Mostly Positive Difficult,Physics,Golf,Platformer,Precision Pl... \n", + "7740 Mostly Positive RPG,Fantasy,Action RPG,Hack and Slash,Singlepl... \n", + "7765 Mostly Positive Turn-Based Tactics,Strategy,Cold War,Stealth,R... \n", + "7768 Mixed Souls-like,Action Roguelike,Co-op,Adventure,Ro... \n", + "\n", + " Description \n", + "18 Create your own ultimate Gundam in the newest ... \n", + "22 A first person social deduction game, combinin... \n", + "34 Conquer mountain roads, highways, and city str... \n", + "45 Your next chapter takes flight! Immerse yourse... \n", + "61 Become a god and wage war in SMITE 2, the Unre... \n", + "... ... \n", + "7695 Dude Simulator 2 is an open world sandbox game... \n", + "7717 The higher you climb, the bigger the fall. \n", + "7740 NaN \n", + "7765 The year is 1983. The world teeters on the ver... \n", + "7768 NECROPOLIS: BRUTAL EDITION is a major update f... \n", + "\n", + "[1049 rows x 7 columns]\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Преобразуем дату выпуска в формат datetime\n", + "df['Release_date'] = pd.to_datetime(df['Release_date'])\n", + "\n", + "# Статистический анализ для определения выбросов\n", + "Q1 = df['Review_no'].quantile(0.25)\n", + "Q3 = df['Review_no'].quantile(0.75)\n", + "IQR = Q3 - Q1\n", + "\n", + "# Определение порога для выбросов\n", + "threshold = 1.5 * IQR\n", + "outliers = (df['Review_no'] < (Q1 - threshold)) | (df['Review_no'] > (Q3 + threshold))\n", + "\n", + "# Вывод выбросов\n", + "print(\"Выбросы:\")\n", + "print(df[outliers])\n", + "\n", + "# Обработка выбросов\n", + "# В данном случае мы заменим выбросы на медианное значение\n", + "median_review_no = df['Review_no'].median()\n", + "df.loc[outliers, 'Review_no'] = median_review_no\n", + "\n", + "# Визуализация данных после обработки\n", + "plt.figure(figsize=(10, 6))\n", + "plt.scatter(df['Release_date'], df['Review_no'])\n", + "plt.xlabel('Release Date')\n", + "plt.ylabel('Review Number')\n", + "plt.title('Scatter Plot of Review Number vs Release Date (After Handling Outliers)')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Очистим от строк с пустыми значениями наш датасет" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Количество удаленных строк: 515\n", + "\n", + "DataFrame после удаления строк с пропущенными значениями:\n", + " Name Price Release_date \\\n", + "0 Black Myth: Wukong 59.99 2024-08-20 \n", + "2 Counter-Strike 2 0.00 2012-08-21 \n", + "4 Grand Theft Auto V 10.48 2015-04-14 \n", + "5 Red Dead Redemption 2 17.99 2019-12-05 \n", + "6 PUBG: BATTLEGROUNDS 0.00 2017-12-21 \n", + "... ... ... ... \n", + "7807 Monster Hunter World: Iceborne - MHW:I Monster... 2.99 2020-02-06 \n", + "7808 Gene Shift Auto: Deluxe Edition 8.99 2022-11-28 \n", + "7809 Run Ralph Run 0.45 2021-03-03 \n", + "7810 Quadroids 6.19 2024-02-22 \n", + "7811 Divekick 4.99 2013-08-20 \n", + "\n", + " Review_no Review_type \\\n", + "0 270.0 Overwhelmingly Positive \n", + "2 270.0 Very Positive \n", + "4 270.0 Very Positive \n", + "5 270.0 Very Positive \n", + "6 270.0 Mixed \n", + "... ... ... \n", + "7807 39.0 Positive \n", + "7808 16.0 Positive \n", + "7809 26.0 Mostly Positive \n", + "7810 15.0 Positive \n", + "7811 1118.0 Very Positive \n", + "\n", + " Tags \\\n", + "0 Mythology,Action RPG,Action,Souls-like,RPG,Com... \n", + "2 FPS,Shooter,Multiplayer,Competitive,Action,Tea... \n", + "4 Open World,Action,Multiplayer,Crime,Automobile... \n", + "5 Open World,Story Rich,Western,Adventure,Multip... \n", + "6 Survival,Shooter,Battle Royale,Multiplayer,FPS... \n", + "... ... \n", + "7807 Action \n", + "7808 Indie,Action,Free to Play,Battle Royale,Roguel... \n", + "7809 Adventure,Action,Puzzle,Arcade,Platformer,Shoo... \n", + "7810 Precision Platformer,Puzzle Platformer,2D Plat... \n", + "7811 Fighting,Indie,2D Fighter,Parody ,Local Multip... \n", + "\n", + " Description \n", + "0 Black Myth: Wukong is an action RPG rooted in ... \n", + "2 For over two decades, Counter-Strike has offer... \n", + "4 Grand Theft Auto V for PC offers players the o... \n", + "5 Winner of over 175 Game of the Year Awards and... \n", + "6 Play PUBG: BATTLEGROUNDS for free.\\n\\nLand on ... \n", + "... ... \n", + "7807 A monster figure you can use to decorate your ... \n", + "7808 Gene Shift Auto is a roguelike-inspired battle... \n", + "7809 Ralph is a smart dinosaur, and a great shooter. \n", + "7810 Quadroids is a single-player puzzle platformer... \n", + "7811 Divekick is the world’s first two-button fight... \n", + "\n", + "[7297 rows x 7 columns]\n" + ] + } + ], + "source": [ + "# Удаление строк с пропущенными значениями\n", + "df_dropna = df.dropna()\n", + "\n", + "# Вывод количества удаленных строк\n", + "num_deleted_rows = len(df) - len(df_dropna)\n", + "print(f\"\\nКоличество удаленных строк: {num_deleted_rows}\")\n", + "\n", + "print(\"\\nDataFrame после удаления строк с пропущенными значениями:\")\n", + "print(df_dropna)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Теперь создадим выборки." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: 4687\n", + "Размер контрольной выборки: 1562\n", + "Размер тестовой выборки: 1563\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "df = pd.read_csv(\".//static//csv//steam_cleaned.csv\")\n", + "\n", + "train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)\n", + "\n", + "# Разделение остатка на контрольную и тестовую выборки\n", + "val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n", + "\n", + "# Проверка размеров выборок\n", + "print(\"Размер обучающей выборки:\", len(train_df))\n", + "print(\"Размер контрольной выборки:\", len(val_df))\n", + "print(\"Размер тестовой выборки:\", len(test_df))\n", + "\n", + "# Сохранение выборок в файлы\n", + "train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n", + "val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n", + "test_df.to_csv(\".//static//csv//test_data.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Проанализируем сбалансированность выборок" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение Review_type в обучающей выборке:\n", + "Review_type\n", + "Very Positive 2117\n", + "Mostly Positive 810\n", + "Mixed 797\n", + "Positive 710\n", + "Overwhelmingly Positive 209\n", + "Mostly Negative 15\n", + "Very Negative 2\n", + "Overwhelmingly Negative 1\n", + "Name: count, dtype: int64\n", + "Процент положительных отзывов: 17.28%\n", + "Процент отрицательных отзывов: 4.46%\n", + "\n", + "Распределение Review_type в контрольной выборке:\n", + "Review_type\n", + "Very Positive 708\n", + "Mostly Positive 290\n", + "Mixed 241\n", + "Positive 224\n", + "Overwhelmingly Positive 78\n", + "Mostly Negative 6\n", + "Very Negative 2\n", + "Name: count, dtype: int64\n", + "Процент положительных отзывов: 18.57%\n", + "Процент отрицательных отзывов: 4.99%\n", + "\n", + "Распределение Review_type в тестовой выборке:\n", + "Review_type\n", + "Very Positive 713\n", + "Mostly Positive 276\n", + "Mixed 253\n", + "Positive 240\n", + "Overwhelmingly Positive 67\n", + "Mostly Negative 5\n", + "Very Negative 1\n", + "Name: count, dtype: int64\n", + "Процент положительных отзывов: 17.66%\n", + "Процент отрицательных отзывов: 4.29%\n", + "\n", + "Необходима аугментация данных для балансировки классов.\n", + "Необходима аугментация данных для балансировки классов.\n", + "Необходима аугментация данных для балансировки классов.\n" + ] + } + ], + "source": [ + "train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n", + "val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n", + "test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n", + "\n", + "# Оценка сбалансированности\n", + "def check_balance(df, name):\n", + " counts = df['Review_type'].value_counts()\n", + " print(f\"Распределение Review_type в {name}:\")\n", + " print(counts)\n", + " print(f\"Процент положительных отзывов: {counts['Mostly Positive'] / len(df) * 100:.2f}%\")\n", + " print(f\"Процент отрицательных отзывов: {counts['Overwhelmingly Positive'] / len(df) * 100:.2f}%\")\n", + " print()\n", + "\n", + "# Определение необходимости аугментации данных\n", + "def need_augmentation(df):\n", + " counts = df['Review_type'].value_counts()\n", + " ratio = counts['Mostly Positive'] / counts['Overwhelmingly Positive']\n", + " if ratio > 1.5 or ratio < 0.67:\n", + " print(\"Необходима аугментация данных для балансировки классов.\")\n", + " else:\n", + " print(\"Аугментация данных не требуется.\")\n", + " \n", + "check_balance(train_df, \"обучающей выборке\")\n", + "check_balance(val_df, \"контрольной выборке\")\n", + "check_balance(test_df, \"тестовой выборке\")\n", + "\n", + "\n", + "\n", + "need_augmentation(train_df)\n", + "need_augmentation(val_df)\n", + "need_augmentation(test_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "По результатам анализа требуется приращение, соотношения отзывов вне допустимого диапазона" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Оверсэмплинг:\n", + "Распределение Review_type в обучающей выборке:\n", + "Review_type\n", + "Mostly Positive 2117\n", + "Mixed 2117\n", + "Very Positive 2117\n", + "Positive 2117\n", + "Overwhelmingly Positive 2117\n", + "Mostly Negative 2117\n", + "Very Negative 2117\n", + "Overwhelmingly Negative 2117\n", + "Name: count, dtype: int64\n", + "Отсутствуют один или оба класса (Positive/Negative).\n", + "\n", + "Распределение Review_type в контрольной выборке:\n", + "Review_type\n", + "Very Negative 708\n", + "Mostly Positive 708\n", + "Mixed 708\n", + "Overwhelmingly Positive 708\n", + "Overwhelmingly Negative 708\n", + "Positive 708\n", + "Mostly Negative 708\n", + "Very Positive 708\n", + "Name: count, dtype: int64\n", + "Отсутствуют один или оба класса (Positive/Negative).\n", + "\n", + "Распределение Review_type в тестовой выборке:\n", + "Review_type\n", + "Very Negative 713\n", + "Mostly Positive 713\n", + "Overwhelmingly Positive 713\n", + "Mixed 713\n", + "Overwhelmingly Negative 713\n", + "Very Positive 713\n", + "Mostly Negative 713\n", + "Positive 713\n", + "Name: count, dtype: int64\n", + "Отсутствуют один или оба класса (Positive/Negative).\n", + "\n", + "Андерсэмплинг:\n", + "Распределение Review_type в обучающей выборке:\n", + "Review_type\n", + "Mixed 1\n", + "Mostly Negative 1\n", + "Mostly Positive 1\n", + "Overwhelmingly Negative 1\n", + "Overwhelmingly Positive 1\n", + "Positive 1\n", + "Very Negative 1\n", + "Very Positive 1\n", + "Name: count, dtype: int64\n", + "Отсутствуют один или оба класса (Positive/Negative).\n", + "\n", + "Распределение Review_type в контрольной выборке:\n", + "Review_type\n", + "Mixed 2\n", + "Mostly Negative 2\n", + "Mostly Positive 2\n", + "Overwhelmingly Negative 2\n", + "Overwhelmingly Positive 2\n", + "Positive 2\n", + "Very Negative 2\n", + "Very Positive 2\n", + "Name: count, dtype: int64\n", + "Отсутствуют один или оба класса (Positive/Negative).\n", + "\n", + "Распределение Review_type в тестовой выборке:\n", + "Review_type\n", + "Mixed 1\n", + "Mostly Negative 1\n", + "Mostly Positive 1\n", + "Overwhelmingly Negative 1\n", + "Overwhelmingly Positive 1\n", + "Positive 1\n", + "Very Negative 1\n", + "Very Positive 1\n", + "Name: count, dtype: int64\n", + "Отсутствуют один или оба класса (Positive/Negative).\n", + "\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "# Загрузка данных\n", + "train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n", + "val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n", + "test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n", + "\n", + "# Преобразование категориальных признаков в числовые\n", + "def encode(df):\n", + " label_encoders = {}\n", + " for column in df.select_dtypes(include=['object']).columns:\n", + " if column != 'Review_type': # Пропускаем целевую переменную\n", + " le = LabelEncoder()\n", + " df[column] = le.fit_transform(df[column])\n", + " label_encoders[column] = le\n", + " return label_encoders\n", + "\n", + "# Преобразование целевой переменной в числовые значения\n", + "def encode_target(df):\n", + " le = LabelEncoder()\n", + " df['Review_type'] = le.fit_transform(df['Review_type'])\n", + " return le\n", + "\n", + "# Применение кодирования\n", + "label_encoders = encode(train_df)\n", + "encode(val_df)\n", + "encode(test_df)\n", + "\n", + "# Кодирование целевой переменной\n", + "le_target = encode_target(train_df)\n", + "encode_target(val_df)\n", + "encode_target(test_df)\n", + "\n", + "# Проверка типов данных\n", + "def check_data_types(df):\n", + " for column in df.columns:\n", + " if df[column].dtype == 'object':\n", + " print(f\"Столбец '{column}' содержит строковые данные.\")\n", + "\n", + "check_data_types(train_df)\n", + "check_data_types(val_df)\n", + "check_data_types(test_df)\n", + "\n", + "# Функция для выполнения oversampling\n", + "def oversample(df):\n", + " if 'Review_type' not in df.columns:\n", + " print(\"Столбец 'Review_type' отсутствует.\")\n", + " return df\n", + " \n", + " X = df.drop('Review_type', axis=1)\n", + " y = df['Review_type']\n", + " \n", + " oversampler = RandomOverSampler(random_state=42)\n", + " X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", + " \n", + " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n", + " return resampled_df\n", + "\n", + "# Функция для выполнения undersampling\n", + "def undersample(df):\n", + " if 'Review_type' not in df.columns:\n", + " print(\"Столбец 'Review_type' отсутствует.\")\n", + " return df\n", + " \n", + " X = df.drop('Review_type', axis=1)\n", + " y = df['Review_type']\n", + " \n", + " undersampler = RandomUnderSampler(random_state=42)\n", + " X_resampled, y_resampled = undersampler.fit_resample(X, y)\n", + " \n", + " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n", + " return resampled_df\n", + "\n", + "# Применение oversampling и undersampling к каждой выборке\n", + "train_df_oversampled = oversample(train_df)\n", + "val_df_oversampled = oversample(val_df)\n", + "test_df_oversampled = oversample(test_df)\n", + "\n", + "train_df_undersampled = undersample(train_df)\n", + "val_df_undersampled = undersample(val_df)\n", + "test_df_undersampled = undersample(test_df)\n", + "\n", + "# Обратное преобразование целевой переменной в строковые метки\n", + "def decode_target(df, le_target):\n", + " df['Review_type'] = le_target.inverse_transform(df['Review_type'])\n", + "\n", + "decode_target(train_df_oversampled, le_target)\n", + "decode_target(val_df_oversampled, le_target)\n", + "decode_target(test_df_oversampled, le_target)\n", + "\n", + "decode_target(train_df_undersampled, le_target)\n", + "decode_target(val_df_undersampled, le_target)\n", + "decode_target(test_df_undersampled, le_target)\n", + "\n", + "# Проверка результатов\n", + "def check_balance(df, name):\n", + " if 'Review_type' not in df.columns:\n", + " print(f\"Столбец 'Review_type' отсутствует в {name}.\")\n", + " return\n", + " \n", + " counts = df['Review_type'].value_counts()\n", + " print(f\"Распределение Review_type в {name}:\")\n", + " print(counts)\n", + " \n", + " if 'Positive' in counts and 'Negative' in counts:\n", + " print(f\"Процент положительных отзывов: {counts['Positive'] / len(df) * 100:.2f}%\")\n", + " print(f\"Процент отрицательных отзывов: {counts['Negative'] / len(df) * 100:.2f}%\")\n", + " else:\n", + " print(\"Отсутствуют один или оба класса (Positive/Negative).\")\n", + " print()\n", + "\n", + "# Проверка сбалансированности после oversampling\n", + "print(\"Оверсэмплинг:\")\n", + "check_balance(train_df_oversampled, \"обучающей выборке\")\n", + "check_balance(val_df_oversampled, \"контрольной выборке\")\n", + "check_balance(test_df_oversampled, \"тестовой выборке\")\n", + "\n", + "# Проверка сбалансированности после undersampling\n", + "print(\"Андерсэмплинг:\")\n", + "check_balance(train_df_undersampled, \"обучающей выборке\")\n", + "check_balance(val_df_undersampled, \"контрольной выборке\")\n", + "check_balance(test_df_undersampled, \"тестовой выборке\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 14,400 Classic Rock Tracks (with Spotify Data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://www.kaggle.com/datasets/thebumpkin/14400-classic-rock-tracks-with-spotify-data Этот набор данных, содержащий 1200 уникальных альбомов и 14 400 треков, представляет собой не просто коллекцию — это хроника эволюции классического рока. Каждый трек тщательно каталогизирован с 18 столбцами данных, включая ключевые метаданные, такие как название трека, исполнитель, альбом и год выпуска, наряду с функциями Spotify audio, которые позволяют получить представление о звуковом ландшафте этих неподвластных времени мелодий. Бизнес-цель может заключаться в улучшении стратегии маркетинга и продвижения музыкальных треков. Предположим как этот набор может быть полезен для бизнеса:\n", + "Персонализированные рекомендации: Создание алгоритмов, которые будут рекомендовать пользователям музыку на основе их предпочтений.\n", + "Цель технического проекта: Разработать и внедрить систему рекомендаций, которая будет предсказывать и рекомендовать пользователям музыкальные треки на основе их предпочтений и поведения.\n", + "Входные данные:\n", + "Данные о пользователях: Идентификатор пользователя, история прослушиваний, оценки треков, время прослушивания, частота прослушивания.\n", + "Данные о треках: Атрибуты треков (название, исполнитель, альбом, год, длительность, танцевальность, энергичность, акустичность и т.д.).\n", + "Данные о взаимодействии: Время и частота взаимодействия пользователя с определенными треками.\n", + "Целевой признак:\n", + "Рекомендации: Булева переменная, указывающая, должен ли конкретный трек быть рекомендован пользователю (1 - рекомендуется, 0 - не рекомендуется)." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['Track', 'Artist', 'Album', 'Year', 'Duration', 'Time_Signature',\n", + " 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness',\n", + " 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo',\n", + " 'Popularity'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "df = pd.read_csv(\".//static//csv//UltimateClassicRock.csv\")\n", + "print(df.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Анализируем датафрейм при помощи \"ящика с усами\". Естьсмещение в сторону меньших значений, это можно исправить при помощи oversampling и undersampling." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "# Box plot для столбца 'Popularity'\n", + "plt.figure(figsize=(10, 6))\n", + "sns.boxplot(x=df['Popularity'])\n", + "plt.title('Box Plot для Popularity')\n", + "plt.xlabel('Popularity')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Решим проблему пустых значений при помощи удаления таких строк." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "df_cleaned = df.dropna()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Разбиение набора данных на обучающую, контрольную и тестовую выборки" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: 8650\n", + "Размер контрольной выборки: 2884\n", + "Размер тестовой выборки: 2884\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Разделение на обучающую и тестовую выборки\n", + "train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n", + "\n", + "# Разделение обучающей выборки на обучающую и контрольную\n", + "train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n", + "\n", + "print(\"Размер обучающей выборки:\", len(train_df))\n", + "print(\"Размер контрольной выборки:\", len(val_df))\n", + "print(\"Размер тестовой выборки:\", len(test_df))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оценка сбалансированности выборок, по результатам видно что баланса тут мало" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение Popularity в обучающей выборке:\n", + "Popularity\n", + "23 258\n", + "15 250\n", + "26 246\n", + "21 245\n", + "14 245\n", + " ... \n", + "84 1\n", + "87 1\n", + "91 1\n", + "79 1\n", + "86 1\n", + "Name: count, Length: 88, dtype: int64\n", + "\n", + "Распределение Popularity в контрольной выборке:\n", + "Popularity\n", + "17 90\n", + "26 86\n", + "21 83\n", + "24 83\n", + "28 80\n", + " ..\n", + "85 1\n", + "83 1\n", + "84 1\n", + "80 1\n", + "77 1\n", + "Name: count, Length: 85, dtype: int64\n", + "\n", + "Распределение Popularity в тестовой выборке:\n", + "Popularity\n", + "22 86\n", + "21 85\n", + "12 84\n", + "20 82\n", + "26 81\n", + " ..\n", + "76 2\n", + "71 2\n", + "79 1\n", + "82 1\n", + "80 1\n", + "Name: count, Length: 80, dtype: int64\n", + "\n" + ] + } + ], + "source": [ + "def check_balance(df, name):\n", + " counts = df['Popularity'].value_counts()\n", + " print(f\"Распределение Popularity в {name}:\")\n", + " print(counts)\n", + " print()\n", + "\n", + "check_balance(train_df, \"обучающей выборке\")\n", + "check_balance(val_df, \"контрольной выборке\")\n", + "check_balance(test_df, \"тестовой выборке\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Выполним овер- и андер- слемпинг." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение Popularity в обучающей выборке после oversampling:\n", + "Popularity\n", + "44 258\n", + "20 258\n", + "30 258\n", + "27 258\n", + "8 258\n", + " ... \n", + "78 258\n", + "79 258\n", + "74 258\n", + "81 258\n", + "86 258\n", + "Name: count, Length: 88, dtype: int64\n", + "\n", + "Распределение Popularity в контрольной выборке после oversampling:\n", + "Popularity\n", + "21 90\n", + "11 90\n", + "28 90\n", + "23 90\n", + "37 90\n", + " ..\n", + "61 90\n", + "84 90\n", + "80 90\n", + "77 90\n", + "0 90\n", + "Name: count, Length: 85, dtype: int64\n", + "\n", + "Распределение Popularity в тестовой выборке после oversampling:\n", + "Popularity\n", + "14 86\n", + "47 86\n", + "27 86\n", + "13 86\n", + "66 86\n", + " ..\n", + "63 86\n", + "79 86\n", + "71 86\n", + "82 86\n", + "80 86\n", + "Name: count, Length: 80, dtype: int64\n", + "\n" + ] + } + ], + "source": [ + "from imblearn.over_sampling import RandomOverSampler\n", + "\n", + "def oversample(df):\n", + " X = df.drop('Popularity', axis=1)\n", + " y = df['Popularity']\n", + " \n", + " oversampler = RandomOverSampler(random_state=42)\n", + " X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", + " \n", + " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n", + " return resampled_df\n", + "\n", + "train_df_oversampled = oversample(train_df)\n", + "val_df_oversampled = oversample(val_df)\n", + "test_df_oversampled = oversample(test_df)\n", + "\n", + "check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n", + "check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n", + "check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение Popularity в обучающей выборке после undersampling:\n", + "Popularity\n", + "0 1\n", + "1 1\n", + "2 1\n", + "3 1\n", + "4 1\n", + " ..\n", + "84 1\n", + "85 1\n", + "86 1\n", + "87 1\n", + "91 1\n", + "Name: count, Length: 88, dtype: int64\n", + "\n", + "Распределение Popularity в контрольной выборке после undersampling:\n", + "Popularity\n", + "0 1\n", + "1 1\n", + "2 1\n", + "3 1\n", + "4 1\n", + " ..\n", + "82 1\n", + "83 1\n", + "84 1\n", + "85 1\n", + "87 1\n", + "Name: count, Length: 85, dtype: int64\n", + "\n", + "Распределение Popularity в тестовой выборке после undersampling:\n", + "Popularity\n", + "0 1\n", + "1 1\n", + "2 1\n", + "3 1\n", + "4 1\n", + " ..\n", + "76 1\n", + "77 1\n", + "79 1\n", + "80 1\n", + "82 1\n", + "Name: count, Length: 80, dtype: int64\n", + "\n" + ] + } + ], + "source": [ + "from imblearn.under_sampling import RandomUnderSampler\n", + "\n", + "def undersample(df):\n", + " X = df.drop('Popularity', axis=1)\n", + " y = df['Popularity']\n", + " \n", + " undersampler = RandomUnderSampler(random_state=42)\n", + " X_resampled, y_resampled = undersampler.fit_resample(X, y)\n", + " \n", + " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n", + " return resampled_df\n", + "\n", + "train_df_undersampled = undersample(train_df)\n", + "val_df_undersampled = undersample(val_df)\n", + "test_df_undersampled = undersample(test_df)\n", + "\n", + "check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n", + "check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n", + "check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Police Shootings in the United States: 2015-2024" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В этом наборе данных, составленном The Washington Post, регистрируется каждый человек, застреленный дежурным полицейским в Соединенных Штатах с 2015 по 2024 год. Он решает проблему занижения органами власти статистики реальных инцедентов. Это может быть использовано в журналисткой работе, например для прогнозирования или выявления закономерностей преступлений. Цель технического проекта установить закономерность в убийствах полицейскими определённых групп граждан. Входные данные: возраст, пол, штат, вооружённость. Целевой признак: общий портрет убитого гражданина." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['date', 'name', 'age', 'gender', 'armed', 'race', 'city', 'state',\n", + " 'flee', 'body_camera', 'signs_of_mental_illness',\n", + " 'police_departments_involved'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "df = pd.read_csv(\".//static//csv//2024-07-23-washington-post-police-shootings-export.csv\")\n", + "print(df.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "При помощи ящика с усами и колонки возраста проверим набор на баланс. Он достаточно сбалансирован." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "# Box plot для столбца 'age'\n", + "plt.figure(figsize=(10, 6))\n", + "sns.boxplot(x=df['age'])\n", + "plt.title('Box Plot для age')\n", + "plt.xlabel('Age')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Теперь проверим на шум, здесь тоже особо проблем нет, однако смущает сочетание white и black, вероятно это мулаты." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "# Scatter plot для столбцов 'age' и 'race'\n", + "plt.figure(figsize=(10, 6))\n", + "sns.scatterplot(x='age', y='race', data=df)\n", + "plt.title('Scatter Plot для age и race')\n", + "plt.xlabel('Age')\n", + "plt.ylabel('Race')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Удаление строк с пустыми значениями" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "df_cleaned = df.dropna()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Разбиение набора данных на обучающую, контрольную и тестовую выборки" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: 4770\n", + "Размер контрольной выборки: 1591\n", + "Размер тестовой выборки: 1591\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Разделение на обучающую и тестовую выборки\n", + "train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n", + "\n", + "# Разделение обучающей выборки на обучающую и контрольную\n", + "train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n", + "\n", + "print(\"Размер обучающей выборки:\", len(train_df))\n", + "print(\"Размер контрольной выборки:\", len(val_df))\n", + "print(\"Размер тестовой выборки:\", len(test_df))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Применение методов приращения данных (аугментации)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение reace в обучающей выборке после oversampling:\n", + "race\n", + "Black 2187\n", + "White 2187\n", + "Hispanic 2187\n", + "Unknown 2187\n", + "Native American 2187\n", + "Asian 2187\n", + "White,Black,Native American 2187\n", + "Other 2187\n", + "White,Black 2187\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение reace в контрольной выборке после oversampling:\n", + "race\n", + "White 718\n", + "Black 718\n", + "Unknown 718\n", + "Hispanic 718\n", + "Asian 718\n", + "Native American 718\n", + "Other 718\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение reace в тестовой выборке после oversampling:\n", + "race\n", + "Unknown 750\n", + "White 750\n", + "Black 750\n", + "Hispanic 750\n", + "Asian 750\n", + "Native American 750\n", + "Black,Hispanic 750\n", + "Other 750\n", + "White,Black 750\n", + "Native American,Hispanic 750\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение reace в обучающей выборке после undersampling:\n", + "race\n", + "Asian 1\n", + "Black 1\n", + "Hispanic 1\n", + "Native American 1\n", + "Other 1\n", + "Unknown 1\n", + "White 1\n", + "White,Black 1\n", + "White,Black,Native American 1\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение reace в контрольной выборке после undersampling:\n", + "race\n", + "Asian 7\n", + "Black 7\n", + "Hispanic 7\n", + "Native American 7\n", + "Other 7\n", + "Unknown 7\n", + "White 7\n", + "Name: count, dtype: int64\n", + "\n", + "Распределение reace в тестовой выборке после undersampling:\n", + "race\n", + "Asian 1\n", + "Black 1\n", + "Black,Hispanic 1\n", + "Hispanic 1\n", + "Native American 1\n", + "Native American,Hispanic 1\n", + "Other 1\n", + "Unknown 1\n", + "White 1\n", + "White,Black 1\n", + "Name: count, dtype: int64\n", + "\n" + ] + } + ], + "source": [ + "from imblearn.over_sampling import RandomOverSampler\n", + "\n", + "def check_balance(df, name):\n", + " counts = df['race'].value_counts()\n", + " print(f\"Распределение reace в {name}:\")\n", + " print(counts)\n", + " print()\n", + "\n", + "def oversample(df):\n", + " X = df.drop('race', axis=1)\n", + " y = df['race']\n", + " \n", + " oversampler = RandomOverSampler(random_state=42)\n", + " X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", + " \n", + " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n", + " return resampled_df\n", + "\n", + "train_df_oversampled = oversample(train_df)\n", + "val_df_oversampled = oversample(val_df)\n", + "test_df_oversampled = oversample(test_df)\n", + "\n", + "check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n", + "check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n", + "check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")\n", + "\n", + "def undersample(df):\n", + " X = df.drop('race', axis=1)\n", + " y = df['race']\n", + " \n", + " undersampler = RandomUnderSampler(random_state=42)\n", + " X_resampled, y_resampled = undersampler.fit_resample(X, y)\n", + " \n", + " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n", + " return resampled_df\n", + "\n", + "train_df_undersampled = undersample(train_df)\n", + "val_df_undersampled = undersample(val_df)\n", + "test_df_undersampled = undersample(test_df)\n", + "\n", + "check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n", + "check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n", + "check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aimenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lab_3/lab3.ipynb b/lab_3/lab3.ipynb new file mode 100644 index 0000000..024426d --- /dev/null +++ b/lab_3/lab3.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Вариант задания: Прогнозирование цен на автомобили\n", + "### Бизнес-цели:\n", + "Повышение эффективности ценообразования на вторичном рынке автомобилей:\n", + "Цель: Разработать модель машинного обучения, которая позволит точно прогнозировать рыночную стоимость автомобилей на вторичном рынке.\n", + "Ключевые показатели успеха (KPI):\n", + "Точность прогнозирования цены (например, RMSE, MAE).\n", + "Сокращение времени на оценку стоимости автомобиля.\n", + "Увеличение количества продаж за счет более конкурентоспособных цен.\n", + "Оптимизация рекламных бюджетов для онлайн-площадок по продаже автомобилей:\n", + "Цель: Использовать прогнозы цен на автомобили для оптимизации таргетинга рекламы и повышения конверсии на онлайн-площадках.\n", + "Ключевые показатели успеха (KPI):\n", + "Увеличение CTR (Click-Through Rate) рекламных объявлений.\n", + "Повышение конверсии (процент пользователей, совершивших покупку после клика на рекламу).\n", + "Снижение стоимости привлечения клиента (CPA).\n", + "### Цели технического проекта:\n", + "Для бизнес-цели 1:\n", + "Сбор и подготовка данных:\n", + "Очистка данных от пропусков, выбросов и дубликатов.\n", + "Преобразование категориальных переменных в числовые.\n", + "Разделение данных на обучающую и тестовую выборки.\n", + "Разработка и обучение модели:\n", + "Исследование различных алгоритмов машинного обучения (линейная регрессия, деревья решений, случайный лес и т.д.).\n", + "Обучение моделей на обучающей выборке.\n", + "Оценка качества моделей на тестовой выборке с помощью метрик RMSE, MAE и др.\n", + "Развертывание модели:\n", + "Интеграция модели в существующую систему или разработка нового API для доступа к прогнозам.\n", + "Создание веб-интерфейса или мобильного приложения для удобного использования модели.\n", + "Для бизнес-цели 2:\n", + "Анализ данных о пользователях и поведении:\n", + "Анализ данных о просмотрах, кликах и покупках на онлайн-площадке.\n", + "Определение сегментов пользователей с разным уровнем интереса к покупке автомобилей.\n", + "Разработка рекомендательной системы:\n", + "Создание модели, которая будет рекомендовать пользователям автомобили, соответствующие их предпочтениям и бюджету.\n", + "Интеграция рекомендательной системы в рекламные кампании.\n", + "Оптимизация таргетинга рекламы:\n", + "Использование прогнозов цен на автомобили для более точного таргетинга рекламы на пользователей, готовых к покупке.\n", + "Тестирование различных стратегий таргетинга и оценка их эффективности." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['ID', 'Price', 'Levy', 'Manufacturer', 'Model', 'Prod. year',\n", + " 'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',\n", + " 'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color',\n", + " 'Airbags'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pn\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib\n", + "import matplotlib.ticker as ticker\n", + "df = pn.read_csv(\".//static//csv//car_price_prediction.csv\").head(15000)\n", + "print(df.columns)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- 2.25.1 From 9926ca3e2d421c0913ad19e447695e3ee7b7dd31 Mon Sep 17 00:00:00 2001 From: GokaPek Date: Fri, 11 Oct 2024 23:17:25 +0400 Subject: [PATCH 2/4] =?UTF-8?q?=D0=BF=D0=BE=D1=87=D1=82=D0=B8=20=D0=B2?= =?UTF-8?q?=D1=81=D1=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_3/lab3.ipynb | 461 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 459 insertions(+), 2 deletions(-) diff --git a/lab_3/lab3.ipynb b/lab_3/lab3.ipynb index 024426d..337db01 100644 --- a/lab_3/lab3.ipynb +++ b/lab_3/lab3.ipynb @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -65,9 +65,466 @@ "import matplotlib.pyplot as plt\n", "import matplotlib\n", "import matplotlib.ticker as ticker\n", - "df = pn.read_csv(\".//static//csv//car_price_prediction.csv\").head(15000)\n", + "df = pn.read_csv(\".//static//csv//car_price_prediction.csv\")\n", "print(df.columns)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Разделим на 3 выборки\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: 12311\n", + "Размер контрольной выборки: 3078\n", + "Размер тестовой выборки: 3848\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Разделение данных на обучающую и тестовую выборки (80% - обучение, 20% - тест)\n", + "train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)\n", + "\n", + "# Разделение обучающей выборки на обучающую и контрольную (80% - обучение, 20% - контроль)\n", + "train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)\n", + "\n", + "print(\"Размер обучающей выборки:\", len(train_data))\n", + "print(\"Размер контрольной выборки:\", len(val_data))\n", + "print(\"Размер тестовой выборки:\", len(test_data))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Пример оценки сбалансированности целевой переменной (цена автомобиля)\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Гистограмма распределения цены в обучающей выборке\n", + "sns.histplot(train_data['Price'], kde=True)\n", + "plt.title('Распределение цены в обучающей выборке')\n", + "plt.show()\n", + "\n", + "# Гистограмма распределения цены в контрольной выборке\n", + "sns.histplot(val_data['Price'], kde=True)\n", + "plt.title('Распределение цены в контрольной выборке')\n", + "plt.show()\n", + "\n", + "# Гистограмма распределения цены в тестовой выборке\n", + "sns.histplot(test_data['Price'], kde=True)\n", + "plt.title('Распределение цены в тестовой выборке')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Процесс конструирования признаков\n", + "Задача 1: Прогнозирование цен на автомобили\n", + "Цель технического проекта: Разработка модели машинного обучения для точного прогнозирования рыночной стоимости автомобилей.\n", + "\n", + "Задача 2: Оптимизация рекламных бюджетов\n", + "Цель технического проекта: Использование прогнозов цен на автомобили для оптимизации таргетинга рекламы и повышения конверсии на онлайн-площадках.\n", + "\n", + "\n", + "### Унитарное кодирование категориальных признаков (one-hot encoding)\n", + "\n", + "One-hot encoding: Преобразование категориальных признаков в бинарные векторы." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ID Price Levy Manufacturer Prod. year Engine volume \\\n", + "3438 45793776 13485 781 VOLKSWAGEN 2012 2.5 \n", + "3185 45760664 314 781 SUBARU 2012 2.5 \n", + "5529 45777845 5645 5908 BMW 1999 2.5 \n", + "7891 45651201 7997 1850 LEXUS 2008 3.5 \n", + "12167 45798755 15681 765 VOLKSWAGEN 2015 2 \n", + "... ... ... ... ... ... ... \n", + "2750 45656065 941 1055 LEXUS 2013 3.5 \n", + "17390 45785069 12000 - FORD 1998 2.5 \n", + "5563 45815001 941 777 TOYOTA 2014 2.5 \n", + "3813 45809829 54850 831 HONDA 2018 1.5 \n", + "6041 45397141 9095 - FORD 2003 1.7 \n", + "\n", + " Mileage Cylinders Drive wheels Doors ... Fuel type_Hybrid \\\n", + "3438 160000 km 4.0 Rear 04-May ... False \n", + "3185 204579 km 4.0 4x4 04-May ... False \n", + "5529 0 km 6.0 Rear 04-May ... False \n", + "7891 244731 km 6.0 Front 04-May ... True \n", + "12167 103000 km 4.0 Front 04-May ... False \n", + "... ... ... ... ... ... ... \n", + "2750 361603 km 6.0 Front 04-May ... True \n", + "17390 220000 km 4.0 Rear 04-May ... False \n", + "5563 202355 km 4.0 Front 04-May ... False \n", + "3813 13048 km 4.0 Front 04-May ... False \n", + "6041 159000 km 4.0 Front 04-May ... False \n", + "\n", + " Fuel type_LPG Fuel type_Petrol Fuel type_Plug-in Hybrid \\\n", + "3438 False True False \n", + "3185 False True False \n", + "5529 False True False \n", + "7891 False False False \n", + "12167 False True False \n", + "... ... ... ... \n", + "2750 False False False \n", + "17390 False False False \n", + "5563 False True False \n", + "3813 False True False \n", + "6041 False False False \n", + "\n", + " Gear box type_Automatic Gear box type_Manual Gear box type_Tiptronic \\\n", + "3438 True False False \n", + "3185 True False False \n", + "5529 False False True \n", + "7891 True False False \n", + "12167 False False True \n", + "... ... ... ... \n", + "2750 True False False \n", + "17390 False True False \n", + "5563 True False False \n", + "3813 True False False \n", + "6041 False True False \n", + "\n", + " Gear box type_Variator Leather interior_No Leather interior_Yes \n", + "3438 False False True \n", + "3185 False False True \n", + "5529 False True False \n", + "7891 False False True \n", + "12167 False True False \n", + "... ... ... ... \n", + "2750 False False True \n", + "17390 False True False \n", + "5563 False False True \n", + "3813 False False True \n", + "6041 False True False \n", + "\n", + "[12311 rows x 1247 columns]\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Пример категориальных признаков\n", + "categorical_features = ['Model', 'Category', 'Fuel type', 'Gear box type', 'Leather interior']\n", + "\n", + "# Применение one-hot encoding\n", + "train_data_encoded = pd.get_dummies(train_data, columns=categorical_features)\n", + "val_data_encoded = pd.get_dummies(val_data, columns=categorical_features)\n", + "test_data_encoded = pd.get_dummies(test_data, columns=categorical_features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Дискретизация числовых признаков \n", + "это процесс преобразования непрерывных числовых значений в дискретные категории или интервалы (бины). Этот процесс может быть полезен по нескольким причинам" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ID Price Levy Manufacturer Prod. year Engine volume \\\n", + "736 45753963 27284 259 CHEVROLET 2014 1.4 \n", + "8674 45786053 10349 - MERCEDES-BENZ 1997 2.9 Turbo \n", + "5971 45757478 40769 - MERCEDES-BENZ 1996 1.8 \n", + "1957 45732345 38737 639 HYUNDAI 2014 2 \n", + "11075 45729790 42102 831 SSANGYONG 2017 1.6 \n", + "... ... ... ... ... ... ... \n", + "12026 45786994 12231 650 CHEVROLET 2016 1.4 Turbo \n", + "17893 45756187 15681 - FORD 2003 2.4 Turbo \n", + "5339 45769967 314 2410 MERCEDES-BENZ 2010 6.2 \n", + "11859 45801865 14069 687 HYUNDAI 2010 1.6 \n", + "9276 45803366 15681 891 HYUNDAI 2016 2 \n", + "\n", + " Mileage Cylinders Drive wheels Doors ... Fuel type_LPG \\\n", + "736 65000 km 4.0 Front 04-May ... False \n", + "8674 3333 km 6.0 Rear 02-Mar ... False \n", + "5971 212485 km 8.0 Rear 04-May ... False \n", + "1957 132756 km 4.0 Front 04-May ... False \n", + "11075 50750 km 4.0 Front 04-May ... False \n", + "... ... ... ... ... ... ... \n", + "12026 9000 km 4.0 Front 04-May ... False \n", + "17893 250000 km 4.0 Rear 04-May ... False \n", + "5339 274771 km 8.0 Rear 04-May ... False \n", + "11859 100403 km 4.0 Front 04-May ... False \n", + "9276 322292 km 4.0 Front 04-May ... True \n", + "\n", + " Fuel type_Petrol Fuel type_Plug-in Hybrid Gear box type_Automatic \\\n", + "736 False True True \n", + "8674 False False False \n", + "5971 True False False \n", + "1957 False False True \n", + "11075 True False True \n", + "... ... ... ... \n", + "12026 True False False \n", + "17893 False False False \n", + "5339 True False True \n", + "11859 True False True \n", + "9276 False False True \n", + "\n", + " Gear box type_Manual Gear box type_Tiptronic Gear box type_Variator \\\n", + "736 False False False \n", + "8674 True False False \n", + "5971 True False False \n", + "1957 False False False \n", + "11075 False False False \n", + "... ... ... ... \n", + "12026 False True False \n", + "17893 True False False \n", + "5339 False False False \n", + "11859 False False False \n", + "9276 False False False \n", + "\n", + " Leather interior_No Leather interior_Yes Year bin \n", + "736 True False 4 \n", + "8674 False True 3 \n", + "5971 True False 3 \n", + "1957 False True 4 \n", + "11075 False True 4 \n", + "... ... ... ... \n", + "12026 True False 4 \n", + "17893 True False 3 \n", + "5339 False True 4 \n", + "11859 False True 4 \n", + "9276 False True 4 \n", + "\n", + "[3848 rows x 658 columns]\n" + ] + } + ], + "source": [ + "# Пример дискретизации признака 'year'\n", + "train_data_encoded['Year bin'] = pd.cut(train_data_encoded['Prod. year'], bins=5, labels=False)\n", + "val_data_encoded['Year bin'] = pd.cut(val_data_encoded['Prod. year'], bins=5, labels=False)\n", + "test_data_encoded['Year bin'] = pd.cut(test_data_encoded['Prod. year'], bins=5, labels=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Ручной синтез\n", + "Создание новых признаков на основе экспертных знаний и логики предметной области. Например, для данных о продаже автомобилей можно создать признак \"возраст автомобиля\" как разницу между текущим годом и годом выпуска." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ID Price Levy Manufacturer Prod. year Engine volume \\\n", + "3438 45793776 13485 781 VOLKSWAGEN 2012 2.5 \n", + "3185 45760664 314 781 SUBARU 2012 2.5 \n", + "5529 45777845 5645 5908 BMW 1999 2.5 \n", + "7891 45651201 7997 1850 LEXUS 2008 3.5 \n", + "12167 45798755 15681 765 VOLKSWAGEN 2015 2 \n", + "... ... ... ... ... ... ... \n", + "2750 45656065 941 1055 LEXUS 2013 3.5 \n", + "17390 45785069 12000 - FORD 1998 2.5 \n", + "5563 45815001 941 777 TOYOTA 2014 2.5 \n", + "3813 45809829 54850 831 HONDA 2018 1.5 \n", + "6041 45397141 9095 - FORD 2003 1.7 \n", + "\n", + " Mileage Cylinders Drive wheels Doors ... Fuel type_Petrol \\\n", + "3438 160000 km 4.0 Rear 04-May ... True \n", + "3185 204579 km 4.0 4x4 04-May ... True \n", + "5529 0 km 6.0 Rear 04-May ... True \n", + "7891 244731 km 6.0 Front 04-May ... False \n", + "12167 103000 km 4.0 Front 04-May ... True \n", + "... ... ... ... ... ... ... \n", + "2750 361603 km 6.0 Front 04-May ... False \n", + "17390 220000 km 4.0 Rear 04-May ... False \n", + "5563 202355 km 4.0 Front 04-May ... True \n", + "3813 13048 km 4.0 Front 04-May ... True \n", + "6041 159000 km 4.0 Front 04-May ... False \n", + "\n", + " Fuel type_Plug-in Hybrid Gear box type_Automatic Gear box type_Manual \\\n", + "3438 False True False \n", + "3185 False True False \n", + "5529 False False False \n", + "7891 False True False \n", + "12167 False False False \n", + "... ... ... ... \n", + "2750 False True False \n", + "17390 False False True \n", + "5563 False True False \n", + "3813 False True False \n", + "6041 False False True \n", + "\n", + " Gear box type_Tiptronic Gear box type_Variator Leather interior_No \\\n", + "3438 False False False \n", + "3185 False False False \n", + "5529 True False True \n", + "7891 False False False \n", + "12167 True False True \n", + "... ... ... ... \n", + "2750 False False False \n", + "17390 False False True \n", + "5563 False False False \n", + "3813 False False False \n", + "6041 False False True \n", + "\n", + " Leather interior_Yes Year bin Age \n", + "3438 True 4 12 \n", + "3185 True 4 12 \n", + "5529 False 3 25 \n", + "7891 True 4 16 \n", + "12167 False 4 9 \n", + "... ... ... ... \n", + "2750 True 4 11 \n", + "17390 False 3 26 \n", + "5563 True 4 10 \n", + "3813 True 4 6 \n", + "6041 False 3 21 \n", + "\n", + "[12311 rows x 1249 columns]\n" + ] + } + ], + "source": [ + "# Пример синтеза признака \"возраст автомобиля\"\n", + "train_data_encoded['Age'] = 2024 - train_data_encoded['Prod. year']\n", + "val_data_encoded['Age'] = 2024 - val_data_encoded['Prod. year']\n", + "test_data_encoded['Age'] = 2024 - test_data_encoded['Prod. year']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", + "\n", + "# Пример масштабирования числовых признаков\n", + "numerical_features = ['Airbags', 'Age']\n", + "\n", + "scaler = StandardScaler()\n", + "train_data_encoded[numerical_features] = scaler.fit_transform(train_data_encoded[numerical_features])\n", + "val_data_encoded[numerical_features] = scaler.transform(val_data_encoded[numerical_features])\n", + "test_data_encoded[numerical_features] = scaler.transform(test_data_encoded[numerical_features])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Конструирование признаков с применением фреймворка Featuretools" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'pkg_resources'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[25], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mft\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Определение сущностей\u001b[39;00m\n\u001b[0;32m 4\u001b[0m es \u001b[38;5;241m=\u001b[39m ft\u001b[38;5;241m.\u001b[39mEntitySet(\u001b[38;5;28mid\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcar_data\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", + "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\__init__.py:4\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversion\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfig_init\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m config\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m primitives\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msynthesis\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n", + "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\__init__.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n", + "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\api.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdeserialize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m read_entityset\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m EntitySet\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrelationship\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Relationship\n", + "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\deserialize.py:8\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01minspect\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m getfullargspec\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m----> 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwoodwork\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtype_sys\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtype_system\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mww_type_system\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwoodwork\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdeserialize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m read_woodwork_table\n\u001b[0;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrelationship\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Relationship\n", + "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\__init__.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpkg_resources\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwarnings\u001b[39;00m\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'pkg_resources'" + ] + } + ], + "source": [ + "import featuretools as ft\n", + "\n", + "# Определение сущностей\n", + "es = ft.EntitySet(id='car_data')\n", + "es = es.entity_from_dataframe(entity_id='cars', dataframe=train_data_encoded, index='id')\n", + "\n", + "# Определение связей между сущностями (если есть)\n", + "# es = es.add_relationship(...)\n", + "\n", + "# Генерация признаков\n", + "feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='cars', max_depth=2)\n", + "\n", + "# Преобразование признаков для контрольной и тестовой выборок\n", + "val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n", + "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)" + ] } ], "metadata": { -- 2.25.1 From 1f9b7fcbe97e147f4e5adc18db1413bec63375c8 Mon Sep 17 00:00:00 2001 From: GokaPek Date: Fri, 11 Oct 2024 23:18:43 +0400 Subject: [PATCH 3/4] =?UTF-8?q?=D0=BF=D0=BE=D1=87=D1=82=D0=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_3/lab3.ipynb | 484 +++++++++++++++++++++++------------------------ 1 file changed, 239 insertions(+), 245 deletions(-) diff --git a/lab_3/lab3.ipynb b/lab_3/lab3.ipynb index 337db01..7c9e25e 100644 --- a/lab_3/lab3.ipynb +++ b/lab_3/lab3.ipynb @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -78,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -181,82 +181,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " ID Price Levy Manufacturer Prod. year Engine volume \\\n", - "3438 45793776 13485 781 VOLKSWAGEN 2012 2.5 \n", - "3185 45760664 314 781 SUBARU 2012 2.5 \n", - "5529 45777845 5645 5908 BMW 1999 2.5 \n", - "7891 45651201 7997 1850 LEXUS 2008 3.5 \n", - "12167 45798755 15681 765 VOLKSWAGEN 2015 2 \n", - "... ... ... ... ... ... ... \n", - "2750 45656065 941 1055 LEXUS 2013 3.5 \n", - "17390 45785069 12000 - FORD 1998 2.5 \n", - "5563 45815001 941 777 TOYOTA 2014 2.5 \n", - "3813 45809829 54850 831 HONDA 2018 1.5 \n", - "6041 45397141 9095 - FORD 2003 1.7 \n", - "\n", - " Mileage Cylinders Drive wheels Doors ... Fuel type_Hybrid \\\n", - "3438 160000 km 4.0 Rear 04-May ... False \n", - "3185 204579 km 4.0 4x4 04-May ... False \n", - "5529 0 km 6.0 Rear 04-May ... False \n", - "7891 244731 km 6.0 Front 04-May ... True \n", - "12167 103000 km 4.0 Front 04-May ... False \n", - "... ... ... ... ... ... ... \n", - "2750 361603 km 6.0 Front 04-May ... True \n", - "17390 220000 km 4.0 Rear 04-May ... False \n", - "5563 202355 km 4.0 Front 04-May ... False \n", - "3813 13048 km 4.0 Front 04-May ... False \n", - "6041 159000 km 4.0 Front 04-May ... False \n", - "\n", - " Fuel type_LPG Fuel type_Petrol Fuel type_Plug-in Hybrid \\\n", - "3438 False True False \n", - "3185 False True False \n", - "5529 False True False \n", - "7891 False False False \n", - "12167 False True False \n", - "... ... ... ... \n", - "2750 False False False \n", - "17390 False False False \n", - "5563 False True False \n", - "3813 False True False \n", - "6041 False False False \n", - "\n", - " Gear box type_Automatic Gear box type_Manual Gear box type_Tiptronic \\\n", - "3438 True False False \n", - "3185 True False False \n", - "5529 False False True \n", - "7891 True False False \n", - "12167 False False True \n", - "... ... ... ... \n", - "2750 True False False \n", - "17390 False True False \n", - "5563 True False False \n", - "3813 True False False \n", - "6041 False True False \n", - "\n", - " Gear box type_Variator Leather interior_No Leather interior_Yes \n", - "3438 False False True \n", - "3185 False False True \n", - "5529 False True False \n", - "7891 False False True \n", - "12167 False True False \n", - "... ... ... ... \n", - "2750 False False True \n", - "17390 False True False \n", - "5563 False False True \n", - "3813 False False True \n", - "6041 False True False \n", - "\n", - "[12311 rows x 1247 columns]\n" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "\n", @@ -279,82 +206,9 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 16, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " ID Price Levy Manufacturer Prod. year Engine volume \\\n", - "736 45753963 27284 259 CHEVROLET 2014 1.4 \n", - "8674 45786053 10349 - MERCEDES-BENZ 1997 2.9 Turbo \n", - "5971 45757478 40769 - MERCEDES-BENZ 1996 1.8 \n", - "1957 45732345 38737 639 HYUNDAI 2014 2 \n", - "11075 45729790 42102 831 SSANGYONG 2017 1.6 \n", - "... ... ... ... ... ... ... \n", - "12026 45786994 12231 650 CHEVROLET 2016 1.4 Turbo \n", - "17893 45756187 15681 - FORD 2003 2.4 Turbo \n", - "5339 45769967 314 2410 MERCEDES-BENZ 2010 6.2 \n", - "11859 45801865 14069 687 HYUNDAI 2010 1.6 \n", - "9276 45803366 15681 891 HYUNDAI 2016 2 \n", - "\n", - " Mileage Cylinders Drive wheels Doors ... Fuel type_LPG \\\n", - "736 65000 km 4.0 Front 04-May ... False \n", - "8674 3333 km 6.0 Rear 02-Mar ... False \n", - "5971 212485 km 8.0 Rear 04-May ... False \n", - "1957 132756 km 4.0 Front 04-May ... False \n", - "11075 50750 km 4.0 Front 04-May ... False \n", - "... ... ... ... ... ... ... \n", - "12026 9000 km 4.0 Front 04-May ... False \n", - "17893 250000 km 4.0 Rear 04-May ... False \n", - "5339 274771 km 8.0 Rear 04-May ... False \n", - "11859 100403 km 4.0 Front 04-May ... False \n", - "9276 322292 km 4.0 Front 04-May ... True \n", - "\n", - " Fuel type_Petrol Fuel type_Plug-in Hybrid Gear box type_Automatic \\\n", - "736 False True True \n", - "8674 False False False \n", - "5971 True False False \n", - "1957 False False True \n", - "11075 True False True \n", - "... ... ... ... \n", - "12026 True False False \n", - "17893 False False False \n", - "5339 True False True \n", - "11859 True False True \n", - "9276 False False True \n", - "\n", - " Gear box type_Manual Gear box type_Tiptronic Gear box type_Variator \\\n", - "736 False False False \n", - "8674 True False False \n", - "5971 True False False \n", - "1957 False False False \n", - "11075 False False False \n", - "... ... ... ... \n", - "12026 False True False \n", - "17893 True False False \n", - "5339 False False False \n", - "11859 False False False \n", - "9276 False False False \n", - "\n", - " Leather interior_No Leather interior_Yes Year bin \n", - "736 True False 4 \n", - "8674 False True 3 \n", - "5971 True False 3 \n", - "1957 False True 4 \n", - "11075 False True 4 \n", - "... ... ... ... \n", - "12026 True False 4 \n", - "17893 True False 3 \n", - "5339 False True 4 \n", - "11859 False True 4 \n", - "9276 False True 4 \n", - "\n", - "[3848 rows x 658 columns]\n" - ] - } - ], + "outputs": [], "source": [ "# Пример дискретизации признака 'year'\n", "train_data_encoded['Year bin'] = pd.cut(train_data_encoded['Prod. year'], bins=5, labels=False)\n", @@ -372,82 +226,9 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 15, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " ID Price Levy Manufacturer Prod. year Engine volume \\\n", - "3438 45793776 13485 781 VOLKSWAGEN 2012 2.5 \n", - "3185 45760664 314 781 SUBARU 2012 2.5 \n", - "5529 45777845 5645 5908 BMW 1999 2.5 \n", - "7891 45651201 7997 1850 LEXUS 2008 3.5 \n", - "12167 45798755 15681 765 VOLKSWAGEN 2015 2 \n", - "... ... ... ... ... ... ... \n", - "2750 45656065 941 1055 LEXUS 2013 3.5 \n", - "17390 45785069 12000 - FORD 1998 2.5 \n", - "5563 45815001 941 777 TOYOTA 2014 2.5 \n", - "3813 45809829 54850 831 HONDA 2018 1.5 \n", - "6041 45397141 9095 - FORD 2003 1.7 \n", - "\n", - " Mileage Cylinders Drive wheels Doors ... Fuel type_Petrol \\\n", - "3438 160000 km 4.0 Rear 04-May ... True \n", - "3185 204579 km 4.0 4x4 04-May ... True \n", - "5529 0 km 6.0 Rear 04-May ... True \n", - "7891 244731 km 6.0 Front 04-May ... False \n", - "12167 103000 km 4.0 Front 04-May ... True \n", - "... ... ... ... ... ... ... \n", - "2750 361603 km 6.0 Front 04-May ... False \n", - "17390 220000 km 4.0 Rear 04-May ... False \n", - "5563 202355 km 4.0 Front 04-May ... True \n", - "3813 13048 km 4.0 Front 04-May ... True \n", - "6041 159000 km 4.0 Front 04-May ... False \n", - "\n", - " Fuel type_Plug-in Hybrid Gear box type_Automatic Gear box type_Manual \\\n", - "3438 False True False \n", - "3185 False True False \n", - "5529 False False False \n", - "7891 False True False \n", - "12167 False False False \n", - "... ... ... ... \n", - "2750 False True False \n", - "17390 False False True \n", - "5563 False True False \n", - "3813 False True False \n", - "6041 False False True \n", - "\n", - " Gear box type_Tiptronic Gear box type_Variator Leather interior_No \\\n", - "3438 False False False \n", - "3185 False False False \n", - "5529 True False True \n", - "7891 False False False \n", - "12167 True False True \n", - "... ... ... ... \n", - "2750 False False False \n", - "17390 False False True \n", - "5563 False False False \n", - "3813 False False False \n", - "6041 False False True \n", - "\n", - " Leather interior_Yes Year bin Age \n", - "3438 True 4 12 \n", - "3185 True 4 12 \n", - "5529 False 3 25 \n", - "7891 True 4 16 \n", - "12167 False 4 9 \n", - "... ... ... ... \n", - "2750 True 4 11 \n", - "17390 False 3 26 \n", - "5563 True 4 10 \n", - "3813 True 4 6 \n", - "6041 False 3 21 \n", - "\n", - "[12311 rows x 1249 columns]\n" - ] - } - ], + "outputs": [], "source": [ "# Пример синтеза признака \"возраст автомобиля\"\n", "train_data_encoded['Age'] = 2024 - train_data_encoded['Prod. year']\n", @@ -464,7 +245,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -488,23 +269,33 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 51, "metadata": {}, "outputs": [ { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'pkg_resources'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[25], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mft\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Определение сущностей\u001b[39;00m\n\u001b[0;32m 4\u001b[0m es \u001b[38;5;241m=\u001b[39m ft\u001b[38;5;241m.\u001b[39mEntitySet(\u001b[38;5;28mid\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcar_data\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", - "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\__init__.py:4\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversion\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfig_init\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m config\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m primitives\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msynthesis\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n", - "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\__init__.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n", - "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\api.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdeserialize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m read_entityset\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m EntitySet\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrelationship\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Relationship\n", - "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\deserialize.py:8\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01minspect\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m getfullargspec\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m----> 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwoodwork\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtype_sys\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtype_system\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mww_type_system\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwoodwork\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdeserialize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m read_woodwork_table\n\u001b[0;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrelationship\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Relationship\n", - "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\__init__.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpkg_resources\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwarnings\u001b[39;00m\n", - "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'pkg_resources'" + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:724: UserWarning: A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: index\n", + " warnings.warn(\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", + " warnings.warn(\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n" ] } ], @@ -513,17 +304,220 @@ "\n", "# Определение сущностей\n", "es = ft.EntitySet(id='car_data')\n", - "es = es.entity_from_dataframe(entity_id='cars', dataframe=train_data_encoded, index='id')\n", + "es = es.add_dataframe(dataframe_name='cars', dataframe=train_data_encoded, index='id')\n", "\n", "# Определение связей между сущностями (если есть)\n", "# es = es.add_relationship(...)\n", "\n", "# Генерация признаков\n", - "feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='cars', max_depth=2)\n", + "feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='cars', max_depth=2)\n", "\n", "# Преобразование признаков для контрольной и тестовой выборок\n", "val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n", - "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)" + "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Оценка качества каждого набора признаков\n", + "Предсказательная способность\n", + "Метрики: RMSE, MAE, R²\n", + "\n", + "Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n", + "\n", + "Скорость вычисления\n", + "Методы: Измерение времени выполнения генерации признаков и обучения модели.\n", + "\n", + "Надежность\n", + "Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n", + "\n", + "Корреляция\n", + "Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n", + "\n", + "Цельность\n", + "Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:724: UserWarning: A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: index\n", + " warnings.warn(\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", + " warnings.warn(\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n" + ] + } + ], + "source": [ + "import featuretools as ft\n", + "\n", + "# Определение сущностей\n", + "es = ft.EntitySet(id='car_data')\n", + "es = es.add_dataframe(dataframe_name='cars', dataframe=train_data_encoded, index='id')\n", + "\n", + "# Генерация признаков\n", + "feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='cars', max_depth=2)\n", + "\n", + "# Преобразование признаков для контрольной и тестовой выборок\n", + "val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n", + "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSE: 234661.34107821883\n", + "R²: 0.8029264507217629\n", + "MAE: 7964.677649030692\n", + "Cross-validated RMSE: 259310.71680259163\n", + "Train RMSE: 109324.02870848698\n", + "Train R²: 0.7887252013114727\n", + "Train MAE: 3471.173866063129\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n", + "from sklearn.model_selection import cross_val_score\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "# Удаление строк с NaN\n", + "feature_matrix = feature_matrix.dropna()\n", + "val_feature_matrix = val_feature_matrix.dropna()\n", + "test_feature_matrix = test_feature_matrix.dropna()\n", + "\n", + "# Разделение данных на обучающую и тестовую выборки\n", + "X_train = feature_matrix.drop('Price', axis=1)\n", + "y_train = feature_matrix['Price']\n", + "X_val = val_feature_matrix.drop('Price', axis=1)\n", + "y_val = val_feature_matrix['Price']\n", + "X_test = test_feature_matrix.drop('Price', axis=1)\n", + "y_test = test_feature_matrix['Price']\n", + "\n", + "# Выбор модели\n", + "model = RandomForestRegressor(random_state=42)\n", + "\n", + "# Обучение модели\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Предсказание и оценка\n", + "y_pred = model.predict(X_test)\n", + "\n", + "rmse = mean_squared_error(y_test, y_pred, squared=False)\n", + "r2 = r2_score(y_test, y_pred)\n", + "mae = mean_absolute_error(y_test, y_pred)\n", + "\n", + "print(f\"RMSE: {rmse}\")\n", + "print(f\"R²: {r2}\")\n", + "print(f\"MAE: {mae}\")\n", + "\n", + "# Кросс-валидация\n", + "scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n", + "rmse_cv = (-scores.mean())**0.5\n", + "print(f\"Cross-validated RMSE: {rmse_cv}\")\n", + "\n", + "# Анализ важности признаков\n", + "feature_importances = model.feature_importances_\n", + "feature_names = X_train.columns\n", + "\n", + "# importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n", + "# importance_df = importance_df.sort_values(by='Importance', ascending=False)\n", + "\n", + "# plt.figure(figsize=(10, 6))\n", + "# sns.barplot(x='Importance', y='Feature', data=importance_df)\n", + "# plt.title('Feature Importance')\n", + "# plt.show()\n", + "\n", + "# Проверка на переобучение\n", + "y_train_pred = model.predict(X_train)\n", + "\n", + "rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)\n", + "r2_train = r2_score(y_train, y_train_pred)\n", + "mae_train = mean_absolute_error(y_train, y_train_pred)\n", + "\n", + "print(f\"Train RMSE: {rmse_train}\")\n", + "print(f\"Train R²: {r2_train}\")\n", + "print(f\"Train MAE: {mae_train}\")\n", + "\n", + "# Визуализация результатов\n", + "plt.figure(figsize=(10, 6))\n", + "plt.scatter(y_test, y_pred, alpha=0.5)\n", + "plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n", + "plt.xlabel('Actual Price')\n", + "plt.ylabel('Predicted Price')\n", + "plt.title('Actual vs Predicted Price')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Точность предсказаний: Модель показывает довольно высокий R² (0.8029), что указывает на хорошее объяснение вариации цен. Однако, значения RMSE и MAE довольно высоки, что говорит о том, что модель не очень точно предсказывает цены, особенно для высоких значений.\n", + "\n", + "Переобучение: Разница между RMSE на обучающей и тестовой выборках не очень большая, что указывает на то, что переобучение не является критическим. Однако, стоит быть осторожным и продолжать мониторинг этого показателя.\n", + "\n", + "Кросс-валидация: Значение RMSE после кросс-валидации немного выше, чем на тестовой выборке, что может указывать на некоторую нестабильность модели." ] } ], -- 2.25.1 From b9f5eaf38d95f7bd377248bda444c3c1b35bdcb4 Mon Sep 17 00:00:00 2001 From: GokaPek Date: Fri, 11 Oct 2024 23:52:22 +0400 Subject: [PATCH 4/4] =?UTF-8?q?=D1=84=D0=B8=D0=BD=D0=B0=D0=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_3/lab2.ipynb | 1337 ---------------------------------------------- 1 file changed, 1337 deletions(-) delete mode 100644 lab_3/lab2.ipynb diff --git a/lab_3/lab2.ipynb b/lab_3/lab2.ipynb deleted file mode 100644 index f7ec25d..0000000 --- a/lab_3/lab2.ipynb +++ /dev/null @@ -1,1337 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Выгрузка в датафрейм первый набор (игры в Steam)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "https://www.kaggle.com/datasets/wajihulhassan369/steam-games-dataset. Набор представляет собой данные об экшенах, доступных в Steam. Эта информация полезна для изучения игровых паттернов, моделирования цен и исследования корреляции между игровыми тегами и методами ценообразования. Этот набор позволяет провести предварительный анализ данных, построить модели машинного обучения или исследовать игровую индустрию. В наборе пресдтавлена дата, различные теги, рейтинг отзывов. Так можно понять, какие теги популярнее, что в играх людям нравится больше, изменилось ли качество игр со временем и т.д. Для бизнеса такой набор данных может быть полезен для прогнозирования, в разработку каки игр целесообразнее вкладываться. Так компания не потеряет деньги.\n", - "Пример цели: Разработка игры на пк в нужную фазу рынка\n", - "Входные данные: год выпуска, сумма продаж\n", - "Целевой признак: продаваемость игр в текущей фазе рынка в сравнении с предыдущими." - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['Name', 'Price', 'Release_date', 'Review_no', 'Review_type', 'Tags',\n", - " 'Description'],\n", - " dtype='object')\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "df = pd.read_csv(\".//static//csv//steam_cleaned.csv\")\n", - "print(df.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Преобразуем дату выпуска в формат datetime\n", - "df['Release_date'] = pd.to_datetime(df['Release_date'])\n", - "\n", - "# Визуализация данных\n", - "plt.figure(figsize=(10, 6))\n", - "plt.scatter(df['Release_date'], df['Review_no'])\n", - "plt.xlabel('Release Date')\n", - "plt.ylabel('Review Number')\n", - "plt.title('Scatter Plot of Review Number vs Release Date')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "При проверке на шум можно заметить выброс в 2014 году. количество обзоров там запредельное. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Все выбросы удалены путём определения порогов квантилями. Зашумленность не очень высокая. Покрытие данных высокое и подошло бы для поставленной задачи по актуальности." - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Выбросы:\n", - " Name Price Release_date Review_no \\\n", - "18 GUNDAM BREAKER 4 59.99 2024-08-29 1846.0 \n", - "22 LOCKDOWN Protocol 5.49 2024-07-22 2192.0 \n", - "34 CarX Street 19.99 2024-08-29 4166.0 \n", - "45 Harry Potter: Quidditch Champions 25.99 2024-09-03 1216.0 \n", - "61 SMITE 2 18.00 2024-08-27 1633.0 \n", - "... ... ... ... ... \n", - "7695 Dude Simulator 2 2.99 2018-07-28 1734.0 \n", - "7717 Golfing Over It with Alva Majo 2.39 2018-03-28 1367.0 \n", - "7740 Dungeon Siege II 4.99 2005-08-16 2274.0 \n", - "7765 Phantom Doctrine 12.99 2018-08-14 3538.0 \n", - "7768 NECROPOLIS: BRUTAL EDITION 19.99 2016-07-12 3668.0 \n", - "\n", - " Review_type Tags \\\n", - "18 Very Positive Action,Robots,Hack and Slash,RPG,Mechs,Action ... \n", - "22 Very Positive Multiplayer,Social Deduction,Conversation,Acti... \n", - "34 Mixed Racing,Open World,Automobile Sim,PvP,Multiplay... \n", - "45 Mostly Positive Action,Sports,Flight,Arcade,Third Person,Magic... \n", - "61 Mixed Action,MOBA,Third Person,Strategy,Adventure,Ca... \n", - "... ... ... \n", - "7695 Mixed Life Sim,Indie,Simulation,Racing,Action,Advent... \n", - "7717 Mostly Positive Difficult,Physics,Golf,Platformer,Precision Pl... \n", - "7740 Mostly Positive RPG,Fantasy,Action RPG,Hack and Slash,Singlepl... \n", - "7765 Mostly Positive Turn-Based Tactics,Strategy,Cold War,Stealth,R... \n", - "7768 Mixed Souls-like,Action Roguelike,Co-op,Adventure,Ro... \n", - "\n", - " Description \n", - "18 Create your own ultimate Gundam in the newest ... \n", - "22 A first person social deduction game, combinin... \n", - "34 Conquer mountain roads, highways, and city str... \n", - "45 Your next chapter takes flight! Immerse yourse... \n", - "61 Become a god and wage war in SMITE 2, the Unre... \n", - "... ... \n", - "7695 Dude Simulator 2 is an open world sandbox game... \n", - "7717 The higher you climb, the bigger the fall. \n", - "7740 NaN \n", - "7765 The year is 1983. The world teeters on the ver... \n", - "7768 NECROPOLIS: BRUTAL EDITION is a major update f... \n", - "\n", - "[1049 rows x 7 columns]\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "# Преобразуем дату выпуска в формат datetime\n", - "df['Release_date'] = pd.to_datetime(df['Release_date'])\n", - "\n", - "# Статистический анализ для определения выбросов\n", - "Q1 = df['Review_no'].quantile(0.25)\n", - "Q3 = df['Review_no'].quantile(0.75)\n", - "IQR = Q3 - Q1\n", - "\n", - "# Определение порога для выбросов\n", - "threshold = 1.5 * IQR\n", - "outliers = (df['Review_no'] < (Q1 - threshold)) | (df['Review_no'] > (Q3 + threshold))\n", - "\n", - "# Вывод выбросов\n", - "print(\"Выбросы:\")\n", - "print(df[outliers])\n", - "\n", - "# Обработка выбросов\n", - "# В данном случае мы заменим выбросы на медианное значение\n", - "median_review_no = df['Review_no'].median()\n", - "df.loc[outliers, 'Review_no'] = median_review_no\n", - "\n", - "# Визуализация данных после обработки\n", - "plt.figure(figsize=(10, 6))\n", - "plt.scatter(df['Release_date'], df['Review_no'])\n", - "plt.xlabel('Release Date')\n", - "plt.ylabel('Review Number')\n", - "plt.title('Scatter Plot of Review Number vs Release Date (After Handling Outliers)')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Очистим от строк с пустыми значениями наш датасет" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Количество удаленных строк: 515\n", - "\n", - "DataFrame после удаления строк с пропущенными значениями:\n", - " Name Price Release_date \\\n", - "0 Black Myth: Wukong 59.99 2024-08-20 \n", - "2 Counter-Strike 2 0.00 2012-08-21 \n", - "4 Grand Theft Auto V 10.48 2015-04-14 \n", - "5 Red Dead Redemption 2 17.99 2019-12-05 \n", - "6 PUBG: BATTLEGROUNDS 0.00 2017-12-21 \n", - "... ... ... ... \n", - "7807 Monster Hunter World: Iceborne - MHW:I Monster... 2.99 2020-02-06 \n", - "7808 Gene Shift Auto: Deluxe Edition 8.99 2022-11-28 \n", - "7809 Run Ralph Run 0.45 2021-03-03 \n", - "7810 Quadroids 6.19 2024-02-22 \n", - "7811 Divekick 4.99 2013-08-20 \n", - "\n", - " Review_no Review_type \\\n", - "0 270.0 Overwhelmingly Positive \n", - "2 270.0 Very Positive \n", - "4 270.0 Very Positive \n", - "5 270.0 Very Positive \n", - "6 270.0 Mixed \n", - "... ... ... \n", - "7807 39.0 Positive \n", - "7808 16.0 Positive \n", - "7809 26.0 Mostly Positive \n", - "7810 15.0 Positive \n", - "7811 1118.0 Very Positive \n", - "\n", - " Tags \\\n", - "0 Mythology,Action RPG,Action,Souls-like,RPG,Com... \n", - "2 FPS,Shooter,Multiplayer,Competitive,Action,Tea... \n", - "4 Open World,Action,Multiplayer,Crime,Automobile... \n", - "5 Open World,Story Rich,Western,Adventure,Multip... \n", - "6 Survival,Shooter,Battle Royale,Multiplayer,FPS... \n", - "... ... \n", - "7807 Action \n", - "7808 Indie,Action,Free to Play,Battle Royale,Roguel... \n", - "7809 Adventure,Action,Puzzle,Arcade,Platformer,Shoo... \n", - "7810 Precision Platformer,Puzzle Platformer,2D Plat... \n", - "7811 Fighting,Indie,2D Fighter,Parody ,Local Multip... \n", - "\n", - " Description \n", - "0 Black Myth: Wukong is an action RPG rooted in ... \n", - "2 For over two decades, Counter-Strike has offer... \n", - "4 Grand Theft Auto V for PC offers players the o... \n", - "5 Winner of over 175 Game of the Year Awards and... \n", - "6 Play PUBG: BATTLEGROUNDS for free.\\n\\nLand on ... \n", - "... ... \n", - "7807 A monster figure you can use to decorate your ... \n", - "7808 Gene Shift Auto is a roguelike-inspired battle... \n", - "7809 Ralph is a smart dinosaur, and a great shooter. \n", - "7810 Quadroids is a single-player puzzle platformer... \n", - "7811 Divekick is the world’s first two-button fight... \n", - "\n", - "[7297 rows x 7 columns]\n" - ] - } - ], - "source": [ - "# Удаление строк с пропущенными значениями\n", - "df_dropna = df.dropna()\n", - "\n", - "# Вывод количества удаленных строк\n", - "num_deleted_rows = len(df) - len(df_dropna)\n", - "print(f\"\\nКоличество удаленных строк: {num_deleted_rows}\")\n", - "\n", - "print(\"\\nDataFrame после удаления строк с пропущенными значениями:\")\n", - "print(df_dropna)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Теперь создадим выборки." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Размер обучающей выборки: 4687\n", - "Размер контрольной выборки: 1562\n", - "Размер тестовой выборки: 1563\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "df = pd.read_csv(\".//static//csv//steam_cleaned.csv\")\n", - "\n", - "train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)\n", - "\n", - "# Разделение остатка на контрольную и тестовую выборки\n", - "val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n", - "\n", - "# Проверка размеров выборок\n", - "print(\"Размер обучающей выборки:\", len(train_df))\n", - "print(\"Размер контрольной выборки:\", len(val_df))\n", - "print(\"Размер тестовой выборки:\", len(test_df))\n", - "\n", - "# Сохранение выборок в файлы\n", - "train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n", - "val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n", - "test_df.to_csv(\".//static//csv//test_data.csv\", index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Проанализируем сбалансированность выборок" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Распределение Review_type в обучающей выборке:\n", - "Review_type\n", - "Very Positive 2117\n", - "Mostly Positive 810\n", - "Mixed 797\n", - "Positive 710\n", - "Overwhelmingly Positive 209\n", - "Mostly Negative 15\n", - "Very Negative 2\n", - "Overwhelmingly Negative 1\n", - "Name: count, dtype: int64\n", - "Процент положительных отзывов: 17.28%\n", - "Процент отрицательных отзывов: 4.46%\n", - "\n", - "Распределение Review_type в контрольной выборке:\n", - "Review_type\n", - "Very Positive 708\n", - "Mostly Positive 290\n", - "Mixed 241\n", - "Positive 224\n", - "Overwhelmingly Positive 78\n", - "Mostly Negative 6\n", - "Very Negative 2\n", - "Name: count, dtype: int64\n", - "Процент положительных отзывов: 18.57%\n", - "Процент отрицательных отзывов: 4.99%\n", - "\n", - "Распределение Review_type в тестовой выборке:\n", - "Review_type\n", - "Very Positive 713\n", - "Mostly Positive 276\n", - "Mixed 253\n", - "Positive 240\n", - "Overwhelmingly Positive 67\n", - "Mostly Negative 5\n", - "Very Negative 1\n", - "Name: count, dtype: int64\n", - "Процент положительных отзывов: 17.66%\n", - "Процент отрицательных отзывов: 4.29%\n", - "\n", - "Необходима аугментация данных для балансировки классов.\n", - "Необходима аугментация данных для балансировки классов.\n", - "Необходима аугментация данных для балансировки классов.\n" - ] - } - ], - "source": [ - "train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n", - "val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n", - "test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n", - "\n", - "# Оценка сбалансированности\n", - "def check_balance(df, name):\n", - " counts = df['Review_type'].value_counts()\n", - " print(f\"Распределение Review_type в {name}:\")\n", - " print(counts)\n", - " print(f\"Процент положительных отзывов: {counts['Mostly Positive'] / len(df) * 100:.2f}%\")\n", - " print(f\"Процент отрицательных отзывов: {counts['Overwhelmingly Positive'] / len(df) * 100:.2f}%\")\n", - " print()\n", - "\n", - "# Определение необходимости аугментации данных\n", - "def need_augmentation(df):\n", - " counts = df['Review_type'].value_counts()\n", - " ratio = counts['Mostly Positive'] / counts['Overwhelmingly Positive']\n", - " if ratio > 1.5 or ratio < 0.67:\n", - " print(\"Необходима аугментация данных для балансировки классов.\")\n", - " else:\n", - " print(\"Аугментация данных не требуется.\")\n", - " \n", - "check_balance(train_df, \"обучающей выборке\")\n", - "check_balance(val_df, \"контрольной выборке\")\n", - "check_balance(test_df, \"тестовой выборке\")\n", - "\n", - "\n", - "\n", - "need_augmentation(train_df)\n", - "need_augmentation(val_df)\n", - "need_augmentation(test_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "По результатам анализа требуется приращение, соотношения отзывов вне допустимого диапазона" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Оверсэмплинг:\n", - "Распределение Review_type в обучающей выборке:\n", - "Review_type\n", - "Mostly Positive 2117\n", - "Mixed 2117\n", - "Very Positive 2117\n", - "Positive 2117\n", - "Overwhelmingly Positive 2117\n", - "Mostly Negative 2117\n", - "Very Negative 2117\n", - "Overwhelmingly Negative 2117\n", - "Name: count, dtype: int64\n", - "Отсутствуют один или оба класса (Positive/Negative).\n", - "\n", - "Распределение Review_type в контрольной выборке:\n", - "Review_type\n", - "Very Negative 708\n", - "Mostly Positive 708\n", - "Mixed 708\n", - "Overwhelmingly Positive 708\n", - "Overwhelmingly Negative 708\n", - "Positive 708\n", - "Mostly Negative 708\n", - "Very Positive 708\n", - "Name: count, dtype: int64\n", - "Отсутствуют один или оба класса (Positive/Negative).\n", - "\n", - "Распределение Review_type в тестовой выборке:\n", - "Review_type\n", - "Very Negative 713\n", - "Mostly Positive 713\n", - "Overwhelmingly Positive 713\n", - "Mixed 713\n", - "Overwhelmingly Negative 713\n", - "Very Positive 713\n", - "Mostly Negative 713\n", - "Positive 713\n", - "Name: count, dtype: int64\n", - "Отсутствуют один или оба класса (Positive/Negative).\n", - "\n", - "Андерсэмплинг:\n", - "Распределение Review_type в обучающей выборке:\n", - "Review_type\n", - "Mixed 1\n", - "Mostly Negative 1\n", - "Mostly Positive 1\n", - "Overwhelmingly Negative 1\n", - "Overwhelmingly Positive 1\n", - "Positive 1\n", - "Very Negative 1\n", - "Very Positive 1\n", - "Name: count, dtype: int64\n", - "Отсутствуют один или оба класса (Positive/Negative).\n", - "\n", - "Распределение Review_type в контрольной выборке:\n", - "Review_type\n", - "Mixed 2\n", - "Mostly Negative 2\n", - "Mostly Positive 2\n", - "Overwhelmingly Negative 2\n", - "Overwhelmingly Positive 2\n", - "Positive 2\n", - "Very Negative 2\n", - "Very Positive 2\n", - "Name: count, dtype: int64\n", - "Отсутствуют один или оба класса (Positive/Negative).\n", - "\n", - "Распределение Review_type в тестовой выборке:\n", - "Review_type\n", - "Mixed 1\n", - "Mostly Negative 1\n", - "Mostly Positive 1\n", - "Overwhelmingly Negative 1\n", - "Overwhelmingly Positive 1\n", - "Positive 1\n", - "Very Negative 1\n", - "Very Positive 1\n", - "Name: count, dtype: int64\n", - "Отсутствуют один или оба класса (Positive/Negative).\n", - "\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "from imblearn.over_sampling import RandomOverSampler\n", - "from imblearn.under_sampling import RandomUnderSampler\n", - "from sklearn.preprocessing import LabelEncoder\n", - "\n", - "# Загрузка данных\n", - "train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n", - "val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n", - "test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n", - "\n", - "# Преобразование категориальных признаков в числовые\n", - "def encode(df):\n", - " label_encoders = {}\n", - " for column in df.select_dtypes(include=['object']).columns:\n", - " if column != 'Review_type': # Пропускаем целевую переменную\n", - " le = LabelEncoder()\n", - " df[column] = le.fit_transform(df[column])\n", - " label_encoders[column] = le\n", - " return label_encoders\n", - "\n", - "# Преобразование целевой переменной в числовые значения\n", - "def encode_target(df):\n", - " le = LabelEncoder()\n", - " df['Review_type'] = le.fit_transform(df['Review_type'])\n", - " return le\n", - "\n", - "# Применение кодирования\n", - "label_encoders = encode(train_df)\n", - "encode(val_df)\n", - "encode(test_df)\n", - "\n", - "# Кодирование целевой переменной\n", - "le_target = encode_target(train_df)\n", - "encode_target(val_df)\n", - "encode_target(test_df)\n", - "\n", - "# Проверка типов данных\n", - "def check_data_types(df):\n", - " for column in df.columns:\n", - " if df[column].dtype == 'object':\n", - " print(f\"Столбец '{column}' содержит строковые данные.\")\n", - "\n", - "check_data_types(train_df)\n", - "check_data_types(val_df)\n", - "check_data_types(test_df)\n", - "\n", - "# Функция для выполнения oversampling\n", - "def oversample(df):\n", - " if 'Review_type' not in df.columns:\n", - " print(\"Столбец 'Review_type' отсутствует.\")\n", - " return df\n", - " \n", - " X = df.drop('Review_type', axis=1)\n", - " y = df['Review_type']\n", - " \n", - " oversampler = RandomOverSampler(random_state=42)\n", - " X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", - " \n", - " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n", - " return resampled_df\n", - "\n", - "# Функция для выполнения undersampling\n", - "def undersample(df):\n", - " if 'Review_type' not in df.columns:\n", - " print(\"Столбец 'Review_type' отсутствует.\")\n", - " return df\n", - " \n", - " X = df.drop('Review_type', axis=1)\n", - " y = df['Review_type']\n", - " \n", - " undersampler = RandomUnderSampler(random_state=42)\n", - " X_resampled, y_resampled = undersampler.fit_resample(X, y)\n", - " \n", - " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n", - " return resampled_df\n", - "\n", - "# Применение oversampling и undersampling к каждой выборке\n", - "train_df_oversampled = oversample(train_df)\n", - "val_df_oversampled = oversample(val_df)\n", - "test_df_oversampled = oversample(test_df)\n", - "\n", - "train_df_undersampled = undersample(train_df)\n", - "val_df_undersampled = undersample(val_df)\n", - "test_df_undersampled = undersample(test_df)\n", - "\n", - "# Обратное преобразование целевой переменной в строковые метки\n", - "def decode_target(df, le_target):\n", - " df['Review_type'] = le_target.inverse_transform(df['Review_type'])\n", - "\n", - "decode_target(train_df_oversampled, le_target)\n", - "decode_target(val_df_oversampled, le_target)\n", - "decode_target(test_df_oversampled, le_target)\n", - "\n", - "decode_target(train_df_undersampled, le_target)\n", - "decode_target(val_df_undersampled, le_target)\n", - "decode_target(test_df_undersampled, le_target)\n", - "\n", - "# Проверка результатов\n", - "def check_balance(df, name):\n", - " if 'Review_type' not in df.columns:\n", - " print(f\"Столбец 'Review_type' отсутствует в {name}.\")\n", - " return\n", - " \n", - " counts = df['Review_type'].value_counts()\n", - " print(f\"Распределение Review_type в {name}:\")\n", - " print(counts)\n", - " \n", - " if 'Positive' in counts and 'Negative' in counts:\n", - " print(f\"Процент положительных отзывов: {counts['Positive'] / len(df) * 100:.2f}%\")\n", - " print(f\"Процент отрицательных отзывов: {counts['Negative'] / len(df) * 100:.2f}%\")\n", - " else:\n", - " print(\"Отсутствуют один или оба класса (Positive/Negative).\")\n", - " print()\n", - "\n", - "# Проверка сбалансированности после oversampling\n", - "print(\"Оверсэмплинг:\")\n", - "check_balance(train_df_oversampled, \"обучающей выборке\")\n", - "check_balance(val_df_oversampled, \"контрольной выборке\")\n", - "check_balance(test_df_oversampled, \"тестовой выборке\")\n", - "\n", - "# Проверка сбалансированности после undersampling\n", - "print(\"Андерсэмплинг:\")\n", - "check_balance(train_df_undersampled, \"обучающей выборке\")\n", - "check_balance(val_df_undersampled, \"контрольной выборке\")\n", - "check_balance(test_df_undersampled, \"тестовой выборке\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 14,400 Classic Rock Tracks (with Spotify Data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "https://www.kaggle.com/datasets/thebumpkin/14400-classic-rock-tracks-with-spotify-data Этот набор данных, содержащий 1200 уникальных альбомов и 14 400 треков, представляет собой не просто коллекцию — это хроника эволюции классического рока. Каждый трек тщательно каталогизирован с 18 столбцами данных, включая ключевые метаданные, такие как название трека, исполнитель, альбом и год выпуска, наряду с функциями Spotify audio, которые позволяют получить представление о звуковом ландшафте этих неподвластных времени мелодий. Бизнес-цель может заключаться в улучшении стратегии маркетинга и продвижения музыкальных треков. Предположим как этот набор может быть полезен для бизнеса:\n", - "Персонализированные рекомендации: Создание алгоритмов, которые будут рекомендовать пользователям музыку на основе их предпочтений.\n", - "Цель технического проекта: Разработать и внедрить систему рекомендаций, которая будет предсказывать и рекомендовать пользователям музыкальные треки на основе их предпочтений и поведения.\n", - "Входные данные:\n", - "Данные о пользователях: Идентификатор пользователя, история прослушиваний, оценки треков, время прослушивания, частота прослушивания.\n", - "Данные о треках: Атрибуты треков (название, исполнитель, альбом, год, длительность, танцевальность, энергичность, акустичность и т.д.).\n", - "Данные о взаимодействии: Время и частота взаимодействия пользователя с определенными треками.\n", - "Целевой признак:\n", - "Рекомендации: Булева переменная, указывающая, должен ли конкретный трек быть рекомендован пользователю (1 - рекомендуется, 0 - не рекомендуется)." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['Track', 'Artist', 'Album', 'Year', 'Duration', 'Time_Signature',\n", - " 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness',\n", - " 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo',\n", - " 'Popularity'],\n", - " dtype='object')\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "df = pd.read_csv(\".//static//csv//UltimateClassicRock.csv\")\n", - "print(df.columns)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Анализируем датафрейм при помощи \"ящика с усами\". Естьсмещение в сторону меньших значений, это можно исправить при помощи oversampling и undersampling." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "# Box plot для столбца 'Popularity'\n", - "plt.figure(figsize=(10, 6))\n", - "sns.boxplot(x=df['Popularity'])\n", - "plt.title('Box Plot для Popularity')\n", - "plt.xlabel('Popularity')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Решим проблему пустых значений при помощи удаления таких строк." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "df_cleaned = df.dropna()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Разбиение набора данных на обучающую, контрольную и тестовую выборки" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Размер обучающей выборки: 8650\n", - "Размер контрольной выборки: 2884\n", - "Размер тестовой выборки: 2884\n" - ] - } - ], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "# Разделение на обучающую и тестовую выборки\n", - "train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n", - "\n", - "# Разделение обучающей выборки на обучающую и контрольную\n", - "train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n", - "\n", - "print(\"Размер обучающей выборки:\", len(train_df))\n", - "print(\"Размер контрольной выборки:\", len(val_df))\n", - "print(\"Размер тестовой выборки:\", len(test_df))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Оценка сбалансированности выборок, по результатам видно что баланса тут мало" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Распределение Popularity в обучающей выборке:\n", - "Popularity\n", - "23 258\n", - "15 250\n", - "26 246\n", - "21 245\n", - "14 245\n", - " ... \n", - "84 1\n", - "87 1\n", - "91 1\n", - "79 1\n", - "86 1\n", - "Name: count, Length: 88, dtype: int64\n", - "\n", - "Распределение Popularity в контрольной выборке:\n", - "Popularity\n", - "17 90\n", - "26 86\n", - "21 83\n", - "24 83\n", - "28 80\n", - " ..\n", - "85 1\n", - "83 1\n", - "84 1\n", - "80 1\n", - "77 1\n", - "Name: count, Length: 85, dtype: int64\n", - "\n", - "Распределение Popularity в тестовой выборке:\n", - "Popularity\n", - "22 86\n", - "21 85\n", - "12 84\n", - "20 82\n", - "26 81\n", - " ..\n", - "76 2\n", - "71 2\n", - "79 1\n", - "82 1\n", - "80 1\n", - "Name: count, Length: 80, dtype: int64\n", - "\n" - ] - } - ], - "source": [ - "def check_balance(df, name):\n", - " counts = df['Popularity'].value_counts()\n", - " print(f\"Распределение Popularity в {name}:\")\n", - " print(counts)\n", - " print()\n", - "\n", - "check_balance(train_df, \"обучающей выборке\")\n", - "check_balance(val_df, \"контрольной выборке\")\n", - "check_balance(test_df, \"тестовой выборке\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Выполним овер- и андер- слемпинг." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Распределение Popularity в обучающей выборке после oversampling:\n", - "Popularity\n", - "44 258\n", - "20 258\n", - "30 258\n", - "27 258\n", - "8 258\n", - " ... \n", - "78 258\n", - "79 258\n", - "74 258\n", - "81 258\n", - "86 258\n", - "Name: count, Length: 88, dtype: int64\n", - "\n", - "Распределение Popularity в контрольной выборке после oversampling:\n", - "Popularity\n", - "21 90\n", - "11 90\n", - "28 90\n", - "23 90\n", - "37 90\n", - " ..\n", - "61 90\n", - "84 90\n", - "80 90\n", - "77 90\n", - "0 90\n", - "Name: count, Length: 85, dtype: int64\n", - "\n", - "Распределение Popularity в тестовой выборке после oversampling:\n", - "Popularity\n", - "14 86\n", - "47 86\n", - "27 86\n", - "13 86\n", - "66 86\n", - " ..\n", - "63 86\n", - "79 86\n", - "71 86\n", - "82 86\n", - "80 86\n", - "Name: count, Length: 80, dtype: int64\n", - "\n" - ] - } - ], - "source": [ - "from imblearn.over_sampling import RandomOverSampler\n", - "\n", - "def oversample(df):\n", - " X = df.drop('Popularity', axis=1)\n", - " y = df['Popularity']\n", - " \n", - " oversampler = RandomOverSampler(random_state=42)\n", - " X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", - " \n", - " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n", - " return resampled_df\n", - "\n", - "train_df_oversampled = oversample(train_df)\n", - "val_df_oversampled = oversample(val_df)\n", - "test_df_oversampled = oversample(test_df)\n", - "\n", - "check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n", - "check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n", - "check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Распределение Popularity в обучающей выборке после undersampling:\n", - "Popularity\n", - "0 1\n", - "1 1\n", - "2 1\n", - "3 1\n", - "4 1\n", - " ..\n", - "84 1\n", - "85 1\n", - "86 1\n", - "87 1\n", - "91 1\n", - "Name: count, Length: 88, dtype: int64\n", - "\n", - "Распределение Popularity в контрольной выборке после undersampling:\n", - "Popularity\n", - "0 1\n", - "1 1\n", - "2 1\n", - "3 1\n", - "4 1\n", - " ..\n", - "82 1\n", - "83 1\n", - "84 1\n", - "85 1\n", - "87 1\n", - "Name: count, Length: 85, dtype: int64\n", - "\n", - "Распределение Popularity в тестовой выборке после undersampling:\n", - "Popularity\n", - "0 1\n", - "1 1\n", - "2 1\n", - "3 1\n", - "4 1\n", - " ..\n", - "76 1\n", - "77 1\n", - "79 1\n", - "80 1\n", - "82 1\n", - "Name: count, Length: 80, dtype: int64\n", - "\n" - ] - } - ], - "source": [ - "from imblearn.under_sampling import RandomUnderSampler\n", - "\n", - "def undersample(df):\n", - " X = df.drop('Popularity', axis=1)\n", - " y = df['Popularity']\n", - " \n", - " undersampler = RandomUnderSampler(random_state=42)\n", - " X_resampled, y_resampled = undersampler.fit_resample(X, y)\n", - " \n", - " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n", - " return resampled_df\n", - "\n", - "train_df_undersampled = undersample(train_df)\n", - "val_df_undersampled = undersample(val_df)\n", - "test_df_undersampled = undersample(test_df)\n", - "\n", - "check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n", - "check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n", - "check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Police Shootings in the United States: 2015-2024" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "В этом наборе данных, составленном The Washington Post, регистрируется каждый человек, застреленный дежурным полицейским в Соединенных Штатах с 2015 по 2024 год. Он решает проблему занижения органами власти статистики реальных инцедентов. Это может быть использовано в журналисткой работе, например для прогнозирования или выявления закономерностей преступлений. Цель технического проекта установить закономерность в убийствах полицейскими определённых групп граждан. Входные данные: возраст, пол, штат, вооружённость. Целевой признак: общий портрет убитого гражданина." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['date', 'name', 'age', 'gender', 'armed', 'race', 'city', 'state',\n", - " 'flee', 'body_camera', 'signs_of_mental_illness',\n", - " 'police_departments_involved'],\n", - " dtype='object')\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "df = pd.read_csv(\".//static//csv//2024-07-23-washington-post-police-shootings-export.csv\")\n", - "print(df.columns)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "При помощи ящика с усами и колонки возраста проверим набор на баланс. Он достаточно сбалансирован." - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "# Box plot для столбца 'age'\n", - "plt.figure(figsize=(10, 6))\n", - "sns.boxplot(x=df['age'])\n", - "plt.title('Box Plot для age')\n", - "plt.xlabel('Age')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Теперь проверим на шум, здесь тоже особо проблем нет, однако смущает сочетание white и black, вероятно это мулаты." - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "# Scatter plot для столбцов 'age' и 'race'\n", - "plt.figure(figsize=(10, 6))\n", - "sns.scatterplot(x='age', y='race', data=df)\n", - "plt.title('Scatter Plot для age и race')\n", - "plt.xlabel('Age')\n", - "plt.ylabel('Race')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Удаление строк с пустыми значениями" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "df_cleaned = df.dropna()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Разбиение набора данных на обучающую, контрольную и тестовую выборки" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Размер обучающей выборки: 4770\n", - "Размер контрольной выборки: 1591\n", - "Размер тестовой выборки: 1591\n" - ] - } - ], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "# Разделение на обучающую и тестовую выборки\n", - "train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n", - "\n", - "# Разделение обучающей выборки на обучающую и контрольную\n", - "train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n", - "\n", - "print(\"Размер обучающей выборки:\", len(train_df))\n", - "print(\"Размер контрольной выборки:\", len(val_df))\n", - "print(\"Размер тестовой выборки:\", len(test_df))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Применение методов приращения данных (аугментации)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Распределение reace в обучающей выборке после oversampling:\n", - "race\n", - "Black 2187\n", - "White 2187\n", - "Hispanic 2187\n", - "Unknown 2187\n", - "Native American 2187\n", - "Asian 2187\n", - "White,Black,Native American 2187\n", - "Other 2187\n", - "White,Black 2187\n", - "Name: count, dtype: int64\n", - "\n", - "Распределение reace в контрольной выборке после oversampling:\n", - "race\n", - "White 718\n", - "Black 718\n", - "Unknown 718\n", - "Hispanic 718\n", - "Asian 718\n", - "Native American 718\n", - "Other 718\n", - "Name: count, dtype: int64\n", - "\n", - "Распределение reace в тестовой выборке после oversampling:\n", - "race\n", - "Unknown 750\n", - "White 750\n", - "Black 750\n", - "Hispanic 750\n", - "Asian 750\n", - "Native American 750\n", - "Black,Hispanic 750\n", - "Other 750\n", - "White,Black 750\n", - "Native American,Hispanic 750\n", - "Name: count, dtype: int64\n", - "\n", - "Распределение reace в обучающей выборке после undersampling:\n", - "race\n", - "Asian 1\n", - "Black 1\n", - "Hispanic 1\n", - "Native American 1\n", - "Other 1\n", - "Unknown 1\n", - "White 1\n", - "White,Black 1\n", - "White,Black,Native American 1\n", - "Name: count, dtype: int64\n", - "\n", - "Распределение reace в контрольной выборке после undersampling:\n", - "race\n", - "Asian 7\n", - "Black 7\n", - "Hispanic 7\n", - "Native American 7\n", - "Other 7\n", - "Unknown 7\n", - "White 7\n", - "Name: count, dtype: int64\n", - "\n", - "Распределение reace в тестовой выборке после undersampling:\n", - "race\n", - "Asian 1\n", - "Black 1\n", - "Black,Hispanic 1\n", - "Hispanic 1\n", - "Native American 1\n", - "Native American,Hispanic 1\n", - "Other 1\n", - "Unknown 1\n", - "White 1\n", - "White,Black 1\n", - "Name: count, dtype: int64\n", - "\n" - ] - } - ], - "source": [ - "from imblearn.over_sampling import RandomOverSampler\n", - "\n", - "def check_balance(df, name):\n", - " counts = df['race'].value_counts()\n", - " print(f\"Распределение reace в {name}:\")\n", - " print(counts)\n", - " print()\n", - "\n", - "def oversample(df):\n", - " X = df.drop('race', axis=1)\n", - " y = df['race']\n", - " \n", - " oversampler = RandomOverSampler(random_state=42)\n", - " X_resampled, y_resampled = oversampler.fit_resample(X, y)\n", - " \n", - " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n", - " return resampled_df\n", - "\n", - "train_df_oversampled = oversample(train_df)\n", - "val_df_oversampled = oversample(val_df)\n", - "test_df_oversampled = oversample(test_df)\n", - "\n", - "check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n", - "check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n", - "check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")\n", - "\n", - "def undersample(df):\n", - " X = df.drop('race', axis=1)\n", - " y = df['race']\n", - " \n", - " undersampler = RandomUnderSampler(random_state=42)\n", - " X_resampled, y_resampled = undersampler.fit_resample(X, y)\n", - " \n", - " resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n", - " return resampled_df\n", - "\n", - "train_df_undersampled = undersample(train_df)\n", - "val_df_undersampled = undersample(val_df)\n", - "test_df_undersampled = undersample(test_df)\n", - "\n", - "check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n", - "check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n", - "check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "aimenv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} -- 2.25.1