From d62f03a3a9c31d1754f8759b231239237913e989 Mon Sep 17 00:00:00 2001 From: Timourka Date: Fri, 4 Oct 2024 23:54:02 +0400 Subject: [PATCH] =?UTF-8?q?=D1=81=D0=B4=D0=B5=D0=BB=D0=B0=D0=BB=20=D0=B0?= =?UTF-8?q?=D1=83=D0=B3=D0=BC=D0=B5=D0=BD=D1=82=D0=B0=D1=86=D0=B8=D1=8E=20?= =?UTF-8?q?=D0=BE=D0=B2=D0=B5=D1=80=D1=81=D0=B5=D0=BC=D0=BF=D0=BB=D0=B8?= =?UTF-8?q?=D0=BD=D0=B3=D0=BE=D0=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_2/laba2.ipynb | 324 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 319 insertions(+), 5 deletions(-) diff --git a/lab_2/laba2.ipynb b/lab_2/laba2.ipynb index 925cd79..bbccf58 100644 --- a/lab_2/laba2.ipynb +++ b/lab_2/laba2.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -119,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -161,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -245,7 +245,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -316,6 +316,320 @@ " plt.grid() # Добавление сетки для удобства восприятия\n", " plt.show() # Отображение графика" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Создадим выборки данных. разбивать будем относительно параметра опасный, ведь это тот самый параметр по которому наша выборка разбивается на классы. И собственно его нам и надо будет предсказывать" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: scikit-learn in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (1.5.2)\n", + "Requirement already satisfied: numpy>=1.19.5 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from scikit-learn) (2.1.1)\n", + "Requirement already satisfied: scipy>=1.6.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from scikit-learn) (1.14.1)\n", + "Requirement already satisfied: joblib>=1.2.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from scikit-learn) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from scikit-learn) (3.5.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Функция для создания выборок\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "def split_stratified_into_train_val_test(\n", + " df_input,\n", + " stratify_colname=\"y\",\n", + " frac_train=0.6,\n", + " frac_val=0.15,\n", + " frac_test=0.25,\n", + " random_state=None,\n", + "):\n", + " \"\"\"\n", + " Splits a Pandas dataframe into three subsets (train, val, and test)\n", + " following fractional ratios provided by the user, where each subset is\n", + " stratified by the values in a specific column (that is, each subset has\n", + " the same relative frequency of the values in the column). It performs this\n", + " splitting by running train_test_split() twice.\n", + "\n", + " Parameters\n", + " ----------\n", + " df_input : Pandas dataframe\n", + " Input dataframe to be split.\n", + " stratify_colname : str\n", + " The name of the column that will be used for stratification. Usually\n", + " this column would be for the label.\n", + " frac_train : float\n", + " frac_val : float\n", + " frac_test : float\n", + " The ratios with which the dataframe will be split into train, val, and\n", + " test data. The values should be expressed as float fractions and should\n", + " sum to 1.0.\n", + " random_state : int, None, or RandomStateInstance\n", + " Value to be passed to train_test_split().\n", + "\n", + " Returns\n", + " -------\n", + " df_train, df_val, df_test :\n", + " Dataframes containing the three splits.\n", + " \"\"\"\n", + "\n", + " if frac_train + frac_val + frac_test != 1.0:\n", + " raise ValueError(\n", + " \"fractions %f, %f, %f do not add up to 1.0\"\n", + " % (frac_train, frac_val, frac_test)\n", + " )\n", + "\n", + " if stratify_colname not in df_input.columns:\n", + " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n", + "\n", + " X = df_input # Contains all columns.\n", + " y = df_input[\n", + " [stratify_colname]\n", + " ] # Dataframe of just the column on which to stratify.\n", + "\n", + " # Split original dataframe into train and temp dataframes.\n", + " df_train, df_temp, y_train, y_temp = train_test_split(\n", + " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n", + " )\n", + "\n", + " # Split the temp dataframe into val and test dataframes.\n", + " relative_frac_test = frac_test / (frac_val + frac_test)\n", + " df_val, df_test, y_val, y_test = train_test_split(\n", + " df_temp,\n", + " y_temp,\n", + " stratify=y_temp,\n", + " test_size=relative_frac_test,\n", + " random_state=random_state,\n", + " )\n", + "\n", + " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n", + "\n", + " return df_train, df_val, df_test" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hazardous\n", + "False 81996\n", + "True 8840\n", + "Name: count, dtype: int64\n", + "\n", + "Обучающая выборка: (54501, 6)\n", + "hazardous\n", + "False 49197\n", + "True 5304\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Контрольная выборка: (18167, 6)\n", + "hazardous\n", + "False 16399\n", + "True 1768\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Тестовая выборка: (18168, 6)\n", + "hazardous\n", + "False 16400\n", + "True 1768\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Вывод распределения количества наблюдений по меткам (классам)\n", + "print(df.hazardous.value_counts())\n", + "print()\n", + "\n", + "\n", + "data = df[['est_diameter_min', 'est_diameter_max', 'relative_velocity', 'miss_distance', 'absolute_magnitude', 'hazardous']].copy()\n", + "\n", + "df_train, df_val, df_test = split_stratified_into_train_val_test(\n", + " data, stratify_colname=\"hazardous\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n", + ")\n", + "\n", + "print(\"Обучающая выборка: \", df_train.shape)\n", + "print(df_train.hazardous.value_counts())\n", + "hazardous_counts = df_train['hazardous'].value_counts()\n", + "plt.figure(figsize=(2, 2))# Установка размера графика\n", + "plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)# Построение круговой диаграммы\n", + "plt.title('Распределение классов hazardous в обучающей выборке')# Добавление заголовка\n", + "plt.show()# Отображение графика\n", + "\n", + "print(\"Контрольная выборка: \", df_val.shape)\n", + "print(df_val.hazardous.value_counts())\n", + "hazardous_counts = df_val['hazardous'].value_counts()\n", + "plt.figure(figsize=(2, 2))\n", + "plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)\n", + "plt.title('Распределение классов hazardous в контрольной выборке')\n", + "plt.show()\n", + "\n", + "print(\"Тестовая выборка: \", df_test.shape)\n", + "print(df_test.hazardous.value_counts())\n", + "hazardous_counts = df_test['hazardous'].value_counts()\n", + "plt.figure(figsize=(2, 2))\n", + "plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)\n", + "plt.title('Распределение классов hazardous в тестовой выборке')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "распределение плохое, соотношение классов сильно смещено, это может привести к проблемам в обучении модели, так как модель будет обучаться в основном на одном классе. В таких случаях стоит рассмотреть методы аугментации данных." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "аугментация данных оверсемплингом(Этот метод увеличивает количество примеров меньшинства)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting imblearn\n", + " Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)\n", + "Collecting imbalanced-learn (from imblearn)\n", + " Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)\n", + "Requirement already satisfied: numpy>=1.17.3 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from imbalanced-learn->imblearn) (2.1.1)\n", + "Requirement already satisfied: scipy>=1.5.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from imbalanced-learn->imblearn) (1.14.1)\n", + "Requirement already satisfied: scikit-learn>=1.0.2 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from imbalanced-learn->imblearn) (1.5.2)\n", + "Requirement already satisfied: joblib>=1.1.1 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from imbalanced-learn->imblearn) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in d:\\мии\\aim-pibd-31-kouvshinoff-t-a\\laba\\lib\\site-packages (from imbalanced-learn->imblearn) (3.5.0)\n", + "Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)\n", + "Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)\n", + "Installing collected packages: imbalanced-learn, imblearn\n", + "Successfully installed imbalanced-learn-0.12.4 imblearn-0.0\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install imblearn" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Обучающая выборка после oversampling: (100447, 6)\n", + "hazardous\n", + "True 51250\n", + "False 49197\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from imblearn.over_sampling import ADASYN\n", + "\n", + "# Создание экземпляра ADASYN\n", + "ada = ADASYN()\n", + "\n", + "# Применение ADASYN\n", + "X_resampled, y_resampled = ada.fit_resample(df_train.drop(columns=['hazardous']), df_train['hazardous'])\n", + "\n", + "# Создание нового DataFrame\n", + "df_train_adasyn = pd.DataFrame(X_resampled)\n", + "df_train_adasyn['hazardous'] = y_resampled # Добавление целевой переменной\n", + "\n", + "# Вывод информации о новой выборке\n", + "print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n", + "print(df_train_adasyn['hazardous'].value_counts())\n", + "hazardous_counts = df_train_adasyn['hazardous'].value_counts()\n", + "plt.figure(figsize=(2, 2))\n", + "plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)\n", + "plt.title('Распределение классов hazardous в тренировачной выборке почле ADASYN')\n", + "plt.show()" + ] } ], "metadata": {